From b1a7a91ada8388936ffff92cf1ad57b3e926f412 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 29 Aug 2010 12:13:15 -0400 Subject: [PATCH 01/67] sunrpc: don't shorten buflen twice in xdr_shrink_pagelen On Jan. 14, 2009, 2:50 +0200, andros@netapp.com wrote: > From: Andy Adamson > > The buflen is reset for all cases at the end of xdr_shrink_pagelen. > The data left in the tail after xdr_read_pages is not processed when the > buflen is incorrectly set. Note that in this case we also lose (len - tail->iov_len) bytes from the buffered data in pages. Reported-by: Andy Adamson Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- net/sunrpc/xdr.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index a1f82a87d34d..91f0de944d0f 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -407,8 +407,7 @@ xdr_shrink_pagelen(struct xdr_buf *buf, size_t len) if (tail->iov_len > len) { copy = tail->iov_len - len; memmove(p, tail->iov_base, copy); - } else - buf->buflen -= len; + } /* Copy from the inlined pages into the tail */ copy = len; if (copy > tail->iov_len) From 0fe62a35903e11fb41b492bd5b0e8e4c751d5c94 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 29 Aug 2010 12:13:15 -0400 Subject: [PATCH 02/67] sunrpc: clean up xdr_shrink_pagelen use of temporary pointer char *p is used only as a shorthand for tail->iov_base + len in a nested block. Move it there. Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- net/sunrpc/xdr.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 91f0de944d0f..41be21f7f7b9 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -395,7 +395,6 @@ xdr_shrink_pagelen(struct xdr_buf *buf, size_t len) { struct kvec *tail; size_t copy; - char *p; unsigned int pglen = buf->page_len; tail = buf->tail; @@ -403,8 +402,8 @@ xdr_shrink_pagelen(struct xdr_buf *buf, size_t len) /* Shift the tail first */ if (tail->iov_len != 0) { - p = (char *)tail->iov_base + len; if (tail->iov_len > len) { + char *p = (char *)tail->iov_base + len; copy = tail->iov_len - len; memmove(p, tail->iov_base, copy); } From 2e29ebb8119e6037133921fac09cc5f9d625b511 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 29 Aug 2010 12:13:15 -0400 Subject: [PATCH 03/67] sunrpc: don't use the copy variable in nested block to clean up the code "copy" will be set prior to the block hence it mustn't be used there. Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- net/sunrpc/xdr.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 41be21f7f7b9..42a7ebf2a322 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -404,8 +404,7 @@ xdr_shrink_pagelen(struct xdr_buf *buf, size_t len) if (tail->iov_len != 0) { if (tail->iov_len > len) { char *p = (char *)tail->iov_base + len; - copy = tail->iov_len - len; - memmove(p, tail->iov_base, copy); + memmove(p, tail->iov_base, tail->iov_len - len); } /* Copy from the inlined pages into the tail */ copy = len; From 42d6d8ab51ca04afcb8a64759076da624cdb71e8 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 29 Aug 2010 12:13:15 -0400 Subject: [PATCH 04/67] sunrpc: simplify xdr_shrink_pagelen use of "copy" The "copy" variable value can be computed using the existing logic rather than repeating it. Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- net/sunrpc/xdr.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 42a7ebf2a322..3317db3cb102 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -402,14 +402,13 @@ xdr_shrink_pagelen(struct xdr_buf *buf, size_t len) /* Shift the tail first */ if (tail->iov_len != 0) { + copy = len; if (tail->iov_len > len) { char *p = (char *)tail->iov_base + len; memmove(p, tail->iov_base, tail->iov_len - len); - } - /* Copy from the inlined pages into the tail */ - copy = len; - if (copy > tail->iov_len) + } else copy = tail->iov_len; + /* Copy from the inlined pages into the tail */ _copy_from_pages((char *)tail->iov_base, buf->pages, buf->page_base + pglen - len, copy); From cf187c2d7ec763cdd459fe43933a5cc4f5f48e1b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 29 Aug 2010 12:13:16 -0400 Subject: [PATCH 05/67] SUNRPC: Don't truncate tail data unnecessarily in xdr_shrink_pagelen If we have unused buffer space, then we should make use of that rather than unnecessarily truncating the message. Signed-off-by: Trond Myklebust --- net/sunrpc/xdr.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 3317db3cb102..3bbef7f5f826 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -396,12 +396,21 @@ xdr_shrink_pagelen(struct xdr_buf *buf, size_t len) struct kvec *tail; size_t copy; unsigned int pglen = buf->page_len; + unsigned int tailbuf_len; tail = buf->tail; BUG_ON (len > pglen); + tailbuf_len = buf->buflen - buf->head->iov_len - buf->page_len; + /* Shift the tail first */ - if (tail->iov_len != 0) { + if (tailbuf_len != 0) { + unsigned int free_space = tailbuf_len - tail->iov_len; + + if (len < free_space) + free_space = len; + tail->iov_len += free_space; + copy = len; if (tail->iov_len > len) { char *p = (char *)tail->iov_base + len; From ed58b2917be24fc8603128e32d50a1378afe66e1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 17 Sep 2010 10:54:37 -0400 Subject: [PATCH 06/67] NFS: Remove \t from mount debugging message During boot, a random character is displayed instead of a tab. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/mount_clnt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 59047f8d7d72..d610203d95c6 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -436,7 +436,7 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res) for (i = 0; i < entries; i++) { flavors[i] = ntohl(*p++); - dprintk("NFS:\tflavor %u: %d\n", i, flavors[i]); + dprintk("NFS: auth flavor[%u]: %d\n", i, flavors[i]); } *count = i; From 60ac03685bf513f9d9b6e8e098018b35309ed326 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 17 Sep 2010 10:54:37 -0400 Subject: [PATCH 07/67] NFS: Clean up NFSROOT command line parsing Clean up: To reduce confusion, rename nfs_root_name as nfs_root_parms, as this buffer contains more than just the name of the remote server. Introduce documenting comments around nfs_root_setup(). Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/nfsroot.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index df101d9f546a..169b67907a65 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -99,7 +99,7 @@ #define NFS_ROOT "/tftpboot/%s" /* Parameters passed from the kernel command line */ -static char nfs_root_name[256] __initdata = ""; +static char nfs_root_parms[256] __initdata = ""; /* Address of NFS server */ static __be32 servaddr __initdata = 0; @@ -369,7 +369,7 @@ static int __init root_nfs_init(void) * be able to use the client IP address for the remote root * directory (necessary for pure RARP booting). */ - if (root_nfs_name(nfs_root_name) < 0 || + if (root_nfs_name(nfs_root_parms) < 0 || root_nfs_addr() < 0) return -1; @@ -380,23 +380,37 @@ static int __init root_nfs_init(void) return 0; } - /* * Parse NFS server and directory information passed on the kernel * command line. + * + * nfsroot=[:][,] + * + * If there is a "%s" token in the string, it is replaced + * by the ASCII-representation of the client's IP address. */ static int __init nfs_root_setup(char *line) { ROOT_DEV = Root_NFS; + if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) { - strlcpy(nfs_root_name, line, sizeof(nfs_root_name)); + strlcpy(nfs_root_parms, line, sizeof(nfs_root_parms)); } else { - int n = strlen(line) + sizeof(NFS_ROOT) - 1; - if (n >= sizeof(nfs_root_name)) - line[sizeof(nfs_root_name) - sizeof(NFS_ROOT) - 2] = '\0'; - sprintf(nfs_root_name, NFS_ROOT, line); + size_t n = strlen(line) + sizeof(NFS_ROOT) - 1; + if (n >= sizeof(nfs_root_parms)) + line[sizeof(nfs_root_parms) - sizeof(NFS_ROOT) - 2] = '\0'; + sprintf(nfs_root_parms, NFS_ROOT, line); } - root_server_addr = root_nfs_parse_addr(nfs_root_name); + + /* + * Extract the IP address of the NFS server containing our + * root file system, if one was specified. + * + * Note: root_nfs_parse_addr() removes the server-ip from + * nfs_root_parms, if it exists. + */ + root_server_addr = root_nfs_parse_addr(nfs_root_parms); + return 1; } From 56463e50d1fc3f070492434cea6303b35ea000de Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 17 Sep 2010 10:54:37 -0400 Subject: [PATCH 08/67] NFS: Use super.c for NFSROOT mount option parsing Replace duplicate code in NFSROOT for mounting an NFS server on '/' with logic that uses the existing mainline text-based logic in the NFS client. Add documenting comments where appropriate. Note that this means NFSROOT mounts now use the same default settings as v2/v3 mounts done via mount(2) from user space. vers=3,tcp,rsize=,wsize= As before, however, no version/protocol negotiation with the server is done. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/nfsroot.c | 182 +++++++++++++++++++++++++++++++++++++++-- include/linux/nfs_fs.h | 10 +-- init/do_mounts.c | 12 +-- 3 files changed, 187 insertions(+), 17 deletions(-) diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 169b67907a65..cb4a6bdca871 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -3,9 +3,10 @@ * * Allow an NFS filesystem to be mounted as root. The way this works is: * (1) Use the IP autoconfig mechanism to set local IP addresses and routes. - * (2) Handle RPC negotiation with the system which replied to RARP or - * was reported as a boot server by BOOTP or manually. - * (3) The actual mounting is done later, when init() is running. + * (2) Construct the device string and the options string using DHCP + * option 17 and/or kernel command line options. + * (3) When mount_root() sets up the root file system, pass these strings + * to the NFS client's regular mount interface via sys_mount(). * * * Changes: @@ -65,7 +66,8 @@ * Hua Qin : Support for mounting root file system via * NFS over TCP. * Fabian Frederick: Option parser rebuilt (using parser lib) -*/ + * Chuck Lever : Use super.c's text-based mount option parsing + */ #include #include @@ -101,11 +103,17 @@ /* Parameters passed from the kernel command line */ static char nfs_root_parms[256] __initdata = ""; +/* Text-based mount options passed to super.c */ +static char nfs_root_options[256] __initdata = ""; + /* Address of NFS server */ static __be32 servaddr __initdata = 0; /* Name of directory to mount */ -static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = { 0, }; +static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = ""; + +/* server:export path string passed to super.c */ +static char nfs_root_device[NFS_MAXPATHLEN + 1] __initdata = ""; /* NFS-related data */ static struct nfs_mount_data nfs_data __initdata = { 0, };/* NFS mount info */ @@ -537,7 +545,7 @@ out: * Get the NFS port numbers and file handle, and return the prepared 'data' * argument for mount() if everything went OK. Return NULL otherwise. */ -void * __init nfs_root_data(void) +void * __init old_nfs_root_data(void) { if (root_nfs_init() < 0 || root_nfs_ports() < 0 @@ -546,3 +554,165 @@ void * __init nfs_root_data(void) set_sockaddr((struct sockaddr_in *) &nfs_data.addr, servaddr, htons(nfs_port)); return (void*)&nfs_data; } + +static int __init root_nfs_copy(char *dest, const char *src, + const size_t destlen) +{ + if (strlcpy(dest, src, destlen) > destlen) + return -1; + return 0; +} + +static int __init root_nfs_cat(char *dest, const char *src, + const size_t destlen) +{ + if (strlcat(dest, src, destlen) > destlen) + return -1; + return 0; +} + +/* + * Parse out root export path and mount options from + * passed-in string @incoming. + * + * Copy the export path into @exppath. + */ +static int __init root_nfs_parse_options(char *incoming, char *exppath, + const size_t exppathlen) +{ + char *p; + + /* + * Set the NFS remote path + */ + p = strsep(&incoming, ","); + if (*p != '\0' && strcmp(p, "default") != 0) + if (root_nfs_copy(exppath, p, exppathlen)) + return -1; + + /* + * @incoming now points to the rest of the string; if it + * contains something, append it to our root options buffer + */ + if (incoming != NULL && *incoming != '\0') + if (root_nfs_cat(nfs_root_options, incoming, + sizeof(nfs_root_options))) + return -1; + + /* + * Possibly prepare for more options to be appended + */ + if (nfs_root_options[0] != '\0' && + nfs_root_options[strlen(nfs_root_options)] != ',') + if (root_nfs_cat(nfs_root_options, ",", + sizeof(nfs_root_options))) + return -1; + + return 0; +} + +/* + * Decode the export directory path name and NFS options from + * the kernel command line. This has to be done late in order to + * use a dynamically acquired client IP address for the remote + * root directory path. + * + * Returns zero if successful; otherwise -1 is returned. + */ +static int __init root_nfs_data(char *cmdline) +{ + char addr_option[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1]; + int len, retval = -1; + char *tmp = NULL; + const size_t tmplen = sizeof(nfs_export_path); + + tmp = kzalloc(tmplen, GFP_KERNEL); + if (tmp == NULL) + goto out_nomem; + strcpy(tmp, NFS_ROOT); + + if (root_server_path[0] != '\0') { + dprintk("Root-NFS: DHCPv4 option 17: %s\n", + root_server_path); + if (root_nfs_parse_options(root_server_path, tmp, tmplen)) + goto out_optionstoolong; + } + + if (cmdline[0] != '\0') { + dprintk("Root-NFS: nfsroot=%s\n", cmdline); + if (root_nfs_parse_options(cmdline, tmp, tmplen)) + goto out_optionstoolong; + } + + /* + * Append mandatory options for nfsroot so they override + * what has come before + */ + snprintf(addr_option, sizeof(addr_option), "nolock,addr=%pI4", + &servaddr); + if (root_nfs_cat(nfs_root_options, addr_option, + sizeof(nfs_root_options))) + goto out_optionstoolong; + + /* + * Set up nfs_root_device. For NFS mounts, this looks like + * + * server:/path + * + * At this point, utsname()->nodename contains our local + * IP address or hostname, set by ipconfig. If "%s" exists + * in tmp, substitute the nodename, then shovel the whole + * mess into nfs_root_device. + */ + len = snprintf(nfs_export_path, sizeof(nfs_export_path), + tmp, utsname()->nodename); + if (len > (int)sizeof(nfs_export_path)) + goto out_devnametoolong; + len = snprintf(nfs_root_device, sizeof(nfs_root_device), + "%pI4:%s", &servaddr, nfs_export_path); + if (len > (int)sizeof(nfs_root_device)) + goto out_devnametoolong; + + retval = 0; + +out: + kfree(tmp); + return retval; +out_nomem: + printk(KERN_ERR "Root-NFS: could not allocate memory\n"); + goto out; +out_optionstoolong: + printk(KERN_ERR "Root-NFS: mount options string too long\n"); + goto out; +out_devnametoolong: + printk(KERN_ERR "Root-NFS: root device name too long.\n"); + goto out; +} + +/** + * nfs_root_data - Return prepared 'data' for NFSROOT mount + * @root_device: OUT: address of string containing NFSROOT device + * @root_data: OUT: address of string containing NFSROOT mount options + * + * Returns zero and sets @root_device and @root_data if successful, + * otherwise -1 is returned. + */ +int __init nfs_root_data(char **root_device, char **root_data) +{ +#ifdef NFSROOT_DEBUG + nfs_debug |= NFSDBG_ROOT | NFSDBG_MOUNT; +#endif /* NFSROOT_DEBUG */ + + servaddr = root_server_addr; + if (servaddr == htonl(INADDR_NONE)) { + printk(KERN_ERR "Root-NFS: no NFS server address\n"); + return -1; + } + + if (root_nfs_data(nfs_root_parms) < 0) + return -1; + + *root_device = nfs_root_device; + *root_data = nfs_root_options; + return 0; +} diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 508f8cf6da37..2a18f1582fa4 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -364,6 +364,7 @@ extern struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ct extern void nfs_put_lock_context(struct nfs_lock_context *l_ctx); extern u64 nfs_compat_user_ino64(u64 fileid); extern void nfs_fattr_init(struct nfs_fattr *fattr); +extern unsigned long nfs_inc_attr_generation_counter(void); extern struct nfs_fattr *nfs_alloc_fattr(void); @@ -379,9 +380,12 @@ static inline void nfs_free_fhandle(const struct nfs_fh *fh) kfree(fh); } +/* + * linux/fs/nfs/nfsroot.c + */ +extern int nfs_root_data(char **root_device, char **root_data); /*__init*/ /* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */ extern __be32 root_nfs_parse_addr(char *name); /*__init*/ -extern unsigned long nfs_inc_attr_generation_counter(void); /* * linux/fs/nfs/file.c @@ -584,10 +588,6 @@ nfs_fileid_to_ino_t(u64 fileid) return ino; } -/* NFS root */ - -extern void * nfs_root_data(void); - #define nfs_wait_event(clnt, wq, condition) \ ({ \ int __retval = wait_event_killable(wq, condition); \ diff --git a/init/do_mounts.c b/init/do_mounts.c index 02e3ca4fc527..52387f14f9b4 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -291,13 +291,13 @@ out: #ifdef CONFIG_ROOT_NFS static int __init mount_nfs_root(void) { - void *data = nfs_root_data(); + char *root_dev, *root_data; - create_dev("/dev/root", ROOT_DEV); - if (data && - do_mount_root("/dev/root", "nfs", root_mountflags, data) == 0) - return 1; - return 0; + if (nfs_root_data(&root_dev, &root_data) != 0) + return 0; + if (do_mount_root(root_dev, "nfs", root_mountflags, root_data) != 0) + return 0; + return 1; } #endif From 8d2321037896aa4868a67f45b2d6ed52b579a48a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 17 Sep 2010 10:54:37 -0400 Subject: [PATCH 09/67] NFS: Clean up nfsroot.c Clean up: now that mount option parsing for nfsroot is handled in fs/nfs/super.c, remove code in fs/nfs/nfsroot.c that is no longer used. This includes code that constructs the legacy nfs_mount_data structure, and code that does a MNT call to the server. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/nfsroot.c | 419 +---------------------------------------------- 1 file changed, 1 insertion(+), 418 deletions(-) diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index cb4a6bdca871..8e7d623173a9 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -71,22 +71,12 @@ #include #include -#include -#include -#include #include -#include -#include #include #include -#include -#include -#include #include -#include #include #include -#include #include "internal.h" @@ -94,9 +84,6 @@ #undef NFSROOT_DEBUG #define NFSDBG_FACILITY NFSDBG_ROOT -/* Default port to use if server is not running a portmapper */ -#define NFS_MNT_PORT 627 - /* Default path we try to mount. "%s" gets replaced by our IP address */ #define NFS_ROOT "/tftpboot/%s" @@ -107,7 +94,7 @@ static char nfs_root_parms[256] __initdata = ""; static char nfs_root_options[256] __initdata = ""; /* Address of NFS server */ -static __be32 servaddr __initdata = 0; +static __be32 servaddr __initdata = htonl(INADDR_NONE); /* Name of directory to mount */ static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = ""; @@ -115,279 +102,6 @@ static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = ""; /* server:export path string passed to super.c */ static char nfs_root_device[NFS_MAXPATHLEN + 1] __initdata = ""; -/* NFS-related data */ -static struct nfs_mount_data nfs_data __initdata = { 0, };/* NFS mount info */ -static int nfs_port __initdata = 0; /* Port to connect to for NFS */ -static int mount_port __initdata = 0; /* Mount daemon port number */ - - -/*************************************************************************** - - Parsing of options - - ***************************************************************************/ - -enum { - /* Options that take integer arguments */ - Opt_port, Opt_rsize, Opt_wsize, Opt_timeo, Opt_retrans, Opt_acregmin, - Opt_acregmax, Opt_acdirmin, Opt_acdirmax, - /* Options that take no arguments */ - Opt_soft, Opt_hard, Opt_intr, - Opt_nointr, Opt_posix, Opt_noposix, Opt_cto, Opt_nocto, Opt_ac, - Opt_noac, Opt_lock, Opt_nolock, Opt_v2, Opt_v3, Opt_udp, Opt_tcp, - Opt_acl, Opt_noacl, - /* Error token */ - Opt_err -}; - -static const match_table_t tokens __initconst = { - {Opt_port, "port=%u"}, - {Opt_rsize, "rsize=%u"}, - {Opt_wsize, "wsize=%u"}, - {Opt_timeo, "timeo=%u"}, - {Opt_retrans, "retrans=%u"}, - {Opt_acregmin, "acregmin=%u"}, - {Opt_acregmax, "acregmax=%u"}, - {Opt_acdirmin, "acdirmin=%u"}, - {Opt_acdirmax, "acdirmax=%u"}, - {Opt_soft, "soft"}, - {Opt_hard, "hard"}, - {Opt_intr, "intr"}, - {Opt_nointr, "nointr"}, - {Opt_posix, "posix"}, - {Opt_noposix, "noposix"}, - {Opt_cto, "cto"}, - {Opt_nocto, "nocto"}, - {Opt_ac, "ac"}, - {Opt_noac, "noac"}, - {Opt_lock, "lock"}, - {Opt_nolock, "nolock"}, - {Opt_v2, "nfsvers=2"}, - {Opt_v2, "v2"}, - {Opt_v3, "nfsvers=3"}, - {Opt_v3, "v3"}, - {Opt_udp, "proto=udp"}, - {Opt_udp, "udp"}, - {Opt_tcp, "proto=tcp"}, - {Opt_tcp, "tcp"}, - {Opt_acl, "acl"}, - {Opt_noacl, "noacl"}, - {Opt_err, NULL} - -}; - -/* - * Parse option string. - */ - -static int __init root_nfs_parse(char *name, char *buf) -{ - - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - - if (!name) - return 1; - - /* Set the NFS remote path */ - p = strsep(&name, ","); - if (p[0] != '\0' && strcmp(p, "default") != 0) - strlcpy(buf, p, NFS_MAXPATHLEN); - - while ((p = strsep (&name, ",")) != NULL) { - int token; - if (!*p) - continue; - token = match_token(p, tokens, args); - - /* %u tokens only. Beware if you add new tokens! */ - if (token < Opt_soft && match_int(&args[0], &option)) - return 0; - switch (token) { - case Opt_port: - nfs_port = option; - break; - case Opt_rsize: - nfs_data.rsize = option; - break; - case Opt_wsize: - nfs_data.wsize = option; - break; - case Opt_timeo: - nfs_data.timeo = option; - break; - case Opt_retrans: - nfs_data.retrans = option; - break; - case Opt_acregmin: - nfs_data.acregmin = option; - break; - case Opt_acregmax: - nfs_data.acregmax = option; - break; - case Opt_acdirmin: - nfs_data.acdirmin = option; - break; - case Opt_acdirmax: - nfs_data.acdirmax = option; - break; - case Opt_soft: - nfs_data.flags |= NFS_MOUNT_SOFT; - break; - case Opt_hard: - nfs_data.flags &= ~NFS_MOUNT_SOFT; - break; - case Opt_intr: - case Opt_nointr: - break; - case Opt_posix: - nfs_data.flags |= NFS_MOUNT_POSIX; - break; - case Opt_noposix: - nfs_data.flags &= ~NFS_MOUNT_POSIX; - break; - case Opt_cto: - nfs_data.flags &= ~NFS_MOUNT_NOCTO; - break; - case Opt_nocto: - nfs_data.flags |= NFS_MOUNT_NOCTO; - break; - case Opt_ac: - nfs_data.flags &= ~NFS_MOUNT_NOAC; - break; - case Opt_noac: - nfs_data.flags |= NFS_MOUNT_NOAC; - break; - case Opt_lock: - nfs_data.flags &= ~NFS_MOUNT_NONLM; - break; - case Opt_nolock: - nfs_data.flags |= NFS_MOUNT_NONLM; - break; - case Opt_v2: - nfs_data.flags &= ~NFS_MOUNT_VER3; - break; - case Opt_v3: - nfs_data.flags |= NFS_MOUNT_VER3; - break; - case Opt_udp: - nfs_data.flags &= ~NFS_MOUNT_TCP; - break; - case Opt_tcp: - nfs_data.flags |= NFS_MOUNT_TCP; - break; - case Opt_acl: - nfs_data.flags &= ~NFS_MOUNT_NOACL; - break; - case Opt_noacl: - nfs_data.flags |= NFS_MOUNT_NOACL; - break; - default: - printk(KERN_WARNING "Root-NFS: unknown " - "option: %s\n", p); - return 0; - } - } - - return 1; -} - -/* - * Prepare the NFS data structure and parse all options. - */ -static int __init root_nfs_name(char *name) -{ - static char buf[NFS_MAXPATHLEN] __initdata; - char *cp; - - /* Set some default values */ - memset(&nfs_data, 0, sizeof(nfs_data)); - nfs_port = -1; - nfs_data.version = NFS_MOUNT_VERSION; - nfs_data.flags = NFS_MOUNT_NONLM; /* No lockd in nfs root yet */ - nfs_data.rsize = NFS_DEF_FILE_IO_SIZE; - nfs_data.wsize = NFS_DEF_FILE_IO_SIZE; - nfs_data.acregmin = NFS_DEF_ACREGMIN; - nfs_data.acregmax = NFS_DEF_ACREGMAX; - nfs_data.acdirmin = NFS_DEF_ACDIRMIN; - nfs_data.acdirmax = NFS_DEF_ACDIRMAX; - strcpy(buf, NFS_ROOT); - - /* Process options received from the remote server */ - root_nfs_parse(root_server_path, buf); - - /* Override them by options set on kernel command-line */ - root_nfs_parse(name, buf); - - cp = utsname()->nodename; - if (strlen(buf) + strlen(cp) > NFS_MAXPATHLEN) { - printk(KERN_ERR "Root-NFS: Pathname for remote directory too long.\n"); - return -1; - } - sprintf(nfs_export_path, buf, cp); - - return 1; -} - - -/* - * Get NFS server address. - */ -static int __init root_nfs_addr(void) -{ - if ((servaddr = root_server_addr) == htonl(INADDR_NONE)) { - printk(KERN_ERR "Root-NFS: No NFS server available, giving up.\n"); - return -1; - } - - snprintf(nfs_data.hostname, sizeof(nfs_data.hostname), - "%pI4", &servaddr); - return 0; -} - -/* - * Tell the user what's going on. - */ -#ifdef NFSROOT_DEBUG -static void __init root_nfs_print(void) -{ - printk(KERN_NOTICE "Root-NFS: Mounting %s on server %s as root\n", - nfs_export_path, nfs_data.hostname); - printk(KERN_NOTICE "Root-NFS: rsize = %d, wsize = %d, timeo = %d, retrans = %d\n", - nfs_data.rsize, nfs_data.wsize, nfs_data.timeo, nfs_data.retrans); - printk(KERN_NOTICE "Root-NFS: acreg (min,max) = (%d,%d), acdir (min,max) = (%d,%d)\n", - nfs_data.acregmin, nfs_data.acregmax, - nfs_data.acdirmin, nfs_data.acdirmax); - printk(KERN_NOTICE "Root-NFS: nfsd port = %d, mountd port = %d, flags = %08x\n", - nfs_port, mount_port, nfs_data.flags); -} -#endif - - -static int __init root_nfs_init(void) -{ -#ifdef NFSROOT_DEBUG - nfs_debug |= NFSDBG_ROOT; -#endif - - /* - * Decode the root directory path name and NFS options from - * the kernel command line. This has to go here in order to - * be able to use the client IP address for the remote root - * directory (necessary for pure RARP booting). - */ - if (root_nfs_name(nfs_root_parms) < 0 || - root_nfs_addr() < 0) - return -1; - -#ifdef NFSROOT_DEBUG - root_nfs_print(); -#endif - - return 0; -} - /* * Parse NFS server and directory information passed on the kernel * command line. @@ -424,137 +138,6 @@ static int __init nfs_root_setup(char *line) __setup("nfsroot=", nfs_root_setup); -/*************************************************************************** - - Routines to actually mount the root directory - - ***************************************************************************/ - -/* - * Construct sockaddr_in from address and port number. - */ -static inline void -set_sockaddr(struct sockaddr_in *sin, __be32 addr, __be16 port) -{ - sin->sin_family = AF_INET; - sin->sin_addr.s_addr = addr; - sin->sin_port = port; -} - -/* - * Query server portmapper for the port of a daemon program. - */ -static int __init root_nfs_getport(int program, int version, int proto) -{ - struct sockaddr_in sin; - - printk(KERN_NOTICE "Looking up port of RPC %d/%d on %pI4\n", - program, version, &servaddr); - set_sockaddr(&sin, servaddr, 0); - return rpcb_getport_sync(&sin, program, version, proto); -} - - -/* - * Use portmapper to find mountd and nfsd port numbers if not overriden - * by the user. Use defaults if portmapper is not available. - * XXX: Is there any nfs server with no portmapper? - */ -static int __init root_nfs_ports(void) -{ - int port; - int nfsd_ver, mountd_ver; - int nfsd_port, mountd_port; - int proto; - - if (nfs_data.flags & NFS_MOUNT_VER3) { - nfsd_ver = NFS3_VERSION; - mountd_ver = NFS_MNT3_VERSION; - nfsd_port = NFS_PORT; - mountd_port = NFS_MNT_PORT; - } else { - nfsd_ver = NFS2_VERSION; - mountd_ver = NFS_MNT_VERSION; - nfsd_port = NFS_PORT; - mountd_port = NFS_MNT_PORT; - } - - proto = (nfs_data.flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP; - - if (nfs_port < 0) { - if ((port = root_nfs_getport(NFS_PROGRAM, nfsd_ver, proto)) < 0) { - printk(KERN_ERR "Root-NFS: Unable to get nfsd port " - "number from server, using default\n"); - port = nfsd_port; - } - nfs_port = port; - dprintk("Root-NFS: Portmapper on server returned %d " - "as nfsd port\n", port); - } - - if ((port = root_nfs_getport(NFS_MNT_PROGRAM, mountd_ver, proto)) < 0) { - printk(KERN_ERR "Root-NFS: Unable to get mountd port " - "number from server, using default\n"); - port = mountd_port; - } - mount_port = port; - dprintk("Root-NFS: mountd port is %d\n", port); - - return 0; -} - - -/* - * Get a file handle from the server for the directory which is to be - * mounted. - */ -static int __init root_nfs_get_handle(void) -{ - struct sockaddr_in sin; - unsigned int auth_flav_len = 0; - struct nfs_mount_request request = { - .sap = (struct sockaddr *)&sin, - .salen = sizeof(sin), - .dirpath = nfs_export_path, - .version = (nfs_data.flags & NFS_MOUNT_VER3) ? - NFS_MNT3_VERSION : NFS_MNT_VERSION, - .protocol = (nfs_data.flags & NFS_MOUNT_TCP) ? - XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP, - .auth_flav_len = &auth_flav_len, - }; - int status = -ENOMEM; - - request.fh = nfs_alloc_fhandle(); - if (!request.fh) - goto out; - set_sockaddr(&sin, servaddr, htons(mount_port)); - status = nfs_mount(&request); - if (status < 0) - printk(KERN_ERR "Root-NFS: Server returned error %d " - "while mounting %s\n", status, nfs_export_path); - else { - nfs_data.root.size = request.fh->size; - memcpy(&nfs_data.root.data, request.fh->data, request.fh->size); - } - nfs_free_fhandle(request.fh); -out: - return status; -} - -/* - * Get the NFS port numbers and file handle, and return the prepared 'data' - * argument for mount() if everything went OK. Return NULL otherwise. - */ -void * __init old_nfs_root_data(void) -{ - if (root_nfs_init() < 0 - || root_nfs_ports() < 0 - || root_nfs_get_handle() < 0) - return NULL; - set_sockaddr((struct sockaddr_in *) &nfs_data.addr, servaddr, htons(nfs_port)); - return (void*)&nfs_data; -} - static int __init root_nfs_copy(char *dest, const char *src, const size_t destlen) { From 306a075362a288683f6346185f97dd0e06df19da Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 17 Sep 2010 10:54:37 -0400 Subject: [PATCH 10/67] NFS: Allow NFSROOT debugging messages to be enabled dynamically As a convenience, introduce a kernel command line option to enable NFSROOT debugging messages. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- Documentation/filesystems/nfs/nfsroot.txt | 22 ++++++++++++++++++++++ Documentation/kernel-parameters.txt | 5 ++++- fs/nfs/nfsroot.c | 19 +++++++++++++------ 3 files changed, 39 insertions(+), 7 deletions(-) diff --git a/Documentation/filesystems/nfs/nfsroot.txt b/Documentation/filesystems/nfs/nfsroot.txt index f2430a7974e1..90c71c6f0d00 100644 --- a/Documentation/filesystems/nfs/nfsroot.txt +++ b/Documentation/filesystems/nfs/nfsroot.txt @@ -159,6 +159,28 @@ ip=:::::: Default: any +nfsrootdebug + + This parameter enables debugging messages to appear in the kernel + log at boot time so that administrators can verify that the correct + NFS mount options, server address, and root path are passed to the + NFS client. + + +rdinit= + + To specify which file contains the program that starts system + initialization, administrators can use this command line parameter. + The default value of this parameter is "/init". If the specified + file exists and the kernel can execute it, root filesystem related + kernel command line parameters, including `nfsroot=', are ignored. + + A description of the process of mounting the root file system can be + found in: + + Documentation/early-userspace/README + + 3.) Boot Loader diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index f084af0cb8e0..0fe70b2c4194 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1532,12 +1532,15 @@ and is between 256 and 4096 characters. It is defined in the file 1 to enable accounting Default value is 0. - nfsaddrs= [NFS] + nfsaddrs= [NFS] Deprecated. Use ip= instead. See Documentation/filesystems/nfs/nfsroot.txt. nfsroot= [NFS] nfs root filesystem for disk-less boxes. See Documentation/filesystems/nfs/nfsroot.txt. + nfsrootdebug [NFS] enable nfsroot debugging messages. + See Documentation/filesystems/nfs/nfsroot.txt. + nfs.callback_tcpport= [NFS] set the TCP port on which the NFSv4 callback channel should listen. diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 8e7d623173a9..460df3652889 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -67,6 +67,7 @@ * NFS over TCP. * Fabian Frederick: Option parser rebuilt (using parser lib) * Chuck Lever : Use super.c's text-based mount option parsing + * Chuck Lever : Add "nfsrootdebug". */ #include @@ -80,8 +81,6 @@ #include "internal.h" -/* Define this to allow debugging output */ -#undef NFSROOT_DEBUG #define NFSDBG_FACILITY NFSDBG_ROOT /* Default path we try to mount. "%s" gets replaced by our IP address */ @@ -102,6 +101,18 @@ static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = ""; /* server:export path string passed to super.c */ static char nfs_root_device[NFS_MAXPATHLEN + 1] __initdata = ""; +/* + * When the "nfsrootdebug" kernel command line option is specified, + * enable debugging messages for NFSROOT. + */ +static int __init nfs_root_debug(char *__unused) +{ + nfs_debug |= NFSDBG_ROOT | NFSDBG_MOUNT; + return 1; +} + +__setup("nfsrootdebug", nfs_root_debug); + /* * Parse NFS server and directory information passed on the kernel * command line. @@ -282,10 +293,6 @@ out_devnametoolong: */ int __init nfs_root_data(char **root_device, char **root_data) { -#ifdef NFSROOT_DEBUG - nfs_debug |= NFSDBG_ROOT | NFSDBG_MOUNT; -#endif /* NFSROOT_DEBUG */ - servaddr = root_server_addr; if (servaddr == htonl(INADDR_NONE)) { printk(KERN_ERR "Root-NFS: no NFS server address\n"); From 859d5024f450686ad0a42ed3c06f2fa20295c9e6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 17 Sep 2010 10:54:37 -0400 Subject: [PATCH 11/67] SUNRPC: Remove rpcb_getport_sync() Clean up: rpcb_getport_sync() has no more users, so remove it. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/clnt.h | 1 - net/sunrpc/rpcb_clnt.c | 51 ------------------------------------- 2 files changed, 52 deletions(-) diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 569dc722a600..a1a40f0c1856 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -137,7 +137,6 @@ int rpcb_register(u32, u32, int, unsigned short); int rpcb_v4_register(const u32 program, const u32 version, const struct sockaddr *address, const char *netid); -int rpcb_getport_sync(struct sockaddr_in *, u32, u32, int); void rpcb_getport_async(struct rpc_task *); void rpc_call_start(struct rpc_task *); diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index dac219a56ae1..c961094ebc62 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -475,57 +475,6 @@ int rpcb_v4_register(const u32 program, const u32 version, return -EAFNOSUPPORT; } -/** - * rpcb_getport_sync - obtain the port for an RPC service on a given host - * @sin: address of remote peer - * @prog: RPC program number to bind - * @vers: RPC version number to bind - * @prot: transport protocol to use to make this request - * - * Return value is the requested advertised port number, - * or a negative errno value. - * - * Called from outside the RPC client in a synchronous task context. - * Uses default timeout parameters specified by underlying transport. - * - * XXX: Needs to support IPv6 - */ -int rpcb_getport_sync(struct sockaddr_in *sin, u32 prog, u32 vers, int prot) -{ - struct rpcbind_args map = { - .r_prog = prog, - .r_vers = vers, - .r_prot = prot, - .r_port = 0, - }; - struct rpc_message msg = { - .rpc_proc = &rpcb_procedures2[RPCBPROC_GETPORT], - .rpc_argp = &map, - .rpc_resp = &map, - }; - struct rpc_clnt *rpcb_clnt; - int status; - - dprintk("RPC: %s(%pI4, %u, %u, %d)\n", - __func__, &sin->sin_addr.s_addr, prog, vers, prot); - - rpcb_clnt = rpcb_create(NULL, (struct sockaddr *)sin, - sizeof(*sin), prot, RPCBVERS_2); - if (IS_ERR(rpcb_clnt)) - return PTR_ERR(rpcb_clnt); - - status = rpc_call_sync(rpcb_clnt, &msg, 0); - rpc_shutdown_client(rpcb_clnt); - - if (status >= 0) { - if (map.r_port != 0) - return map.r_port; - status = -EACCES; - } - return status; -} -EXPORT_SYMBOL_GPL(rpcb_getport_sync); - static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbind_args *map, struct rpc_procinfo *proc) { struct rpc_message msg = { From cd9a1c0e5ac681871d64804f82291649e2a0accb Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 17 Sep 2010 10:56:50 -0400 Subject: [PATCH 12/67] NFSv4: Clean up nfs4_atomic_open Start moving the 'struct nameidata' dependent code out of the lower level NFS code in preparation for the removal of open intents. Instead of the struct nameidata, we pass down a partially initialised struct nfs_open_context that will be fully initialised by the atomic open upon success. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 77 ++++++++++++++++++++++++++++++++++++++++-- fs/nfs/inode.c | 8 ++--- fs/nfs/nfs4_fs.h | 2 +- fs/nfs/nfs4proc.c | 40 ++++++---------------- include/linux/nfs_fs.h | 2 ++ 5 files changed, 93 insertions(+), 36 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index e257172d438c..17529b5bc551 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -38,6 +38,7 @@ #include "delegation.h" #include "iostat.h" #include "internal.h" +#include "fscache.h" /* #define NFS_DEBUG_VERBOSE 1 */ @@ -1029,9 +1030,61 @@ static int is_atomic_open(struct nameidata *nd) return 1; } +static struct nfs_open_context *nameidata_to_nfs_open_context(struct dentry *dentry, struct nameidata *nd) +{ + struct path path = { + .mnt = nd->path.mnt, + .dentry = dentry, + }; + struct nfs_open_context *ctx; + struct rpc_cred *cred; + fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC); + + cred = rpc_lookup_cred(); + if (IS_ERR(cred)) + return ERR_CAST(cred); + ctx = alloc_nfs_open_context(&path, cred, fmode); + put_rpccred(cred); + if (ctx == NULL) + return ERR_PTR(-ENOMEM); + return ctx; +} + +static int do_open(struct inode *inode, struct file *filp) +{ + nfs_fscache_set_inode_cookie(inode, filp); + return 0; +} + +static int nfs_intent_set_file(struct nameidata *nd, struct nfs_open_context *ctx) +{ + struct file *filp; + int ret = 0; + + /* If the open_intent is for execute, we have an extra check to make */ + if (ctx->mode & FMODE_EXEC) { + ret = nfs_may_open(ctx->path.dentry->d_inode, + ctx->cred, + nd->intent.open.flags); + if (ret < 0) + goto out; + } + filp = lookup_instantiate_filp(nd, ctx->path.dentry, do_open); + if (IS_ERR(filp)) + ret = PTR_ERR(filp); + else + nfs_file_set_open_context(filp, ctx); +out: + put_nfs_open_context(ctx); + return ret; +} + static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { + struct nfs_open_context *ctx; + struct iattr attr; struct dentry *res = NULL; + int open_flags; int error; dfprintk(VFS, "NFS: atomic_lookup(%s/%ld), %s\n", @@ -1054,9 +1107,27 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry goto out; } + ctx = nameidata_to_nfs_open_context(dentry, nd); + res = ERR_CAST(ctx); + if (IS_ERR(ctx)) + goto out; + + open_flags = nd->intent.open.flags; + if (nd->flags & LOOKUP_CREATE) { + attr.ia_mode = nd->intent.open.create_mode; + attr.ia_valid = ATTR_MODE; + if (!IS_POSIXACL(dir)) + attr.ia_mode &= ~current_umask(); + } else { + open_flags &= ~O_EXCL; + attr.ia_valid = 0; + BUG_ON(open_flags & O_CREAT); + } + /* Open the file on the server */ - res = nfs4_atomic_open(dir, dentry, nd); + res = nfs4_atomic_open(dir, ctx, open_flags, &attr); if (IS_ERR(res)) { + put_nfs_open_context(ctx); error = PTR_ERR(res); switch (error) { /* Make a negative dentry */ @@ -1074,8 +1145,10 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry default: goto out; } - } else if (res != NULL) + } + if (res != NULL) dentry = res; + nfs_intent_set_file(nd, ctx); out: return res; no_open: diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 7d2d6c72aa78..2f9266406afc 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -623,7 +623,7 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync) nfs_revalidate_inode(server, inode); } -static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred) +struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred, fmode_t f_mode) { struct nfs_open_context *ctx; @@ -633,6 +633,7 @@ static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct path_get(&ctx->path); ctx->cred = get_rpccred(cred); ctx->state = NULL; + ctx->mode = f_mode; ctx->flags = 0; ctx->error = 0; ctx->dir_cookie = 0; @@ -673,7 +674,7 @@ void put_nfs_open_context(struct nfs_open_context *ctx) * Ensure that mmap has a recent RPC credential for use when writing out * shared pages */ -static void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) +void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) { struct inode *inode = filp->f_path.dentry->d_inode; struct nfs_inode *nfsi = NFS_I(inode); @@ -730,11 +731,10 @@ int nfs_open(struct inode *inode, struct file *filp) cred = rpc_lookup_cred(); if (IS_ERR(cred)) return PTR_ERR(cred); - ctx = alloc_nfs_open_context(&filp->f_path, cred); + ctx = alloc_nfs_open_context(&filp->f_path, cred, filp->f_mode); put_rpccred(cred); if (ctx == NULL) return -ENOMEM; - ctx->mode = filp->f_mode; nfs_file_set_open_context(filp, ctx); put_nfs_open_context(ctx); nfs_fscache_set_inode_cookie(inode, filp); diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 311e15cc8af0..c5cc2a6aceb0 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -242,7 +242,7 @@ extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *); extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait); -extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); +extern struct dentry *nfs4_atomic_open(struct inode *, struct nfs_open_context *, int, struct iattr *); extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *); extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 089da5b5d20a..38c3bed2240d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2025,39 +2025,17 @@ out_close: } struct dentry * -nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr) { - struct path path = { - .mnt = nd->path.mnt, - .dentry = dentry, - }; + struct dentry *dentry = ctx->path.dentry; struct dentry *parent; - struct iattr attr; - struct rpc_cred *cred; struct nfs4_state *state; struct dentry *res; - int open_flags = nd->intent.open.flags; - fmode_t fmode = open_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC); - if (nd->flags & LOOKUP_CREATE) { - attr.ia_mode = nd->intent.open.create_mode; - attr.ia_valid = ATTR_MODE; - if (!IS_POSIXACL(dir)) - attr.ia_mode &= ~current_umask(); - } else { - open_flags &= ~O_EXCL; - attr.ia_valid = 0; - BUG_ON(open_flags & O_CREAT); - } - - cred = rpc_lookup_cred(); - if (IS_ERR(cred)) - return (struct dentry *)cred; parent = dentry->d_parent; /* Protect against concurrent sillydeletes */ nfs_block_sillyrename(parent); - state = nfs4_do_open(dir, &path, fmode, open_flags, &attr, cred); - put_rpccred(cred); + state = nfs4_do_open(dir, &ctx->path, ctx->mode, open_flags, attr, ctx->cred); if (IS_ERR(state)) { if (PTR_ERR(state) == -ENOENT) { d_add(dentry, NULL); @@ -2067,11 +2045,15 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) return (struct dentry *)state; } res = d_add_unique(dentry, igrab(state->inode)); - if (res != NULL) - path.dentry = res; - nfs_set_verifier(path.dentry, nfs_save_change_attribute(dir)); + if (res != NULL) { + struct dentry *dummy = ctx->path.dentry; + + ctx->path.dentry = dget(res); + dput(dummy); + } + ctx->state = state; + nfs_set_verifier(ctx->path.dentry, nfs_save_change_attribute(dir)); nfs_unblock_sillyrename(parent); - nfs4_intent_set_file(nd, &path, state, fmode); return res; } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 2a18f1582fa4..61c89b4ad7c6 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -360,6 +360,8 @@ extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr); extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx); extern void put_nfs_open_context(struct nfs_open_context *ctx); extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode); +extern struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred, fmode_t f_mode); +extern void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx); extern struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx); extern void nfs_put_lock_context(struct nfs_lock_context *l_ctx); extern u64 nfs_compat_user_ino64(u64 fileid); From f46e0bd34ec002d0727761da52b8fd47f06d4440 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 17 Sep 2010 10:56:50 -0400 Subject: [PATCH 13/67] NFSv4: Further minor cleanups for nfs4_atomic_open() Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 21 +++++++++++++++------ fs/nfs/nfs4_fs.h | 2 +- fs/nfs/nfs4proc.c | 28 ++++------------------------ 3 files changed, 20 insertions(+), 31 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 17529b5bc551..194676cc5b04 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1084,8 +1084,8 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry struct nfs_open_context *ctx; struct iattr attr; struct dentry *res = NULL; + struct inode *inode; int open_flags; - int error; dfprintk(VFS, "NFS: atomic_lookup(%s/%ld), %s\n", dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); @@ -1125,13 +1125,15 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry } /* Open the file on the server */ - res = nfs4_atomic_open(dir, ctx, open_flags, &attr); - if (IS_ERR(res)) { + nfs_block_sillyrename(dentry->d_parent); + inode = nfs4_atomic_open(dir, ctx, open_flags, &attr); + if (IS_ERR(inode)) { + nfs_unblock_sillyrename(dentry->d_parent); put_nfs_open_context(ctx); - error = PTR_ERR(res); - switch (error) { + switch (PTR_ERR(inode)) { /* Make a negative dentry */ case -ENOENT: + d_add(dentry, NULL); res = NULL; goto out; /* This turned out not to be a regular file */ @@ -1143,13 +1145,20 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry goto no_open; /* case -EINVAL: */ default: + res = ERR_CAST(inode); goto out; } } - if (res != NULL) + res = d_add_unique(dentry, inode); + if (res != NULL) { + dput(ctx->path.dentry); + ctx->path.dentry = dget(res); dentry = res; + } nfs_intent_set_file(nd, ctx); + nfs_unblock_sillyrename(dentry->d_parent); out: + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); return res; no_open: return nfs_lookup(dir, dentry, nd); diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index c5cc2a6aceb0..8e1f6396d3de 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -242,7 +242,7 @@ extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *); extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait); -extern struct dentry *nfs4_atomic_open(struct inode *, struct nfs_open_context *, int, struct iattr *); +extern struct inode *nfs4_atomic_open(struct inode *, struct nfs_open_context *, int, struct iattr *); extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *); extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 38c3bed2240d..f8d41eed9a6b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2024,37 +2024,17 @@ out_close: return ret; } -struct dentry * +struct inode * nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr) { - struct dentry *dentry = ctx->path.dentry; - struct dentry *parent; struct nfs4_state *state; - struct dentry *res; - parent = dentry->d_parent; /* Protect against concurrent sillydeletes */ - nfs_block_sillyrename(parent); state = nfs4_do_open(dir, &ctx->path, ctx->mode, open_flags, attr, ctx->cred); - if (IS_ERR(state)) { - if (PTR_ERR(state) == -ENOENT) { - d_add(dentry, NULL); - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - } - nfs_unblock_sillyrename(parent); - return (struct dentry *)state; - } - res = d_add_unique(dentry, igrab(state->inode)); - if (res != NULL) { - struct dentry *dummy = ctx->path.dentry; - - ctx->path.dentry = dget(res); - dput(dummy); - } + if (IS_ERR(state)) + return ERR_CAST(state); ctx->state = state; - nfs_set_verifier(ctx->path.dentry, nfs_save_change_attribute(dir)); - nfs_unblock_sillyrename(parent); - return res; + return igrab(state->inode); } int From b8d4caddd871758ffa156be51b4c8be82fea470d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 17 Sep 2010 10:56:51 -0400 Subject: [PATCH 14/67] NFSv4: Clean up nfs4_open_revalidate Remove references to 'struct nameidata' from the low-level open_revalidate code, and replace them with a struct nfs_open_context which will be correctly initialised upon success. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 11 ++++++++++- fs/nfs/nfs4_fs.h | 2 +- fs/nfs/nfs4proc.c | 23 ++++++----------------- 3 files changed, 17 insertions(+), 19 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 194676cc5b04..196775c70948 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1169,6 +1169,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) struct dentry *parent = NULL; struct inode *inode = dentry->d_inode; struct inode *dir; + struct nfs_open_context *ctx; int openflags, ret = 0; if (!is_atomic_open(nd) || d_mountpoint(dentry)) @@ -1194,12 +1195,20 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) /* We can't create new files, or truncate existing ones here */ openflags &= ~(O_CREAT|O_EXCL|O_TRUNC); + ctx = nameidata_to_nfs_open_context(dentry, nd); + ret = PTR_ERR(ctx); + if (IS_ERR(ctx)) + goto out; /* * Note: we're not holding inode->i_mutex and so may be racing with * operations that change the directory. We therefore save the * change attribute *before* we do the RPC call. */ - ret = nfs4_open_revalidate(dir, dentry, openflags, nd); + ret = nfs4_open_revalidate(dir, ctx, openflags); + if (ret == 1) + nfs_intent_set_file(nd, ctx); + else + put_nfs_open_context(ctx); out: dput(parent); if (!ret) diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 8e1f6396d3de..76ec1c6d92ac 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -243,7 +243,7 @@ extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait); extern struct inode *nfs4_atomic_open(struct inode *, struct nfs_open_context *, int, struct iattr *); -extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *); +extern int nfs4_open_revalidate(struct inode *, struct nfs_open_context *, int); extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, struct nfs4_fs_locations *fs_locations, struct page *page); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index f8d41eed9a6b..b4762463b19f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2038,21 +2038,11 @@ nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags } int -nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, struct nameidata *nd) +nfs4_open_revalidate(struct inode *dir, struct nfs_open_context *ctx, int openflags) { - struct path path = { - .mnt = nd->path.mnt, - .dentry = dentry, - }; - struct rpc_cred *cred; struct nfs4_state *state; - fmode_t fmode = openflags & (FMODE_READ | FMODE_WRITE); - cred = rpc_lookup_cred(); - if (IS_ERR(cred)) - return PTR_ERR(cred); - state = nfs4_do_open(dir, &path, fmode, openflags, NULL, cred); - put_rpccred(cred); + state = nfs4_do_open(dir, &ctx->path, ctx->mode, openflags, NULL, ctx->cred); if (IS_ERR(state)) { switch (PTR_ERR(state)) { case -EPERM: @@ -2065,14 +2055,13 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st goto out_drop; } } - if (state->inode == dentry->d_inode) { - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - nfs4_intent_set_file(nd, &path, state, fmode); + ctx->state = state; + if (state->inode == ctx->path.dentry->d_inode) { + nfs_set_verifier(ctx->path.dentry, nfs_save_change_attribute(dir)); return 1; } - nfs4_close_sync(&path, state, fmode); out_drop: - d_drop(dentry); + d_drop(ctx->path.dentry); return 0; } From 535918f14176396646b5547b7d1353c932f24f5e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 17 Sep 2010 10:56:51 -0400 Subject: [PATCH 15/67] NFSv4: Further cleanups for nfs4_open_revalidate() Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 32 ++++++++++++++++++++++++++------ fs/nfs/nfs4_fs.h | 1 - fs/nfs/nfs4proc.c | 28 ---------------------------- 3 files changed, 26 insertions(+), 35 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 196775c70948..dc93d356341b 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1204,16 +1204,36 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) * operations that change the directory. We therefore save the * change attribute *before* we do the RPC call. */ - ret = nfs4_open_revalidate(dir, ctx, openflags); - if (ret == 1) + inode = nfs4_atomic_open(dir, ctx, openflags, NULL); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + switch (ret) { + case -EPERM: + case -EACCES: + case -EDQUOT: + case -ENOSPC: + case -EROFS: + goto out_put_ctx; + default: + goto out_drop; + } + } + iput(inode); + if (inode == dentry->d_inode) { + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); nfs_intent_set_file(nd, ctx); - else - put_nfs_open_context(ctx); + } else + goto out_drop; out: dput(parent); - if (!ret) - d_drop(dentry); return ret; +out_drop: + d_drop(dentry); + ret = 0; +out_put_ctx: + put_nfs_open_context(ctx); + goto out; + no_open_dput: dput(parent); no_open: diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 76ec1c6d92ac..6cf12ba9f4e7 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -243,7 +243,6 @@ extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait); extern struct inode *nfs4_atomic_open(struct inode *, struct nfs_open_context *, int, struct iattr *); -extern int nfs4_open_revalidate(struct inode *, struct nfs_open_context *, int); extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, struct nfs4_fs_locations *fs_locations, struct page *page); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b4762463b19f..83c5ef6e7cef 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2037,34 +2037,6 @@ nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags return igrab(state->inode); } -int -nfs4_open_revalidate(struct inode *dir, struct nfs_open_context *ctx, int openflags) -{ - struct nfs4_state *state; - - state = nfs4_do_open(dir, &ctx->path, ctx->mode, openflags, NULL, ctx->cred); - if (IS_ERR(state)) { - switch (PTR_ERR(state)) { - case -EPERM: - case -EACCES: - case -EDQUOT: - case -ENOSPC: - case -EROFS: - return PTR_ERR(state); - default: - goto out_drop; - } - } - ctx->state = state; - if (state->inode == ctx->path.dentry->d_inode) { - nfs_set_verifier(ctx->path.dentry, nfs_save_change_attribute(dir)); - return 1; - } -out_drop: - d_drop(ctx->path.dentry); - return 0; -} - static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) { if (ctx->state == NULL) From c0204fd2b8fe047b18b67e07e1bf2a03691240cd Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 17 Sep 2010 10:56:51 -0400 Subject: [PATCH 16/67] NFS: Clean up nfs4_proc_create() Remove all remaining references to the struct nameidata from the low level NFS layers. Again pass down a partially initialised struct nfs_open_context when we want to do atomic open+create. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 47 +++++++++++++++++++++++++++++----- fs/nfs/nfs3proc.c | 2 +- fs/nfs/nfs4proc.c | 56 +++++++++++------------------------------ fs/nfs/proc.c | 2 +- include/linux/nfs_xdr.h | 2 +- 5 files changed, 58 insertions(+), 51 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index dc93d356341b..e37ffddd79c2 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -105,8 +105,9 @@ const struct inode_operations nfs3_dir_inode_operations = { #ifdef CONFIG_NFS_V4 static struct dentry *nfs_atomic_lookup(struct inode *, struct dentry *, struct nameidata *); +static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd); const struct inode_operations nfs4_dir_inode_operations = { - .create = nfs_create, + .create = nfs_open_create, .lookup = nfs_atomic_lookup, .link = nfs_link, .unlink = nfs_unlink, @@ -1239,6 +1240,44 @@ no_open_dput: no_open: return nfs_lookup_revalidate(dentry, nd); } + +static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + struct nfs_open_context *ctx = NULL; + struct iattr attr; + int error; + int open_flags = 0; + + dfprintk(VFS, "NFS: create(%s/%ld), %s\n", + dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); + + attr.ia_mode = mode; + attr.ia_valid = ATTR_MODE; + + if ((nd->flags & LOOKUP_CREATE) != 0) { + open_flags = nd->intent.open.flags; + + ctx = nameidata_to_nfs_open_context(dentry, nd); + error = PTR_ERR(ctx); + if (IS_ERR(ctx)) + goto out_err; + } + + error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, ctx); + if (error != 0) + goto out_put_ctx; + if (ctx != NULL) + nfs_intent_set_file(nd, ctx); + return 0; +out_put_ctx: + if (ctx != NULL) + put_nfs_open_context(ctx); +out_err: + d_drop(dentry); + return error; +} + #endif /* CONFIG_NFSV4 */ static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc) @@ -1369,7 +1408,6 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode, { struct iattr attr; int error; - int open_flags = 0; dfprintk(VFS, "NFS: create(%s/%ld), %s\n", dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); @@ -1377,10 +1415,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode, attr.ia_mode = mode; attr.ia_valid = ATTR_MODE; - if ((nd->flags & LOOKUP_CREATE) != 0) - open_flags = nd->intent.open.flags; - - error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd); + error = NFS_PROTO(dir)->create(dir, dentry, &attr, 0, NULL); if (error != 0) goto out_err; return 0; diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index fabb4f2849a1..2be4a7f59f1c 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -313,7 +313,7 @@ static void nfs3_free_createdata(struct nfs3_createdata *data) */ static int nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, - int flags, struct nameidata *nd) + int flags, struct nfs_open_context *ctx) { struct nfs3_createdata *data; mode_t mode = sattr->ia_mode; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 83c5ef6e7cef..617b149ee16d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1998,32 +1998,6 @@ out: return status; } -static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct nfs4_state *state, fmode_t fmode) -{ - struct file *filp; - int ret; - - /* If the open_intent is for execute, we have an extra check to make */ - if (fmode & FMODE_EXEC) { - ret = nfs_may_open(state->inode, - state->owner->so_cred, - nd->intent.open.flags); - if (ret < 0) - goto out_close; - } - filp = lookup_instantiate_filp(nd, path->dentry, NULL); - if (!IS_ERR(filp)) { - struct nfs_open_context *ctx; - ctx = nfs_file_open_context(filp); - ctx->state = state; - return 0; - } - ret = PTR_ERR(filp); -out_close: - nfs4_close_sync(path, state, fmode & (FMODE_READ|FMODE_WRITE)); - return ret; -} - struct inode * nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr) { @@ -2491,36 +2465,34 @@ static int nfs4_proc_readlink(struct inode *inode, struct page *page, static int nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, - int flags, struct nameidata *nd) + int flags, struct nfs_open_context *ctx) { - struct path path = { - .mnt = nd->path.mnt, + struct path my_path = { .dentry = dentry, }; + struct path *path = &my_path; struct nfs4_state *state; - struct rpc_cred *cred; - fmode_t fmode = flags & (FMODE_READ | FMODE_WRITE); + struct rpc_cred *cred = NULL; + fmode_t fmode = 0; int status = 0; - cred = rpc_lookup_cred(); - if (IS_ERR(cred)) { - status = PTR_ERR(cred); - goto out; + if (ctx != NULL) { + cred = ctx->cred; + path = &ctx->path; + fmode = ctx->mode; } - state = nfs4_do_open(dir, &path, fmode, flags, sattr, cred); + state = nfs4_do_open(dir, path, fmode, flags, sattr, cred); d_drop(dentry); if (IS_ERR(state)) { status = PTR_ERR(state); - goto out_putcred; + goto out; } d_add(dentry, igrab(state->inode)); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0) - status = nfs4_intent_set_file(nd, &path, state, fmode); + if (ctx != NULL) + ctx->state = state; else - nfs4_close_sync(&path, state, fmode); -out_putcred: - put_rpccred(cred); + nfs4_close_sync(path, state, fmode); out: return status; } diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 611bec22f552..4ef39ae88ea5 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -258,7 +258,7 @@ static void nfs_free_createdata(const struct nfs_createdata *data) static int nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, - int flags, struct nameidata *nd) + int flags, struct nfs_open_context *ctx) { struct nfs_createdata *data; struct rpc_message msg = { diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index fc461926c412..b1484dad7bef 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1032,7 +1032,7 @@ struct nfs_rpc_ops { int (*readlink)(struct inode *, struct page *, unsigned int, unsigned int); int (*create) (struct inode *, struct dentry *, - struct iattr *, int, struct nameidata *); + struct iattr *, int, struct nfs_open_context *); int (*remove) (struct inode *, struct qstr *); void (*unlink_setup) (struct rpc_message *, struct inode *dir); int (*unlink_done) (struct rpc_task *, struct inode *); From 2b484297e48c3fbb1846fc6ea10036d9465273e7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 17 Sep 2010 10:56:51 -0400 Subject: [PATCH 17/67] NFS: Add an 'open_context' element to struct nfs_rpc_ops Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 7 ++++--- fs/nfs/nfs4_fs.h | 1 - fs/nfs/nfs4proc.c | 3 ++- include/linux/nfs_xdr.h | 4 ++++ 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index e37ffddd79c2..1e9e18813ad7 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -34,7 +34,6 @@ #include #include -#include "nfs4_fs.h" #include "delegation.h" #include "iostat.h" #include "internal.h" @@ -1127,7 +1126,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry /* Open the file on the server */ nfs_block_sillyrename(dentry->d_parent); - inode = nfs4_atomic_open(dir, ctx, open_flags, &attr); + inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr); if (IS_ERR(inode)) { nfs_unblock_sillyrename(dentry->d_parent); put_nfs_open_context(ctx); @@ -1175,8 +1174,10 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) if (!is_atomic_open(nd) || d_mountpoint(dentry)) goto no_open; + parent = dget_parent(dentry); dir = parent->d_inode; + /* We can't create new files in nfs_open_revalidate(), so we * optimize away revalidation of negative dentries. */ @@ -1205,7 +1206,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) * operations that change the directory. We therefore save the * change attribute *before* we do the RPC call. */ - inode = nfs4_atomic_open(dir, ctx, openflags, NULL); + inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, NULL); if (IS_ERR(inode)) { ret = PTR_ERR(inode); switch (ret) { diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 6cf12ba9f4e7..d24a8e07b5e2 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -242,7 +242,6 @@ extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *); extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait); -extern struct inode *nfs4_atomic_open(struct inode *, struct nfs_open_context *, int, struct iattr *); extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, struct nfs4_fs_locations *fs_locations, struct page *page); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 617b149ee16d..643abd26f2de 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1998,7 +1998,7 @@ out: return status; } -struct inode * +static struct inode * nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr) { struct nfs4_state *state; @@ -5358,6 +5358,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .lock = nfs4_proc_lock, .clear_acl_cache = nfs4_zap_acl_attr, .close_context = nfs4_close_context, + .open_context = nfs4_atomic_open, }; /* diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index b1484dad7bef..6f345f8af4ae 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1065,6 +1065,10 @@ struct nfs_rpc_ops { int (*lock_check_bounds)(const struct file_lock *); void (*clear_acl_cache)(struct inode *); void (*close_context)(struct nfs_open_context *ctx, int); + struct inode * (*open_context) (struct inode *dir, + struct nfs_open_context *ctx, + int open_flags, + struct iattr *iattr); }; /* From 920769f031a8aff87b66bdf49d1a0d0988241ef9 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 17 Sep 2010 17:30:25 -0400 Subject: [PATCH 18/67] nfs: standardize the rename args container Each NFS version has its own version of the rename args container. Standardize them on a common one that's identical to the one NFSv4 uses. Signed-off-by: Jeff Layton Reviewed-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/nfs2xdr.c | 8 ++++---- fs/nfs/nfs3proc.c | 12 +++++------- fs/nfs/nfs3xdr.c | 10 +++++----- fs/nfs/nfs4proc.c | 2 +- fs/nfs/nfs4xdr.c | 2 +- fs/nfs/proc.c | 10 ++++------ include/linux/nfs_xdr.h | 39 ++++++++++++--------------------------- 7 files changed, 32 insertions(+), 51 deletions(-) diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index db8846a0e82e..e15bc0306d0c 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -337,10 +337,10 @@ nfs_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs_createargs *args) static int nfs_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args) { - p = xdr_encode_fhandle(p, args->fromfh); - p = xdr_encode_array(p, args->fromname, args->fromlen); - p = xdr_encode_fhandle(p, args->tofh); - p = xdr_encode_array(p, args->toname, args->tolen); + p = xdr_encode_fhandle(p, args->old_dir); + p = xdr_encode_array(p, args->old_name->name, args->old_name->len); + p = xdr_encode_fhandle(p, args->new_dir); + p = xdr_encode_array(p, args->new_name->name, args->new_name->len); req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); return 0; } diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 2be4a7f59f1c..6dc4ef66f074 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -442,13 +442,11 @@ static int nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name, struct inode *new_dir, struct qstr *new_name) { - struct nfs3_renameargs arg = { - .fromfh = NFS_FH(old_dir), - .fromname = old_name->name, - .fromlen = old_name->len, - .tofh = NFS_FH(new_dir), - .toname = new_name->name, - .tolen = new_name->len + struct nfs_renameargs arg = { + .old_dir = NFS_FH(old_dir), + .old_name = old_name, + .new_dir = NFS_FH(new_dir), + .new_name = new_name, }; struct nfs3_renameres res; struct rpc_message msg = { diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 9769704f8ce6..f385759eb35b 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -442,12 +442,12 @@ nfs3_xdr_mknodargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mknodargs *args) * Encode RENAME arguments */ static int -nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs3_renameargs *args) +nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args) { - p = xdr_encode_fhandle(p, args->fromfh); - p = xdr_encode_array(p, args->fromname, args->fromlen); - p = xdr_encode_fhandle(p, args->tofh); - p = xdr_encode_array(p, args->toname, args->tolen); + p = xdr_encode_fhandle(p, args->old_dir); + p = xdr_encode_array(p, args->old_name->name, args->old_name->len); + p = xdr_encode_fhandle(p, args->new_dir); + p = xdr_encode_array(p, args->new_name->name, args->new_name->len); req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); return 0; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 643abd26f2de..3aa13c1b8dd8 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2570,7 +2570,7 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, struct inode *new_dir, struct qstr *new_name) { struct nfs_server *server = NFS_SERVER(old_dir); - struct nfs4_rename_arg arg = { + struct nfs_renameargs arg = { .old_dir = NFS_FH(old_dir), .new_dir = NFS_FH(new_dir), .old_name = old_name, diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 08ef91291132..7a098ae07a1b 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1823,7 +1823,7 @@ static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs /* * Encode RENAME request */ -static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs4_rename_arg *args) +static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs_renameargs *args) { struct xdr_stream xdr; struct compound_hdr hdr = { diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 4ef39ae88ea5..0aff10c3bbce 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -370,12 +370,10 @@ nfs_proc_rename(struct inode *old_dir, struct qstr *old_name, struct inode *new_dir, struct qstr *new_name) { struct nfs_renameargs arg = { - .fromfh = NFS_FH(old_dir), - .fromname = old_name->name, - .fromlen = old_name->len, - .tofh = NFS_FH(new_dir), - .toname = new_name->name, - .tolen = new_name->len + .old_dir = NFS_FH(old_dir), + .old_name = old_name, + .new_dir = NFS_FH(new_dir), + .new_name = new_name, }; struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_RENAME], diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 6f345f8af4ae..acb95fb27bcc 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -399,6 +399,18 @@ struct nfs_removeres { struct nfs4_sequence_res seq_res; }; +/* + * Common arguments to the rename call + */ +struct nfs_renameargs { + const struct nfs_fh *old_dir; + const struct nfs_fh *new_dir; + const struct qstr *old_name; + const struct qstr *new_name; + const u32 *bitmask; + struct nfs4_sequence_args seq_args; +}; + /* * Argument struct for decode_entry function */ @@ -434,15 +446,6 @@ struct nfs_createargs { struct iattr * sattr; }; -struct nfs_renameargs { - struct nfs_fh * fromfh; - const char * fromname; - unsigned int fromlen; - struct nfs_fh * tofh; - const char * toname; - unsigned int tolen; -}; - struct nfs_setattrargs { struct nfs_fh * fh; nfs4_stateid stateid; @@ -586,15 +589,6 @@ struct nfs3_mknodargs { dev_t rdev; }; -struct nfs3_renameargs { - struct nfs_fh * fromfh; - const char * fromname; - unsigned int fromlen; - struct nfs_fh * tofh; - const char * toname; - unsigned int tolen; -}; - struct nfs3_linkargs { struct nfs_fh * fromfh; struct nfs_fh * tofh; @@ -801,15 +795,6 @@ struct nfs4_readlink_res { struct nfs4_sequence_res seq_res; }; -struct nfs4_rename_arg { - const struct nfs_fh * old_dir; - const struct nfs_fh * new_dir; - const struct qstr * old_name; - const struct qstr * new_name; - const u32 * bitmask; - struct nfs4_sequence_args seq_args; -}; - struct nfs4_rename_res { const struct nfs_server * server; struct nfs4_change_info old_cinfo; From e8582a8b96f329083b4da29aa87bc43cc0d80dd1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 17 Sep 2010 17:31:06 -0400 Subject: [PATCH 19/67] nfs: standardize the rename response container Right now, v3 and v4 have their own variants. Create a standard struct that will work for v3 and v4. v2 doesn't get anything but a simple error and so isn't affected by this. Signed-off-by: Jeff Layton Reviewed-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/nfs3proc.c | 16 ++++++++-------- fs/nfs/nfs3xdr.c | 6 +++--- fs/nfs/nfs4proc.c | 2 +- fs/nfs/nfs4xdr.c | 2 +- include/linux/nfs_xdr.h | 23 +++++++++-------------- 5 files changed, 22 insertions(+), 27 deletions(-) diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 6dc4ef66f074..bb41d88e1567 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -448,7 +448,7 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name, .new_dir = NFS_FH(new_dir), .new_name = new_name, }; - struct nfs3_renameres res; + struct nfs_renameres res; struct rpc_message msg = { .rpc_proc = &nfs3_procedures[NFS3PROC_RENAME], .rpc_argp = &arg, @@ -458,17 +458,17 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name, dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name); - res.fromattr = nfs_alloc_fattr(); - res.toattr = nfs_alloc_fattr(); - if (res.fromattr == NULL || res.toattr == NULL) + res.old_fattr = nfs_alloc_fattr(); + res.new_fattr = nfs_alloc_fattr(); + if (res.old_fattr == NULL || res.new_fattr == NULL) goto out; status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0); - nfs_post_op_update_inode(old_dir, res.fromattr); - nfs_post_op_update_inode(new_dir, res.toattr); + nfs_post_op_update_inode(old_dir, res.old_fattr); + nfs_post_op_update_inode(new_dir, res.new_fattr); out: - nfs_free_fattr(res.toattr); - nfs_free_fattr(res.fromattr); + nfs_free_fattr(res.old_fattr); + nfs_free_fattr(res.new_fattr); dprintk("NFS reply rename: %d\n", status); return status; } diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index f385759eb35b..89c40f8ec6e6 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -970,14 +970,14 @@ nfs3_xdr_createres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res) * Decode RENAME reply */ static int -nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs3_renameres *res) +nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs_renameres *res) { int status; if ((status = ntohl(*p++)) != 0) status = nfs_stat_to_errno(status); - p = xdr_decode_wcc_data(p, res->fromattr); - p = xdr_decode_wcc_data(p, res->toattr); + p = xdr_decode_wcc_data(p, res->old_fattr); + p = xdr_decode_wcc_data(p, res->new_fattr); return status; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 3aa13c1b8dd8..a3c21cc4677b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2577,7 +2577,7 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, .new_name = new_name, .bitmask = server->attr_bitmask, }; - struct nfs4_rename_res res = { + struct nfs_renameres res = { .server = server, }; struct rpc_message msg = { diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 7a098ae07a1b..b0bd7efe8100 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4873,7 +4873,7 @@ out: /* * Decode RENAME response */ -static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_rename_res *res) +static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs_renameres *res) { struct xdr_stream xdr; struct compound_hdr hdr; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index acb95fb27bcc..9ad132e13d12 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -411,6 +411,15 @@ struct nfs_renameargs { struct nfs4_sequence_args seq_args; }; +struct nfs_renameres { + const struct nfs_server *server; + struct nfs4_change_info old_cinfo; + struct nfs_fattr *old_fattr; + struct nfs4_change_info new_cinfo; + struct nfs_fattr *new_fattr; + struct nfs4_sequence_res seq_res; +}; + /* * Argument struct for decode_entry function */ @@ -623,11 +632,6 @@ struct nfs3_readlinkargs { struct page ** pages; }; -struct nfs3_renameres { - struct nfs_fattr * fromattr; - struct nfs_fattr * toattr; -}; - struct nfs3_linkres { struct nfs_fattr * dir_attr; struct nfs_fattr * fattr; @@ -795,15 +799,6 @@ struct nfs4_readlink_res { struct nfs4_sequence_res seq_res; }; -struct nfs4_rename_res { - const struct nfs_server * server; - struct nfs4_change_info old_cinfo; - struct nfs_fattr * old_fattr; - struct nfs4_change_info new_cinfo; - struct nfs_fattr * new_fattr; - struct nfs4_sequence_res seq_res; -}; - #define NFS4_SETCLIENTID_NAMELEN (127) struct nfs4_setclientid { const nfs4_verifier * sc_verifier; From 779c51795bfb35c2403c924b9de90ca9356bc693 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 17 Sep 2010 17:31:30 -0400 Subject: [PATCH 20/67] nfs: move nfs_sillyrename to unlink.c ...since that's where most of the sillyrenaming code lives. A comment block is added to the beginning as well to clarify how sillyrenaming works. Also, make nfs_async_unlink static as nfs_sillyrename is the only caller. Signed-off-by: Jeff Layton Reviewed-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 70 ---------------------------------- fs/nfs/unlink.c | 85 +++++++++++++++++++++++++++++++++++++++++- include/linux/nfs_fs.h | 2 +- 3 files changed, 85 insertions(+), 72 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 1e9e18813ad7..86f1d41c6078 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1498,76 +1498,6 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry) return error; } -static int nfs_sillyrename(struct inode *dir, struct dentry *dentry) -{ - static unsigned int sillycounter; - const int fileidsize = sizeof(NFS_FILEID(dentry->d_inode))*2; - const int countersize = sizeof(sillycounter)*2; - const int slen = sizeof(".nfs")+fileidsize+countersize-1; - char silly[slen+1]; - struct qstr qsilly; - struct dentry *sdentry; - int error = -EIO; - - dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n", - dentry->d_parent->d_name.name, dentry->d_name.name, - atomic_read(&dentry->d_count)); - nfs_inc_stats(dir, NFSIOS_SILLYRENAME); - - /* - * We don't allow a dentry to be silly-renamed twice. - */ - error = -EBUSY; - if (dentry->d_flags & DCACHE_NFSFS_RENAMED) - goto out; - - sprintf(silly, ".nfs%*.*Lx", - fileidsize, fileidsize, - (unsigned long long)NFS_FILEID(dentry->d_inode)); - - /* Return delegation in anticipation of the rename */ - nfs_inode_return_delegation(dentry->d_inode); - - sdentry = NULL; - do { - char *suffix = silly + slen - countersize; - - dput(sdentry); - sillycounter++; - sprintf(suffix, "%*.*x", countersize, countersize, sillycounter); - - dfprintk(VFS, "NFS: trying to rename %s to %s\n", - dentry->d_name.name, silly); - - sdentry = lookup_one_len(silly, dentry->d_parent, slen); - /* - * N.B. Better to return EBUSY here ... it could be - * dangerous to delete the file while it's in use. - */ - if (IS_ERR(sdentry)) - goto out; - } while(sdentry->d_inode != NULL); /* need negative lookup */ - - qsilly.name = silly; - qsilly.len = strlen(silly); - if (dentry->d_inode) { - error = NFS_PROTO(dir)->rename(dir, &dentry->d_name, - dir, &qsilly); - nfs_mark_for_revalidate(dentry->d_inode); - } else - error = NFS_PROTO(dir)->rename(dir, &dentry->d_name, - dir, &qsilly); - if (!error) { - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - d_move(dentry, sdentry); - error = nfs_async_unlink(dir, dentry); - /* If we return 0 we don't unlink */ - } - dput(sdentry); -out: - return error; -} - /* * Remove a file after making sure there are no pending writes, * and after checking that the file has only one user. diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 2f84adaad427..c3ce865294f1 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -13,9 +13,12 @@ #include #include #include +#include #include "internal.h" #include "nfs4_fs.h" +#include "iostat.h" +#include "delegation.h" struct nfs_unlinkdata { struct hlist_node list; @@ -244,7 +247,7 @@ void nfs_unblock_sillyrename(struct dentry *dentry) * @dir: parent directory of dentry * @dentry: dentry to unlink */ -int +static int nfs_async_unlink(struct inode *dir, struct dentry *dentry) { struct nfs_unlinkdata *data; @@ -303,3 +306,83 @@ nfs_complete_unlink(struct dentry *dentry, struct inode *inode) if (data != NULL && (NFS_STALE(inode) || !nfs_call_unlink(dentry, data))) nfs_free_unlinkdata(data); } + +/** + * nfs_sillyrename - Perform a silly-rename of a dentry + * @dir: inode of directory that contains dentry + * @dentry: dentry to be sillyrenamed + * + * NFSv2/3 is stateless and the server doesn't know when the client is + * holding a file open. To prevent application problems when a file is + * unlinked while it's still open, the client performs a "silly-rename". + * That is, it renames the file to a hidden file in the same directory, + * and only performs the unlink once the last reference to it is put. + * + * The final cleanup is done during dentry_iput. + */ +int +nfs_sillyrename(struct inode *dir, struct dentry *dentry) +{ + static unsigned int sillycounter; + const int fileidsize = sizeof(NFS_FILEID(dentry->d_inode))*2; + const int countersize = sizeof(sillycounter)*2; + const int slen = sizeof(".nfs")+fileidsize+countersize-1; + char silly[slen+1]; + struct qstr qsilly; + struct dentry *sdentry; + int error = -EIO; + + dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + atomic_read(&dentry->d_count)); + nfs_inc_stats(dir, NFSIOS_SILLYRENAME); + + /* + * We don't allow a dentry to be silly-renamed twice. + */ + error = -EBUSY; + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) + goto out; + + sprintf(silly, ".nfs%*.*Lx", + fileidsize, fileidsize, + (unsigned long long)NFS_FILEID(dentry->d_inode)); + + /* Return delegation in anticipation of the rename */ + nfs_inode_return_delegation(dentry->d_inode); + + sdentry = NULL; + do { + char *suffix = silly + slen - countersize; + + dput(sdentry); + sillycounter++; + sprintf(suffix, "%*.*x", countersize, countersize, sillycounter); + + dfprintk(VFS, "NFS: trying to rename %s to %s\n", + dentry->d_name.name, silly); + + sdentry = lookup_one_len(silly, dentry->d_parent, slen); + /* + * N.B. Better to return EBUSY here ... it could be + * dangerous to delete the file while it's in use. + */ + if (IS_ERR(sdentry)) + goto out; + } while (sdentry->d_inode != NULL); /* need negative lookup */ + + qsilly.name = silly; + qsilly.len = strlen(silly); + error = NFS_PROTO(dir)->rename(dir, &dentry->d_name, dir, &qsilly); + if (dentry->d_inode) + nfs_mark_for_revalidate(dentry->d_inode); + if (!error) { + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + d_move(dentry, sdentry); + error = nfs_async_unlink(dir, dentry); + /* If we return 0 we don't unlink */ + } + dput(sdentry); +out: + return error; +} diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 61c89b4ad7c6..d929b1883644 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -485,10 +485,10 @@ extern void nfs_release_automount_timer(void); /* * linux/fs/nfs/unlink.c */ -extern int nfs_async_unlink(struct inode *dir, struct dentry *dentry); extern void nfs_complete_unlink(struct dentry *dentry, struct inode *); extern void nfs_block_sillyrename(struct dentry *dentry); extern void nfs_unblock_sillyrename(struct dentry *dentry); +extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry); /* * linux/fs/nfs/write.c From d3d4152a5d59af9e13a73efa9e9c24383fbe307f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 17 Sep 2010 17:31:57 -0400 Subject: [PATCH 21/67] nfs: make sillyrename an async operation A synchronous rename can be interrupted by a SIGKILL. If that happens during a sillyrename operation, it's possible for the rename call to be sent to the server, but the task exits before processing the reply. If this happens, the sillyrenamed file won't get cleaned up during nfs_dentry_iput and the server is left with a dangling .nfs* file hanging around. Fix this problem by turning sillyrename into an asynchronous operation and have the task doing the sillyrename just wait on the reply. If the task is killed before the sillyrename completes, it'll still proceed to completion. Signed-off-by: Jeff Layton Reviewed-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/nfs3proc.c | 23 +++++ fs/nfs/nfs4proc.c | 30 ++++++ fs/nfs/proc.c | 19 ++++ fs/nfs/unlink.c | 200 +++++++++++++++++++++++++++++++++++++--- include/linux/nfs_xdr.h | 2 + 5 files changed, 263 insertions(+), 11 deletions(-) diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index bb41d88e1567..4e9d941ab548 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -438,6 +438,27 @@ nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir) return 1; } +static void +nfs3_proc_rename_setup(struct rpc_message *msg, struct inode *dir) +{ + msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME]; +} + +static int +nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir, + struct inode *new_dir) +{ + struct nfs_renameres *res; + + if (nfs3_async_handle_jukebox(task, old_dir)) + return 0; + res = task->tk_msg.rpc_resp; + + nfs_post_op_update_inode(old_dir, res->old_fattr); + nfs_post_op_update_inode(new_dir, res->new_fattr); + return 1; +} + static int nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name, struct inode *new_dir, struct qstr *new_name) @@ -842,6 +863,8 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .unlink_setup = nfs3_proc_unlink_setup, .unlink_done = nfs3_proc_unlink_done, .rename = nfs3_proc_rename, + .rename_setup = nfs3_proc_rename_setup, + .rename_done = nfs3_proc_rename_done, .link = nfs3_proc_link, .symlink = nfs3_proc_symlink, .mkdir = nfs3_proc_mkdir, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a3c21cc4677b..c46e45e9b33f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2566,6 +2566,34 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) return 1; } +static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir) +{ + struct nfs_server *server = NFS_SERVER(dir); + struct nfs_renameargs *arg = msg->rpc_argp; + struct nfs_renameres *res = msg->rpc_resp; + + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME]; + arg->bitmask = server->attr_bitmask; + res->server = server; +} + +static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, + struct inode *new_dir) +{ + struct nfs_renameres *res = task->tk_msg.rpc_resp; + + if (!nfs4_sequence_done(task, &res->seq_res)) + return 0; + if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) + return 0; + + update_changeattr(old_dir, &res->old_cinfo); + nfs_post_op_update_inode(old_dir, res->old_fattr); + update_changeattr(new_dir, &res->new_cinfo); + nfs_post_op_update_inode(new_dir, res->new_fattr); + return 1; +} + static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, struct inode *new_dir, struct qstr *new_name) { @@ -5338,6 +5366,8 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .unlink_setup = nfs4_proc_unlink_setup, .unlink_done = nfs4_proc_unlink_done, .rename = nfs4_proc_rename, + .rename_setup = nfs4_proc_rename_setup, + .rename_done = nfs4_proc_rename_done, .link = nfs4_proc_link, .symlink = nfs4_proc_symlink, .mkdir = nfs4_proc_mkdir, diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 0aff10c3bbce..e5e84aa2af17 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -365,6 +365,23 @@ static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir) return 1; } +static void +nfs_proc_rename_setup(struct rpc_message *msg, struct inode *dir) +{ + msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME]; +} + +static int +nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir, + struct inode *new_dir) +{ + if (nfs_async_handle_expired_key(task)) + return 0; + nfs_mark_for_revalidate(old_dir); + nfs_mark_for_revalidate(new_dir); + return 1; +} + static int nfs_proc_rename(struct inode *old_dir, struct qstr *old_name, struct inode *new_dir, struct qstr *new_name) @@ -703,6 +720,8 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .unlink_setup = nfs_proc_unlink_setup, .unlink_done = nfs_proc_unlink_done, .rename = nfs_proc_rename, + .rename_setup = nfs_proc_rename_setup, + .rename_done = nfs_proc_rename_done, .link = nfs_proc_link, .symlink = nfs_proc_symlink, .mkdir = nfs_proc_mkdir, diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index c3ce865294f1..698b3e6367ff 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -307,6 +307,174 @@ nfs_complete_unlink(struct dentry *dentry, struct inode *inode) nfs_free_unlinkdata(data); } +/* Cancel a queued async unlink. Called when a sillyrename run fails. */ +static void +nfs_cancel_async_unlink(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { + struct nfs_unlinkdata *data = dentry->d_fsdata; + + dentry->d_flags &= ~DCACHE_NFSFS_RENAMED; + spin_unlock(&dentry->d_lock); + nfs_free_unlinkdata(data); + return; + } + spin_unlock(&dentry->d_lock); +} + +struct nfs_renamedata { + struct nfs_renameargs args; + struct nfs_renameres res; + struct rpc_cred *cred; + struct inode *old_dir; + struct dentry *old_dentry; + struct nfs_fattr old_fattr; + struct inode *new_dir; + struct dentry *new_dentry; + struct nfs_fattr new_fattr; +}; + +/** + * nfs_async_rename_done - Sillyrename post-processing + * @task: rpc_task of the sillyrename + * @calldata: nfs_renamedata for the sillyrename + * + * Do the directory attribute updates and the d_move + */ +static void nfs_async_rename_done(struct rpc_task *task, void *calldata) +{ + struct nfs_renamedata *data = calldata; + struct inode *old_dir = data->old_dir; + struct inode *new_dir = data->new_dir; + + if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) { + nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client); + return; + } + + if (task->tk_status != 0) { + nfs_cancel_async_unlink(data->old_dentry); + return; + } + + nfs_set_verifier(data->old_dentry, nfs_save_change_attribute(old_dir)); + d_move(data->old_dentry, data->new_dentry); +} + +/** + * nfs_async_rename_release - Release the sillyrename data. + * @calldata: the struct nfs_renamedata to be released + */ +static void nfs_async_rename_release(void *calldata) +{ + struct nfs_renamedata *data = calldata; + struct super_block *sb = data->old_dir->i_sb; + + if (data->old_dentry->d_inode) + nfs_mark_for_revalidate(data->old_dentry->d_inode); + + dput(data->old_dentry); + dput(data->new_dentry); + iput(data->old_dir); + iput(data->new_dir); + nfs_sb_deactive(sb); + put_rpccred(data->cred); + kfree(data); +} + +#if defined(CONFIG_NFS_V4_1) +static void nfs_rename_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs_renamedata *data = calldata; + struct nfs_server *server = NFS_SERVER(data->old_dir); + + if (nfs4_setup_sequence(server, &data->args.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); +} +#endif /* CONFIG_NFS_V4_1 */ + +static const struct rpc_call_ops nfs_rename_ops = { + .rpc_call_done = nfs_async_rename_done, + .rpc_release = nfs_async_rename_release, +#if defined(CONFIG_NFS_V4_1) + .rpc_call_prepare = nfs_rename_prepare, +#endif /* CONFIG_NFS_V4_1 */ +}; + +/** + * nfs_async_rename - perform an asynchronous rename operation + * @old_dir: directory that currently holds the dentry to be renamed + * @new_dir: target directory for the rename + * @old_dentry: original dentry to be renamed + * @new_dentry: dentry to which the old_dentry should be renamed + * + * It's expected that valid references to the dentries and inodes are held + */ +static struct rpc_task * +nfs_async_rename(struct inode *old_dir, struct inode *new_dir, + struct dentry *old_dentry, struct dentry *new_dentry) +{ + struct nfs_renamedata *data; + struct rpc_message msg = { }; + struct rpc_task_setup task_setup_data = { + .rpc_message = &msg, + .callback_ops = &nfs_rename_ops, + .workqueue = nfsiod_workqueue, + .rpc_client = NFS_CLIENT(old_dir), + .flags = RPC_TASK_ASYNC, + }; + struct rpc_task *task; + + data = kmalloc(sizeof(*data), GFP_KERNEL); + if (data == NULL) + return ERR_PTR(-ENOMEM); + task_setup_data.callback_data = data, + + data->cred = rpc_lookup_cred(); + if (IS_ERR(data->cred)) { + task = (struct rpc_task *)data->cred; + kfree(data); + return task; + } + + msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + msg.rpc_cred = data->cred; + + /* set up nfs_renamedata */ + data->old_dir = old_dir; + atomic_inc(&old_dir->i_count); + data->new_dir = new_dir; + atomic_inc(&new_dir->i_count); + data->old_dentry = dget(old_dentry); + data->new_dentry = dget(new_dentry); + nfs_fattr_init(&data->old_fattr); + nfs_fattr_init(&data->new_fattr); + + /* set up nfs_renameargs */ + data->args.old_dir = NFS_FH(old_dir); + data->args.old_name = &old_dentry->d_name; + data->args.new_dir = NFS_FH(new_dir); + data->args.new_name = &new_dentry->d_name; + + /* set up nfs_renameres */ + data->res.old_fattr = &data->old_fattr; + data->res.new_fattr = &data->new_fattr; + + nfs_sb_active(old_dir->i_sb); + + NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dir); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + nfs_async_rename_release(data); + + return task; +} + /** * nfs_sillyrename - Perform a silly-rename of a dentry * @dir: inode of directory that contains dentry @@ -328,8 +496,8 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) const int countersize = sizeof(sillycounter)*2; const int slen = sizeof(".nfs")+fileidsize+countersize-1; char silly[slen+1]; - struct qstr qsilly; struct dentry *sdentry; + struct rpc_task *task; int error = -EIO; dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n", @@ -371,17 +539,27 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) goto out; } while (sdentry->d_inode != NULL); /* need negative lookup */ - qsilly.name = silly; - qsilly.len = strlen(silly); - error = NFS_PROTO(dir)->rename(dir, &dentry->d_name, dir, &qsilly); - if (dentry->d_inode) - nfs_mark_for_revalidate(dentry->d_inode); - if (!error) { - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - d_move(dentry, sdentry); - error = nfs_async_unlink(dir, dentry); - /* If we return 0 we don't unlink */ + /* queue unlink first. Can't do this from rpc_release as it + * has to allocate memory + */ + error = nfs_async_unlink(dir, dentry); + if (error) + goto out_dput; + + /* run the rename task, undo unlink if it fails */ + task = nfs_async_rename(dir, dir, dentry, sdentry); + if (IS_ERR(task)) { + error = -EBUSY; + nfs_cancel_async_unlink(dentry); + goto out_dput; } + + /* wait for the RPC task to complete, unless a SIGKILL intervenes */ + error = rpc_wait_for_completion_task(task); + if (error == 0) + error = task->tk_status; + rpc_put_task(task); +out_dput: dput(sdentry); out: return error; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 9ad132e13d12..172df83ac54b 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1018,6 +1018,8 @@ struct nfs_rpc_ops { int (*unlink_done) (struct rpc_task *, struct inode *); int (*rename) (struct inode *, struct qstr *, struct inode *, struct qstr *); + void (*rename_setup) (struct rpc_message *msg, struct inode *dir); + int (*rename_done) (struct rpc_task *task, struct inode *old_dir, struct inode *new_dir); int (*link) (struct inode *, struct inode *, struct qstr *); int (*symlink) (struct inode *, struct dentry *, struct page *, unsigned int, struct iattr *); From f7732d6573c4f29fc1ca5d384bbf82ddfa115030 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 21 Sep 2010 16:52:40 -0400 Subject: [PATCH 22/67] NFS: Fix a use-after-free case in nfs_async_rename() The call to nfs_async_rename_release() after rpc_run_task() is incorrect. The rpc_run_task() is always guaranteed to call the ->rpc_release() method. Signed-off-by: Trond Myklebust --- fs/nfs/unlink.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 698b3e6367ff..47530aacebfd 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -426,7 +426,6 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, .rpc_client = NFS_CLIENT(old_dir), .flags = RPC_TASK_ASYNC, }; - struct rpc_task *task; data = kmalloc(sizeof(*data), GFP_KERNEL); if (data == NULL) @@ -435,7 +434,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, data->cred = rpc_lookup_cred(); if (IS_ERR(data->cred)) { - task = (struct rpc_task *)data->cred; + struct rpc_task *task = ERR_CAST(data->cred); kfree(data); return task; } @@ -468,11 +467,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dir); - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) - nfs_async_rename_release(data); - - return task; + return rpc_run_task(&task_setup_data); } /** From d688e11007419fd060ae74d8d952a5c4ece735aa Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 21 Sep 2010 16:52:40 -0400 Subject: [PATCH 23/67] NFSv4.1: Fix the slotid initialisation in nfs_async_rename() This fixes an Oopsable condition that was introduced by commit d3d4152a5d59af9e13a73efa9e9c24383fbe307f (nfs: make sillyrename an async operation) Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c46e45e9b33f..72aa7067b85d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2550,6 +2550,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) args->bitmask = server->cache_consistency_bitmask; res->server = server; + res->seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; } @@ -2575,6 +2576,7 @@ static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir) msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME]; arg->bitmask = server->attr_bitmask; res->server = server; + res->seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; } static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, From 4fbf6e507888da902b02a3c4f5f493fab1071312 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 21 Sep 2010 16:54:34 -0400 Subject: [PATCH 24/67] SUNRPC: Convert rpciod to use the alloc_workqueue() interface create_workqueue() is a deprecated function. Signed-off-by: Trond Myklebust --- net/sunrpc/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index cace6049e4a5..2b08c3d2f4db 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -908,7 +908,7 @@ static int rpciod_start(void) * Create the rpciod thread and wait for it to start. */ dprintk("RPC: creating workqueue rpciod\n"); - wq = create_workqueue("rpciod"); + wq = alloc_workqueue("rpciod", WQ_RESCUER, 0); rpciod_workqueue = wq; return rpciod_workqueue != NULL; } From 609588928fae2c977c887d6d31b1f0aae60ea09e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 21 Sep 2010 16:55:31 -0400 Subject: [PATCH 25/67] NFS: Convert nfsiod to use alloc_workqueue() create_singlethread_workqueue() is deprecated. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 2f9266406afc..2ff814272dcf 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1493,7 +1493,7 @@ static int nfsiod_start(void) { struct workqueue_struct *wq; dprintk("RPC: creating workqueue nfsiod\n"); - wq = create_singlethread_workqueue("nfsiod"); + wq = alloc_workqueue("nfsiod", WQ_RESCUER, 0); if (wq == NULL) return -ENOMEM; nfsiod_workqueue = wq; From d141d97437a3c84aa18cfd5c8d91b89c4173f25c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 21 Sep 2010 16:55:47 -0400 Subject: [PATCH 26/67] NFS: Fix NFSv3 debugging messages in fs/nfs/nfs3proc.c Clean up. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/nfs3proc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 4e9d941ab548..f8446b38dcb4 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -671,7 +671,8 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, nfs_free_fattr(res.dir_attr); out: - dprintk("NFS reply readdir: %d\n", status); + dprintk("NFS reply readdir%s: %d\n", + plus? "plus" : "", status); return status; } @@ -741,7 +742,7 @@ nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, dprintk("NFS call fsstat\n"); nfs_fattr_init(stat->fattr); status = rpc_call_sync(server->client, &msg, 0); - dprintk("NFS reply statfs: %d\n", status); + dprintk("NFS reply fsstat: %d\n", status); return status; } From 38359352fcb0d776b562a9e0ed4f0d355d5a332e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 21 Sep 2010 16:55:48 -0400 Subject: [PATCH 27/67] SUNRPC: Correct an rpcbind debugging message Clean up. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/rpcb_clnt.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index c961094ebc62..63ec116b4dd4 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -211,8 +211,9 @@ static int rpcb_create_local(void) */ clnt4 = rpc_bind_new_program(clnt, &rpcb_program, RPCBVERS_4); if (IS_ERR(clnt4)) { - dprintk("RPC: failed to create local rpcbind v4 " - "cleint (errno %ld).\n", PTR_ERR(clnt4)); + dprintk("RPC: failed to bind second program to " + "rpcbind v4 client (errno %ld).\n", + PTR_ERR(clnt4)); clnt4 = NULL; } From b4687da7fc5f741af7fee9b0248a2cf2ad9c4478 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 21 Sep 2010 16:55:48 -0400 Subject: [PATCH 28/67] SUNRPC: Refactor logic to NUL-terminate strings in pages Clean up: Introduce a helper to '\0'-terminate XDR strings that are placed in a page in the page cache. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/nfs2xdr.c | 6 +----- fs/nfs/nfs3xdr.c | 6 +----- fs/nfs/nfs4xdr.c | 5 +---- include/linux/sunrpc/xdr.h | 1 + net/sunrpc/xdr.c | 17 +++++++++++++++++ 5 files changed, 21 insertions(+), 14 deletions(-) diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index e15bc0306d0c..79c74387a2fe 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -596,7 +596,6 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy) struct kvec *iov = rcvbuf->head; size_t hdrlen; u32 len, recvd; - char *kaddr; int status; if ((status = ntohl(*p++))) @@ -623,10 +622,7 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy) return -EIO; } - /* NULL terminate the string we got */ - kaddr = (char *)kmap_atomic(rcvbuf->pages[0], KM_USER0); - kaddr[len+rcvbuf->page_base] = '\0'; - kunmap_atomic(kaddr, KM_USER0); + xdr_terminate_string(rcvbuf, len); return 0; } diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 89c40f8ec6e6..52b2fda66e63 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -824,7 +824,6 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) struct kvec *iov = rcvbuf->head; size_t hdrlen; u32 len, recvd; - char *kaddr; int status; status = ntohl(*p++); @@ -857,10 +856,7 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) return -EIO; } - /* NULL terminate the string we got */ - kaddr = (char*)kmap_atomic(rcvbuf->pages[0], KM_USER0); - kaddr[len+rcvbuf->page_base] = '\0'; - kunmap_atomic(kaddr, KM_USER0); + xdr_terminate_string(rcvbuf, len); return 0; } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index b0bd7efe8100..86ab69eb149c 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4299,7 +4299,6 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) size_t hdrlen; u32 len, recvd; __be32 *p; - char *kaddr; int status; status = decode_op_hdr(xdr, OP_READLINK); @@ -4330,9 +4329,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) * and and null-terminate the text (the VFS expects * null-termination). */ - kaddr = (char *)kmap_atomic(rcvbuf->pages[0], KM_USER0); - kaddr[len+rcvbuf->page_base] = '\0'; - kunmap_atomic(kaddr, KM_USER0); + xdr_terminate_string(rcvbuf, len); return 0; out_overflow: print_overflow_msg(__func__, xdr); diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 35cf2e8cd7c6..8c1dcbb54d89 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -108,6 +108,7 @@ void xdr_encode_pages(struct xdr_buf *, struct page **, unsigned int, unsigned int); void xdr_inline_pages(struct xdr_buf *, unsigned int, struct page **, unsigned int, unsigned int); +void xdr_terminate_string(struct xdr_buf *, const u32); static inline __be32 *xdr_encode_array(__be32 *p, const void *s, unsigned int len) { diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 3bbef7f5f826..e0725d9d8107 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -111,6 +111,23 @@ xdr_decode_string_inplace(__be32 *p, char **sp, } EXPORT_SYMBOL_GPL(xdr_decode_string_inplace); +/** + * xdr_terminate_string - '\0'-terminate a string residing in an xdr_buf + * @buf: XDR buffer where string resides + * @len: length of string, in bytes + * + */ +void +xdr_terminate_string(struct xdr_buf *buf, const u32 len) +{ + char *kaddr; + + kaddr = kmap_atomic(buf->pages[0], KM_USER0); + kaddr[buf->page_base + len] = '\0'; + kunmap_atomic(kaddr, KM_USER0); +} +EXPORT_SYMBOL(xdr_terminate_string); + void xdr_encode_pages(struct xdr_buf *xdr, struct page **pages, unsigned int base, unsigned int len) From 63185942c5f138c62de16b4cbc7eee494a58fea8 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Wed, 22 Sep 2010 09:50:35 -0400 Subject: [PATCH 29/67] lockd: Remove BKL from the client This patch removes all calls to lock_kernel() from the client. This patch should be applied after the "fs/lock.c prepare for BKL removal" patch submitted by Arnd Bergmann on September 18. Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- fs/lockd/clntlock.c | 15 ++++++++++----- fs/lockd/clntproc.c | 13 ++++--------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 64fd427c993c..d5bb86866e6c 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -42,6 +42,7 @@ struct nlm_wait { }; static LIST_HEAD(nlm_blocked); +static DEFINE_SPINLOCK(nlm_blocked_lock); /** * nlmclnt_init - Set up per-NFS mount point lockd data structures @@ -97,7 +98,10 @@ struct nlm_wait *nlmclnt_prepare_block(struct nlm_host *host, struct file_lock * block->b_lock = fl; init_waitqueue_head(&block->b_wait); block->b_status = nlm_lck_blocked; + + spin_lock(&nlm_blocked_lock); list_add(&block->b_list, &nlm_blocked); + spin_unlock(&nlm_blocked_lock); } return block; } @@ -106,7 +110,9 @@ void nlmclnt_finish_block(struct nlm_wait *block) { if (block == NULL) return; + spin_lock(&nlm_blocked_lock); list_del(&block->b_list); + spin_unlock(&nlm_blocked_lock); kfree(block); } @@ -154,6 +160,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) * Look up blocked request based on arguments. * Warning: must not use cookie to match it! */ + spin_lock(&nlm_blocked_lock); list_for_each_entry(block, &nlm_blocked, b_list) { struct file_lock *fl_blocked = block->b_lock; @@ -178,6 +185,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) wake_up(&block->b_wait); res = nlm_granted; } + spin_unlock(&nlm_blocked_lock); return res; } @@ -216,10 +224,6 @@ reclaimer(void *ptr) allow_signal(SIGKILL); down_write(&host->h_rwsem); - - /* This one ensures that our parent doesn't terminate while the - * reclaim is in progress */ - lock_kernel(); lockd_up(); /* note: this cannot fail as lockd is already running */ dprintk("lockd: reclaiming locks for host %s\n", host->h_name); @@ -260,16 +264,17 @@ restart: dprintk("NLM: done reclaiming locks for host %s\n", host->h_name); /* Now, wake up all processes that sleep on a blocked lock */ + spin_lock(&nlm_blocked_lock); list_for_each_entry(block, &nlm_blocked, b_list) { if (block->b_host == host) { block->b_status = nlm_lck_denied_grace_period; wake_up(&block->b_wait); } } + spin_unlock(&nlm_blocked_lock); /* Release host handle after use */ nlm_release_host(host); lockd_down(); - unlock_kernel(); return 0; } diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 7932c399fab4..47ea1e1925b8 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -166,7 +166,6 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl) /* Set up the argument struct */ nlmclnt_setlockargs(call, fl); - lock_kernel(); if (IS_SETLK(cmd) || IS_SETLKW(cmd)) { if (fl->fl_type != F_UNLCK) { call->a_args.block = IS_SETLKW(cmd) ? 1 : 0; @@ -177,10 +176,8 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl) status = nlmclnt_test(call, fl); else status = -EINVAL; - fl->fl_ops->fl_release_private(fl); fl->fl_ops = NULL; - unlock_kernel(); dprintk("lockd: clnt proc returns %d\n", status); return status; @@ -226,9 +223,7 @@ void nlm_release_call(struct nlm_rqst *call) static void nlmclnt_rpc_release(void *data) { - lock_kernel(); nlm_release_call(data); - unlock_kernel(); } static int nlm_wait_on_grace(wait_queue_head_t *queue) @@ -448,14 +443,18 @@ out: static void nlmclnt_locks_copy_lock(struct file_lock *new, struct file_lock *fl) { + spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock); new->fl_u.nfs_fl.state = fl->fl_u.nfs_fl.state; new->fl_u.nfs_fl.owner = nlm_get_lockowner(fl->fl_u.nfs_fl.owner); list_add_tail(&new->fl_u.nfs_fl.list, &fl->fl_u.nfs_fl.owner->host->h_granted); + spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock); } static void nlmclnt_locks_release_private(struct file_lock *fl) { + spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock); list_del(&fl->fl_u.nfs_fl.list); + spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock); nlm_put_lockowner(fl->fl_u.nfs_fl.owner); } @@ -721,9 +720,7 @@ static void nlmclnt_unlock_callback(struct rpc_task *task, void *data) die: return; retry_rebind: - lock_kernel(); nlm_rebind_host(req->a_host); - unlock_kernel(); retry_unlock: rpc_restart_call(task); } @@ -801,9 +798,7 @@ retry_cancel: /* Don't ever retry more than 3 times */ if (req->a_retries++ >= NLMCLNT_MAX_RETRIES) goto die; - lock_kernel(); nlm_rebind_host(req->a_host); - unlock_kernel(); rpc_restart_call(task); rpc_delay(task, 30 * HZ); } From 5eebde23223aeb0ad2d9e3be6590ff8bbfab0fc2 Mon Sep 17 00:00:00 2001 From: Suresh Jayaraman Date: Thu, 23 Sep 2010 08:55:58 -0400 Subject: [PATCH 30/67] nfs: introduce mount option '-olocal_lock' to make locks local NFS clients since 2.6.12 support flock locks by emulating fcntl byte-range locks. Due to this, some windows applications which seem to use both flock (share mode lock mapped as flock by Samba) and fcntl locks sequentially on the same file, can't lock as they falsely assume the file is already locked. The problem was reported on a setup with windows clients accessing excel files on a Samba exported share which is originally a NFS mount from a NetApp filer. Older NFS clients (< 2.6.12) did not see this problem as flock locks were considered local. To support legacy flock behavior, this patch adds a mount option "-olocal_lock=" which can take the following values: 'none' - Neither flock locks nor POSIX locks are local 'flock' - flock locks are local 'posix' - fcntl/POSIX locks are local 'all' - Both flock locks and POSIX locks are local Testing: - This patch was tested by using -olocal_lock option with different values and the NLM calls were noted from the network packet captured. 'none' - NLM calls were seen during both flock() and fcntl(), flock lock was granted, fcntl was denied 'flock' - no NLM calls for flock(), NLM call was seen for fcntl(), granted 'posix' - NLM call was seen for flock() - granted, no NLM call for fcntl() 'all' - no NLM calls were seen during both flock() and fcntl() - No bugs were seen during NFSv4 locking/unlocking in general and NFSv4 reboot recovery. Cc: Neil Brown Signed-off-by: Suresh Jayaraman Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 6 ++-- fs/nfs/file.c | 45 ++++++++++++++++++++--------- fs/nfs/super.c | 59 ++++++++++++++++++++++++++++++++++++++- include/linux/nfs_mount.h | 3 ++ 4 files changed, 97 insertions(+), 16 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 4e7df2adb212..5f01f42b3991 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -635,7 +635,8 @@ static int nfs_create_rpc_client(struct nfs_client *clp, */ static void nfs_destroy_server(struct nfs_server *server) { - if (!(server->flags & NFS_MOUNT_NONLM)) + if (!(server->flags & NFS_MOUNT_LOCAL_FLOCK) || + !(server->flags & NFS_MOUNT_LOCAL_FCNTL)) nlmclnt_done(server->nlm_host); } @@ -657,7 +658,8 @@ static int nfs_start_lockd(struct nfs_server *server) if (nlm_init.nfs_version > 3) return 0; - if (server->flags & NFS_MOUNT_NONLM) + if ((server->flags & NFS_MOUNT_LOCAL_FLOCK) && + (server->flags & NFS_MOUNT_LOCAL_FCNTL)) return 0; switch (clp->cl_proto) { diff --git a/fs/nfs/file.c b/fs/nfs/file.c index eb51bd6201da..59cbe1ba0511 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -684,7 +684,8 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe, return ret; } -static int do_getlk(struct file *filp, int cmd, struct file_lock *fl) +static int +do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) { struct inode *inode = filp->f_mapping->host; int status = 0; @@ -699,7 +700,7 @@ static int do_getlk(struct file *filp, int cmd, struct file_lock *fl) if (nfs_have_delegation(inode, FMODE_READ)) goto out_noconflict; - if (NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM) + if (is_local) goto out_noconflict; status = NFS_PROTO(inode)->lock(filp, cmd, fl); @@ -730,7 +731,8 @@ static int do_vfs_lock(struct file *file, struct file_lock *fl) return res; } -static int do_unlk(struct file *filp, int cmd, struct file_lock *fl) +static int +do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) { struct inode *inode = filp->f_mapping->host; int status; @@ -745,15 +747,19 @@ static int do_unlk(struct file *filp, int cmd, struct file_lock *fl) * If we're signalled while cleaning up locks on process exit, we * still need to complete the unlock. */ - /* Use local locking if mounted with "-onolock" */ - if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)) + /* + * Use local locking if mounted with "-onolock" or with appropriate + * "-olocal_lock=" + */ + if (!is_local) status = NFS_PROTO(inode)->lock(filp, cmd, fl); else status = do_vfs_lock(filp, fl); return status; } -static int do_setlk(struct file *filp, int cmd, struct file_lock *fl) +static int +do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) { struct inode *inode = filp->f_mapping->host; int status; @@ -766,8 +772,11 @@ static int do_setlk(struct file *filp, int cmd, struct file_lock *fl) if (status != 0) goto out; - /* Use local locking if mounted with "-onolock" */ - if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)) + /* + * Use local locking if mounted with "-onolock" or with appropriate + * "-olocal_lock=" + */ + if (!is_local) status = NFS_PROTO(inode)->lock(filp, cmd, fl); else status = do_vfs_lock(filp, fl); @@ -791,6 +800,7 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) { struct inode *inode = filp->f_mapping->host; int ret = -ENOLCK; + int is_local = 0; dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n", filp->f_path.dentry->d_parent->d_name.name, @@ -804,6 +814,9 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) goto out_err; + if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL) + is_local = 1; + if (NFS_PROTO(inode)->lock_check_bounds != NULL) { ret = NFS_PROTO(inode)->lock_check_bounds(fl); if (ret < 0) @@ -811,11 +824,11 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) } if (IS_GETLK(cmd)) - ret = do_getlk(filp, cmd, fl); + ret = do_getlk(filp, cmd, fl, is_local); else if (fl->fl_type == F_UNLCK) - ret = do_unlk(filp, cmd, fl); + ret = do_unlk(filp, cmd, fl, is_local); else - ret = do_setlk(filp, cmd, fl); + ret = do_setlk(filp, cmd, fl, is_local); out_err: return ret; } @@ -825,6 +838,9 @@ out_err: */ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) { + struct inode *inode = filp->f_mapping->host; + int is_local = 0; + dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n", filp->f_path.dentry->d_parent->d_name.name, filp->f_path.dentry->d_name.name, @@ -833,14 +849,17 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_FLOCK)) return -ENOLCK; + if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK) + is_local = 1; + /* We're simulating flock() locks using posix locks on the server */ fl->fl_owner = (fl_owner_t)filp; fl->fl_start = 0; fl->fl_end = OFFSET_MAX; if (fl->fl_type == F_UNLCK) - return do_unlk(filp, cmd, fl); - return do_setlk(filp, cmd, fl); + return do_unlk(filp, cmd, fl, is_local); + return do_setlk(filp, cmd, fl, is_local); } /* diff --git a/fs/nfs/super.c b/fs/nfs/super.c index ec3966e4706b..2e866d86c220 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -100,6 +100,7 @@ enum { Opt_addr, Opt_mountaddr, Opt_clientaddr, Opt_lookupcache, Opt_fscache_uniq, + Opt_local_lock, /* Special mount options */ Opt_userspace, Opt_deprecated, Opt_sloppy, @@ -171,6 +172,7 @@ static const match_table_t nfs_mount_option_tokens = { { Opt_lookupcache, "lookupcache=%s" }, { Opt_fscache_uniq, "fsc=%s" }, + { Opt_local_lock, "local_lock=%s" }, { Opt_err, NULL } }; @@ -236,6 +238,22 @@ static match_table_t nfs_lookupcache_tokens = { { Opt_lookupcache_err, NULL } }; +enum { + Opt_local_lock_all, Opt_local_lock_flock, Opt_local_lock_posix, + Opt_local_lock_none, + + Opt_local_lock_err +}; + +static match_table_t nfs_local_lock_tokens = { + { Opt_local_lock_all, "all" }, + { Opt_local_lock_flock, "flock" }, + { Opt_local_lock_posix, "posix" }, + { Opt_local_lock_none, "none" }, + + { Opt_local_lock_err, NULL } +}; + static void nfs_umount_begin(struct super_block *); static int nfs_statfs(struct dentry *, struct kstatfs *); @@ -1009,9 +1027,13 @@ static int nfs_parse_mount_options(char *raw, break; case Opt_lock: mnt->flags &= ~NFS_MOUNT_NONLM; + mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK | + NFS_MOUNT_LOCAL_FCNTL); break; case Opt_nolock: mnt->flags |= NFS_MOUNT_NONLM; + mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK | + NFS_MOUNT_LOCAL_FCNTL); break; case Opt_v2: mnt->flags &= ~NFS_MOUNT_VER3; @@ -1412,6 +1434,34 @@ static int nfs_parse_mount_options(char *raw, mnt->fscache_uniq = string; mnt->options |= NFS_OPTION_FSCACHE; break; + case Opt_local_lock: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + token = match_token(string, nfs_local_lock_tokens, + args); + kfree(string); + switch (token) { + case Opt_local_lock_all: + mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK | + NFS_MOUNT_LOCAL_FCNTL); + break; + case Opt_local_lock_flock: + mnt->flags |= NFS_MOUNT_LOCAL_FLOCK; + break; + case Opt_local_lock_posix: + mnt->flags |= NFS_MOUNT_LOCAL_FCNTL; + break; + case Opt_local_lock_none: + mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK | + NFS_MOUNT_LOCAL_FCNTL); + break; + default: + dfprintk(MOUNT, "NFS: invalid " + "local_lock argument\n"); + return 0; + }; + break; /* * Special options @@ -1817,6 +1867,12 @@ static int nfs_validate_mount_data(void *options, if (!args->nfs_server.hostname) goto out_nomem; + if (!(data->flags & NFS_MOUNT_NONLM)) + args->flags &= ~(NFS_MOUNT_LOCAL_FLOCK| + NFS_MOUNT_LOCAL_FCNTL); + else + args->flags |= (NFS_MOUNT_LOCAL_FLOCK| + NFS_MOUNT_LOCAL_FCNTL); /* * The legacy version 6 binary mount data from userspace has a * field used only to transport selinux information into the @@ -2433,7 +2489,8 @@ static void nfs4_fill_super(struct super_block *sb) static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args) { - args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3); + args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3| + NFS_MOUNT_LOCAL_FLOCK|NFS_MOUNT_LOCAL_FCNTL); } static int nfs4_validate_text_mount_data(void *options, diff --git a/include/linux/nfs_mount.h b/include/linux/nfs_mount.h index 5d59ae861aa6..576bddd72e04 100644 --- a/include/linux/nfs_mount.h +++ b/include/linux/nfs_mount.h @@ -71,4 +71,7 @@ struct nfs_mount_data { #define NFS_MOUNT_NORESVPORT 0x40000 #define NFS_MOUNT_LEGACY_INTERFACE 0x80000 +#define NFS_MOUNT_LOCAL_FLOCK 0x100000 +#define NFS_MOUNT_LOCAL_FCNTL 0x200000 + #endif From ef84303ebc77a9041265faaccd56b7fcef151077 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Thu, 23 Sep 2010 12:22:09 -0400 Subject: [PATCH 31/67] NFS: handle inode==NULL in __put_nfs_open_context inode may be NULL when put_nfs_open_context is called from nfs_atomic_lookup before d_add_unique(dentry, inode) Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 2ff814272dcf..702ed096e790 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -654,11 +654,14 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) { struct inode *inode = ctx->path.dentry->d_inode; - if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) + if (inode) { + if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) + return; + list_del(&ctx->list); + spin_unlock(&inode->i_lock); + NFS_PROTO(inode)->close_context(ctx, is_sync); + } else if (!atomic_dec_and_test(&ctx->lock_context.count)) return; - list_del(&ctx->list); - spin_unlock(&inode->i_lock); - NFS_PROTO(inode)->close_context(ctx, is_sync); if (ctx->cred != NULL) put_rpccred(ctx->cred); path_put(&ctx->path); From 7c563cc9f3f4aca70c27bd08a135499227f67014 Mon Sep 17 00:00:00 2001 From: Suresh Jayaraman Date: Thu, 23 Sep 2010 14:26:48 -0400 Subject: [PATCH 32/67] nfs: show "local_lock" mount option in /proc/mounts Display the status of 'local_lock' mount option in /proc/mounts. Signed-off-by: Suresh Jayaraman Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 2e866d86c220..50ed035cabc4 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -632,6 +632,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, const struct proc_nfs_info *nfs_infop; struct nfs_client *clp = nfss->nfs_client; u32 version = clp->rpc_ops->version; + int local_flock, local_fcntl; seq_printf(m, ",vers=%u", version); seq_printf(m, ",rsize=%u", nfss->rsize); @@ -680,6 +681,18 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, else seq_printf(m, ",lookupcache=pos"); } + + local_flock = nfss->flags & NFS_MOUNT_LOCAL_FLOCK; + local_fcntl = nfss->flags & NFS_MOUNT_LOCAL_FCNTL; + + if (!local_flock && !local_fcntl) + seq_printf(m, ",local_lock=none"); + else if (local_flock && local_fcntl) + seq_printf(m, ",local_lock=all"); + else if (local_flock) + seq_printf(m, ",local_lock=flock"); + else + seq_printf(m, ",local_lock=posix"); } /* From dfb4f309830359352539919f23accc59a20a3758 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Fri, 24 Sep 2010 09:17:01 -0400 Subject: [PATCH 33/67] NFSv4.1: keep seq_res.sr_slot as pointer rather than an index Having to explicitly initialize sr_slotid to NFS4_MAX_SLOT_TABLE resulted in numerous bugs. Keeping the current slot as a pointer to the slot table is more straight forward and robust as it's implicitly set up to NULL wherever the seq_res member is initialized to zeroes. Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 52 +++++++++++++++++------------------------ fs/nfs/nfs4xdr.c | 6 ++--- fs/nfs/read.c | 1 - fs/nfs/unlink.c | 3 +-- fs/nfs/write.c | 2 -- include/linux/nfs_xdr.h | 2 +- 6 files changed, 25 insertions(+), 41 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 72aa7067b85d..aee1bcc1fcc9 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -334,10 +334,12 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp * Must be called while holding tbl->slot_tbl_lock */ static void -nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid) +nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *free_slot) { + int free_slotid = free_slot - tbl->slots; int slotid = free_slotid; + BUG_ON(slotid < 0 || slotid >= NFS4_MAX_SLOT_TABLE); /* clear used bit in bitmap */ __clear_bit(slotid, tbl->used_slots); @@ -379,7 +381,7 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) struct nfs4_slot_table *tbl; tbl = &res->sr_session->fc_slot_table; - if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) { + if (!res->sr_slot) { /* just wake up the next guy waiting since * we may have not consumed a slot after all */ dprintk("%s: No slot\n", __func__); @@ -387,17 +389,15 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) } spin_lock(&tbl->slot_tbl_lock); - nfs4_free_slot(tbl, res->sr_slotid); + nfs4_free_slot(tbl, res->sr_slot); nfs41_check_drain_session_complete(res->sr_session); spin_unlock(&tbl->slot_tbl_lock); - res->sr_slotid = NFS4_MAX_SLOT_TABLE; + res->sr_slot = NULL; } static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) { unsigned long timestamp; - struct nfs4_slot_table *tbl; - struct nfs4_slot *slot; struct nfs_client *clp; /* @@ -410,17 +410,14 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res * res->sr_status = NFS_OK; /* -ERESTARTSYS can result in skipping nfs41_sequence_setup */ - if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) + if (!res->sr_slot) goto out; - tbl = &res->sr_session->fc_slot_table; - slot = tbl->slots + res->sr_slotid; - /* Check the SEQUENCE operation status */ switch (res->sr_status) { case 0: /* Update the slot's sequence and clientid lease timer */ - ++slot->seq_nr; + ++res->sr_slot->seq_nr; timestamp = res->sr_renewal_time; clp = res->sr_session->clp; do_renew_lease(clp, timestamp); @@ -433,12 +430,14 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res * * returned NFS4ERR_DELAY as per Section 2.10.6.2 * of RFC5661. */ - dprintk("%s: slot=%d seq=%d: Operation in progress\n", - __func__, res->sr_slotid, slot->seq_nr); + dprintk("%s: slot=%ld seq=%d: Operation in progress\n", + __func__, + res->sr_slot - res->sr_session->fc_slot_table.slots, + res->sr_slot->seq_nr); goto out_retry; default: /* Just update the slot sequence no. */ - ++slot->seq_nr; + ++res->sr_slot->seq_nr; } out: /* The session may be reset by one of the error handlers. */ @@ -505,10 +504,9 @@ static int nfs41_setup_sequence(struct nfs4_session *session, dprintk("--> %s\n", __func__); /* slot already allocated? */ - if (res->sr_slotid != NFS4_MAX_SLOT_TABLE) + if (res->sr_slot != NULL) return 0; - res->sr_slotid = NFS4_MAX_SLOT_TABLE; tbl = &session->fc_slot_table; spin_lock(&tbl->slot_tbl_lock); @@ -550,7 +548,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session, dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr); res->sr_session = session; - res->sr_slotid = slotid; + res->sr_slot = slot; res->sr_renewal_time = jiffies; res->sr_status_flags = 0; /* @@ -576,8 +574,9 @@ int nfs4_setup_sequence(const struct nfs_server *server, goto out; } - dprintk("--> %s clp %p session %p sr_slotid %d\n", - __func__, session->clp, session, res->sr_slotid); + dprintk("--> %s clp %p session %p sr_slot %ld\n", + __func__, session->clp, session, res->sr_slot ? + res->sr_slot - session->fc_slot_table.slots : -1); ret = nfs41_setup_sequence(session, args, res, cache_reply, task); @@ -650,7 +649,7 @@ static int nfs4_call_sync_sequence(struct nfs_server *server, .callback_data = &data }; - res->sr_slotid = NFS4_MAX_SLOT_TABLE; + res->sr_slot = NULL; if (privileged) task_setup.callback_ops = &nfs41_call_priv_sync_ops; task = rpc_run_task(&task_setup); @@ -735,7 +734,6 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p) p->o_res.server = p->o_arg.server; nfs_fattr_init(&p->f_attr); nfs_fattr_init(&p->dir_attr); - p->o_res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; } static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, @@ -1975,7 +1973,6 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i calldata->res.fattr = &calldata->fattr; calldata->res.seqid = calldata->arg.seqid; calldata->res.server = server; - calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; path_get(path); calldata->path = *path; @@ -2550,7 +2547,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) args->bitmask = server->cache_consistency_bitmask; res->server = server; - res->seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; + res->seq_res.sr_slot = NULL; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; } @@ -2576,7 +2573,6 @@ static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir) msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME]; arg->bitmask = server->attr_bitmask; res->server = server; - res->seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; } static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, @@ -3646,7 +3642,6 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co memcpy(&data->stateid, stateid, sizeof(data->stateid)); data->res.fattr = &data->fattr; data->res.server = server; - data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; nfs_fattr_init(data->res.fattr); data->timestamp = jiffies; data->rpc_status = 0; @@ -3799,7 +3794,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, p->arg.fl = &p->fl; p->arg.seqid = seqid; p->res.seqid = seqid; - p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; p->arg.stateid = &lsp->ls_stateid; p->lsp = lsp; atomic_inc(&lsp->ls_count); @@ -3979,7 +3973,6 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, p->arg.lock_owner.clientid = server->nfs_client->cl_clientid; p->arg.lock_owner.id = lsp->ls_id.id; p->res.lock_seqid = p->arg.lock_seqid; - p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; p->lsp = lsp; p->server = server; atomic_inc(&lsp->ls_count); @@ -4612,7 +4605,6 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) }; int status; - res.lr_seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; dprintk("--> %s\n", __func__); task = rpc_run_task(&task_setup); @@ -5105,12 +5097,11 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_ if (!atomic_inc_not_zero(&clp->cl_count)) return ERR_PTR(-EIO); - calldata = kmalloc(sizeof(*calldata), GFP_NOFS); + calldata = kzalloc(sizeof(*calldata), GFP_NOFS); if (calldata == NULL) { nfs_put_client(clp); return ERR_PTR(-ENOMEM); } - calldata->res.sr_slotid = NFS4_MAX_SLOT_TABLE; msg.rpc_argp = &calldata->args; msg.rpc_resp = &calldata->res; calldata->clp = clp; @@ -5242,7 +5233,6 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp) goto out; calldata->clp = clp; calldata->arg.one_fs = 0; - calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; msg.rpc_argp = &calldata->arg; msg.rpc_resp = &calldata->res; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 86ab69eb149c..3feace66b981 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4665,7 +4665,6 @@ static int decode_sequence(struct xdr_stream *xdr, struct rpc_rqst *rqstp) { #if defined(CONFIG_NFS_V4_1) - struct nfs4_slot *slot; struct nfs4_sessionid id; u32 dummy; int status; @@ -4697,15 +4696,14 @@ static int decode_sequence(struct xdr_stream *xdr, goto out_overflow; /* seqid */ - slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid]; dummy = be32_to_cpup(p++); - if (dummy != slot->seq_nr) { + if (dummy != res->sr_slot->seq_nr) { dprintk("%s Invalid sequence number\n", __func__); goto out_err; } /* slot id */ dummy = be32_to_cpup(p++); - if (dummy != res->sr_slotid) { + if (dummy != res->sr_slot - res->sr_session->fc_slot_table.slots) { dprintk("%s Invalid slot id\n", __func__); goto out_err; } diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 87adc2744246..79859c81a943 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -46,7 +46,6 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(&p->pages); p->npages = pagecount; - p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; if (pagecount <= ARRAY_SIZE(p->page_array)) p->pagevec = p->page_array; else { diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 47530aacebfd..9a16bad5d2ea 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -262,7 +262,6 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry) status = PTR_ERR(data->cred); goto out_free; } - data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; data->res.dir_attr = &data->dir_attr; status = -EBUSY; @@ -427,7 +426,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, .flags = RPC_TASK_ASYNC, }; - data = kmalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL); if (data == NULL) return ERR_PTR(-ENOMEM); task_setup_data.callback_data = data, diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 874972d9427c..4d6d35dd2b4b 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -55,7 +55,6 @@ struct nfs_write_data *nfs_commitdata_alloc(void) if (p) { memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(&p->pages); - p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; } return p; } @@ -75,7 +74,6 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(&p->pages); p->npages = pagecount; - p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; if (pagecount <= ARRAY_SIZE(p->page_array)) p->pagevec = p->page_array; else { diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 172df83ac54b..5772b2c2f063 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -170,7 +170,7 @@ struct nfs4_sequence_args { struct nfs4_sequence_res { struct nfs4_session *sr_session; - u8 sr_slotid; /* slot used to send request */ + struct nfs4_slot *sr_slot; /* slot used to send request */ int sr_status; /* sequence operation status */ unsigned long sr_renewal_time; u32 sr_status_flags; From 5c78f58e2d5cef65c255a556184f1f43c8d84c84 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 27 Sep 2010 15:51:20 -0400 Subject: [PATCH 34/67] NFS: Really fix put_nfs_open_context() In nfs_open_revalidate(), if the open_context() call returns an inode that is not the same as dentry->d_inode, then we will call put_nfs_open_context() with a valid dentry->d_inode, but without the context being part of the nfsi->open_files list. In this case too, we want to just skip the list removal, but we do want to call the ->close_context() callback in order to close the NFSv4 state. Signed-off-by: Trond Myklebust Acked-by: Jeff Layton --- fs/nfs/inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 702ed096e790..18be041abd23 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -639,6 +639,7 @@ struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cr ctx->dir_cookie = 0; nfs_init_lock_context(&ctx->lock_context); ctx->lock_context.open_context = ctx; + INIT_LIST_HEAD(&ctx->list); } return ctx; } @@ -654,14 +655,15 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) { struct inode *inode = ctx->path.dentry->d_inode; - if (inode) { + if (!list_empty(&ctx->list)) { if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) return; list_del(&ctx->list); spin_unlock(&inode->i_lock); - NFS_PROTO(inode)->close_context(ctx, is_sync); } else if (!atomic_dec_and_test(&ctx->lock_context.count)) return; + if (inode != NULL) + NFS_PROTO(inode)->close_context(ctx, is_sync); if (ctx->cred != NULL) put_rpccred(ctx->cred); path_put(&ctx->path); From a00dd6c03dd97a777c291a8af8682be4b5fadf8d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 28 Sep 2010 09:14:01 -0400 Subject: [PATCH 35/67] NFS: don't use FLUSH_SYNC on WB_SYNC_NONE COMMIT calls (try #2) WB_SYNC_NONE is supposed to mean "don't wait on anything". That should also include not waiting for COMMIT calls to complete. WB_SYNC_NONE is also implied when wbc->nonblocking and wbc->for_background are set, so we can replace those checks in nfs_commit_unstable_pages with a check for WB_SYNC_NONE. Signed-off-by: Jeff Layton Reviewed-by: Wu Fengguang Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 4d6d35dd2b4b..605e292501f4 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1431,15 +1431,17 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr int flags = FLUSH_SYNC; int ret = 0; - /* Don't commit yet if this is a non-blocking flush and there are - * lots of outstanding writes for this mapping. - */ - if (wbc->sync_mode == WB_SYNC_NONE && - nfsi->ncommit <= (nfsi->npages >> 1)) - goto out_mark_dirty; + if (wbc->sync_mode == WB_SYNC_NONE) { + /* Don't commit yet if this is a non-blocking flush and there + * are a lot of outstanding writes for this mapping. + */ + if (nfsi->ncommit <= (nfsi->npages >> 1)) + goto out_mark_dirty; - if (wbc->nonblocking || wbc->for_background) + /* don't wait for the COMMIT response */ flags = 0; + } + ret = nfs_commit_inode(inode, flags); if (ret >= 0) { if (wbc->sync_mode == WB_SYNC_NONE) { From aa510da5bfe1dfe263215fd0e05dac96e738a782 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 29 Sep 2010 15:11:56 -0400 Subject: [PATCH 36/67] NFS: We must use list_for_each_entry_safe in nfs_access_cache_shrinker We may end up removing the current entry from nfs_access_lru_list. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 86f1d41c6078..fe549f5ef20f 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1788,14 +1788,14 @@ static void nfs_access_free_list(struct list_head *head) int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) { LIST_HEAD(head); - struct nfs_inode *nfsi; + struct nfs_inode *nfsi, *next; struct nfs_access_entry *cache; if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) return (nr_to_scan == 0) ? 0 : -1; spin_lock(&nfs_access_lru_lock); - list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) { + list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) { struct inode *inode; if (nr_to_scan-- == 0) From 955a857e062642cd3ebe1dc7bb38c0f85d8f8f17 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Wed, 29 Sep 2010 15:41:49 -0400 Subject: [PATCH 37/67] NFS: new idmapper This patch creates a new idmapper system that uses the request-key function to place a call into userspace to map user and group ids to names. The old idmapper was single threaded, which prevented more than one request from running at a single time. This means that a user would have to wait for an upcall to finish before accessing a cached result. The upcall result is stored on a keyring of type id_resolver. See the file Documentation/filesystems/nfs/idmapper.txt for instructions. Signed-off-by: Bryan Schumaker [Trond: fix up the return value of nfs_idmap_lookup_name and clean up code] Signed-off-by: Trond Myklebust --- Documentation/filesystems/nfs/00-INDEX | 2 + Documentation/filesystems/nfs/idmapper.txt | 67 +++++++ fs/nfs/Kconfig | 11 ++ fs/nfs/idmap.c | 211 ++++++++++++++++++++- fs/nfs/inode.c | 7 + fs/nfs/nfs4xdr.c | 4 +- fs/nfs/sysctl.c | 2 + include/linux/nfs_idmap.h | 31 ++- 8 files changed, 329 insertions(+), 6 deletions(-) create mode 100644 Documentation/filesystems/nfs/idmapper.txt diff --git a/Documentation/filesystems/nfs/00-INDEX b/Documentation/filesystems/nfs/00-INDEX index 2f68cd688769..3225a5662114 100644 --- a/Documentation/filesystems/nfs/00-INDEX +++ b/Documentation/filesystems/nfs/00-INDEX @@ -14,3 +14,5 @@ nfsroot.txt - short guide on setting up a diskless box with NFS root filesystem. rpc-cache.txt - introduction to the caching mechanisms in the sunrpc layer. +idmapper.txt + - information for configuring request-keys to be used by idmapper diff --git a/Documentation/filesystems/nfs/idmapper.txt b/Documentation/filesystems/nfs/idmapper.txt new file mode 100644 index 000000000000..c3852041a21f --- /dev/null +++ b/Documentation/filesystems/nfs/idmapper.txt @@ -0,0 +1,67 @@ + +========= +ID Mapper +========= +Id mapper is used by NFS to translate user and group ids into names, and to +translate user and group names into ids. Part of this translation involves +performing an upcall to userspace to request the information. Id mapper will +user request-key to perform this upcall and cache the result. The program +/usr/sbin/nfs.upcall should be called by request-key, and will perform the +translation and initialize a key with the resulting information. + + NFS_USE_NEW_IDMAPPER must be selected when configuring the kernel to use this + feature. + +=========== +Configuring +=========== +The file /etc/request-key.conf will need to be modified so /sbin/request-key can +direct the upcall. The following line should be added: + +#OP TYPE DESCRIPTION CALLOUT INFO PROGRAM ARG1 ARG2 ARG3 ... +#====== ======= =============== =============== =============================== +create id_resolver * * /usr/sbin/nfs.upcall %k %d 600 + +This will direct all id_resolver requests to the program /usr/sbin/nfs.upcall. +The last parameter, 600, defines how many seconds into the future the key will +expire. This parameter is optional for /usr/sbin/nfs.upcall. When the timeout +is not specified, nfs.upcall will default to 600 seconds. + +id mapper uses for key descriptions: + uid: Find the UID for the given user + gid: Find the GID for the given group + user: Find the user name for the given UID + group: Find the group name for the given GID + +You can handle any of these individually, rather than using the generic upcall +program. If you would like to use your own program for a uid lookup then you +would edit your request-key.conf so it look similar to this: + +#OP TYPE DESCRIPTION CALLOUT INFO PROGRAM ARG1 ARG2 ARG3 ... +#====== ======= =============== =============== =============================== +create id_resolver uid:* * /some/other/program %k %d 600 +create id_resolver * * /usr/sbin/nfs.upcall %k %d 600 + +Notice that the new line was added above the line for the generic program. +request-key will find the first matching line and corresponding program. In +this case, /some/other/program will handle all uid lookups and +/usr/sbin/nfs.upcall will handle gid, user, and group lookups. + +See for more information about the +request-key function. + + +========== +nfs.upcall +========== +nfs.upcall is designed to be called by request-key, and should not be run "by +hand". This program takes two arguments, a serialized key and a key +description. The serialized key is first converted into a key_serial_t, and +then passed as an argument to keyctl_instantiate (both are part of keyutils.h). + +The actual lookups are performed by functions found in nfsidmap.h. nfs.upcall +determines the correct function to call by looking at the first part of the +description string. For example, a uid lookup description will appear as +"uid:user@domain". + +nfs.upcall will return 0 if the key was instantiated, and non-zero otherwise. diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 6c2aad49d731..3f69752d6f18 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -116,3 +116,14 @@ config NFS_USE_KERNEL_DNS select DNS_RESOLVER select KEYS default y + +config NFS_USE_NEW_IDMAPPER + bool "Use the new idmapper upcall routine" + depends on NFS_V4 && KEYS + help + Say Y here if you want NFS to use the new idmapper upcall functions. + You will need /sbin/request-key (usually provided by the keyutils + package). For details, read + . + + If you are unsure, say N. diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 21a84d45916f..dec47ed8b6b9 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -34,6 +34,212 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#ifdef CONFIG_NFS_USE_NEW_IDMAPPER + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define NFS_UINT_MAXLEN 11 + +const struct cred *id_resolver_cache; + +struct key_type key_type_id_resolver = { + .name = "id_resolver", + .instantiate = user_instantiate, + .match = user_match, + .revoke = user_revoke, + .destroy = user_destroy, + .describe = user_describe, + .read = user_read, +}; + +int nfs_idmap_init(void) +{ + struct cred *cred; + struct key *keyring; + int ret = 0; + + printk(KERN_NOTICE "Registering the %s key type\n", key_type_id_resolver.name); + + cred = prepare_kernel_cred(NULL); + if (!cred) + return -ENOMEM; + + keyring = key_alloc(&key_type_keyring, ".id_resolver", 0, 0, cred, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA); + if (IS_ERR(keyring)) { + ret = PTR_ERR(keyring); + goto failed_put_cred; + } + + ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL); + if (ret < 0) + goto failed_put_key; + + ret = register_key_type(&key_type_id_resolver); + if (ret < 0) + goto failed_put_key; + + cred->thread_keyring = keyring; + cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; + id_resolver_cache = cred; + return 0; + +failed_put_key: + key_put(keyring); +failed_put_cred: + put_cred(cred); + return ret; +} + +void nfs_idmap_quit(void) +{ + key_revoke(id_resolver_cache->thread_keyring); + unregister_key_type(&key_type_id_resolver); + put_cred(id_resolver_cache); +} + +/* + * Assemble the description to pass to request_key() + * This function will allocate a new string and update dest to point + * at it. The caller is responsible for freeing dest. + * + * On error 0 is returned. Otherwise, the length of dest is returned. + */ +static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen, + const char *type, size_t typelen, char **desc) +{ + char *cp; + size_t desclen = typelen + namelen + 2; + + *desc = kmalloc(desclen, GFP_KERNEL); + if (!desc) + return -ENOMEM; + + cp = *desc; + memcpy(cp, type, typelen); + cp += typelen; + *cp++ = ':'; + + memcpy(cp, name, namelen); + cp += namelen; + *cp = '\0'; + return desclen; +} + +static ssize_t nfs_idmap_request_key(const char *name, size_t namelen, + const char *type, void *data, size_t data_size) +{ + const struct cred *saved_cred; + struct key *rkey; + char *desc; + struct user_key_payload *payload; + ssize_t ret; + + ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc); + if (ret <= 0) + goto out; + + saved_cred = override_creds(id_resolver_cache); + rkey = request_key(&key_type_id_resolver, desc, ""); + revert_creds(saved_cred); + kfree(desc); + if (IS_ERR(rkey)) { + ret = PTR_ERR(rkey); + goto out; + } + + rcu_read_lock(); + rkey->perm |= KEY_USR_VIEW; + + ret = key_validate(rkey); + if (ret < 0) + goto out_up; + + payload = rcu_dereference(rkey->payload.data); + if (IS_ERR_OR_NULL(payload)) { + ret = PTR_ERR(payload); + goto out_up; + } + + ret = payload->datalen; + if (ret > 0 && ret <= data_size) + memcpy(data, payload->data, ret); + else + ret = -EINVAL; + +out_up: + rcu_read_unlock(); + key_put(rkey); +out: + return ret; +} + + +/* ID -> Name */ +static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, size_t buflen) +{ + char id_str[NFS_UINT_MAXLEN]; + int id_len; + ssize_t ret; + + id_len = snprintf(id_str, sizeof(id_str), "%u", id); + ret = nfs_idmap_request_key(id_str, id_len, type, buf, buflen); + if (ret < 0) + return -EINVAL; + return ret; +} + +/* Name -> ID */ +static int nfs_idmap_lookup_id(const char *name, size_t namelen, + const char *type, __u32 *id) +{ + char id_str[NFS_UINT_MAXLEN]; + long id_long; + ssize_t data_size; + int ret = 0; + + data_size = nfs_idmap_request_key(name, namelen, type, id_str, NFS_UINT_MAXLEN); + if (data_size <= 0) { + ret = -EINVAL; + } else { + ret = strict_strtol(id_str, 10, &id_long); + *id = (__u32)id_long; + } + return ret; +} + +int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid) +{ + return nfs_idmap_lookup_id(name, namelen, "uid", uid); +} + +int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *gid) +{ + return nfs_idmap_lookup_id(name, namelen, "gid", gid); +} + +int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen) +{ + return nfs_idmap_lookup_name(uid, "user", buf, buflen); +} +int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t buflen) +{ + return nfs_idmap_lookup_name(gid, "group", buf, buflen); +} + +#else /* CONFIG_NFS_USE_IDMAPPER not defined */ + #include #include #include @@ -503,16 +709,17 @@ int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namele return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid); } -int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf) +int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen) { struct idmap *idmap = clp->cl_idmap; return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf); } -int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf) +int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen) { struct idmap *idmap = clp->cl_idmap; return nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf); } +#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */ diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 18be041abd23..f2d2c801e0af 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1526,6 +1526,10 @@ static int __init init_nfs_fs(void) { int err; + err = nfs_idmap_init(); + if (err < 0) + goto out9; + err = nfs_dns_resolver_init(); if (err < 0) goto out8; @@ -1590,6 +1594,8 @@ out6: out7: nfs_dns_resolver_destroy(); out8: + nfs_idmap_quit(); +out9: return err; } @@ -1602,6 +1608,7 @@ static void __exit exit_nfs_fs(void) nfs_destroy_nfspagecache(); nfs_fscache_unregister(); nfs_dns_resolver_destroy(); + nfs_idmap_quit(); #ifdef CONFIG_PROC_FS rpc_proc_unregister("nfs"); #endif diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 3feace66b981..6ea5c9392fe4 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -816,7 +816,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const if (iap->ia_valid & ATTR_MODE) len += 4; if (iap->ia_valid & ATTR_UID) { - owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name); + owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name, IDMAP_NAMESZ); if (owner_namelen < 0) { dprintk("nfs: couldn't resolve uid %d to string\n", iap->ia_uid); @@ -828,7 +828,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const len += 4 + (XDR_QUADLEN(owner_namelen) << 2); } if (iap->ia_valid & ATTR_GID) { - owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group); + owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group, IDMAP_NAMESZ); if (owner_grouplen < 0) { dprintk("nfs: couldn't resolve gid %d to string\n", iap->ia_gid); diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c index ad4d2e787b20..978aaeb8a093 100644 --- a/fs/nfs/sysctl.c +++ b/fs/nfs/sysctl.c @@ -32,6 +32,7 @@ static ctl_table nfs_cb_sysctls[] = { .extra1 = (int *)&nfs_set_port_min, .extra2 = (int *)&nfs_set_port_max, }, +#ifndef CONFIG_NFS_USE_NEW_IDMAPPER { .procname = "idmap_cache_timeout", .data = &nfs_idmap_cache_timeout, @@ -39,6 +40,7 @@ static ctl_table nfs_cb_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, +#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */ #endif { .procname = "nfs_mountpoint_timeout", diff --git a/include/linux/nfs_idmap.h b/include/linux/nfs_idmap.h index 91a1c24e0cbf..e8352dc5afb5 100644 --- a/include/linux/nfs_idmap.h +++ b/include/linux/nfs_idmap.h @@ -66,13 +66,40 @@ struct idmap_msg { /* Forward declaration to make this header independent of others */ struct nfs_client; +#ifdef CONFIG_NFS_USE_NEW_IDMAPPER + +int nfs_idmap_init(void); +void nfs_idmap_quit(void); + +static inline int nfs_idmap_new(struct nfs_client *clp) +{ + return 0; +} + +static inline void nfs_idmap_delete(struct nfs_client *clp) +{ +} + +#else /* CONFIG_NFS_USE_NEW_IDMAPPER not set */ + +static inline int nfs_idmap_init(void) +{ + return 0; +} + +static inline void nfs_idmap_quit(void) +{ +} + int nfs_idmap_new(struct nfs_client *); void nfs_idmap_delete(struct nfs_client *); +#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */ + int nfs_map_name_to_uid(struct nfs_client *, const char *, size_t, __u32 *); int nfs_map_group_to_gid(struct nfs_client *, const char *, size_t, __u32 *); -int nfs_map_uid_to_name(struct nfs_client *, __u32, char *); -int nfs_map_gid_to_group(struct nfs_client *, __u32, char *); +int nfs_map_uid_to_name(struct nfs_client *, __u32, char *, size_t); +int nfs_map_gid_to_group(struct nfs_client *, __u32, char *, size_t); extern unsigned int nfs_idmap_cache_timeout; #endif /* __KERNEL__ */ From bc4866b6e0b44f8ea0df22a16e5927714beb4983 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 4 Oct 2010 17:59:08 -0400 Subject: [PATCH 38/67] NFS: Don't SIGBUS if nfs_vm_page_mkwrite races with a cache invalidation In the case where we lock the page, and then find out that the page has been thrown out of the page cache, we should just return VM_FAULT_NOPAGE. This is what block_page_mkwrite() does in these situations. Signed-off-by: Trond Myklebust Cc: stable@kernel.org --- fs/nfs/file.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 59cbe1ba0511..39672b731736 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -551,7 +551,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) struct file *filp = vma->vm_file; struct dentry *dentry = filp->f_path.dentry; unsigned pagelen; - int ret = -EINVAL; + int ret = VM_FAULT_NOPAGE; struct address_space *mapping; dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n", @@ -567,21 +567,20 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (mapping != dentry->d_inode->i_mapping) goto out_unlock; - ret = 0; pagelen = nfs_page_length(page); if (pagelen == 0) goto out_unlock; - ret = nfs_flush_incompatible(filp, page); - if (ret != 0) - goto out_unlock; + ret = VM_FAULT_LOCKED; + if (nfs_flush_incompatible(filp, page) == 0 && + nfs_updatepage(filp, page, 0, pagelen) == 0) + goto out; - ret = nfs_updatepage(filp, page, 0, pagelen); + ret = VM_FAULT_SIGBUS; out_unlock: - if (!ret) - return VM_FAULT_LOCKED; unlock_page(page); - return VM_FAULT_SIGBUS; +out: + return ret; } static const struct vm_operations_struct nfs_file_vm_ops = { From b0ed9dbc24f1fd912b2dd08b995153cafc1d5b1c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 4 Oct 2010 17:59:08 -0400 Subject: [PATCH 39/67] NFSv4: Fix open recovery NFSv4 open recovery is currently broken: since we do not clear the state->flags states before attempting recovery, we end up with the 'can_open_cached()' function triggering. This again leads to no OPEN call being put on the wire. Reported-by: Sachin Prabhu Signed-off-by: Trond Myklebust Cc: stable@kernel.org --- fs/nfs/nfs4proc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index aee1bcc1fcc9..7f25246bf38c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1118,6 +1118,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * clear_bit(NFS_DELEGATED_STATE, &state->flags); smp_rmb(); if (state->n_rdwr != 0) { + clear_bit(NFS_O_RDWR_STATE, &state->flags); ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate); if (ret != 0) return ret; @@ -1125,6 +1126,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * return -ESTALE; } if (state->n_wronly != 0) { + clear_bit(NFS_O_WRONLY_STATE, &state->flags); ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate); if (ret != 0) return ret; @@ -1132,6 +1134,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * return -ESTALE; } if (state->n_rdonly != 0) { + clear_bit(NFS_O_RDONLY_STATE, &state->flags); ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate); if (ret != 0) return ret; From ae1007d37e00144b72906a4bdc47d517ae91bcc1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 4 Oct 2010 17:59:08 -0400 Subject: [PATCH 40/67] NFSv4: Don't call nfs4_state_mark_reclaim_reboot() from error handlers In the case of a server reboot, the state recovery thread starts by calling nfs4_state_end_reclaim_reboot() in order to avoid edge conditions when the server reboots while the client is in the middle of recovery. However, if the client has already marked the nfs4_state as requiring reboot recovery, then the above behaviour will cause the recovery thread to treat the open as if it was part of such an edge condition: the open will be recovered as if it was part of a lease expiration (and all the locks will be lost). Fix is to remove the call to nfs4_state_mark_reclaim_reboot from nfs4_async_handle_error(), and nfs4_handle_exception(). Instead we leave it to the recovery thread to do this for us. Signed-off-by: Trond Myklebust Cc: stable@kernel.org --- fs/nfs/nfs4proc.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 7f25246bf38c..88a9d93ec1bd 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -255,9 +255,6 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, nfs4_state_mark_reclaim_nograce(clp, state); goto do_state_recovery; case -NFS4ERR_STALE_STATEID: - if (state == NULL) - break; - nfs4_state_mark_reclaim_reboot(clp, state); case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_EXPIRED: goto do_state_recovery; @@ -3414,9 +3411,6 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, nfs4_state_mark_reclaim_nograce(clp, state); goto do_state_recovery; case -NFS4ERR_STALE_STATEID: - if (state == NULL) - break; - nfs4_state_mark_reclaim_reboot(clp, state); case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_EXPIRED: goto do_state_recovery; From 6eaa61496fb3b93cceface7a296415fc4c030bce Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 4 Oct 2010 17:59:08 -0400 Subject: [PATCH 41/67] NFSv4: Don't call nfs4_reclaim_complete() on receiving NFS4ERR_STALE_CLIENTID If the server sends us an NFS4ERR_STALE_CLIENTID while the state management thread is busy reclaiming state, we do want to treat all state that wasn't reclaimed before the STALE_CLIENTID as if a network partition occurred (see the edge conditions described in RFC3530 and RFC5661). What we do not want to do is to send an nfs4_reclaim_complete(), since we haven't yet even started reclaiming state after the server rebooted. Signed-off-by: Trond Myklebust Cc: stable@kernel.org --- fs/nfs/nfs4state.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 3e2f19b04c06..940cf7c070af 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1138,16 +1138,14 @@ static void nfs4_reclaim_complete(struct nfs_client *clp, (void)ops->reclaim_complete(clp); } -static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) +static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp) { struct nfs4_state_owner *sp; struct rb_node *pos; struct nfs4_state *state; if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) - return; - - nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops); + return 0; for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); @@ -1161,6 +1159,14 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) } nfs_delegation_reap_unclaimed(clp); + return 1; +} + +static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) +{ + if (!nfs4_state_clear_reclaim_reboot(clp)) + return; + nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops); } static void nfs_delegation_clear_all(struct nfs_client *clp) @@ -1187,7 +1193,7 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_LEASE_MOVED: set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); - nfs4_state_end_reclaim_reboot(clp); + nfs4_state_clear_reclaim_reboot(clp); nfs4_state_start_reclaim_reboot(clp); break; case -NFS4ERR_EXPIRED: From 898f635c4297e91ceac675d83c4a460f26118342 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 23 Oct 2010 11:24:25 -0400 Subject: [PATCH 42/67] NFSv4: Don't ignore the error return codes from nfs_intent_set_file If nfs_intent_set_file() returns an error, we usually want to pass that back up the stack. Also ensure that nfs_open_revalidate() returns '1' on success. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index fe549f5ef20f..a847acf4d3cd 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1086,6 +1086,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry struct dentry *res = NULL; struct inode *inode; int open_flags; + int err; dfprintk(VFS, "NFS: atomic_lookup(%s/%ld), %s\n", dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); @@ -1119,9 +1120,8 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry if (!IS_POSIXACL(dir)) attr.ia_mode &= ~current_umask(); } else { - open_flags &= ~O_EXCL; + open_flags &= ~(O_EXCL | O_CREAT); attr.ia_valid = 0; - BUG_ON(open_flags & O_CREAT); } /* Open the file on the server */ @@ -1150,13 +1150,18 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry } } res = d_add_unique(dentry, inode); + nfs_unblock_sillyrename(dentry->d_parent); if (res != NULL) { dput(ctx->path.dentry); ctx->path.dentry = dget(res); dentry = res; } - nfs_intent_set_file(nd, ctx); - nfs_unblock_sillyrename(dentry->d_parent); + err = nfs_intent_set_file(nd, ctx); + if (err < 0) { + if (res != NULL) + dput(res); + return ERR_PTR(err); + } out: nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); return res; @@ -1221,11 +1226,13 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) } } iput(inode); - if (inode == dentry->d_inode) { - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - nfs_intent_set_file(nd, ctx); - } else + if (inode != dentry->d_inode) goto out_drop; + + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + ret = nfs_intent_set_file(nd, ctx); + if (ret >= 0) + ret = 1; out: dput(parent); return ret; @@ -1262,20 +1269,24 @@ static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode, ctx = nameidata_to_nfs_open_context(dentry, nd); error = PTR_ERR(ctx); if (IS_ERR(ctx)) - goto out_err; + goto out_err_drop; } error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, ctx); if (error != 0) goto out_put_ctx; - if (ctx != NULL) - nfs_intent_set_file(nd, ctx); + if (ctx != NULL) { + error = nfs_intent_set_file(nd, ctx); + if (error < 0) + goto out_err; + } return 0; out_put_ctx: if (ctx != NULL) put_nfs_open_context(ctx); -out_err: +out_err_drop: d_drop(dentry); +out_err: return error; } From 168667c43bbafff11b46014a1e94477ff7619f45 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 19 Oct 2010 19:47:49 -0400 Subject: [PATCH 43/67] NFSv4: The state manager must ignore EKEYEXPIRED. Otherwise, we cannot recover state correctly. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 27 ++++++++++++++++++--------- fs/nfs/nfs4state.c | 23 ++++++++++++++++++++++- 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 88a9d93ec1bd..aa771c1af115 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1186,7 +1186,7 @@ static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state int err; do { err = _nfs4_do_open_reclaim(ctx, state); - if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED) + if (err != -NFS4ERR_DELAY) break; nfs4_handle_exception(server, err, &exception); } while (exception.retry); @@ -1256,6 +1256,13 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: nfs4_state_mark_reclaim_nograce(server->nfs_client, state); + case -EKEYEXPIRED: + /* + * User RPCSEC_GSS context has expired. + * We cannot recover this stateid now, so + * skip it and allow recovery thread to + * proceed. + */ case -ENOMEM: err = 0; goto out; @@ -1603,7 +1610,6 @@ static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state goto out; case -NFS4ERR_GRACE: case -NFS4ERR_DELAY: - case -EKEYEXPIRED: nfs4_handle_exception(server, err, &exception); err = 0; } @@ -3544,7 +3550,6 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp, case -NFS4ERR_RESOURCE: /* The IBM lawyers misread another document! */ case -NFS4ERR_DELAY: - case -EKEYEXPIRED: err = nfs4_delay(clp->cl_rpcclient, &timeout); } } while (err == 0); @@ -4156,7 +4161,7 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) return 0; err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM); - if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED) + if (err != -NFS4ERR_DELAY) break; nfs4_handle_exception(server, err, &exception); } while (exception.retry); @@ -4181,7 +4186,6 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request goto out; case -NFS4ERR_GRACE: case -NFS4ERR_DELAY: - case -EKEYEXPIRED: nfs4_handle_exception(server, err, &exception); err = 0; } @@ -4327,13 +4331,21 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) nfs4_state_mark_reclaim_nograce(server->nfs_client, state); err = 0; goto out; + case -EKEYEXPIRED: + /* + * User RPCSEC_GSS context has expired. + * We cannot recover this stateid now, so + * skip it and allow recovery thread to + * proceed. + */ + err = 0; + goto out; case -ENOMEM: case -NFS4ERR_DENIED: /* kill_proc(fl->fl_pid, SIGLOST, 1); */ err = 0; goto out; case -NFS4ERR_DELAY: - case -EKEYEXPIRED: break; } err = nfs4_handle_exception(server, err, &exception); @@ -4562,7 +4574,6 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata) switch (task->tk_status) { case -NFS4ERR_DELAY: case -NFS4ERR_GRACE: - case -EKEYEXPIRED: dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status); rpc_delay(task, NFS4_POLL_RETRY_MIN); task->tk_status = 0; @@ -5025,7 +5036,6 @@ static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client { switch(task->tk_status) { case -NFS4ERR_DELAY: - case -EKEYEXPIRED: rpc_delay(task, NFS4_POLL_RETRY_MAX); return -EAGAIN; default: @@ -5167,7 +5177,6 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf case -NFS4ERR_WRONG_CRED: /* What to do here? */ break; case -NFS4ERR_DELAY: - case -EKEYEXPIRED: rpc_delay(task, NFS4_POLL_RETRY_MAX); return -EAGAIN; default: diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 940cf7c070af..40028ac2b69c 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1063,6 +1063,14 @@ restart: /* Mark the file as being 'closed' */ state->state = 0; break; + case -EKEYEXPIRED: + /* + * User RPCSEC_GSS context has expired. + * We cannot recover this stateid now, so + * skip it and allow recovery thread to + * proceed. + */ + break; case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_BAD_STATEID: @@ -1181,6 +1189,14 @@ static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp) nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce); } +static void nfs4_warn_keyexpired(const char *s) +{ + printk_ratelimited(KERN_WARNING "Error: state manager" + " encountered RPCSEC_GSS session" + " expired against NFSv4 server %s.\n", + s); +} + static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) { switch (error) { @@ -1210,6 +1226,10 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); /* Zero session reset errors */ return 0; + case -EKEYEXPIRED: + /* Nothing we can do */ + nfs4_warn_keyexpired(clp->cl_hostname); + return 0; } return error; } @@ -1420,9 +1440,10 @@ static void nfs4_set_lease_expired(struct nfs_client *clp, int status) case -NFS4ERR_DELAY: case -NFS4ERR_CLID_INUSE: case -EAGAIN: - case -EKEYEXPIRED: break; + case -EKEYEXPIRED: + nfs4_warn_keyexpired(clp->cl_hostname); case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery * in nfs4_exchange_id */ default: From 8c7597f6ce212bbc8ca05090e21820ffe9792b3d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 22 Oct 2010 16:18:52 -0700 Subject: [PATCH 44/67] nfs: include ratelimit.h, fix nfs4state build error nfs4state.c uses interfaces from ratelimit.h. It needs to include that header file to fix build errors: fs/nfs/nfs4state.c:1195: warning: type defaults to 'int' in declaration of 'DEFINE_RATELIMIT_STATE' fs/nfs/nfs4state.c:1195: warning: parameter names (without types) in function declaration fs/nfs/nfs4state.c:1195: error: invalid storage class for function 'DEFINE_RATELIMIT_STATE' fs/nfs/nfs4state.c:1195: error: implicit declaration of function '__ratelimit' fs/nfs/nfs4state.c:1195: error: '_rs' undeclared (first use in this function) Signed-off-by: Randy Dunlap Cc: Trond Myklebust Cc: linux-nfs@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/nfs4state.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 40028ac2b69c..e0dc06d74b6a 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include From d1bacf9eb2fd0e7ef870acf84b9e3b157dcfa7dc Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Fri, 24 Sep 2010 14:48:42 -0400 Subject: [PATCH 45/67] NFS: add readdir cache array This patch adds the readdir cache array and functions to retreive the array stored on a cache page, clear the array by freeing allocated memory, add an entry to the array, and search the array for a given cookie. It then modifies readdir to make use of the new cache array. With the new cache array method, we no longer need some of this code. Finally, nfs_llseek_dir() will set file->f_pos to a value greater than 0 and desc->dir_cookie to zero. When we see this, readdir needs to find the file at position file->f_pos from the start of the directory. Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 640 +++++++++++++++++++++++++-------------------------- 1 file changed, 313 insertions(+), 327 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index a847acf4d3cd..f77243907a08 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -55,6 +55,7 @@ static int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); static int nfs_fsync_dir(struct file *, int); static loff_t nfs_llseek_dir(struct file *, loff_t, int); +static int nfs_readdir_clear_array(struct page*, gfp_t); const struct file_operations nfs_dir_operations = { .llseek = nfs_llseek_dir, @@ -80,6 +81,10 @@ const struct inode_operations nfs_dir_inode_operations = { .setattr = nfs_setattr, }; +const struct address_space_operations nfs_dir_addr_space_ops = { + .releasepage = nfs_readdir_clear_array, +}; + #ifdef CONFIG_NFS_V3 const struct inode_operations nfs3_dir_inode_operations = { .create = nfs_create, @@ -151,51 +156,188 @@ nfs_opendir(struct inode *inode, struct file *filp) return res; } +struct nfs_cache_array_entry { + u64 cookie; + u64 ino; + struct qstr string; +}; + +struct nfs_cache_array { + unsigned int size; + int eof_index; + u64 last_cookie; + struct nfs_cache_array_entry array[0]; +}; + +#define MAX_READDIR_ARRAY ((PAGE_SIZE - sizeof(struct nfs_cache_array)) / sizeof(struct nfs_cache_array_entry)) + typedef __be32 * (*decode_dirent_t)(__be32 *, struct nfs_entry *, int); typedef struct { struct file *file; struct page *page; unsigned long page_index; - __be32 *ptr; u64 *dir_cookie; loff_t current_index; - struct nfs_entry *entry; decode_dirent_t decode; - int plus; + unsigned long timestamp; unsigned long gencount; - int timestamp_valid; + unsigned int cache_entry_index; + unsigned int plus:1; + unsigned int eof:1; } nfs_readdir_descriptor_t; -/* Now we cache directories properly, by stuffing the dirent - * data directly in the page cache. - * - * Inode invalidation due to refresh etc. takes care of - * _everything_, no sloppy entry flushing logic, no extraneous - * copying, network direct to page cache, the way it was meant - * to be. - * - * NOTE: Dirent information verification is done always by the - * page-in of the RPC reply, nowhere else, this simplies - * things substantially. +/* + * The caller is responsible for calling nfs_readdir_release_array(page) */ static -int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) +struct nfs_cache_array *nfs_readdir_get_array(struct page *page) +{ + if (page == NULL) + return ERR_PTR(-EIO); + return (struct nfs_cache_array *)kmap(page); +} + +static +void nfs_readdir_release_array(struct page *page) +{ + kunmap(page); +} + +/* + * we are freeing strings created by nfs_add_to_readdir_array() + */ +static +int nfs_readdir_clear_array(struct page *page, gfp_t mask) +{ + struct nfs_cache_array *array = nfs_readdir_get_array(page); + int i; + for (i = 0; i < array->size; i++) + kfree(array->array[i].string.name); + nfs_readdir_release_array(page); + return 0; +} + +/* + * the caller is responsible for freeing qstr.name + * when called by nfs_readdir_add_to_array, the strings will be freed in + * nfs_clear_readdir_array() + */ +static +void nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len) +{ + string->len = len; + string->name = kmemdup(name, len, GFP_KERNEL); + string->hash = full_name_hash(string->name, string->len); +} + +static +int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) +{ + struct nfs_cache_array *array = nfs_readdir_get_array(page); + if (IS_ERR(array)) + return PTR_ERR(array); + if (array->size >= MAX_READDIR_ARRAY) { + nfs_readdir_release_array(page); + return -EIO; + } + + array->array[array->size].cookie = entry->prev_cookie; + array->last_cookie = entry->cookie; + array->array[array->size].ino = entry->ino; + nfs_readdir_make_qstr(&array->array[array->size].string, entry->name, entry->len); + if (entry->eof == 1) + array->eof_index = array->size; + array->size++; + nfs_readdir_release_array(page); + return 0; +} + +static +int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) +{ + loff_t diff = desc->file->f_pos - desc->current_index; + unsigned int index; + + if (diff < 0) + goto out_eof; + if (diff >= array->size) { + if (array->eof_index > 0) + goto out_eof; + desc->current_index += array->size; + return -EAGAIN; + } + + index = (unsigned int)diff; + *desc->dir_cookie = array->array[index].cookie; + desc->cache_entry_index = index; + if (index == array->eof_index) + desc->eof = 1; + return 0; +out_eof: + desc->eof = 1; + return -EBADCOOKIE; +} + +static +int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) +{ + int i; + int status = -EAGAIN; + + for (i = 0; i < array->size; i++) { + if (i == array->eof_index) { + desc->eof = 1; + status = -EBADCOOKIE; + } + if (array->array[i].cookie == *desc->dir_cookie) { + desc->cache_entry_index = i; + status = 0; + break; + } + } + + return status; +} + +static +int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) +{ + struct nfs_cache_array *array; + int status = -EBADCOOKIE; + + if (desc->dir_cookie == NULL) + goto out; + + array = nfs_readdir_get_array(desc->page); + if (IS_ERR(array)) { + status = PTR_ERR(array); + goto out; + } + + if (*desc->dir_cookie == 0) + status = nfs_readdir_search_for_pos(array, desc); + else + status = nfs_readdir_search_for_cookie(array, desc); + + nfs_readdir_release_array(desc->page); +out: + return status; +} + +/* Fill a page with xdr information before transferring to the cache page */ +static +int nfs_readdir_xdr_filler(struct page *xdr_page, nfs_readdir_descriptor_t *desc, + struct nfs_entry *entry, struct file *file, struct inode *inode) { - struct file *file = desc->file; - struct inode *inode = file->f_path.dentry->d_inode; struct rpc_cred *cred = nfs_file_cred(file); unsigned long timestamp, gencount; int error; - dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n", - __func__, (long long)desc->entry->cookie, - page->index); - again: timestamp = jiffies; gencount = nfs_inc_attr_generation_counter(); - error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, desc->entry->cookie, page, + error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, xdr_page, NFS_SERVER(inode)->dtsize, desc->plus); if (error < 0) { /* We requested READDIRPLUS, but the server doesn't grok it */ @@ -209,12 +351,93 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) } desc->timestamp = timestamp; desc->gencount = gencount; - desc->timestamp_valid = 1; +error: + return error; +} + +/* Fill in an entry based on the xdr code stored in desc->page */ +static +int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, __be32 **ptr) +{ + __be32 *p = *ptr; + p = desc->decode(p, entry, desc->plus); + if (IS_ERR(p)) + return PTR_ERR(p); + *ptr = p; + + entry->fattr->time_start = desc->timestamp; + entry->fattr->gencount = desc->gencount; + return 0; +} + +/* Perform conversion from xdr to cache array */ +static +void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, + struct page *xdr_page, struct page *page) +{ + __be32 *ptr = kmap(xdr_page); + while (xdr_decode(desc, entry, &ptr) == 0) { + if (nfs_readdir_add_to_array(entry, page) == -1) + break; + } + kunmap(xdr_page); +} + +static +int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode) +{ + struct page *xdr_page; + struct nfs_entry entry; + struct file *file = desc->file; + struct nfs_cache_array *array; + int status = 0; + + entry.prev_cookie = 0; + entry.cookie = *desc->dir_cookie; + entry.eof = 0; + entry.fh = nfs_alloc_fhandle(); + entry.fattr = nfs_alloc_fattr(); + if (entry.fh == NULL || entry.fattr == NULL) + goto out; + + array = nfs_readdir_get_array(page); + memset(array, 0, sizeof(struct nfs_cache_array)); + array->eof_index = -1; + + xdr_page = alloc_page(GFP_KERNEL); + if (!xdr_page) + goto out_release_array; + do { + status = nfs_readdir_xdr_filler(xdr_page, desc, &entry, file, inode); + if (status < 0) + break; + nfs_readdir_page_filler(desc, &entry, xdr_page, page); + } while (array->eof_index < 0 && array->size < MAX_READDIR_ARRAY); + + put_page(xdr_page); +out_release_array: + nfs_readdir_release_array(page); +out: + nfs_free_fattr(entry.fattr); + nfs_free_fhandle(entry.fh); + return status; +} + +/* + * Now we cache directories properly, by converting xdr information + * to an array that can be used for lookups later. This results in + * fewer cache pages, since we can store more information on each page. + * We only need to convert from xdr once so future lookups are much simpler + */ +static +int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) +{ + struct inode *inode = desc->file->f_path.dentry->d_inode; + + if (nfs_readdir_xdr_to_array(desc, page, inode) == -1) + goto error; SetPageUptodate(page); - /* Ensure consistent page alignment of the data. - * Note: assumes we have exclusive access to this mapping either - * through inode->i_mutex or some other mechanism. - */ + if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) { /* Should never happen */ nfs_zap_mapping(inode, inode->i_mapping); @@ -226,173 +449,59 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) return -EIO; } -static inline -int dir_decode(nfs_readdir_descriptor_t *desc) +static +void cache_page_release(nfs_readdir_descriptor_t *desc) { - __be32 *p = desc->ptr; - p = desc->decode(p, desc->entry, desc->plus); - if (IS_ERR(p)) - return PTR_ERR(p); - desc->ptr = p; - if (desc->timestamp_valid) { - desc->entry->fattr->time_start = desc->timestamp; - desc->entry->fattr->gencount = desc->gencount; - } else - desc->entry->fattr->valid &= ~NFS_ATTR_FATTR; - return 0; -} - -static inline -void dir_page_release(nfs_readdir_descriptor_t *desc) -{ - kunmap(desc->page); page_cache_release(desc->page); desc->page = NULL; - desc->ptr = NULL; } -/* - * Given a pointer to a buffer that has already been filled by a call - * to readdir, find the next entry with cookie '*desc->dir_cookie'. - * - * If the end of the buffer has been reached, return -EAGAIN, if not, - * return the offset within the buffer of the next entry to be - * read. - */ -static inline -int find_dirent(nfs_readdir_descriptor_t *desc) +static +struct page *get_cache_page(nfs_readdir_descriptor_t *desc) { - struct nfs_entry *entry = desc->entry; - int loop_count = 0, - status; - - while((status = dir_decode(desc)) == 0) { - dfprintk(DIRCACHE, "NFS: %s: examining cookie %Lu\n", - __func__, (unsigned long long)entry->cookie); - if (entry->prev_cookie == *desc->dir_cookie) - break; - if (loop_count++ > 200) { - loop_count = 0; - schedule(); - } - } - return status; + struct page *page; + page = read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping, + desc->page_index, (filler_t *)nfs_readdir_filler, desc); + if (IS_ERR(page)) + desc->eof = 1; + return page; } /* - * Given a pointer to a buffer that has already been filled by a call - * to readdir, find the entry at offset 'desc->file->f_pos'. - * - * If the end of the buffer has been reached, return -EAGAIN, if not, - * return the offset within the buffer of the next entry to be - * read. + * Returns 0 if desc->dir_cookie was found on page desc->page_index */ -static inline -int find_dirent_index(nfs_readdir_descriptor_t *desc) +static +int find_cache_page(nfs_readdir_descriptor_t *desc) { - struct nfs_entry *entry = desc->entry; - int loop_count = 0, - status; + int res; - for(;;) { - status = dir_decode(desc); - if (status) - break; + desc->page = get_cache_page(desc); + if (IS_ERR(desc->page)) + return PTR_ERR(desc->page); - dfprintk(DIRCACHE, "NFS: found cookie %Lu at index %Ld\n", - (unsigned long long)entry->cookie, desc->current_index); - - if (desc->file->f_pos == desc->current_index) { - *desc->dir_cookie = entry->cookie; - break; - } - desc->current_index++; - if (loop_count++ > 200) { - loop_count = 0; - schedule(); - } - } - return status; + res = nfs_readdir_search_array(desc); + if (res == 0) + return 0; + cache_page_release(desc); + return res; } -/* - * Find the given page, and call find_dirent() or find_dirent_index in - * order to try to return the next entry. - */ -static inline -int find_dirent_page(nfs_readdir_descriptor_t *desc) -{ - struct inode *inode = desc->file->f_path.dentry->d_inode; - struct page *page; - int status; - - dfprintk(DIRCACHE, "NFS: %s: searching page %ld for target %Lu\n", - __func__, desc->page_index, - (long long) *desc->dir_cookie); - - /* If we find the page in the page_cache, we cannot be sure - * how fresh the data is, so we will ignore readdir_plus attributes. - */ - desc->timestamp_valid = 0; - page = read_cache_page(inode->i_mapping, desc->page_index, - (filler_t *)nfs_readdir_filler, desc); - if (IS_ERR(page)) { - status = PTR_ERR(page); - goto out; - } - - /* NOTE: Someone else may have changed the READDIRPLUS flag */ - desc->page = page; - desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */ - if (*desc->dir_cookie != 0) - status = find_dirent(desc); - else - status = find_dirent_index(desc); - if (status < 0) - dir_page_release(desc); - out: - dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status); - return status; -} - -/* - * Recurse through the page cache pages, and return a - * filled nfs_entry structure of the next directory entry if possible. - * - * The target for the search is '*desc->dir_cookie' if non-0, - * 'desc->file->f_pos' otherwise - */ +/* Search for desc->dir_cookie from the beginning of the page cache */ static inline int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) { - int loop_count = 0; - int res; + int res = -EAGAIN; + desc->page_index = 0; - /* Always search-by-index from the beginning of the cache */ - if (*desc->dir_cookie == 0) { - dfprintk(DIRCACHE, "NFS: readdir_search_pagecache() searching for offset %Ld\n", - (long long)desc->file->f_pos); - desc->page_index = 0; - desc->entry->cookie = desc->entry->prev_cookie = 0; - desc->entry->eof = 0; - desc->current_index = 0; - } else - dfprintk(DIRCACHE, "NFS: readdir_search_pagecache() searching for cookie %Lu\n", - (unsigned long long)*desc->dir_cookie); + if (*desc->dir_cookie == 0) + desc->cache_entry_index = 0; - for (;;) { - res = find_dirent_page(desc); + while (1) { + res = find_cache_page(desc); if (res != -EAGAIN) break; - /* Align to beginning of next page */ - desc->page_index ++; - if (loop_count++ > 200) { - loop_count = 0; - schedule(); - } + desc->page_index++; } - - dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, res); return res; } @@ -401,8 +510,6 @@ static inline unsigned int dt_type(struct inode *inode) return (inode->i_mode >> 12) & 15; } -static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc); - /* * Once we've found the start of the dirent within a page: fill 'er up... */ @@ -411,49 +518,36 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, filldir_t filldir) { struct file *file = desc->file; - struct nfs_entry *entry = desc->entry; - struct dentry *dentry = NULL; - u64 fileid; - int loop_count = 0, - res; + int i = 0; + int res = 0; + struct nfs_cache_array *array = NULL; + unsigned int d_type = DT_UNKNOWN; + struct dentry *dentry = NULL; - dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling starting @ cookie %Lu\n", - (unsigned long long)entry->cookie); + array = nfs_readdir_get_array(desc->page); - for(;;) { - unsigned d_type = DT_UNKNOWN; - /* Note: entry->prev_cookie contains the cookie for - * retrieving the current dirent on the server */ - fileid = entry->ino; + for (i = desc->cache_entry_index; i < array->size; i++) { + d_type = DT_UNKNOWN; - /* Get a dentry if we have one */ - if (dentry != NULL) - dput(dentry); - dentry = nfs_readdir_lookup(desc); - - /* Use readdirplus info */ - if (dentry != NULL && dentry->d_inode != NULL) { - d_type = dt_type(dentry->d_inode); - fileid = NFS_FILEID(dentry->d_inode); - } - - res = filldir(dirent, entry->name, entry->len, - file->f_pos, nfs_compat_user_ino64(fileid), - d_type); + res = filldir(dirent, array->array[i].string.name, + array->array[i].string.len, file->f_pos, + nfs_compat_user_ino64(array->array[i].ino), d_type); if (res < 0) break; file->f_pos++; - *desc->dir_cookie = entry->cookie; - if (dir_decode(desc) != 0) { - desc->page_index ++; + desc->cache_entry_index = i; + if (i < (array->size-1)) + *desc->dir_cookie = array->array[i+1].cookie; + else + *desc->dir_cookie = array->last_cookie; + if (i == array->eof_index) { + desc->eof = 1; break; } - if (loop_count++ > 200) { - loop_count = 0; - schedule(); - } } - dir_page_release(desc); + + nfs_readdir_release_array(desc->page); + cache_page_release(desc); if (dentry != NULL) dput(dentry); dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", @@ -477,12 +571,9 @@ static inline int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, filldir_t filldir) { - struct file *file = desc->file; - struct inode *inode = file->f_path.dentry->d_inode; - struct rpc_cred *cred = nfs_file_cred(file); struct page *page = NULL; int status; - unsigned long timestamp, gencount; + struct inode *inode = desc->file->f_path.dentry->d_inode; dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", (unsigned long long)*desc->dir_cookie); @@ -492,38 +583,21 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, status = -ENOMEM; goto out; } - timestamp = jiffies; - gencount = nfs_inc_attr_generation_counter(); - status = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, - *desc->dir_cookie, page, - NFS_SERVER(inode)->dtsize, - desc->plus); - desc->page = page; - desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */ - if (status >= 0) { - desc->timestamp = timestamp; - desc->gencount = gencount; - desc->timestamp_valid = 1; - if ((status = dir_decode(desc)) == 0) - desc->entry->prev_cookie = *desc->dir_cookie; - } else - status = -EIO; - if (status < 0) - goto out_release; + if (nfs_readdir_xdr_to_array(desc, page, inode) == -1) { + status = -EIO; + goto out_release; + } + + desc->page = page; status = nfs_do_filldir(desc, dirent, filldir); - /* Reset read descriptor so it searches the page cache from - * the start upon the next call to readdir_search_pagecache() */ - desc->page_index = 0; - desc->entry->cookie = desc->entry->prev_cookie = 0; - desc->entry->eof = 0; out: dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status); return status; out_release: - dir_page_release(desc); + cache_page_release(desc); goto out; } @@ -537,7 +611,6 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) struct inode *inode = dentry->d_inode; nfs_readdir_descriptor_t my_desc, *desc = &my_desc; - struct nfs_entry my_entry; int res = -ENOMEM; dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", @@ -558,26 +631,17 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) desc->decode = NFS_PROTO(inode)->decode_dirent; desc->plus = NFS_USE_READDIRPLUS(inode); - my_entry.cookie = my_entry.prev_cookie = 0; - my_entry.eof = 0; - my_entry.fh = nfs_alloc_fhandle(); - my_entry.fattr = nfs_alloc_fattr(); - if (my_entry.fh == NULL || my_entry.fattr == NULL) - goto out_alloc_failed; - - desc->entry = &my_entry; - nfs_block_sillyrename(dentry); res = nfs_revalidate_mapping(inode, filp->f_mapping); if (res < 0) goto out; - while(!desc->entry->eof) { + while (desc->eof != 1) { res = readdir_search_pagecache(desc); if (res == -EBADCOOKIE) { /* This means either end of directory */ - if (*desc->dir_cookie && desc->entry->cookie != *desc->dir_cookie) { + if (*desc->dir_cookie && desc->eof == 0) { /* Or that the server has 'lost' a cookie */ res = uncached_readdir(desc, dirent, filldir); if (res >= 0) @@ -590,7 +654,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); nfs_zap_caches(inode); desc->plus = 0; - desc->entry->eof = 0; + desc->eof = 0; continue; } if (res < 0) @@ -606,9 +670,6 @@ out: nfs_unblock_sillyrename(dentry); if (res > 0) res = 0; -out_alloc_failed: - nfs_free_fattr(my_entry.fattr); - nfs_free_fhandle(my_entry.fh); dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n", dentry->d_parent->d_name.name, dentry->d_name.name, res); @@ -1292,81 +1353,6 @@ out_err: #endif /* CONFIG_NFSV4 */ -static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc) -{ - struct dentry *parent = desc->file->f_path.dentry; - struct inode *dir = parent->d_inode; - struct nfs_entry *entry = desc->entry; - struct dentry *dentry, *alias; - struct qstr name = { - .name = entry->name, - .len = entry->len, - }; - struct inode *inode; - unsigned long verf = nfs_save_change_attribute(dir); - - switch (name.len) { - case 2: - if (name.name[0] == '.' && name.name[1] == '.') - return dget_parent(parent); - break; - case 1: - if (name.name[0] == '.') - return dget(parent); - } - - spin_lock(&dir->i_lock); - if (NFS_I(dir)->cache_validity & NFS_INO_INVALID_DATA) { - spin_unlock(&dir->i_lock); - return NULL; - } - spin_unlock(&dir->i_lock); - - name.hash = full_name_hash(name.name, name.len); - dentry = d_lookup(parent, &name); - if (dentry != NULL) { - /* Is this a positive dentry that matches the readdir info? */ - if (dentry->d_inode != NULL && - (NFS_FILEID(dentry->d_inode) == entry->ino || - d_mountpoint(dentry))) { - if (!desc->plus || entry->fh->size == 0) - return dentry; - if (nfs_compare_fh(NFS_FH(dentry->d_inode), - entry->fh) == 0) - goto out_renew; - } - /* No, so d_drop to allow one to be created */ - d_drop(dentry); - dput(dentry); - } - if (!desc->plus || !(entry->fattr->valid & NFS_ATTR_FATTR)) - return NULL; - if (name.len > NFS_SERVER(dir)->namelen) - return NULL; - /* Note: caller is already holding the dir->i_mutex! */ - dentry = d_alloc(parent, &name); - if (dentry == NULL) - return NULL; - dentry->d_op = NFS_PROTO(dir)->dentry_ops; - inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr); - if (IS_ERR(inode)) { - dput(dentry); - return NULL; - } - - alias = d_materialise_unique(dentry, inode); - if (alias != NULL) { - dput(dentry); - if (IS_ERR(alias)) - return NULL; - dentry = alias; - } - -out_renew: - nfs_set_verifier(dentry, verf); - return dentry; -} - /* * Code common to create, mkdir, and mknod. */ From baf57a09e9d87b14be5e2788828169394a2525ab Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 24 Sep 2010 18:49:43 -0400 Subject: [PATCH 46/67] NFS: Optimise the readdir searches If we're going through the loop in nfs_readdir() more than once, we usually do not want to restart searching from the beginning of the pages cache. We only want to do that if the previous search failed... Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index f77243907a08..33b0ce7a97be 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -491,10 +491,6 @@ static inline int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) { int res = -EAGAIN; - desc->page_index = 0; - - if (*desc->dir_cookie == 0) - desc->cache_entry_index = 0; while (1) { res = find_cache_page(desc); @@ -589,6 +585,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, goto out_release; } + desc->page_index = 0; desc->page = page; status = nfs_do_filldir(desc, dirent, filldir); @@ -653,6 +650,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (res == -ETOOSMALL && desc->plus) { clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); nfs_zap_caches(inode); + desc->page_index = 0; desc->plus = 0; desc->eof = 0; continue; From d39ab9de3b80da5835049b1c3b49da4e84e01c07 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Fri, 24 Sep 2010 18:50:01 -0400 Subject: [PATCH 47/67] NFS: re-add readdir plus This patch adds readdir plus support to the cache array. Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 33b0ce7a97be..fd30f185ec01 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -370,6 +370,71 @@ int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, __be32 * return 0; } +static +int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) +{ + struct nfs_inode *node; + if (dentry->d_inode == NULL) + goto different; + node = NFS_I(dentry->d_inode); + if (node->fh.size != entry->fh->size) + goto different; + if (strncmp(node->fh.data, entry->fh->data, node->fh.size) != 0) + goto different; + return 1; +different: + return 0; +} + +static +void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) +{ + struct qstr filename; + struct dentry *dentry = NULL; + struct dentry *alias = NULL; + struct inode *dir = parent->d_inode; + struct inode *inode; + + nfs_readdir_make_qstr(&filename, entry->name, entry->len); + if (filename.len == 1 && filename.name[0] == '.') + dentry = dget(parent); + else if (filename.len == 2 && filename.name[0] == '.' + && filename.name[1] == '.') + dentry = dget_parent(parent); + else + dentry = d_lookup(parent, &filename); + + if (dentry != NULL) { + if (nfs_same_file(dentry, entry)) { + nfs_refresh_inode(dentry->d_inode, entry->fattr); + goto out; + } else { + d_drop(dentry); + dput(dentry); + } + } + + dentry = d_alloc(parent, &filename); + dentry->d_op = NFS_PROTO(dir)->dentry_ops; + inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr); + if (IS_ERR(inode)) + goto out; + + alias = d_materialise_unique(dentry, inode); + if (IS_ERR(alias)) + goto out; + else if (alias) { + nfs_set_verifier(alias, nfs_save_change_attribute(dir)); + dput(alias); + } else + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + +out: + dput(dentry); + kfree(filename.name); + return; +} + /* Perform conversion from xdr to cache array */ static void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, @@ -379,6 +444,8 @@ void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *e while (xdr_decode(desc, entry, &ptr) == 0) { if (nfs_readdir_add_to_array(entry, page) == -1) break; + if (desc->plus == 1) + nfs_prime_dcache(desc->file->f_path.dentry, entry); } kunmap(xdr_page); } From 0715dc632a271fc0fedf3ef4779fe28ac1e53ef4 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Fri, 24 Sep 2010 18:50:01 -0400 Subject: [PATCH 48/67] NFS: remove readdir plus limit We will now use readdir plus even on directories that are very large. Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index f2d2c801e0af..6eec28656415 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -234,9 +234,6 @@ nfs_init_locked(struct inode *inode, void *opaque) return 0; } -/* Don't use READDIRPLUS on directories that we believe are too large */ -#define NFS_LIMIT_READDIRPLUS (8*PAGE_SIZE) - /* * This is our front-end to iget that looks up inodes by file handle * instead of inode number. @@ -291,8 +288,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) } else if (S_ISDIR(inode->i_mode)) { inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops; inode->i_fop = &nfs_dir_operations; - if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS) - && fattr->size <= NFS_LIMIT_READDIRPLUS) + if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)) set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); /* Deal with crossing mountpoints */ if ((fattr->valid & NFS_ATTR_FATTR_FSID) From ba8e452a4fe64a51b74d43761e14d99f0666cc45 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 19 Oct 2010 19:58:49 -0400 Subject: [PATCH 49/67] SUNRPC: Add a helper function xdr_inline_peek We sometimes need to be able to read ahead in an xdr_stream without incrementing the current pointer position. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xdr.h | 1 + net/sunrpc/xdr.c | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 8c1dcbb54d89..ab91d86565fd 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -201,6 +201,7 @@ extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes); extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int base, unsigned int len); extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); +extern __be32 *xdr_inline_peek(struct xdr_stream *xdr, size_t nbytes); extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes); extern void xdr_read_pages(struct xdr_stream *xdr, unsigned int len); extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len); diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index e0725d9d8107..cd9e841e7492 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -572,6 +572,27 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p) } EXPORT_SYMBOL_GPL(xdr_init_decode); +/** + * xdr_inline_peek - Allow read-ahead in the XDR data stream + * @xdr: pointer to xdr_stream struct + * @nbytes: number of bytes of data to decode + * + * Check if the input buffer is long enough to enable us to decode + * 'nbytes' more bytes of data starting at the current position. + * If so return the current pointer without updating the current + * pointer position. + */ +__be32 * xdr_inline_peek(struct xdr_stream *xdr, size_t nbytes) +{ + __be32 *p = xdr->p; + __be32 *q = p + XDR_QUADLEN(nbytes); + + if (unlikely(q > xdr->end || q < p)) + return NULL; + return p; +} +EXPORT_SYMBOL_GPL(xdr_inline_peek); + /** * xdr_inline_decode - Retrieve non-page XDR data to decode * @xdr: pointer to xdr_stream struct From babddc72a9468884ce1a23db3c3d54b0afa299f0 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Wed, 20 Oct 2010 15:44:29 -0400 Subject: [PATCH 50/67] NFS: decode_dirent should use an xdr_stream Convert nfs*xdr.c to use an xdr stream in decode_dirent. This will prevent a kernel oops that has been occuring when reading a vmapped page. Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 29 +++++++++---- fs/nfs/internal.h | 6 +-- fs/nfs/nfs2xdr.c | 41 +++++++++++++++--- fs/nfs/nfs3xdr.c | 93 ++++++++++++++++++++++++++++++++++++++--- fs/nfs/nfs4_fs.h | 2 +- fs/nfs/nfs4xdr.c | 71 +++++++++++++++++++++++-------- include/linux/nfs_xdr.h | 2 +- 7 files changed, 203 insertions(+), 41 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index fd30f185ec01..88cbcda76856 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -171,7 +171,7 @@ struct nfs_cache_array { #define MAX_READDIR_ARRAY ((PAGE_SIZE - sizeof(struct nfs_cache_array)) / sizeof(struct nfs_cache_array_entry)) -typedef __be32 * (*decode_dirent_t)(__be32 *, struct nfs_entry *, int); +typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int); typedef struct { struct file *file; struct page *page; @@ -357,13 +357,11 @@ error: /* Fill in an entry based on the xdr code stored in desc->page */ static -int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, __be32 **ptr) +int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct xdr_stream *stream) { - __be32 *p = *ptr; - p = desc->decode(p, entry, desc->plus); + __be32 *p = desc->decode(stream, entry, desc->plus); if (IS_ERR(p)) return PTR_ERR(p); - *ptr = p; entry->fattr->time_start = desc->timestamp; entry->fattr->gencount = desc->gencount; @@ -438,10 +436,23 @@ out: /* Perform conversion from xdr to cache array */ static void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, - struct page *xdr_page, struct page *page) + struct page *xdr_page, struct page *page, unsigned int buflen) { + struct xdr_stream stream; + struct xdr_buf buf; __be32 *ptr = kmap(xdr_page); - while (xdr_decode(desc, entry, &ptr) == 0) { + + buf.head->iov_base = xdr_page; + buf.head->iov_len = buflen; + buf.tail->iov_len = 0; + buf.page_base = 0; + buf.page_len = 0; + buf.buflen = buf.head->iov_len; + buf.len = buf.head->iov_len; + + xdr_init_decode(&stream, &buf, ptr); + + while (xdr_decode(desc, entry, &stream) == 0) { if (nfs_readdir_add_to_array(entry, page) == -1) break; if (desc->plus == 1) @@ -458,6 +469,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct file *file = desc->file; struct nfs_cache_array *array; int status = 0; + unsigned int array_size = 1; entry.prev_cookie = 0; entry.cookie = *desc->dir_cookie; @@ -476,9 +488,10 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, goto out_release_array; do { status = nfs_readdir_xdr_filler(xdr_page, desc, &entry, file, inode); + if (status < 0) break; - nfs_readdir_page_filler(desc, &entry, xdr_page, page); + nfs_readdir_page_filler(desc, &entry, xdr_page, page, array_size * PAGE_SIZE); } while (array->eof_index < 0 && array->size < MAX_READDIR_ARRAY); put_page(xdr_page); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index c961bc92c107..74b015598a43 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -181,15 +181,15 @@ extern void nfs_destroy_directcache(void); /* nfs2xdr.c */ extern int nfs_stat_to_errno(int); extern struct rpc_procinfo nfs_procedures[]; -extern __be32 * nfs_decode_dirent(__be32 *, struct nfs_entry *, int); +extern __be32 *nfs_decode_dirent(struct xdr_stream *, struct nfs_entry *, int); /* nfs3xdr.c */ extern struct rpc_procinfo nfs3_procedures[]; -extern __be32 *nfs3_decode_dirent(__be32 *, struct nfs_entry *, int); +extern __be32 *nfs3_decode_dirent(struct xdr_stream *, struct nfs_entry *, int); /* nfs4xdr.c */ #ifdef CONFIG_NFS_V4 -extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus); +extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *entry, int plus); #endif #ifdef CONFIG_NFS_V4_1 extern const u32 nfs41_maxread_overhead; diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 79c74387a2fe..0210c752e743 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -500,25 +500,56 @@ err_unmap: goto out; } -__be32 * -nfs_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) +static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) { - if (!*p++) { - if (!*p) + dprintk("nfs: %s: prematurely hit end of receive buffer. " + "Remaining buffer length is %tu words.\n", + func, xdr->end - xdr->p); +} + +__be32 * +nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, int plus) +{ + __be32 *p; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + if (!ntohl(*p++)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + if (!ntohl(*p++)) return ERR_PTR(-EAGAIN); entry->eof = 1; return ERR_PTR(-EBADCOOKIE); } + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + entry->ino = ntohl(*p++); entry->len = ntohl(*p++); + + p = xdr_inline_decode(xdr, entry->len + 4); + if (unlikely(!p)) + goto out_overflow; entry->name = (const char *) p; p += XDR_QUADLEN(entry->len); entry->prev_cookie = entry->cookie; entry->cookie = ntohl(*p++); - entry->eof = !p[0] && p[1]; + + p = xdr_inline_peek(xdr, 8); + if (p != NULL) + entry->eof = !p[0] && p[1]; + else + entry->eof = 0; return p; + +out_overflow: + print_overflow_msg(__func__, xdr); + return ERR_PTR(-EIO); } /* diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 52b2fda66e63..d562c8d9d56e 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -100,6 +100,13 @@ static const umode_t nfs_type2fmt[] = { [NF3FIFO] = S_IFIFO, }; +static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +{ + dprintk("nfs: %s: prematurely hit end of receive buffer. " + "Remaining buffer length is %tu words.\n", + func, xdr->end - xdr->p); +} + /* * Common NFS XDR functions as inlines */ @@ -119,6 +126,29 @@ xdr_decode_fhandle(__be32 *p, struct nfs_fh *fh) return NULL; } +static inline __be32 * +xdr_decode_fhandle_stream(struct xdr_stream *xdr, struct nfs_fh *fh) +{ + __be32 *p; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + fh->size = ntohl(*p++); + + if (fh->size <= NFS3_FHSIZE) { + p = xdr_inline_decode(xdr, fh->size); + if (unlikely(!p)) + goto out_overflow; + memcpy(fh->data, p, fh->size); + return p + XDR_QUADLEN(fh->size); + } + return NULL; + +out_overflow: + print_overflow_msg(__func__, xdr); + return ERR_PTR(-EIO); +} + /* * Encode/decode time. */ @@ -240,6 +270,26 @@ xdr_decode_post_op_attr(__be32 *p, struct nfs_fattr *fattr) return p; } +static inline __be32 * +xdr_decode_post_op_attr_stream(struct xdr_stream *xdr, struct nfs_fattr *fattr) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + if (ntohl(*p++)) { + p = xdr_inline_decode(xdr, 84); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_fattr(p, fattr); + } + return p; +out_overflow: + print_overflow_msg(__func__, xdr); + return ERR_PTR(-EIO); +} + static inline __be32 * xdr_decode_pre_op_attr(__be32 *p, struct nfs_fattr *fattr) { @@ -616,19 +666,33 @@ err_unmap: } __be32 * -nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) +nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, int plus) { + __be32 *p; struct nfs_entry old = *entry; - if (!*p++) { - if (!*p) + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + if (!ntohl(*p++)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + if (!ntohl(*p++)) return ERR_PTR(-EAGAIN); entry->eof = 1; return ERR_PTR(-EBADCOOKIE); } + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; p = xdr_decode_hyper(p, &entry->ino); entry->len = ntohl(*p++); + + p = xdr_inline_decode(xdr, entry->len + 8); + if (unlikely(!p)) + goto out_overflow; entry->name = (const char *) p; p += XDR_QUADLEN(entry->len); entry->prev_cookie = entry->cookie; @@ -636,10 +700,17 @@ nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) if (plus) { entry->fattr->valid = 0; - p = xdr_decode_post_op_attr(p, entry->fattr); + p = xdr_decode_post_op_attr_stream(xdr, entry->fattr); + if (IS_ERR(p)) + goto out_overflow_exit; /* In fact, a post_op_fh3: */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; if (*p++) { - p = xdr_decode_fhandle(p, entry->fh); + p = xdr_decode_fhandle_stream(xdr, entry->fh); + if (IS_ERR(p)) + goto out_overflow_exit; /* Ugh -- server reply was truncated */ if (p == NULL) { dprintk("NFS: FH truncated\n"); @@ -650,8 +721,18 @@ nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) memset((u8*)(entry->fh), 0, sizeof(*entry->fh)); } - entry->eof = !p[0] && p[1]; + p = xdr_inline_peek(xdr, 8); + if (p != NULL) + entry->eof = !p[0] && p[1]; + else + entry->eof = 0; + return p; + +out_overflow: + print_overflow_msg(__func__, xdr); +out_overflow_exit: + return ERR_PTR(-EIO); } /* diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index d24a8e07b5e2..c58ea6377506 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -331,7 +331,7 @@ extern void nfs_free_seqid(struct nfs_seqid *seqid); extern const nfs4_stateid zero_stateid; /* nfs4xdr.c */ -extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus); +extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *entry, int plus); extern struct rpc_procinfo nfs4_procedures[]; struct nfs4_mount_data; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 6ea5c9392fe4..a4919e999354 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3950,13 +3950,13 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) __be32 *p; uint32_t namelen, type; - p = xdr_inline_decode(xdr, 32); + p = xdr_inline_decode(xdr, 32); /* read 32 bytes */ if (unlikely(!p)) goto out_overflow; - p = xdr_decode_hyper(p, &offset); + p = xdr_decode_hyper(p, &offset); /* read 2 8-byte long words */ p = xdr_decode_hyper(p, &length); - type = be32_to_cpup(p++); - if (fl != NULL) { + type = be32_to_cpup(p++); /* 4 byte read */ + if (fl != NULL) { /* manipulate file lock */ fl->fl_start = (loff_t)offset; fl->fl_end = fl->fl_start + (loff_t)length - 1; if (length == ~(uint64_t)0) @@ -3966,9 +3966,9 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) fl->fl_type = F_RDLCK; fl->fl_pid = 0; } - p = xdr_decode_hyper(p, &clientid); - namelen = be32_to_cpup(p); - p = xdr_inline_decode(xdr, namelen); + p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */ + namelen = be32_to_cpup(p); /* read 4 bytes */ /* have read all 32 bytes now */ + p = xdr_inline_decode(xdr, namelen); /* variable size field */ if (likely(p)) return -NFS4ERR_DENIED; out_overflow: @@ -5755,21 +5755,33 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p, } #endif /* CONFIG_NFS_V4_1 */ -__be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) +__be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, int plus) { uint32_t bitmap[2] = {0}; uint32_t len; - - if (!*p++) { - if (!*p) + __be32 *p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + if (!ntohl(*p++)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + if (!ntohl(*p++)) return ERR_PTR(-EAGAIN); entry->eof = 1; return ERR_PTR(-EBADCOOKIE); } + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; entry->prev_cookie = entry->cookie; p = xdr_decode_hyper(p, &entry->cookie); entry->len = ntohl(*p++); + + p = xdr_inline_decode(xdr, entry->len + 4); + if (unlikely(!p)) + goto out_overflow; entry->name = (const char *) p; p += XDR_QUADLEN(entry->len); @@ -5782,29 +5794,54 @@ __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) len = ntohl(*p++); /* bitmap length */ if (len-- > 0) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; bitmap[0] = ntohl(*p++); if (len-- > 0) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; bitmap[1] = ntohl(*p++); p += len; } } + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; len = XDR_QUADLEN(ntohl(*p++)); /* attribute buffer length */ if (len > 0) { if (bitmap[0] & FATTR4_WORD0_RDATTR_ERROR) { bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR; /* Ignore the return value of rdattr_error for now */ - p++; - len--; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; } - if (bitmap[0] == 0 && bitmap[1] == FATTR4_WORD1_MOUNTED_ON_FILEID) + if (bitmap[0] == 0 && bitmap[1] == FATTR4_WORD1_MOUNTED_ON_FILEID) { + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; xdr_decode_hyper(p, &entry->ino); - else if (bitmap[0] == FATTR4_WORD0_FILEID) + } else if (bitmap[0] == FATTR4_WORD0_FILEID) { + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; xdr_decode_hyper(p, &entry->ino); - p += len; + } } - entry->eof = !p[0] && p[1]; + p = xdr_inline_peek(xdr, 8); + if (p != NULL) + entry->eof = !p[0] && p[1]; + else + entry->eof = 0; + return p; + +out_overflow: + print_overflow_msg(__func__, xdr); + return ERR_PTR(-EIO); } /* diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 5772b2c2f063..ca0e8fd7feec 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1036,7 +1036,7 @@ struct nfs_rpc_ops { int (*pathconf) (struct nfs_server *, struct nfs_fh *, struct nfs_pathconf *); int (*set_capabilities)(struct nfs_server *, struct nfs_fh *); - __be32 *(*decode_dirent)(__be32 *, struct nfs_entry *, int plus); + __be32 *(*decode_dirent)(struct xdr_stream *, struct nfs_entry *, int plus); void (*read_setup) (struct nfs_read_data *, struct rpc_message *); int (*read_done) (struct rpc_task *, struct nfs_read_data *); void (*write_setup) (struct nfs_write_data *, struct rpc_message *); From afa8ccc978c24d8ab22e3b3b8cbd1054c84c070b Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Wed, 20 Oct 2010 15:44:31 -0400 Subject: [PATCH 51/67] NFS: remove page size checking code Remove the page size checking code for a readdir decode. This is now done by decode_dirent with xdr_streams. Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/nfs2xdr.c | 54 --------------------------------- fs/nfs/nfs3xdr.c | 78 +----------------------------------------------- fs/nfs/nfs4xdr.c | 66 ---------------------------------------- 3 files changed, 1 insertion(+), 197 deletions(-) diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 0210c752e743..82f026422424 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -423,9 +423,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy) struct page **page; size_t hdrlen; unsigned int pglen, recvd; - u32 len; int status, nr = 0; - __be32 *end, *entry, *kaddr; if ((status = ntohl(*p++))) return nfs_stat_to_errno(status); @@ -445,59 +443,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy) if (pglen > recvd) pglen = recvd; page = rcvbuf->pages; - kaddr = p = kmap_atomic(*page, KM_USER0); - end = (__be32 *)((char *)p + pglen); - entry = p; - - /* Make sure the packet actually has a value_follows and EOF entry */ - if ((entry + 1) > end) - goto short_pkt; - - for (; *p++; nr++) { - if (p + 2 > end) - goto short_pkt; - p++; /* fileid */ - len = ntohl(*p++); - p += XDR_QUADLEN(len) + 1; /* name plus cookie */ - if (len > NFS2_MAXNAMLEN) { - dprintk("NFS: giant filename in readdir (len 0x%x)!\n", - len); - goto err_unmap; - } - if (p + 2 > end) - goto short_pkt; - entry = p; - } - - /* - * Apparently some server sends responses that are a valid size, but - * contain no entries, and have value_follows==0 and EOF==0. For - * those, just set the EOF marker. - */ - if (!nr && entry[1] == 0) { - dprintk("NFS: readdir reply truncated!\n"); - entry[1] = 1; - } - out: - kunmap_atomic(kaddr, KM_USER0); return nr; - short_pkt: - /* - * When we get a short packet there are 2 possibilities. We can - * return an error, or fix up the response to look like a valid - * response and return what we have so far. If there are no - * entries and the packet was short, then return -EIO. If there - * are valid entries in the response, return them and pretend that - * the call was successful, but incomplete. The caller can retry the - * readdir starting at the last cookie. - */ - entry[0] = entry[1] = 0; - if (!nr) - nr = -errno_NFSERR_IO; - goto out; -err_unmap: - nr = -errno_NFSERR_IO; - goto out; } static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index d562c8d9d56e..dc98eb7976c3 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -554,9 +554,8 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res struct kvec *iov = rcvbuf->head; struct page **page; size_t hdrlen; - u32 len, recvd, pglen; + u32 recvd, pglen; int status, nr = 0; - __be32 *entry, *end, *kaddr; status = ntohl(*p++); /* Decode post_op_attrs */ @@ -586,83 +585,8 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res if (pglen > recvd) pglen = recvd; page = rcvbuf->pages; - kaddr = p = kmap_atomic(*page, KM_USER0); - end = (__be32 *)((char *)p + pglen); - entry = p; - /* Make sure the packet actually has a value_follows and EOF entry */ - if ((entry + 1) > end) - goto short_pkt; - - for (; *p++; nr++) { - if (p + 3 > end) - goto short_pkt; - p += 2; /* inode # */ - len = ntohl(*p++); /* string length */ - p += XDR_QUADLEN(len) + 2; /* name + cookie */ - if (len > NFS3_MAXNAMLEN) { - dprintk("NFS: giant filename in readdir (len 0x%x)!\n", - len); - goto err_unmap; - } - - if (res->plus) { - /* post_op_attr */ - if (p + 2 > end) - goto short_pkt; - if (*p++) { - p += 21; - if (p + 1 > end) - goto short_pkt; - } - /* post_op_fh3 */ - if (*p++) { - if (p + 1 > end) - goto short_pkt; - len = ntohl(*p++); - if (len > NFS3_FHSIZE) { - dprintk("NFS: giant filehandle in " - "readdir (len 0x%x)!\n", len); - goto err_unmap; - } - p += XDR_QUADLEN(len); - } - } - - if (p + 2 > end) - goto short_pkt; - entry = p; - } - - /* - * Apparently some server sends responses that are a valid size, but - * contain no entries, and have value_follows==0 and EOF==0. For - * those, just set the EOF marker. - */ - if (!nr && entry[1] == 0) { - dprintk("NFS: readdir reply truncated!\n"); - entry[1] = 1; - } - out: - kunmap_atomic(kaddr, KM_USER0); return nr; - short_pkt: - /* - * When we get a short packet there are 2 possibilities. We can - * return an error, or fix up the response to look like a valid - * response and return what we have so far. If there are no - * entries and the packet was short, then return -EIO. If there - * are valid entries in the response, return them and pretend that - * the call was successful, but incomplete. The caller can retry the - * readdir starting at the last cookie. - */ - entry[0] = entry[1] = 0; - if (!nr) - nr = -errno_NFSERR_IO; - goto out; -err_unmap: - nr = -errno_NFSERR_IO; - goto out; } __be32 * diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index a4919e999354..8346e977d837 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4200,12 +4200,9 @@ out_overflow: static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir) { struct xdr_buf *rcvbuf = &req->rq_rcv_buf; - struct page *page = *rcvbuf->pages; struct kvec *iov = rcvbuf->head; size_t hdrlen; u32 recvd, pglen = rcvbuf->page_len; - __be32 *end, *entry, *p, *kaddr; - unsigned int nr = 0; int status; status = decode_op_hdr(xdr, OP_READDIR); @@ -4225,71 +4222,8 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n pglen = recvd; xdr_read_pages(xdr, pglen); - BUG_ON(pglen + readdir->pgbase > PAGE_CACHE_SIZE); - kaddr = p = kmap_atomic(page, KM_USER0); - end = p + ((pglen + readdir->pgbase) >> 2); - entry = p; - /* Make sure the packet actually has a value_follows and EOF entry */ - if ((entry + 1) > end) - goto short_pkt; - - for (; *p++; nr++) { - u32 len, attrlen, xlen; - if (end - p < 3) - goto short_pkt; - dprintk("cookie = %Lu, ", *((unsigned long long *)p)); - p += 2; /* cookie */ - len = ntohl(*p++); /* filename length */ - if (len > NFS4_MAXNAMLEN) { - dprintk("NFS: giant filename in readdir (len 0x%x)\n", - len); - goto err_unmap; - } - xlen = XDR_QUADLEN(len); - if (end - p < xlen + 1) - goto short_pkt; - dprintk("filename = %*s\n", len, (char *)p); - p += xlen; - len = ntohl(*p++); /* bitmap length */ - if (end - p < len + 1) - goto short_pkt; - p += len; - attrlen = XDR_QUADLEN(ntohl(*p++)); - if (end - p < attrlen + 2) - goto short_pkt; - p += attrlen; /* attributes */ - entry = p; - } - /* - * Apparently some server sends responses that are a valid size, but - * contain no entries, and have value_follows==0 and EOF==0. For - * those, just set the EOF marker. - */ - if (!nr && entry[1] == 0) { - dprintk("NFS: readdir reply truncated!\n"); - entry[1] = 1; - } -out: - kunmap_atomic(kaddr, KM_USER0); return 0; -short_pkt: - /* - * When we get a short packet there are 2 possibilities. We can - * return an error, or fix up the response to look like a valid - * response and return what we have so far. If there are no - * entries and the packet was short, then return -EIO. If there - * are valid entries in the response, return them and pretend that - * the call was successful, but incomplete. The caller can retry the - * readdir starting at the last cookie. - */ - dprintk("%s: short packet at entry %d\n", __func__, nr); - entry[0] = entry[1] = 0; - if (nr) - goto out; -err_unmap: - kunmap_atomic(kaddr, KM_USER0); - return -errno_NFSERR_IO; } static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) From 56e4ebf877b6043c289bda32a5a7385b80c17dee Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Wed, 20 Oct 2010 15:44:37 -0400 Subject: [PATCH 52/67] NFS: readdir with vmapped pages We can use vmapped pages to read more information from the network at once. This will reduce the number of calls needed to complete a readdir. Signed-off-by: Bryan Schumaker [trondmy: Added #include for linux/vmalloc.h> in fs/nfs/dir.c] Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 4 +-- fs/nfs/dir.c | 66 +++++++++++++++++++++++++++++++++-------- fs/nfs/internal.h | 6 ++++ fs/nfs/nfs3proc.c | 4 +-- fs/nfs/nfs4proc.c | 8 ++--- fs/nfs/proc.c | 4 +-- include/linux/nfs_xdr.h | 2 +- 7 files changed, 71 insertions(+), 23 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 5f01f42b3991..876ba8592706 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -903,8 +903,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo * server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); - if (server->dtsize > PAGE_CACHE_SIZE) - server->dtsize = PAGE_CACHE_SIZE; + if (server->dtsize > PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES) + server->dtsize = PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES; if (server->dtsize > server->rsize) server->dtsize = server->rsize; diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 88cbcda76856..5d7da3ad4e85 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "delegation.h" #include "iostat.h" @@ -327,7 +328,7 @@ out: /* Fill a page with xdr information before transferring to the cache page */ static -int nfs_readdir_xdr_filler(struct page *xdr_page, nfs_readdir_descriptor_t *desc, +int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct file *file, struct inode *inode) { struct rpc_cred *cred = nfs_file_cred(file); @@ -337,7 +338,7 @@ int nfs_readdir_xdr_filler(struct page *xdr_page, nfs_readdir_descriptor_t *desc again: timestamp = jiffies; gencount = nfs_inc_attr_generation_counter(); - error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, xdr_page, + error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, pages, NFS_SERVER(inode)->dtsize, desc->plus); if (error < 0) { /* We requested READDIRPLUS, but the server doesn't grok it */ @@ -436,11 +437,11 @@ out: /* Perform conversion from xdr to cache array */ static void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, - struct page *xdr_page, struct page *page, unsigned int buflen) + void *xdr_page, struct page *page, unsigned int buflen) { struct xdr_stream stream; struct xdr_buf buf; - __be32 *ptr = kmap(xdr_page); + __be32 *ptr = xdr_page; buf.head->iov_base = xdr_page; buf.head->iov_len = buflen; @@ -458,18 +459,59 @@ void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *e if (desc->plus == 1) nfs_prime_dcache(desc->file->f_path.dentry, entry); } - kunmap(xdr_page); +} + +static +void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages) +{ + unsigned int i; + for (i = 0; i < npages; i++) + put_page(pages[i]); +} + +static +void nfs_readdir_free_large_page(void *ptr, struct page **pages, + unsigned int npages) +{ + vm_unmap_ram(ptr, npages); + nfs_readdir_free_pagearray(pages, npages); +} + +/* + * nfs_readdir_large_page will allocate pages that must be freed with a call + * to nfs_readdir_free_large_page + */ +static +void *nfs_readdir_large_page(struct page **pages, unsigned int npages) +{ + void *ptr; + unsigned int i; + + for (i = 0; i < npages; i++) { + struct page *page = alloc_page(GFP_KERNEL); + if (page == NULL) + goto out_freepages; + pages[i] = page; + } + + ptr = vm_map_ram(pages, NFS_MAX_READDIR_PAGES, 0, PAGE_KERNEL); + if (!IS_ERR_OR_NULL(ptr)) + return ptr; +out_freepages: + nfs_readdir_free_pagearray(pages, i); + return NULL; } static int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode) { - struct page *xdr_page; + struct page *pages[NFS_MAX_READDIR_PAGES]; + void *pages_ptr = NULL; struct nfs_entry entry; struct file *file = desc->file; struct nfs_cache_array *array; int status = 0; - unsigned int array_size = 1; + unsigned int array_size = ARRAY_SIZE(pages); entry.prev_cookie = 0; entry.cookie = *desc->dir_cookie; @@ -483,18 +525,18 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, memset(array, 0, sizeof(struct nfs_cache_array)); array->eof_index = -1; - xdr_page = alloc_page(GFP_KERNEL); - if (!xdr_page) + pages_ptr = nfs_readdir_large_page(pages, array_size); + if (!pages_ptr) goto out_release_array; do { - status = nfs_readdir_xdr_filler(xdr_page, desc, &entry, file, inode); + status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode); if (status < 0) break; - nfs_readdir_page_filler(desc, &entry, xdr_page, page, array_size * PAGE_SIZE); + nfs_readdir_page_filler(desc, &entry, pages_ptr, page, array_size * PAGE_SIZE); } while (array->eof_index < 0 && array->size < MAX_READDIR_ARRAY); - put_page(xdr_page); + nfs_readdir_free_large_page(pages_ptr, pages, array_size); out_release_array: nfs_readdir_release_array(page); out: diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 74b015598a43..7b0e894d00c8 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -62,6 +62,12 @@ struct nfs_clone_mount { */ #define NFS_UNSPEC_PORT (-1) +/* + * Maximum number of pages that readdir can use for creating + * a vmapped array of pages. + */ +#define NFS_MAX_READDIR_PAGES 8 + /* * In-kernel mount arguments */ diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index f8446b38dcb4..ce939c062a52 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -630,7 +630,7 @@ out: */ static int nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, - u64 cookie, struct page *page, unsigned int count, int plus) + u64 cookie, struct page **pages, unsigned int count, int plus) { struct inode *dir = dentry->d_inode; __be32 *verf = NFS_COOKIEVERF(dir); @@ -640,7 +640,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, .verf = {verf[0], verf[1]}, .plus = plus, .count = count, - .pages = &page + .pages = pages }; struct nfs3_readdirres res = { .verf = verf, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index aa771c1af115..cb33c73206e3 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2823,12 +2823,12 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, } static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, - u64 cookie, struct page *page, unsigned int count, int plus) + u64 cookie, struct page **pages, unsigned int count, int plus) { struct inode *dir = dentry->d_inode; struct nfs4_readdir_arg args = { .fh = NFS_FH(dir), - .pages = &page, + .pages = pages, .pgbase = 0, .count = count, .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask, @@ -2859,14 +2859,14 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, } static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, - u64 cookie, struct page *page, unsigned int count, int plus) + u64 cookie, struct page **pages, unsigned int count, int plus) { struct nfs4_exception exception = { }; int err; do { err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode), _nfs4_proc_readdir(dentry, cred, cookie, - page, count, plus), + pages, count, plus), &exception); } while (exception.retry); return err; diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index e5e84aa2af17..58e7f84fc1fd 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -534,14 +534,14 @@ nfs_proc_rmdir(struct inode *dir, struct qstr *name) */ static int nfs_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, - u64 cookie, struct page *page, unsigned int count, int plus) + u64 cookie, struct page **pages, unsigned int count, int plus) { struct inode *dir = dentry->d_inode; struct nfs_readdirargs arg = { .fh = NFS_FH(dir), .cookie = cookie, .count = count, - .pages = &page, + .pages = pages, }; struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_READDIR], diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index ca0e8fd7feec..1b9a17a1f235 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1026,7 +1026,7 @@ struct nfs_rpc_ops { int (*mkdir) (struct inode *, struct dentry *, struct iattr *); int (*rmdir) (struct inode *, struct qstr *); int (*readdir) (struct dentry *, struct rpc_cred *, - u64, struct page *, unsigned int, int); + u64, struct page **, unsigned int, int); int (*mknod) (struct inode *, struct dentry *, struct iattr *, dev_t); int (*statfs) (struct nfs_server *, struct nfs_fh *, From 3c8a1aeed8fd7f89bd0400fad72cbc1ac3460217 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Thu, 21 Oct 2010 16:33:16 -0400 Subject: [PATCH 53/67] NFS: nfs_readdir_filler catch all errors Check for all errors, not a specific one. Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 5d7da3ad4e85..7b8b7c59db9f 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -556,7 +556,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) { struct inode *inode = desc->file->f_path.dentry->d_inode; - if (nfs_readdir_xdr_to_array(desc, page, inode) == -1) + if (nfs_readdir_xdr_to_array(desc, page, inode) < 0) goto error; SetPageUptodate(page); From 9942438089d5c0e3adecdcb7bc360b8fe0ce7e62 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Thu, 21 Oct 2010 16:33:16 -0400 Subject: [PATCH 54/67] NFS: check xdr_decode for errors Check if the decoded entry has the eof bit set when returning from xdr_decode with an error. If it does, we should set the eof bits in the array before returning. This should keep us from looping when we expect more data but the server doesn't give us anything new. Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 7b8b7c59db9f..0cbb714a09d8 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -442,6 +442,8 @@ void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *e struct xdr_stream stream; struct xdr_buf buf; __be32 *ptr = xdr_page; + int status; + struct nfs_cache_array *array; buf.head->iov_base = xdr_page; buf.head->iov_len = buflen; @@ -453,11 +455,23 @@ void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *e xdr_init_decode(&stream, &buf, ptr); - while (xdr_decode(desc, entry, &stream) == 0) { + + do { + status = xdr_decode(desc, entry, &stream); + if (status != 0) + break; + if (nfs_readdir_add_to_array(entry, page) == -1) break; if (desc->plus == 1) nfs_prime_dcache(desc->file->f_path.dentry, entry); + } while (!entry->eof); + + if (status == -EBADCOOKIE && entry->eof) { + array = nfs_readdir_get_array(page); + array->eof_index = array->size - 1; + status = 0; + nfs_readdir_release_array(page); } } From ae42c70a60fe330d9c2af7c4b92ce78484308e37 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Thu, 21 Oct 2010 16:33:17 -0400 Subject: [PATCH 55/67] NFS: introduce generic decode_getattr function Getattr should be able to decode errors and the readdir file handle. decode_getfattr_attrs does the actual attribute decoding, while decode_getfattr_generic will check the opcode before decoding. This will let other functions call decode_getfattr_attrs to decode their attributes. Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 117 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 98 insertions(+), 19 deletions(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 8346e977d837..b7eff205d3d8 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -2848,6 +2848,58 @@ out_overflow: return -EIO; } +static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap) +{ + __be32 *p; + + if (unlikely(bitmap[0] & (FATTR4_WORD0_RDATTR_ERROR - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_RDATTR_ERROR)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR; + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh) +{ + __be32 *p; + int len; + + if (fh == NULL) { + bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE; + return 0; + } + + memset(fh, 0, sizeof(*fh)); + + if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEHANDLE - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_FILEHANDLE)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + if (len > NFS4_FHSIZE) + return -EIO; + fh->size = len; + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + memcpy(fh->data, p, len); + bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE; + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) { __be32 *p; @@ -3744,29 +3796,14 @@ xdr_error: return status; } -static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, +static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, + struct nfs_fattr *fattr, struct nfs_fh *fh, const struct nfs_server *server, int may_sleep) { - __be32 *savep; - uint32_t attrlen, - bitmap[2] = {0}, - type; int status; umode_t fmode = 0; uint64_t fileid; - - status = decode_op_hdr(xdr, OP_GETATTR); - if (status < 0) - goto xdr_error; - - status = decode_attr_bitmap(xdr, bitmap); - if (status < 0) - goto xdr_error; - - status = decode_attr_length(xdr, &attrlen, &savep); - if (status < 0) - goto xdr_error; - + uint32_t type; status = decode_attr_type(xdr, bitmap, &type); if (status < 0) @@ -3792,6 +3829,14 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, goto xdr_error; fattr->valid |= status; + status = decode_attr_error(xdr, bitmap); + if (status < 0) + goto xdr_error; + + status = decode_attr_filehandle(xdr, bitmap, fh); + if (status < 0) + goto xdr_error; + status = decode_attr_fileid(xdr, bitmap, &fattr->fileid); if (status < 0) goto xdr_error; @@ -3857,17 +3902,51 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid); if (status < 0) goto xdr_error; - if (status != 0 && !(fattr->valid & status)) { + if (status != 0) { fattr->fileid = fileid; fattr->valid |= status; } +xdr_error: + dprintk("%s: xdr returned %d\n", __func__, -status); + return status; +} + +static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr, + struct nfs_fh *fh, const struct nfs_server *server, int may_sleep) +{ + __be32 *savep; + uint32_t attrlen, + bitmap[2] = {0}; + int status; + + status = decode_op_hdr(xdr, OP_GETATTR); + if (status < 0) + goto xdr_error; + + status = decode_attr_bitmap(xdr, bitmap); + if (status < 0) + goto xdr_error; + + status = decode_attr_length(xdr, &attrlen, &savep); + if (status < 0) + goto xdr_error; + + status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server, may_sleep); + if (status < 0) + goto xdr_error; + status = verify_attr_len(xdr, savep, attrlen); xdr_error: dprintk("%s: xdr returned %d\n", __func__, -status); return status; } +static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, + const struct nfs_server *server, int may_sleep) +{ + return decode_getfattr_generic(xdr, fattr, NULL, server, may_sleep); +} static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) { From 82f2e5472e2304e531c2fa85e457f4a71070044e Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Thu, 21 Oct 2010 16:33:18 -0400 Subject: [PATCH 56/67] NFS: Readdir plus in v4 By requsting more attributes during a readdir, we can mimic the readdir plus operation that was in NFSv3. To test, I ran the command `ls -lU --color=none` on directories with various numbers of files. Without readdir plus, I see this: n files | 100 | 1,000 | 10,000 | 100,000 | 1,000,000 --------+-----------+-----------+-----------+-----------+---------- real | 0m00.153s | 0m00.589s | 0m05.601s | 0m56.691s | 9m59.128s user | 0m00.007s | 0m00.007s | 0m00.077s | 0m00.703s | 0m06.800s sys | 0m00.010s | 0m00.070s | 0m00.633s | 0m06.423s | 1m10.005s access | 3 | 1 | 1 | 4 | 31 getattr | 2 | 1 | 1 | 1 | 1 lookup | 104 | 1,003 | 10,003 | 100,003 | 1,000,003 readdir | 2 | 16 | 158 | 1,575 | 15,749 total | 111 | 1,021 | 10,163 | 101,583 | 1,015,784 With readdir plus enabled, I see this: n files | 100 | 1,000 | 10,000 | 100,000 | 1,000,000 --------+-----------+-----------+-----------+-----------+---------- real | 0m00.115s | 0m00.206s | 0m01.079s | 0m12.521s | 2m07.528s user | 0m00.003s | 0m00.003s | 0m00.040s | 0m00.290s | 0m03.296s sys | 0m00.007s | 0m00.020s | 0m00.120s | 0m01.357s | 0m17.556s access | 3 | 1 | 1 | 1 | 7 getattr | 2 | 1 | 1 | 1 | 1 lookup | 4 | 3 | 3 | 3 | 3 readdir | 6 | 62 | 630 | 6,300 | 62,993 total | 15 | 67 | 635 | 6,305 | 63,004 Readdir plus disabled has about a 16x increase in the number of rpc calls and is 4 - 5 times slower on large directories. Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 5 ++-- fs/nfs/dir.c | 4 +-- fs/nfs/internal.h | 6 ++--- fs/nfs/nfs2xdr.c | 2 +- fs/nfs/nfs3xdr.c | 2 +- fs/nfs/nfs4_fs.h | 2 +- fs/nfs/nfs4proc.c | 1 + fs/nfs/nfs4xdr.c | 55 ++++++++++++++++++++--------------------- include/linux/nfs_xdr.h | 3 ++- 9 files changed, 41 insertions(+), 39 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 876ba8592706..da2f2f024a4d 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1358,8 +1358,9 @@ static int nfs4_init_server(struct nfs_server *server, /* Initialise the client representation from the mount data */ server->flags = data->flags; - server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR| - NFS_CAP_POSIX_LOCK; + server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|NFS_CAP_POSIX_LOCK; + if (!(data->flags & NFS_MOUNT_NORDIRPLUS)) + server->caps |= NFS_CAP_READDIRPLUS; server->options = data->options; /* Get a client record */ diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 0cbb714a09d8..2a768d05a534 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -172,7 +172,7 @@ struct nfs_cache_array { #define MAX_READDIR_ARRAY ((PAGE_SIZE - sizeof(struct nfs_cache_array)) / sizeof(struct nfs_cache_array_entry)) -typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int); +typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int); typedef struct { struct file *file; struct page *page; @@ -360,7 +360,7 @@ error: static int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct xdr_stream *stream) { - __be32 *p = desc->decode(stream, entry, desc->plus); + __be32 *p = desc->decode(stream, entry, NFS_SERVER(desc->file->f_path.dentry->d_inode), desc->plus); if (IS_ERR(p)) return PTR_ERR(p); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 7b0e894d00c8..db08ff3ff454 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -187,15 +187,15 @@ extern void nfs_destroy_directcache(void); /* nfs2xdr.c */ extern int nfs_stat_to_errno(int); extern struct rpc_procinfo nfs_procedures[]; -extern __be32 *nfs_decode_dirent(struct xdr_stream *, struct nfs_entry *, int); +extern __be32 *nfs_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int); /* nfs3xdr.c */ extern struct rpc_procinfo nfs3_procedures[]; -extern __be32 *nfs3_decode_dirent(struct xdr_stream *, struct nfs_entry *, int); +extern __be32 *nfs3_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int); /* nfs4xdr.c */ #ifdef CONFIG_NFS_V4 -extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *entry, int plus); +extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int); #endif #ifdef CONFIG_NFS_V4_1 extern const u32 nfs41_maxread_overhead; diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 82f026422424..e6bf45710cc7 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -454,7 +454,7 @@ static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) } __be32 * -nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, int plus) +nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus) { __be32 *p; p = xdr_inline_decode(xdr, 4); diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index dc98eb7976c3..31a44df40aea 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -590,7 +590,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res } __be32 * -nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, int plus) +nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus) { __be32 *p; struct nfs_entry old = *entry; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index c58ea6377506..9fa496387fdf 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -331,7 +331,7 @@ extern void nfs_free_seqid(struct nfs_seqid *seqid); extern const nfs4_stateid zero_stateid; /* nfs4xdr.c */ -extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *entry, int plus); +extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int); extern struct rpc_procinfo nfs4_procedures[]; struct nfs4_mount_data; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index cb33c73206e3..f5ab216e8870 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2832,6 +2832,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, .pgbase = 0, .count = count, .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask, + .plus = plus, }; struct nfs4_readdir_res res; struct rpc_message msg = { diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index b7eff205d3d8..ccfb1c92b262 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1385,12 +1385,20 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr) { - uint32_t attrs[2] = { - FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID, - FATTR4_WORD1_MOUNTED_ON_FILEID, - }; + uint32_t attrs[2] = {0, 0}; __be32 *p; + if (readdir->plus) { + attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE| + FATTR4_WORD0_FSID|FATTR4_WORD0_FILEHANDLE; + attrs[1] |= FATTR4_WORD1_MODE|FATTR4_WORD1_NUMLINKS|FATTR4_WORD1_OWNER| + FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV| + FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS| + FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; + } + attrs[0] |= FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID; + attrs[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID; + p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20); *p++ = cpu_to_be32(OP_READDIR); p = xdr_encode_hyper(p, readdir->cookie); @@ -1398,11 +1406,15 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *p++ = cpu_to_be32(readdir->count >> 1); /* We're not doing readdirplus */ *p++ = cpu_to_be32(readdir->count); *p++ = cpu_to_be32(2); - /* Switch to mounted_on_fileid if the server supports it */ - if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID) - attrs[0] &= ~FATTR4_WORD0_FILEID; - else - attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; + + if (!readdir->plus) { + /* Switch to mounted_on_fileid if the server supports it */ + if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID) + attrs[0] &= ~FATTR4_WORD0_FILEID; + else + attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; + } + *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]); *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]); hdr->nops++; @@ -5768,7 +5780,8 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p, } #endif /* CONFIG_NFS_V4_1 */ -__be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, int plus) +__be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, + struct nfs_server *server, int plus) { uint32_t bitmap[2] = {0}; uint32_t len; @@ -5824,24 +5837,10 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, int goto out_overflow; len = XDR_QUADLEN(ntohl(*p++)); /* attribute buffer length */ if (len > 0) { - if (bitmap[0] & FATTR4_WORD0_RDATTR_ERROR) { - bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR; - /* Ignore the return value of rdattr_error for now */ - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - goto out_overflow; - } - if (bitmap[0] == 0 && bitmap[1] == FATTR4_WORD1_MOUNTED_ON_FILEID) { - p = xdr_inline_decode(xdr, 8); - if (unlikely(!p)) - goto out_overflow; - xdr_decode_hyper(p, &entry->ino); - } else if (bitmap[0] == FATTR4_WORD0_FILEID) { - p = xdr_inline_decode(xdr, 8); - if (unlikely(!p)) - goto out_overflow; - xdr_decode_hyper(p, &entry->ino); - } + if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, server, 1) < 0) + goto out_overflow; + if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID) + entry->ino = entry->fattr->fileid; } p = xdr_inline_peek(xdr, 8); diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 1b9a17a1f235..efe2eab8ac94 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -778,6 +778,7 @@ struct nfs4_readdir_arg { struct page ** pages; /* zero-copy data */ unsigned int pgbase; /* zero-copy data */ const u32 * bitmask; + int plus; struct nfs4_sequence_args seq_args; }; @@ -1036,7 +1037,7 @@ struct nfs_rpc_ops { int (*pathconf) (struct nfs_server *, struct nfs_fh *, struct nfs_pathconf *); int (*set_capabilities)(struct nfs_server *, struct nfs_fh *); - __be32 *(*decode_dirent)(struct xdr_stream *, struct nfs_entry *, int plus); + __be32 *(*decode_dirent)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int plus); void (*read_setup) (struct nfs_read_data *, struct rpc_message *); int (*read_done) (struct rpc_task *, struct nfs_read_data *); void (*write_setup) (struct nfs_write_data *, struct rpc_message *); From 4a201d6e3f4253f918555cc7c27c418f8ac1bb65 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 23 Oct 2010 14:53:23 -0400 Subject: [PATCH 57/67] NFS: Ensure we check all allocation return values in new readdir code Also some clean ups. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 61 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 24 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 2a768d05a534..257e4052492e 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -225,33 +225,42 @@ int nfs_readdir_clear_array(struct page *page, gfp_t mask) * nfs_clear_readdir_array() */ static -void nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len) +int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len) { string->len = len; string->name = kmemdup(name, len, GFP_KERNEL); - string->hash = full_name_hash(string->name, string->len); + if (string->name == NULL) + return -ENOMEM; + string->hash = full_name_hash(name, len); + return 0; } static int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) { struct nfs_cache_array *array = nfs_readdir_get_array(page); + struct nfs_cache_array_entry *cache_entry; + int ret; + if (IS_ERR(array)) return PTR_ERR(array); - if (array->size >= MAX_READDIR_ARRAY) { - nfs_readdir_release_array(page); - return -EIO; - } + ret = -EIO; + if (array->size >= MAX_READDIR_ARRAY) + goto out; - array->array[array->size].cookie = entry->prev_cookie; + cache_entry = &array->array[array->size]; + cache_entry->cookie = entry->prev_cookie; + cache_entry->ino = entry->ino; + ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len); + if (ret) + goto out; array->last_cookie = entry->cookie; - array->array[array->size].ino = entry->ino; - nfs_readdir_make_qstr(&array->array[array->size].string, entry->name, entry->len); if (entry->eof == 1) array->eof_index = array->size; array->size++; +out: nfs_readdir_release_array(page); - return 0; + return ret; } static @@ -388,21 +397,24 @@ different: static void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) { - struct qstr filename; - struct dentry *dentry = NULL; - struct dentry *alias = NULL; + struct qstr filename = { + .len = entry->len, + .name = entry->name, + }; + struct dentry *dentry; + struct dentry *alias; struct inode *dir = parent->d_inode; struct inode *inode; - nfs_readdir_make_qstr(&filename, entry->name, entry->len); - if (filename.len == 1 && filename.name[0] == '.') - dentry = dget(parent); - else if (filename.len == 2 && filename.name[0] == '.' - && filename.name[1] == '.') - dentry = dget_parent(parent); - else - dentry = d_lookup(parent, &filename); + if (filename.name[0] == '.') { + if (filename.len == 1) + return; + if (filename.len == 2 && filename.name[1] == '.') + return; + } + filename.hash = full_name_hash(filename.name, filename.len); + dentry = d_lookup(parent, &filename); if (dentry != NULL) { if (nfs_same_file(dentry, entry)) { nfs_refresh_inode(dentry->d_inode, entry->fattr); @@ -414,6 +426,9 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) } dentry = d_alloc(parent, &filename); + if (dentry == NULL) + return; + dentry->d_op = NFS_PROTO(dir)->dentry_ops; inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr); if (IS_ERR(inode)) @@ -430,8 +445,6 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) out: dput(dentry); - kfree(filename.name); - return; } /* Perform conversion from xdr to cache array */ @@ -508,7 +521,7 @@ void *nfs_readdir_large_page(struct page **pages, unsigned int npages) pages[i] = page; } - ptr = vm_map_ram(pages, NFS_MAX_READDIR_PAGES, 0, PAGE_KERNEL); + ptr = vm_map_ram(pages, npages, 0, PAGE_KERNEL); if (!IS_ERR_OR_NULL(ptr)) return ptr; out_freepages: From 7ad07353003d6ff69fe0b987813bb77b4d5ac23d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 23 Oct 2010 15:34:20 -0400 Subject: [PATCH 58/67] NFSv4: Fix up decode_attr_filehandle() to handle the case of empty fh pointer decode_attr_filehandle still needs to skip the XDR-encoded filehandle if someone passes a null pointer argument. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index ccfb1c92b262..a6b00e84bd1a 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -2883,12 +2883,8 @@ static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, stru __be32 *p; int len; - if (fh == NULL) { - bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE; - return 0; - } - - memset(fh, 0, sizeof(*fh)); + if (fh != NULL) + memset(fh, 0, sizeof(*fh)); if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEHANDLE - 1U))) return -EIO; @@ -2899,11 +2895,13 @@ static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, stru len = be32_to_cpup(p); if (len > NFS4_FHSIZE) return -EIO; - fh->size = len; p = xdr_inline_decode(xdr, len); if (unlikely(!p)) goto out_overflow; - memcpy(fh->data, p, len); + if (fh != NULL) { + memcpy(fh->data, p, len); + fh->size = len; + } bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE; } return 0; From 3201f3dd7370f2d29dfb689ae16f8f5d4066cc33 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 23 Oct 2010 15:43:10 -0400 Subject: [PATCH 59/67] NFSv4: Fix a regression in decode_getfattr We don't want to have the mounted_on_fileid overwrite the true fileid. We only return the former if the server didn't supply the true fileid. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index a6b00e84bd1a..707975eebb5d 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3912,7 +3912,7 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid); if (status < 0) goto xdr_error; - if (status != 0) { + if (status != 0 && !(fattr->valid & status)) { fattr->fileid = fileid; fattr->valid |= status; } From 4f082222fad3c8471abe0c8e8f18c72f335a34c7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 24 Oct 2010 13:14:02 -0400 Subject: [PATCH 60/67] NFSv4: nfs4_decode_dirent must clear entry->fattr->valid Otherwise, we may end up reading uninitialised data from the resulting struct nfs_fattr. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 707975eebb5d..9bf5e66d11db 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -5815,6 +5815,7 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, * since glibc seems to choke on it...) */ entry->ino = 1; + entry->fattr->valid = 0; len = ntohl(*p++); /* bitmap length */ if (len-- > 0) { From 9af8c222ca5eae88f000664f693316480bf58fbc Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 24 Oct 2010 11:52:55 -0400 Subject: [PATCH 61/67] NFSv4: Clean up nfs4_decode_dirent Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 38 +++++++++++++------------------------- 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 9bf5e66d11db..4c43e4874c52 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -5803,11 +5803,10 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, p = xdr_decode_hyper(p, &entry->cookie); entry->len = ntohl(*p++); - p = xdr_inline_decode(xdr, entry->len + 4); + p = xdr_inline_decode(xdr, entry->len); if (unlikely(!p)) goto out_overflow; entry->name = (const char *) p; - p += XDR_QUADLEN(entry->len); /* * In case the server doesn't return an inode number, @@ -5817,30 +5816,19 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, entry->ino = 1; entry->fattr->valid = 0; - len = ntohl(*p++); /* bitmap length */ - if (len-- > 0) { - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - goto out_overflow; - bitmap[0] = ntohl(*p++); - if (len-- > 0) { - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - goto out_overflow; - bitmap[1] = ntohl(*p++); - p += len; - } - } - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) + if (decode_attr_bitmap(xdr, bitmap) < 0) + goto out_overflow; + + if (decode_attr_length(xdr, &len, &p) < 0) + goto out_overflow; + + if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, server, 1) < 0) + goto out_overflow; + if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID) + entry->ino = entry->fattr->fileid; + + if (verify_attr_len(xdr, p, len) < 0) goto out_overflow; - len = XDR_QUADLEN(ntohl(*p++)); /* attribute buffer length */ - if (len > 0) { - if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, server, 1) < 0) - goto out_overflow; - if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID) - entry->ino = entry->fattr->fileid; - } p = xdr_inline_peek(xdr, 8); if (p != NULL) From 6f7a35bd23bdbbb40c07ee1120ef047190e77d9b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 24 Oct 2010 12:11:42 -0400 Subject: [PATCH 62/67] NFSv4: Fix up the 'dircount' hint in encode_readdir Also ensure we only ask for either fileid or mounted_on_fileid in the readdirplus case too... Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 4c43e4874c52..1e5d68e36e07 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1386,6 +1386,7 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr) { uint32_t attrs[2] = {0, 0}; + uint32_t dircount = readdir->count >> 1; __be32 *p; if (readdir->plus) { @@ -1395,26 +1396,24 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV| FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS| FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; + dircount >>= 1; } attrs[0] |= FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID; attrs[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID; + /* Switch to mounted_on_fileid if the server supports it */ + if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID) + attrs[0] &= ~FATTR4_WORD0_FILEID; + else + attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20); *p++ = cpu_to_be32(OP_READDIR); p = xdr_encode_hyper(p, readdir->cookie); p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE); - *p++ = cpu_to_be32(readdir->count >> 1); /* We're not doing readdirplus */ + *p++ = cpu_to_be32(dircount); *p++ = cpu_to_be32(readdir->count); *p++ = cpu_to_be32(2); - if (!readdir->plus) { - /* Switch to mounted_on_fileid if the server supports it */ - if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID) - attrs[0] &= ~FATTR4_WORD0_FILEID; - else - attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; - } - *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]); *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]); hdr->nops++; From 118df3d17f11733b294ea2cd988d56ee376ef9fd Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 24 Oct 2010 17:17:31 -0400 Subject: [PATCH 63/67] SUNRPC: After calling xprt_release(), we must restart from call_reserve Rob Leslie reports seeing the following Oops after his Kerberos session expired. BUG: unable to handle kernel NULL pointer dereference at 00000058 IP: [] rpcauth_refreshcred+0x11/0x12c [sunrpc] *pde = 00000000 Oops: 0000 [#1] last sysfs file: /sys/devices/platform/pc87360.26144/temp3_input Modules linked in: autofs4 authenc esp4 xfrm4_mode_transport ipt_LOG ipt_REJECT xt_limit xt_state ipt_REDIRECT xt_owner xt_HL xt_hl xt_tcpudp xt_mark cls_u32 cls_tcindex sch_sfq sch_htb sch_dsmark geodewdt deflate ctr twofish_generic twofish_i586 twofish_common camellia serpent blowfish cast5 cbc xcbc rmd160 sha512_generic sha1_generic hmac crypto_null af_key rpcsec_gss_krb5 nfsd exportfs nfs lockd fscache nfs_acl auth_rpcgss sunrpc ip_gre sit tunnel4 dummy ext3 jbd nf_nat_irc nf_conntrack_irc nf_nat_ftp nf_conntrack_ftp iptable_mangle iptable_nat nf_nat nf_conntrack_ipv4 nf_conntrack nf_defrag_ipv4 iptable_filter ip_tables x_tables pc8736x_gpio nsc_gpio pc87360 hwmon_vid loop aes_i586 aes_generic sha256_generic dm_crypt cs5535_gpio serio_raw cs5535_mfgpt hifn_795x des_generic geode_rng rng_core led_class ext4 mbcache jbd2 crc16 dm_mirror dm_region_hash dm_log dm_snapshot dm_mod sd_mod crc_t10dif ide_pci_generic cs5536 amd74xx ide_core pata_cs5536 ata_generic libata usb_storage via_rhine mii scsi_mod btrfs zlib_deflate crc32c libcrc32c [last unloaded: scsi_wait_scan] Pid: 12875, comm: sudo Not tainted 2.6.36-net5501 #1 / EIP: 0060:[] EFLAGS: 00010292 CPU: 0 EIP is at rpcauth_refreshcred+0x11/0x12c [sunrpc] EAX: 00000000 EBX: defb13a0 ECX: 00000006 EDX: e18683b8 ESI: defb13a0 EDI: 00000000 EBP: 00000000 ESP: de571d58 DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068 Process sudo (pid: 12875, ti=de570000 task=decd1430 task.ti=de570000) Stack: e186e008 00000000 defb13a0 0000000d deda6000 e1868f22 e196f12b defb13a0 <0> defb13d8 00000000 00000000 e186e0aa 00000000 defb13a0 de571dac 00000000 <0> e186956c de571e34 debea5c0 de571dc8 e186967a 00000000 debea5c0 de571e34 Call Trace: [] ? rpc_wake_up_next+0x114/0x11b [sunrpc] [] ? call_decode+0x24a/0x5af [sunrpc] [] ? nfs4_xdr_dec_access+0x0/0xa2 [nfs] [] ? __rpc_execute+0x62/0x17b [sunrpc] [] ? rpc_run_task+0x91/0x97 [sunrpc] [] ? rpc_call_sync+0x40/0x5b [sunrpc] [] ? nfs4_proc_access+0x10a/0x176 [nfs] [] ? nfs_do_access+0x2b1/0x2c0 [nfs] [] ? rpcauth_lookupcred+0x62/0x84 [sunrpc] [] ? nfs_permission+0xad/0x13b [nfs] [] ? exec_permission+0x15/0x4b [] ? link_path_walk+0x4f/0x456 [] ? path_walk+0x4c/0xa8 [] ? do_path_lookup+0x1f/0x68 [] ? user_path_at+0x37/0x5f [] ? handle_mm_fault+0x229/0x55b [] ? sys_faccessat+0x93/0x146 [] ? sys_access+0xf/0x13 [] ? syscall_call+0x7/0xb Code: 0f 94 c2 84 d2 74 09 8b 44 24 0c e8 6a e9 8b de 83 c4 14 89 d8 5b 5e 5f 5d c3 55 57 56 53 83 ec 1c fc 89 c6 8b 40 10 89 44 24 04 <8b> 58 58 85 db 0f 85 d4 00 00 00 0f b7 46 70 8b 56 20 89 c5 83 EIP: [] rpcauth_refreshcred+0x11/0x12c [sunrpc] SS:ESP 0068:de571d58 CR2: 0000000000000058 This appears to be caused by the function rpc_verify_header() first calling xprt_release(), then doing a call_refresh. If we release the transport slot, we should _always_ jump back to call_reserve before calling anything else. Signed-off-by: Trond Myklebust Cc: stable@kernel.org --- net/sunrpc/clnt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 2388d83b68ff..3d28ae8a888a 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1677,7 +1677,7 @@ rpc_verify_header(struct rpc_task *task) rpcauth_invalcred(task); /* Ensure we obtain a new XID! */ xprt_release(task); - task->tk_action = call_refresh; + task->tk_action = call_reserve; goto out_retry; case RPC_AUTH_BADCRED: case RPC_AUTH_BADVERF: From 6b96724e507fecc3e6440e86426fe4f44359ed66 Mon Sep 17 00:00:00 2001 From: Ricardo Labiaga Date: Tue, 12 Oct 2010 16:30:05 -0700 Subject: [PATCH 64/67] Revalidate caches on lock Instead of blindly zapping the caches, attempt to revalidate them if the server has indicated that it uses high resolution timestamps. NFSv4 should be able to always revalidate the cache since the protocol requires the update of the change attribute on modification of the data. In reality, there are servers (the Linux NFS server for example) that do not obey this requirement and use ctime as the basis for change attribute. Long term, the server needs to be fixed. At this time, and to be on the safe side, continue zapping caches if the server indicates that it does not have a high resolution timestamp. Signed-off-by: Ricardo Labiaga Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 2 ++ fs/nfs/file.c | 19 ++++++++++++++++--- fs/nfs/nfs3xdr.c | 3 ++- include/linux/nfs_fs_sb.h | 1 + include/linux/nfs_xdr.h | 1 + 5 files changed, 22 insertions(+), 4 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index da2f2f024a4d..a63bce8d0596 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -915,6 +915,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo * server->maxfilesize = fsinfo->maxfilesize; + server->time_delta = fsinfo->time_delta; + /* We're airborne Set socket buffersize */ rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100); } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 39672b731736..c3f2477c16c1 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -757,6 +757,11 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) return status; } +static int +is_time_granular(struct timespec *ts) { + return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000)); +} + static int do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) { @@ -781,13 +786,21 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) status = do_vfs_lock(filp, fl); if (status < 0) goto out; + /* - * Make sure we clear the cache whenever we try to get the lock. + * Revalidate the cache if the server has time stamps granular + * enough to detect subsecond changes. Otherwise, clear the + * cache to prevent missing any changes. + * * This makes locking act as a cache coherency point. */ nfs_sync_mapping(filp->f_mapping); - if (!nfs_have_delegation(inode, FMODE_READ)) - nfs_zap_caches(inode); + if (!nfs_have_delegation(inode, FMODE_READ)) { + if (is_time_granular(&NFS_SERVER(inode)->time_delta)) + __nfs_revalidate_inode(NFS_SERVER(inode), inode); + else + nfs_zap_caches(inode); + } out: return status; } diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 31a44df40aea..d9a5e832c257 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -1044,8 +1044,9 @@ nfs3_xdr_fsinfores(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *res) res->wtmult = ntohl(*p++); res->dtpref = ntohl(*p++); p = xdr_decode_hyper(p, &res->maxfilesize); + p = xdr_decode_time3(p, &res->time_delta); - /* ignore time_delta and properties */ + /* ignore properties */ res->lease_time = 0; return 0; } diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index c82ee7cd6288..5eef862ec187 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -124,6 +124,7 @@ struct nfs_server { struct nfs_fsid fsid; __u64 maxfilesize; /* maximum file size */ + struct timespec time_delta; /* smallest time granularity */ unsigned long mount_time; /* when this fs was mounted */ dev_t s_dev; /* superblock dev numbers */ diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index efe2eab8ac94..da7a1300dc60 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -112,6 +112,7 @@ struct nfs_fsinfo { __u32 wtmult; /* writes should be multiple of this */ __u32 dtpref; /* pref. readdir transfer size */ __u64 maxfilesize; + struct timespec time_delta; /* server time granularity */ __u32 lease_time; /* in seconds */ }; From 55b6e7742d5b25182edf410369379b9727b2e5bc Mon Sep 17 00:00:00 2001 From: Ricardo Labiaga Date: Tue, 12 Oct 2010 16:30:06 -0700 Subject: [PATCH 65/67] Ask for time_delta during fsinfo probe Used by the client to determine if the server has a granular enough time stamp. Signed-off-by: Ricardo Labiaga Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 2 +- fs/nfs/nfs4xdr.c | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index f5ab216e8870..e87fe612ca18 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -129,7 +129,7 @@ const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE | FATTR4_WORD0_MAXREAD | FATTR4_WORD0_MAXWRITE | FATTR4_WORD0_LEASE_TIME, - 0 + FATTR4_WORD1_TIME_DELTA }; const u32 nfs4_fs_locations_bitmap[2] = { diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 1e5d68e36e07..7131c761d85c 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3582,6 +3582,24 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s return status; } +static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap, + struct timespec *time) +{ + int status = 0; + + time->tv_sec = 0; + time->tv_nsec = 0; + if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_DELTA - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_TIME_DELTA)) { + status = decode_attr_time(xdr, time); + bitmap[1] &= ~FATTR4_WORD1_TIME_DELTA; + } + dprintk("%s: time_delta=%ld %ld\n", __func__, (long)time->tv_sec, + (long)time->tv_nsec); + return status; +} + static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) { int status = 0; @@ -3982,6 +4000,9 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0) goto xdr_error; fsinfo->wtpref = fsinfo->wtmax; + status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta); + if (status != 0) + goto xdr_error; status = verify_attr_len(xdr, savep, attrlen); xdr_error: From 3388bff5cfe91589a912cdc7f00d3aae3aa18adc Mon Sep 17 00:00:00 2001 From: Roman Borisov Date: Wed, 13 Oct 2010 16:54:51 +0400 Subject: [PATCH 66/67] nfs: fix unchecked value Return value of "decode_attr_bitmap()" was not checked; Signed-off-by: Roman Borisov Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 7131c761d85c..bd2101d918c8 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -2687,7 +2687,10 @@ out_overflow: static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask) { if (likely(bitmap[0] & FATTR4_WORD0_SUPPORTED_ATTRS)) { - decode_attr_bitmap(xdr, bitmask); + int ret; + ret = decode_attr_bitmap(xdr, bitmask); + if (unlikely(ret < 0)) + return ret; bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS; } else bitmask[0] = bitmask[1] = 0; From 9a84d38031c258a17bb39beed1e500eadee67407 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 24 Oct 2010 18:00:46 -0400 Subject: [PATCH 67/67] SUNRPC: Cleanup duplicate assignment in rpcauth_refreshcred Signed-off-by: Trond Myklebust --- net/sunrpc/auth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 36cb66022a27..b457db034cc9 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -595,7 +595,7 @@ rpcauth_unwrap_resp(struct rpc_task *task, kxdrproc_t decode, void *rqstp, int rpcauth_refreshcred(struct rpc_task *task) { - struct rpc_cred *cred = task->tk_rqstp->rq_cred; + struct rpc_cred *cred; int err; cred = task->tk_rqstp->rq_cred;