From 0601f793921157603831d00a9541d92e8f5763f6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 18 May 2009 17:47:56 -0400 Subject: [PATCH 01/22] SUNRPC: requeue tcp socket less frequently Don't requeue the socket in some cases where we know it's unnecessary. Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index b7d435c3f19e..7a3e4bfd895d 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -965,7 +965,6 @@ static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp) goto err_again; /* record not complete */ } len = svsk->sk_reclen; - set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); return len; error: @@ -1115,6 +1114,10 @@ out: /* Reset TCP read info */ svsk->sk_reclen = 0; svsk->sk_tcplen = 0; + /* If we have more data, signal svc_xprt_enqueue() to try again */ + if (svc_recv_available(svsk) > sizeof(rpc_fraghdr)) + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); + svc_xprt_copy_addrs(rqstp, &svsk->sk_xprt); if (serv->sv_stats) From 5ee78d483c5812228e971e145b912e0a7e35e571 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 18 May 2009 17:47:56 -0400 Subject: [PATCH 02/22] SUNRPC: svc_tcp_recvfrom cleanup Minor cleanup in preparation for later patches. Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 7a3e4bfd895d..733c2f6a1858 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -893,6 +893,7 @@ failed: static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp) { struct svc_serv *serv = svsk->sk_xprt.xpt_server; + unsigned int want; int len; if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags)) @@ -915,9 +916,9 @@ static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp) clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); if (svsk->sk_tcplen < sizeof(rpc_fraghdr)) { - int want = sizeof(rpc_fraghdr) - svsk->sk_tcplen; struct kvec iov; + want = sizeof(rpc_fraghdr) - svsk->sk_tcplen; iov.iov_base = ((char *) &svsk->sk_reclen) + svsk->sk_tcplen; iov.iov_len = want; if ((len = svc_recvfrom(rqstp, &iov, 1, want)) < 0) @@ -1040,8 +1041,9 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) struct svc_serv *serv = svsk->sk_xprt.xpt_server; int len; struct kvec *vec; - int pnum, vlen; struct rpc_rqst *req = NULL; + unsigned int vlen; + int pnum; dprintk("svc: tcp_recv %p data %d conn %d close %d\n", svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags), @@ -1072,7 +1074,7 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) } pnum = 1; - while (vlen < len) { + while (vlen < svsk->sk_reclen - 8) { vec[pnum].iov_base = (req) ? page_address(req->rq_private_buf.pages[pnum - 1]) : page_address(rqstp->rq_pages[pnum]); @@ -1083,29 +1085,23 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) rqstp->rq_respages = &rqstp->rq_pages[pnum]; /* Now receive data */ - len = svc_recvfrom(rqstp, vec, pnum, len); + len = svc_recvfrom(rqstp, vec, pnum, svsk->sk_reclen - 8); if (len < 0) goto err_again; - /* - * Account for the 8 bytes we read earlier - */ - len += 8; - if (req) { - xprt_complete_rqst(req->rq_task, len); - len = 0; + xprt_complete_rqst(req->rq_task, svsk->sk_reclen); + rqstp->rq_arg.len = 0; goto out; } - dprintk("svc: TCP complete record (%d bytes)\n", len); - rqstp->rq_arg.len = len; + dprintk("svc: TCP complete record (%d bytes)\n", svsk->sk_reclen); + rqstp->rq_arg.len = svsk->sk_reclen; rqstp->rq_arg.page_base = 0; - if (len <= rqstp->rq_arg.head[0].iov_len) { - rqstp->rq_arg.head[0].iov_len = len; + if (rqstp->rq_arg.len <= rqstp->rq_arg.head[0].iov_len) { + rqstp->rq_arg.head[0].iov_len = rqstp->rq_arg.len; rqstp->rq_arg.page_len = 0; - } else { - rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; - } + } else + rqstp->rq_arg.page_len = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len; rqstp->rq_xprt_ctxt = NULL; rqstp->rq_prot = IPPROTO_TCP; @@ -1123,7 +1119,7 @@ out: if (serv->sv_stats) serv->sv_stats->nettcpcnt++; - return len; + return rqstp->rq_arg.len; err_again: if (len == -EAGAIN) { From 48e6555c7b3bf0d92f8167d8b8b8ecf4a3fdab84 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 14 Feb 2011 14:52:03 -0500 Subject: [PATCH 03/22] svcrpc: note network-order types in svc_process_calldir Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 733c2f6a1858..1955e1a1e390 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -982,9 +982,9 @@ static int svc_process_calldir(struct svc_sock *svsk, struct svc_rqst *rqstp, struct rpc_rqst **reqpp, struct kvec *vec) { struct rpc_rqst *req = NULL; - u32 *p; - u32 xid; - u32 calldir; + __be32 *p; + __be32 xid; + __be32 calldir; int len; len = svc_recvfrom(rqstp, vec, 1, 8); From cc6c2127f2316c2b2ad1e8919b45cde5e03f65aa Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 14 Feb 2011 15:03:35 -0500 Subject: [PATCH 04/22] svcrpc: close connection if client sends short packet If the client sents a record too short to contain even the beginning of the rpc header, then just close the connection. The current code drops the record data and continues. I don't see the point. It's a hopeless situation and simpler just to cut off the connection completely. Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 1955e1a1e390..62ff7c5c09c2 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -955,6 +955,9 @@ static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp) } } + if (svsk->sk_reclen < 8) + goto err_delete; /* client is nuts. */ + /* Check whether enough data is available */ len = svc_recv_available(svsk); if (len < 0) @@ -1058,20 +1061,10 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) vec[0] = rqstp->rq_arg.head[0]; vlen = PAGE_SIZE; - /* - * We have enough data for the whole tcp record. Let's try and read the - * first 8 bytes to get the xid and the call direction. We can use this - * to figure out if this is a call or a reply to a callback. If - * sk_reclen is < 8 (xid and calldir), then this is a malformed packet. - * In that case, don't bother with the calldir and just read the data. - * It will be rejected in svc_process. - */ - if (len >= 8) { - len = svc_process_calldir(svsk, rqstp, &req, vec); - if (len < 0) - goto err_again; - vlen -= 8; - } + len = svc_process_calldir(svsk, rqstp, &req, vec); + if (len < 0) + goto err_again; + vlen -= 8; pnum = 1; while (vlen < svsk->sk_reclen - 8) { From 586c52cc61b5b84c70102208b78269ef5924bf49 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 18 May 2009 17:47:56 -0400 Subject: [PATCH 05/22] svcrpc: copy cb reply instead of pages It's much simpler just to copy the cb reply data than to play tricks with pages. Callback replies will typically be very small (at least until we implement cb_getattr, in which case files with very long ACLs could pose a problem), so there's no loss in efficiency. Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 120 ++++++++++++++++++++----------------------- 1 file changed, 55 insertions(+), 65 deletions(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 62ff7c5c09c2..40b502b11442 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -981,57 +981,58 @@ static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp) return -EAGAIN; } -static int svc_process_calldir(struct svc_sock *svsk, struct svc_rqst *rqstp, - struct rpc_rqst **reqpp, struct kvec *vec) +static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp) { + struct rpc_xprt *bc_xprt = svsk->sk_xprt.xpt_bc_xprt; struct rpc_rqst *req = NULL; - __be32 *p; + struct kvec *src, *dst; + __be32 *p = (__be32 *)rqstp->rq_arg.head[0].iov_base; __be32 xid; __be32 calldir; - int len; - len = svc_recvfrom(rqstp, vec, 1, 8); - if (len < 0) - goto error; - - p = (u32 *)rqstp->rq_arg.head[0].iov_base; xid = *p++; calldir = *p; - if (calldir == 0) { - /* REQUEST is the most common case */ - vec[0] = rqstp->rq_arg.head[0]; - } else { - /* REPLY */ - struct rpc_xprt *bc_xprt = svsk->sk_xprt.xpt_bc_xprt; + if (bc_xprt) + req = xprt_lookup_rqst(bc_xprt, xid); - if (bc_xprt) - req = xprt_lookup_rqst(bc_xprt, xid); - - if (!req) { - printk(KERN_NOTICE - "%s: Got unrecognized reply: " - "calldir 0x%x xpt_bc_xprt %p xid %08x\n", - __func__, ntohl(calldir), - bc_xprt, xid); - vec[0] = rqstp->rq_arg.head[0]; - goto out; - } - - memcpy(&req->rq_private_buf, &req->rq_rcv_buf, - sizeof(struct xdr_buf)); - /* copy the xid and call direction */ - memcpy(req->rq_private_buf.head[0].iov_base, - rqstp->rq_arg.head[0].iov_base, 8); - vec[0] = req->rq_private_buf.head[0]; + if (!req) { + printk(KERN_NOTICE + "%s: Got unrecognized reply: " + "calldir 0x%x xpt_bc_xprt %p xid %08x\n", + __func__, ntohl(calldir), + bc_xprt, xid); + return -EAGAIN; } - out: - vec[0].iov_base += 8; - vec[0].iov_len -= 8; - len = svsk->sk_reclen - 8; - error: - *reqpp = req; - return len; + + memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf)); + /* + * XXX!: cheating for now! Only copying HEAD. + * But we know this is good enough for now (in fact, for any + * callback reply in the forseeable future). + */ + dst = &req->rq_private_buf.head[0]; + src = &rqstp->rq_arg.head[0]; + if (dst->iov_len < src->iov_len) + return -EAGAIN; /* whatever; just giving up. */ + memcpy(dst->iov_base, src->iov_base, src->iov_len); + xprt_complete_rqst(req->rq_task, svsk->sk_reclen); + rqstp->rq_arg.len = 0; + return 0; +} + +static int copy_pages_to_kvecs(struct kvec *vec, struct page **pages, int len) +{ + int i = 0; + int t = 0; + + while (t < len) { + vec[i].iov_base = page_address(pages[i]); + vec[i].iov_len = PAGE_SIZE; + i++; + t += PAGE_SIZE; + } + return i; } /* @@ -1044,8 +1045,8 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) struct svc_serv *serv = svsk->sk_xprt.xpt_server; int len; struct kvec *vec; - struct rpc_rqst *req = NULL; - unsigned int vlen; + __be32 *p; + __be32 calldir; int pnum; dprintk("svc: tcp_recv %p data %d conn %d close %d\n", @@ -1058,35 +1059,17 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) goto error; vec = rqstp->rq_vec; - vec[0] = rqstp->rq_arg.head[0]; - vlen = PAGE_SIZE; - len = svc_process_calldir(svsk, rqstp, &req, vec); - if (len < 0) - goto err_again; - vlen -= 8; + pnum = copy_pages_to_kvecs(&vec[0], &rqstp->rq_pages[0], + svsk->sk_reclen); - pnum = 1; - while (vlen < svsk->sk_reclen - 8) { - vec[pnum].iov_base = (req) ? - page_address(req->rq_private_buf.pages[pnum - 1]) : - page_address(rqstp->rq_pages[pnum]); - vec[pnum].iov_len = PAGE_SIZE; - pnum++; - vlen += PAGE_SIZE; - } rqstp->rq_respages = &rqstp->rq_pages[pnum]; /* Now receive data */ - len = svc_recvfrom(rqstp, vec, pnum, svsk->sk_reclen - 8); + len = svc_recvfrom(rqstp, vec, pnum, svsk->sk_reclen); if (len < 0) goto err_again; - if (req) { - xprt_complete_rqst(req->rq_task, svsk->sk_reclen); - rqstp->rq_arg.len = 0; - goto out; - } dprintk("svc: TCP complete record (%d bytes)\n", svsk->sk_reclen); rqstp->rq_arg.len = svsk->sk_reclen; rqstp->rq_arg.page_base = 0; @@ -1099,7 +1082,14 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) rqstp->rq_xprt_ctxt = NULL; rqstp->rq_prot = IPPROTO_TCP; -out: + p = (__be32 *)rqstp->rq_arg.head[0].iov_base; + calldir = p[1]; + if (calldir) { + len = receive_cb_reply(svsk, rqstp); + if (len < 0) + goto err_again; + } + /* Reset TCP read info */ svsk->sk_reclen = 0; svsk->sk_tcplen = 0; From 31d68ef65c7d49def19c1bae4e01b87d66cf5a56 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 24 Feb 2011 11:25:33 -0800 Subject: [PATCH 06/22] SUNRPC: Don't wait for full record to receive tcp data Ensure that we immediately read and buffer data from the incoming TCP stream so that we grow the receive window quickly, and don't deadlock on large READ or WRITE requests. Also do some minor exit cleanup. Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svcsock.h | 1 + net/sunrpc/svcsock.c | 144 +++++++++++++++++++++++++-------- 2 files changed, 113 insertions(+), 32 deletions(-) diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index 04dba23c59f2..85c50b40759d 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -28,6 +28,7 @@ struct svc_sock { /* private TCP part */ u32 sk_reclen; /* length of record */ u32 sk_tcplen; /* current read length */ + struct page * sk_pages[RPCSVC_MAXPAGES]; /* received data */ }; /* diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 40b502b11442..a4fafcbc6ea0 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -387,6 +387,33 @@ static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, return len; } +static int svc_partial_recvfrom(struct svc_rqst *rqstp, + struct kvec *iov, int nr, + int buflen, unsigned int base) +{ + size_t save_iovlen; + void __user *save_iovbase; + unsigned int i; + int ret; + + if (base == 0) + return svc_recvfrom(rqstp, iov, nr, buflen); + + for (i = 0; i < nr; i++) { + if (iov[i].iov_len > base) + break; + base -= iov[i].iov_len; + } + save_iovlen = iov[i].iov_len; + save_iovbase = iov[i].iov_base; + iov[i].iov_len -= base; + iov[i].iov_base += base; + ret = svc_recvfrom(rqstp, &iov[i], nr - i, buflen); + iov[i].iov_len = save_iovlen; + iov[i].iov_base = save_iovbase; + return ret; +} + /* * Set socket snd and rcv buffer lengths */ @@ -884,6 +911,56 @@ failed: return NULL; } +static unsigned int svc_tcp_restore_pages(struct svc_sock *svsk, struct svc_rqst *rqstp) +{ + unsigned int i, len, npages; + + if (svsk->sk_tcplen <= sizeof(rpc_fraghdr)) + return 0; + len = svsk->sk_tcplen - sizeof(rpc_fraghdr); + npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; + for (i = 0; i < npages; i++) { + if (rqstp->rq_pages[i] != NULL) + put_page(rqstp->rq_pages[i]); + BUG_ON(svsk->sk_pages[i] == NULL); + rqstp->rq_pages[i] = svsk->sk_pages[i]; + svsk->sk_pages[i] = NULL; + } + rqstp->rq_arg.head[0].iov_base = page_address(rqstp->rq_pages[0]); + return len; +} + +static void svc_tcp_save_pages(struct svc_sock *svsk, struct svc_rqst *rqstp) +{ + unsigned int i, len, npages; + + if (svsk->sk_tcplen <= sizeof(rpc_fraghdr)) + return; + len = svsk->sk_tcplen - sizeof(rpc_fraghdr); + npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; + for (i = 0; i < npages; i++) { + svsk->sk_pages[i] = rqstp->rq_pages[i]; + rqstp->rq_pages[i] = NULL; + } +} + +static void svc_tcp_clear_pages(struct svc_sock *svsk) +{ + unsigned int i, len, npages; + + if (svsk->sk_tcplen <= sizeof(rpc_fraghdr)) + goto out; + len = svsk->sk_tcplen - sizeof(rpc_fraghdr); + npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; + for (i = 0; i < npages; i++) { + BUG_ON(svsk->sk_pages[i] == NULL); + put_page(svsk->sk_pages[i]); + svsk->sk_pages[i] = NULL; + } +out: + svsk->sk_tcplen = 0; +} + /* * Receive data. * If we haven't gotten the record length yet, get the next four bytes. @@ -928,7 +1005,7 @@ static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp) if (len < want) { dprintk("svc: short recvfrom while reading record " "length (%d of %d)\n", len, want); - goto err_again; /* record header not complete */ + return -EAGAIN; } svsk->sk_reclen = ntohl(svsk->sk_reclen); @@ -958,26 +1035,14 @@ static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp) if (svsk->sk_reclen < 8) goto err_delete; /* client is nuts. */ - /* Check whether enough data is available */ - len = svc_recv_available(svsk); - if (len < 0) - goto error; - - if (len < svsk->sk_reclen) { - dprintk("svc: incomplete TCP record (%d of %d)\n", - len, svsk->sk_reclen); - goto err_again; /* record not complete */ - } len = svsk->sk_reclen; return len; - error: - if (len == -EAGAIN) - dprintk("RPC: TCP recv_record got EAGAIN\n"); +error: + dprintk("RPC: TCP recv_record got %d\n", len); return len; - err_delete: +err_delete: set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); - err_again: return -EAGAIN; } @@ -1035,6 +1100,7 @@ static int copy_pages_to_kvecs(struct kvec *vec, struct page **pages, int len) return i; } + /* * Receive data from a TCP socket. */ @@ -1045,6 +1111,7 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) struct svc_serv *serv = svsk->sk_xprt.xpt_server; int len; struct kvec *vec; + unsigned int want, base; __be32 *p; __be32 calldir; int pnum; @@ -1058,6 +1125,9 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) if (len < 0) goto error; + base = svc_tcp_restore_pages(svsk, rqstp); + want = svsk->sk_reclen - base; + vec = rqstp->rq_vec; pnum = copy_pages_to_kvecs(&vec[0], &rqstp->rq_pages[0], @@ -1066,11 +1136,18 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) rqstp->rq_respages = &rqstp->rq_pages[pnum]; /* Now receive data */ - len = svc_recvfrom(rqstp, vec, pnum, svsk->sk_reclen); - if (len < 0) - goto err_again; + len = svc_partial_recvfrom(rqstp, vec, pnum, want, base); + if (len >= 0) + svsk->sk_tcplen += len; + if (len != want) { + if (len < 0 && len != -EAGAIN) + goto err_other; + svc_tcp_save_pages(svsk, rqstp); + dprintk("svc: incomplete TCP record (%d of %d)\n", + svsk->sk_tcplen, svsk->sk_reclen); + goto err_noclose; + } - dprintk("svc: TCP complete record (%d bytes)\n", svsk->sk_reclen); rqstp->rq_arg.len = svsk->sk_reclen; rqstp->rq_arg.page_base = 0; if (rqstp->rq_arg.len <= rqstp->rq_arg.head[0].iov_len) { @@ -1087,7 +1164,7 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) if (calldir) { len = receive_cb_reply(svsk, rqstp); if (len < 0) - goto err_again; + goto error; } /* Reset TCP read info */ @@ -1102,20 +1179,20 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) if (serv->sv_stats) serv->sv_stats->nettcpcnt++; + dprintk("svc: TCP complete record (%d bytes)\n", rqstp->rq_arg.len); return rqstp->rq_arg.len; -err_again: - if (len == -EAGAIN) { - dprintk("RPC: TCP recvfrom got EAGAIN\n"); - return len; - } error: - if (len != -EAGAIN) { - printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", - svsk->sk_xprt.xpt_server->sv_name, -len); - set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); - } + if (len != -EAGAIN) + goto err_other; + dprintk("RPC: TCP recvfrom got EAGAIN\n"); return -EAGAIN; +err_other: + printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", + svsk->sk_xprt.xpt_server->sv_name, -len); + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); +err_noclose: + return -EAGAIN; /* record not complete */ } /* @@ -1286,6 +1363,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) svsk->sk_reclen = 0; svsk->sk_tcplen = 0; + memset(&svsk->sk_pages[0], 0, sizeof(svsk->sk_pages)); tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF; @@ -1544,8 +1622,10 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt) svc_sock_detach(xprt); - if (!test_bit(XPT_LISTENER, &xprt->xpt_flags)) + if (!test_bit(XPT_LISTENER, &xprt->xpt_flags)) { + svc_tcp_clear_pages(svsk); kernel_sock_shutdown(svsk->sk_sock, SHUT_RDWR); + } } /* From 9660439861aa8dbd5e2b8087f33e20760c2c9afc Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 21 Oct 2008 14:13:47 -0400 Subject: [PATCH 07/22] svcrpc: take advantage of tcp autotuning Allow the NFSv4 server to make use of TCP autotuning behaviour, which was previously disabled by setting the sk_userlocks variable. Set the receive buffers to be big enough to receive the whole RPC request, and set this for the listening socket, not the accept socket. Remove the code that readjusts the receive/send buffer sizes for the accepted socket. Previously this code was used to influence the TCP window management behaviour, which is no longer needed when autotuning is enabled. This can improve IO bandwidth on networks with high bandwidth-delay products, where a large tcp window is required. It also simplifies performance tuning, since getting adequate tcp buffers previously required increasing the number of nfsd threads. Signed-off-by: Olga Kornievskaia Cc: Jim Rees Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 35 +++++++---------------------------- 1 file changed, 7 insertions(+), 28 deletions(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index a4fafcbc6ea0..213dea8b283c 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -436,7 +436,6 @@ static void svc_sock_setbufsize(struct socket *sock, unsigned int snd, lock_sock(sock->sk); sock->sk->sk_sndbuf = snd * 2; sock->sk->sk_rcvbuf = rcv * 2; - sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK; sock->sk->sk_write_space(sock->sk); release_sock(sock->sk); #endif @@ -973,23 +972,6 @@ static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp) unsigned int want; int len; - if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags)) - /* sndbuf needs to have room for one request - * per thread, otherwise we can stall even when the - * network isn't a bottleneck. - * - * We count all threads rather than threads in a - * particular pool, which provides an upper bound - * on the number of threads which will access the socket. - * - * rcvbuf just needs to be able to hold a few requests. - * Normally they will be removed from the queue - * as soon a a complete request arrives. - */ - svc_sock_setbufsize(svsk->sk_sock, - (serv->sv_nrthreads+3) * serv->sv_max_mesg, - 3 * serv->sv_max_mesg); - clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); if (svsk->sk_tcplen < sizeof(rpc_fraghdr)) { @@ -1367,15 +1349,6 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF; - /* initialise setting must have enough space to - * receive and respond to one request. - * svc_tcp_recvfrom will re-adjust if necessary - */ - svc_sock_setbufsize(svsk->sk_sock, - 3 * svsk->sk_xprt.xpt_server->sv_max_mesg, - 3 * svsk->sk_xprt.xpt_server->sv_max_mesg); - - set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); if (sk->sk_state != TCP_ESTABLISHED) set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); @@ -1439,8 +1412,14 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, /* Initialize the socket */ if (sock->type == SOCK_DGRAM) svc_udp_init(svsk, serv); - else + else { + /* initialise setting must have enough space to + * receive and respond to one request. + */ + svc_sock_setbufsize(svsk->sk_sock, 4 * serv->sv_max_mesg, + 4 * serv->sv_max_mesg); svc_tcp_init(svsk, serv); + } dprintk("svc: svc_setup_socket created %p (inet %p)\n", svsk, svsk->sk_sk); From 8985ef0b8af895c3b85a8c1b7108e0169fcbd20b Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sat, 9 Apr 2011 10:03:10 -0400 Subject: [PATCH 08/22] svcrpc: complete svsk processing on cb receive failure Currently when there's some failure to receive a callback (because we couldn't find a matching xid, for example), we exit svc_recv with sk_tcplen still set but without any pages saved with the socket. This will cause a crash later in svc_tcp_restore_pages. Instead, make sure we reset that tcp information whether the callback received failed or succeeded. Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 213dea8b283c..af04f779ce9f 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1143,11 +1143,8 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) p = (__be32 *)rqstp->rq_arg.head[0].iov_base; calldir = p[1]; - if (calldir) { + if (calldir) len = receive_cb_reply(svsk, rqstp); - if (len < 0) - goto error; - } /* Reset TCP read info */ svsk->sk_reclen = 0; @@ -1156,6 +1153,8 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) if (svc_recv_available(svsk) > sizeof(rpc_fraghdr)) set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); + if (len < 0) + goto error; svc_xprt_copy_addrs(rqstp, &svsk->sk_xprt); if (serv->sv_stats) From aea93397db4b39c9d15443a0e7cc9a380ba990c6 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sun, 10 Apr 2011 10:35:12 -0400 Subject: [PATCH 09/22] nfsd: distinguish functions of NFSD_MAY_* flags Most of the NFSD_MAY_* flags actually request permissions, but over the years we've accreted a few that modify the behavior of the permission or open code in other ways. Distinguish the two cases a little more. In particular, allow the shortcut at the start of nfsd_permission to ignore the non-permission-requesting bits. Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 2 +- fs/nfsd/vfs.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 2e1cebde90df..a76ef7e0b3d0 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -2027,7 +2027,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, struct inode *inode = dentry->d_inode; int err; - if (acc == NFSD_MAY_NOP) + if ((acc & NFSD_MAY_MASK) == NFSD_MAY_NOP) return 0; #if 0 dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s\n", diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 9a370a5e36b7..1036913e6e86 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -17,6 +17,9 @@ #define NFSD_MAY_SATTR 8 #define NFSD_MAY_TRUNC 16 #define NFSD_MAY_LOCK 32 +#define NFSD_MAY_MASK 63 + +/* extra hints to permission and open routines: */ #define NFSD_MAY_OWNER_OVERRIDE 64 #define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/ #define NFSD_MAY_BYPASS_GSS_ON_ROOT 256 From 204f4ce75434c3453907813f8a819d4cf2a5728f Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 8 Apr 2011 16:32:54 -0400 Subject: [PATCH 10/22] nfsd4: allow fh_verify caller to skip pseudoflavor checks Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsfh.c | 2 +- fs/nfsd/vfs.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 55c8e63af0be..90c6aa6d5e0f 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -344,7 +344,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) * which clients virtually always use auth_sys for, * even while using RPCSEC_GSS for NFS. */ - if (access & NFSD_MAY_LOCK) + if (access & NFSD_MAY_LOCK || access & NFSD_MAY_BYPASS_GSS) goto skip_pseudoflavor_check; /* * Clients may expect to be able to use auth_sys during mount, diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 1036913e6e86..4d2509f766d4 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -24,6 +24,7 @@ #define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/ #define NFSD_MAY_BYPASS_GSS_ON_ROOT 256 #define NFSD_MAY_NOT_BREAK_LEASE 512 +#define NFSD_MAY_BYPASS_GSS 1024 #define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE) #define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC) From 22b03214962ec2a9748abc9987fc2e66dec4626d Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sat, 9 Apr 2011 11:23:24 -0400 Subject: [PATCH 11/22] nfsd4: introduce OPDESC helper Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 5fcb1396a7e3..126b8f75b576 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1031,6 +1031,11 @@ static __be32 nfs41_check_op_ordering(struct nfsd4_compoundargs *args) return nfs_ok; } +static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op) +{ + return &nfsd4_ops[op->opnum]; +} + /* * COMPOUND call. */ @@ -1108,7 +1113,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, goto encode_op; } - opdesc = &nfsd4_ops[op->opnum]; + opdesc = OPDESC(op); if (!cstate->current_fh.fh_dentry) { if (!(opdesc->op_flags & ALLOWED_WITHOUT_FH)) { From 29a78a3ed7fc9c4ee49962751eb321b038c190a2 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sat, 9 Apr 2011 11:28:53 -0400 Subject: [PATCH 12/22] nfsd4: make fh_verify responsibility of nfsd_lookup_dentry caller The secinfo caller actually won't want this. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 3 +++ fs/nfsd/vfs.c | 9 +++------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 126b8f75b576..8059adae013b 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -762,6 +762,9 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, __be32 err; fh_init(&resfh, NFS4_FHSIZE); + err = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_EXEC); + if (err) + return err; err = nfsd_lookup_dentry(rqstp, &cstate->current_fh, secinfo->si_name, secinfo->si_namelen, &exp, &dentry); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index a76ef7e0b3d0..e53313972c30 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -181,16 +181,10 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, struct svc_export *exp; struct dentry *dparent; struct dentry *dentry; - __be32 err; int host_err; dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name); - /* Obtain dentry and export. */ - err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); - if (err) - return err; - dparent = fhp->fh_dentry; exp = fhp->fh_export; exp_get(exp); @@ -254,6 +248,9 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, struct dentry *dentry; __be32 err; + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); + if (err) + return err; err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry); if (err) return err; From 68d93184352f2e723f135b0a9bad93b58f9d120b Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 8 Apr 2011 17:00:50 -0400 Subject: [PATCH 13/22] nfsd4: fix wrongsec handling for PUTFH + op cases When PUTFH is followed by an operation that uses the filehandle, and when the current client is using a security flavor that is inconsistent with the given filehandle, we have a choice: we can return WRONGSEC either when the current filehandle is set using the PUTFH, or when the filehandle is first used by the following operation. Follow the recommendations of RFC 5661 in making this choice. (Our current behavior prevented the client from doing security negotiation by returning WRONGSEC on PUTFH+SECINFO_NO_NAME.) Signed-off-by: J. Bruce Fields --- fs/nfsd/export.c | 6 ----- fs/nfsd/nfs4proc.c | 59 ++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 54 insertions(+), 11 deletions(-) diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index ad000aeb21a2..b9566e46219f 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1354,12 +1354,6 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp) if (IS_ERR(exp)) return nfserrno(PTR_ERR(exp)); rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL); - if (rv) - goto out; - rv = check_nfsd_access(exp, rqstp); - if (rv) - fh_put(fhp); -out: exp_put(exp); return rv; } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 8059adae013b..ad32568a1aa7 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -403,7 +403,7 @@ nfsd4_putfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, cstate->current_fh.fh_handle.fh_size = putfh->pf_fhlen; memcpy(&cstate->current_fh.fh_handle.fh_base, putfh->pf_fhval, putfh->pf_fhlen); - return fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); + return fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_BYPASS_GSS); } static __be32 @@ -989,6 +989,9 @@ enum nfsd4_op_flags { ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */ ALLOWED_ON_ABSENT_FS = 1 << 1, /* ops processed on absent fs */ ALLOWED_AS_FIRST_OP = 1 << 2, /* ops reqired first in compound */ + /* For rfc 5661 section 2.6.3.1.1: */ + OP_HANDLES_WRONGSEC = 1 << 3, + OP_IS_PUTFH_LIKE = 1 << 4, }; struct nfsd4_operation { @@ -1039,6 +1042,39 @@ static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op) return &nfsd4_ops[op->opnum]; } +static bool need_wrongsec_check(struct svc_rqst *rqstp) +{ + struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct nfsd4_compoundargs *argp = rqstp->rq_argp; + struct nfsd4_op *this = &argp->ops[resp->opcnt - 1]; + struct nfsd4_op *next = &argp->ops[resp->opcnt]; + struct nfsd4_operation *thisd; + struct nfsd4_operation *nextd; + + thisd = OPDESC(this); + /* + * Most ops check wronsec on our own; only the putfh-like ops + * have special rules. + */ + if (!(thisd->op_flags & OP_IS_PUTFH_LIKE)) + return false; + /* + * rfc 5661 2.6.3.1.1.6: don't bother erroring out a + * put-filehandle operation if we're not going to use the + * result: + */ + if (argp->opcnt == resp->opcnt) + return false; + + nextd = OPDESC(next); + /* + * Rest of 2.6.3.1.1: certain operations will return WRONGSEC + * errors themselves as necessary; others should check for them + * now: + */ + return !(nextd->op_flags & OP_HANDLES_WRONGSEC); +} + /* * COMPOUND call. */ @@ -1134,6 +1170,9 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, else BUG_ON(op->status == nfs_ok); + if (!op->status && need_wrongsec_check(rqstp)) + op->status = check_nfsd_access(cstate->current_fh.fh_export, rqstp); + encode_op: /* Only from SEQUENCE */ if (resp->cstate.status == nfserr_replay_cache) { @@ -1225,10 +1264,12 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_LOOKUP] = { .op_func = (nfsd4op_func)nfsd4_lookup, + .op_flags = OP_HANDLES_WRONGSEC, .op_name = "OP_LOOKUP", }, [OP_LOOKUPP] = { .op_func = (nfsd4op_func)nfsd4_lookupp, + .op_flags = OP_HANDLES_WRONGSEC, .op_name = "OP_LOOKUPP", }, [OP_NVERIFY] = { @@ -1237,6 +1278,7 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_OPEN] = { .op_func = (nfsd4op_func)nfsd4_open, + .op_flags = OP_HANDLES_WRONGSEC, .op_name = "OP_OPEN", }, [OP_OPEN_CONFIRM] = { @@ -1249,17 +1291,20 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_PUTFH] = { .op_func = (nfsd4op_func)nfsd4_putfh, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_IS_PUTFH_LIKE, .op_name = "OP_PUTFH", }, [OP_PUTPUBFH] = { .op_func = (nfsd4op_func)nfsd4_putrootfh, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_IS_PUTFH_LIKE, .op_name = "OP_PUTPUBFH", }, [OP_PUTROOTFH] = { .op_func = (nfsd4op_func)nfsd4_putrootfh, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_IS_PUTFH_LIKE, .op_name = "OP_PUTROOTFH", }, [OP_READ] = { @@ -1289,15 +1334,18 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_RESTOREFH] = { .op_func = (nfsd4op_func)nfsd4_restorefh, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_IS_PUTFH_LIKE, .op_name = "OP_RESTOREFH", }, [OP_SAVEFH] = { .op_func = (nfsd4op_func)nfsd4_savefh, + .op_flags = OP_HANDLES_WRONGSEC, .op_name = "OP_SAVEFH", }, [OP_SECINFO] = { .op_func = (nfsd4op_func)nfsd4_secinfo, + .op_flags = OP_HANDLES_WRONGSEC, .op_name = "OP_SECINFO", }, [OP_SETATTR] = { @@ -1361,6 +1409,7 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_SECINFO_NO_NAME] = { .op_func = (nfsd4op_func)nfsd4_secinfo_no_name, + .op_flags = OP_HANDLES_WRONGSEC, .op_name = "OP_SECINFO_NO_NAME", }, }; From ac6721a13e5b1a90728e790600f827a5e5f5da2f Mon Sep 17 00:00:00 2001 From: Mi Jinlong Date: Wed, 20 Apr 2011 17:06:25 +0800 Subject: [PATCH 14/22] nfsd41: make sure nfs server process OPEN with EXCLUSIVE4_1 correctly The NFS server uses nfsd_create_v3 to handle EXCLUSIVE4_1 opens, but that function is not prepared to handle them. Rename nfsd_create_v3() to do_nfsd_create(), and add handling of EXCLUSIVE4_1. Signed-off-by: Mi Jinlong Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs3proc.c | 2 +- fs/nfsd/nfs4proc.c | 4 ++-- fs/nfsd/vfs.c | 20 ++++++++++++++++---- fs/nfsd/vfs.h | 2 +- 4 files changed, 20 insertions(+), 8 deletions(-) diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 2247fc91d5e9..9095f3c21df9 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -245,7 +245,7 @@ nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp, } /* Now create the file and set attributes */ - nfserr = nfsd_create_v3(rqstp, dirfhp, argp->name, argp->len, + nfserr = do_nfsd_create(rqstp, dirfhp, argp->name, argp->len, attr, newfhp, argp->createmode, argp->verf, NULL, NULL); diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index ad32568a1aa7..3a6dbd70b34b 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -196,9 +196,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o /* * Note: create modes (UNCHECKED,GUARDED...) are the same - * in NFSv4 as in v3. + * in NFSv4 as in v3 except EXCLUSIVE4_1. */ - status = nfsd_create_v3(rqstp, current_fh, open->op_fname.data, + status = do_nfsd_create(rqstp, current_fh, open->op_fname.data, open->op_fname.len, &open->op_iattr, &resfh, open->op_createmode, (u32 *)open->op_verf.data, diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index e53313972c30..389e45ad200b 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1337,11 +1337,18 @@ out_nfserr: } #ifdef CONFIG_NFSD_V3 + +static inline int nfsd_create_is_exclusive(int createmode) +{ + return createmode == NFS3_CREATE_EXCLUSIVE + || createmode == NFS4_CREATE_EXCLUSIVE4_1; +} + /* - * NFSv3 version of nfsd_create + * NFSv3 and NFSv4 version of nfsd_create */ __be32 -nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, +do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, char *fname, int flen, struct iattr *iap, struct svc_fh *resfhp, int createmode, u32 *verifier, int *truncp, int *created) @@ -1386,7 +1393,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, if (err) goto out; - if (createmode == NFS3_CREATE_EXCLUSIVE) { + if (nfsd_create_is_exclusive(createmode)) { /* solaris7 gets confused (bugid 4218508) if these have * the high bit set, so just clear the high bits. If this is * ever changed to use different attrs for storing the @@ -1427,6 +1434,11 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, && dchild->d_inode->i_atime.tv_sec == v_atime && dchild->d_inode->i_size == 0 ) break; + case NFS4_CREATE_EXCLUSIVE4_1: + if ( dchild->d_inode->i_mtime.tv_sec == v_mtime + && dchild->d_inode->i_atime.tv_sec == v_atime + && dchild->d_inode->i_size == 0 ) + goto set_attr; /* fallthru */ case NFS3_CREATE_GUARDED: err = nfserr_exist; @@ -1445,7 +1457,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, nfsd_check_ignore_resizing(iap); - if (createmode == NFS3_CREATE_EXCLUSIVE) { + if (nfsd_create_is_exclusive(createmode)) { /* Cram the verifier into atime/mtime */ iap->ia_valid = ATTR_MTIME|ATTR_ATIME | ATTR_MTIME_SET|ATTR_ATIME_SET; diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 4d2509f766d4..e0bbac04d1dd 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -58,7 +58,7 @@ __be32 nfsd_create(struct svc_rqst *, struct svc_fh *, int type, dev_t rdev, struct svc_fh *res); #ifdef CONFIG_NFSD_V3 __be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *); -__be32 nfsd_create_v3(struct svc_rqst *, struct svc_fh *, +__be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *, char *name, int len, struct iattr *attrs, struct svc_fh *res, int createmode, u32 *verifier, int *truncp, int *created); From a62573dc353b255dbbc8520bfdcb1c8a8c1ada87 Mon Sep 17 00:00:00 2001 From: Mi Jinlong Date: Wed, 23 Mar 2011 17:57:07 +0800 Subject: [PATCH 15/22] nfsd41: add flag checking for create_session Teach the NFS server to reject invalid create_session flags. Also do some minor formatting adjustments. Signed-off-by: Mi Jinlong Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 3 +++ include/linux/nfs4.h | 8 +++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index fbde6f79922e..6dbaa379c863 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1515,6 +1515,9 @@ nfsd4_create_session(struct svc_rqst *rqstp, bool confirm_me = false; int status = 0; + if (cr_ses->flags & ~SESSION4_FLAG_MASK_A) + return nfserr_inval; + nfs4_lock_state(); unconf = find_unconfirmed_client(&cr_ses->clientid); conf = find_confirmed_client(&cr_ses->clientid); diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index b528f6d4b860..8937727e7039 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -570,9 +570,11 @@ struct nfs4_sessionid { }; /* Create Session Flags */ -#define SESSION4_PERSIST 0x001 -#define SESSION4_BACK_CHAN 0x002 -#define SESSION4_RDMA 0x004 +#define SESSION4_PERSIST 0x001 +#define SESSION4_BACK_CHAN 0x002 +#define SESSION4_RDMA 0x004 + +#define SESSION4_FLAG_MASK_A 0x007 enum state_protect_how4 { SP4_NONE = 0, From b7c66360dc34e64742edabf4dc410070aa883119 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Fri, 22 Apr 2011 12:45:59 -0400 Subject: [PATCH 16/22] nfsd v4.1 lOCKT clientid field must be ignored RFC 5661 Section 18.11.3 The clientid field of the owner MAY be set to any value by the client and MUST be ignored by the server. The reason the server MUST ignore the clientid field is that the server MUST derive the client ID from the session ID from the SEQUENCE operation of the COMPOUND request. Signed-off-by: Andy Adamson Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index c6766af00d98..6e7f10a89354 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -588,8 +588,6 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt) READ_BUF(lockt->lt_owner.len); READMEM(lockt->lt_owner.data, lockt->lt_owner.len); - if (argp->minorversion && !zero_clientid(&lockt->lt_clientid)) - return nfserr_inval; DECODE_TAIL; } From 868b89c3dc25eec03985b31ff070e8f73c818980 Mon Sep 17 00:00:00 2001 From: Mi Jinlong Date: Wed, 27 Apr 2011 09:09:58 +0800 Subject: [PATCH 17/22] nfsd41: compare request's opcnt with session's maxops at nfsd4_sequence Make sure nfs server errors out if request contains more ops than channel allows. Signed-off-by: Mi Jinlong [bfields@redhat.com: use helper function] Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 6dbaa379c863..3196dc388578 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1724,6 +1724,13 @@ static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_sessi return; } +static bool nfsd4_session_too_many_ops(struct svc_rqst *rqstp, struct nfsd4_session *session) +{ + struct nfsd4_compoundargs *args = rqstp->rq_argp; + + return args->opcnt > session->se_fchannel.maxops; +} + __be32 nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, @@ -1752,6 +1759,10 @@ nfsd4_sequence(struct svc_rqst *rqstp, if (!session) goto out; + status = nfserr_too_many_ops; + if (nfsd4_session_too_many_ops(rqstp, session)) + goto out; + status = nfserr_badslot; if (seq->slotid >= session->se_fchannel.maxreqs) goto out; From bcecf1ccc336200ee488e8eb68acdafc4b0dbd1a Mon Sep 17 00:00:00 2001 From: Mi Jinlong Date: Wed, 27 Apr 2011 09:14:30 +0800 Subject: [PATCH 18/22] nfsd41: error out on repeated RECLAIM_COMPLETE Servers are supposed to return nfserr_complete_already to clients that attempt to send multiple RECLAIM_COMPLETEs. Signed-off-by: Mi Jinlong Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 3196dc388578..2bb03f86a037 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1818,6 +1818,8 @@ out: __be32 nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc) { + int status = 0; + if (rc->rca_one_fs) { if (!cstate->current_fh.fh_dentry) return nfserr_nofilehandle; @@ -1827,9 +1829,14 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta */ return nfs_ok; } + nfs4_lock_state(); - if (is_client_expired(cstate->session->se_client)) { - nfs4_unlock_state(); + status = nfserr_complete_already; + if (cstate->session->se_client->cl_firststate) + goto out; + + status = nfserr_stale_clientid; + if (is_client_expired(cstate->session->se_client)) /* * The following error isn't really legal. * But we only get here if the client just explicitly @@ -1837,11 +1844,13 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta * error it gets back on an operation for the dead * client. */ - return nfserr_stale_clientid; - } + goto out; + + status = nfs_ok; nfsd4_create_clid_dir(cstate->session->se_client); +out: nfs4_unlock_state(); - return nfs_ok; + return status; } __be32 From fccb13c947de83a368e1f3c2216bbf4d8d41efa1 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Wed, 27 Apr 2011 15:47:14 -0400 Subject: [PATCH 19/22] NFSD: Remove setting unused variable in nfsd_vfs_read() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compiling gave me this warning: fs/nfsd/vfs.c: In function ‘nfsd_vfs_read’: fs/nfsd/vfs.c:880:16: warning: variable ‘inode’ set but not used [-Wunused-but-set-variable] I discovered that a local variable "inode" was being set towards the beginning of nfsd_vfs_read() and then ignored for the rest of the function. Signed-off-by: Bryan Schumaker Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 389e45ad200b..f4e056ae53e4 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -874,13 +874,11 @@ static __be32 nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, struct kvec *vec, int vlen, unsigned long *count) { - struct inode *inode; mm_segment_t oldfs; __be32 err; int host_err; err = nfserr_perm; - inode = file->f_path.dentry->d_inode; if (file->f_op->splice_read && rqstp->rq_splice_ok) { struct splice_desc sd = { From 1db2b9dde3317e181f76860410cb0e7433896f28 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Wed, 27 Apr 2011 15:47:15 -0400 Subject: [PATCH 20/22] NFSD: Check status from nfsd4_map_bcts_dir() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compiling gave me this warning: fs/nfsd/nfs4state.c: In function ‘nfsd4_bind_conn_to_session’: fs/nfsd/nfs4state.c:1623:9: warning: variable ‘status’ set but not used [-Wunused-but-set-variable] The local variable "status" was being set by nfsd4_map_bcts_dir() and then ignored before calling nfsd4_new_conn(). Signed-off-by: Bryan Schumaker Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 2bb03f86a037..a2ea14f40b42 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1636,8 +1636,9 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp, return nfserr_badsession; status = nfsd4_map_bcts_dir(&bcts->dir); - nfsd4_new_conn(rqstp, cstate->session, bcts->dir); - return nfs_ok; + if (!status) + nfsd4_new_conn(rqstp, cstate->session, bcts->dir); + return status; } static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid) From 6ce2357f1e73e0da8ebeace6ec829f48a646bb8c Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Wed, 27 Apr 2011 15:47:16 -0400 Subject: [PATCH 21/22] NFSD: Remove unused variable from nfsd4_decode_bind_conn_to_session() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compiling gave me this warning: fs/nfsd/nfs4xdr.c: In function ‘nfsd4_decode_bind_conn_to_session’: fs/nfsd/nfs4xdr.c:427:6: warning: variable ‘dummy’ set but not used [-Wunused-but-set-variable] The local variable "dummy" wasn't being used past the READ32() macro that set it. READ_BUF() should ensure that the xdr buffer is pushed past the data read into dummy already, so nothing needs to be read in. Signed-off-by: Bryan Schumaker [bfields@redhat.com: minor comment fixup.] Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 6e7f10a89354..195a91d9405b 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -424,15 +424,12 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts) { DECODE_HEAD; - u32 dummy; READ_BUF(NFS4_MAX_SESSIONID_LEN + 8); COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN); READ32(bcts->dir); - /* XXX: Perhaps Tom Tucker could help us figure out how we - * should be using ctsa_use_conn_in_rdma_mode: */ - READ32(dummy); - + /* XXX: skipping ctsa_use_conn_in_rdma_mode. Perhaps Tom Tucker + * could help us figure out we should be using it. */ DECODE_TAIL; } From c47d832bc0155153920e507f075647519bad09a2 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Mon, 16 May 2011 16:38:14 +0200 Subject: [PATCH 22/22] nfsd: make local functions static This also fixes a number of sparse warnings. Signed-off-by: Daniel Mack Cc: Trond Myklebust Cc: J. Bruce Fields Cc: Neil Brown Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs3xdr.c | 2 +- fs/nfsd/nfs4state.c | 4 ++-- fs/nfsd/nfs4xdr.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 7e84a852cdae..291f2d2551b3 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -842,7 +842,7 @@ out: return rv; } -__be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen) +static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen) { struct svc_fh fh; int err; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a2ea14f40b42..a8e4e91587ab 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2482,7 +2482,7 @@ find_delegation_file(struct nfs4_file *fp, stateid_t *stid) return NULL; } -int share_access_to_flags(u32 share_access) +static int share_access_to_flags(u32 share_access) { share_access &= ~NFS4_SHARE_WANT_MASK; @@ -2902,7 +2902,7 @@ out: return status; } -struct lock_manager nfsd4_manager = { +static struct lock_manager nfsd4_manager = { }; static void diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 195a91d9405b..990181103214 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3115,7 +3115,7 @@ nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr, return nfserr; } -__be32 +static __be32 nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_sequence *seq) {