From 2785259631697ebb0749a3782cca206e2e542939 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Mon, 4 Feb 2008 23:48:37 -0800 Subject: [PATCH 01/80] nfs: use GFP_NOFS preloads for radix-tree insertion NFS should use GFP_NOFS mode radix tree preloads rather than GFP_ATOMIC allocations at radix-tree insertion-time. This is important to reduce the atomic memory requirement. Signed-off-by: Nick Piggin Cc: Trond Myklebust Cc: "J. Bruce Fields" Signed-off-by: Andrew Morton Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f55c437124a2..7be42e6eb63a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -360,15 +360,13 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) /* * Insert a write request into an inode */ -static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req) +static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) { struct nfs_inode *nfsi = NFS_I(inode); int error; error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req); - BUG_ON(error == -EEXIST); - if (error) - return error; + BUG_ON(error); if (!nfsi->npages) { igrab(inode); if (nfs_have_delegation(inode, FMODE_WRITE)) @@ -378,8 +376,8 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req) set_page_private(req->wb_page, (unsigned long)req); nfsi->npages++; kref_get(&req->wb_kref); - radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); - return 0; + radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, + NFS_PAGE_TAG_LOCKED); } /* @@ -591,6 +589,13 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx, /* Loop over all inode entries and see if we find * A request for the page we wish to update */ + if (new) { + if (radix_tree_preload(GFP_NOFS)) { + nfs_release_request(new); + return ERR_PTR(-ENOMEM); + } + } + spin_lock(&inode->i_lock); req = nfs_page_find_request_locked(page); if (req) { @@ -601,28 +606,27 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx, error = nfs_wait_on_request(req); nfs_release_request(req); if (error < 0) { - if (new) + if (new) { + radix_tree_preload_end(); nfs_release_request(new); + } return ERR_PTR(error); } continue; } spin_unlock(&inode->i_lock); - if (new) + if (new) { + radix_tree_preload_end(); nfs_release_request(new); + } break; } if (new) { - int error; nfs_lock_request_dontget(new); - error = nfs_inode_add_request(inode, new); - if (error) { - spin_unlock(&inode->i_lock); - nfs_unlock_request(new); - return ERR_PTR(error); - } + nfs_inode_add_request(inode, new); spin_unlock(&inode->i_lock); + radix_tree_preload_end(); req = new; goto zero_page; } From cbc20059259edee4ea24c923e6627c394830c972 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 14 Feb 2008 11:11:30 -0500 Subject: [PATCH 02/80] SUNRPC: Declare as const the rpc_message arguments to rpc_call_sync/async Signed-off-by: Trond Myklebust --- include/linux/sunrpc/clnt.h | 9 +++++---- net/sunrpc/clnt.c | 4 ++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 129a86e25d29..6fff7f82ef12 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -127,11 +127,12 @@ int rpcb_getport_sync(struct sockaddr_in *, u32, u32, int); void rpcb_getport_async(struct rpc_task *); void rpc_call_start(struct rpc_task *); -int rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg, - int flags, const struct rpc_call_ops *tk_ops, +int rpc_call_async(struct rpc_clnt *clnt, + const struct rpc_message *msg, int flags, + const struct rpc_call_ops *tk_ops, void *calldata); -int rpc_call_sync(struct rpc_clnt *clnt, struct rpc_message *msg, - int flags); +int rpc_call_sync(struct rpc_clnt *clnt, + const struct rpc_message *msg, int flags); struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred, int flags); void rpc_restart_call(struct rpc_task *); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 8c6a7f1a25e9..a23512bb240d 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -548,7 +548,7 @@ EXPORT_SYMBOL_GPL(rpc_run_task); * @msg: RPC call parameters * @flags: RPC call flags */ -int rpc_call_sync(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) +int rpc_call_sync(struct rpc_clnt *clnt, const struct rpc_message *msg, int flags) { struct rpc_task *task; struct rpc_task_setup task_setup_data = { @@ -579,7 +579,7 @@ EXPORT_SYMBOL_GPL(rpc_call_sync); * @data: user call data */ int -rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg, int flags, +rpc_call_async(struct rpc_clnt *clnt, const struct rpc_message *msg, int flags, const struct rpc_call_ops *tk_ops, void *data) { struct rpc_task *task; From 4b5621f6b127bce9218998c187bd25bf7f9fc371 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 25 Feb 2008 15:56:29 -0800 Subject: [PATCH 03/80] NFS: Fix an f_mode/f_flags confusion in fs/nfs/write.c O_SYNC is stored in filp->f_flags. Thanks to Al Viro for pointing out the bug. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f55c437124a2..80c61fdb2720 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -734,7 +734,7 @@ int nfs_updatepage(struct file *file, struct page *page, */ if (nfs_write_pageuptodate(page, inode) && inode->i_flock == NULL && - !(file->f_mode & O_SYNC)) { + !(file->f_flags & O_SYNC)) { count = max(count + offset, nfs_page_length(page)); offset = 0; } From 383ba71938519959be8e0b598ec658f0c211ff45 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 19 Feb 2008 20:04:20 -0500 Subject: [PATCH 04/80] NFS: Fix a deadlock with lazy umount We can't allow rpc callback functions like task->tk_ops->rpc_call_prepare() and task->tk_ops->rpc_call_done() to call mntput() in any way, since that will cause a deadlock when the call to rpc_shutdown_client() attempts to wait on 'task' to complete. We can avoid the above deadlock by moving calls to mntput to task->tk_ops->rpc_release() callback, since at that time the task will be marked as completed, and so rpc_shutdown_client won't attempt to wait on it. Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 5 +++-- fs/nfs/inode.c | 6 +++++- fs/nfs/read.c | 7 +++++-- fs/nfs/write.c | 13 ++++++++++--- 4 files changed, 23 insertions(+), 8 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 16844f98f50e..e0170407a885 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -323,7 +323,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, data->inode = inode; data->cred = msg.rpc_cred; data->args.fh = NFS_FH(inode); - data->args.context = ctx; + data->args.context = get_nfs_open_context(ctx); data->args.offset = pos; data->args.pgbase = pgbase; data->args.pages = data->pagevec; @@ -546,6 +546,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) data->args.fh = NFS_FH(data->inode); data->args.offset = 0; data->args.count = 0; + data->args.context = get_nfs_open_context(dreq->ctx); data->res.count = 0; data->res.fattr = &data->fattr; data->res.verf = &data->verf; @@ -728,7 +729,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, data->inode = inode; data->cred = msg.rpc_cred; data->args.fh = NFS_FH(inode); - data->args.context = ctx; + data->args.context = get_nfs_open_context(ctx); data->args.offset = pos; data->args.pgbase = pgbase; data->args.pages = data->pagevec; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 966a8850aa30..a499fb58d858 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -521,8 +521,12 @@ struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) static void __put_nfs_open_context(struct nfs_open_context *ctx, int wait) { - struct inode *inode = ctx->path.dentry->d_inode; + struct inode *inode; + if (ctx == NULL) + return; + + inode = ctx->path.dentry->d_inode; if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock)) return; list_del(&ctx->list); diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 3d7d9631e125..fab0d3720a03 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -73,7 +73,10 @@ static void nfs_readdata_free(struct nfs_read_data *rdata) void nfs_readdata_release(void *data) { - nfs_readdata_free(data); + struct nfs_read_data *rdata = data; + + put_nfs_open_context(rdata->args.context); + nfs_readdata_free(rdata); } static @@ -186,7 +189,7 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, data->args.pgbase = req->wb_pgbase + offset; data->args.pages = data->pagevec; data->args.count = count; - data->args.context = req->wb_context; + data->args.context = get_nfs_open_context(req->wb_context); data->res.fattr = &data->fattr; data->res.count = count; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 80c61fdb2720..69b4158d9a10 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -105,8 +105,11 @@ static void nfs_writedata_free(struct nfs_write_data *wdata) call_rcu_bh(&wdata->task.u.tk_rcu, nfs_writedata_rcu_free); } -void nfs_writedata_release(void *wdata) +void nfs_writedata_release(void *data) { + struct nfs_write_data *wdata = data; + + put_nfs_open_context(wdata->args.context); nfs_writedata_free(wdata); } @@ -816,7 +819,7 @@ static void nfs_write_rpcsetup(struct nfs_page *req, data->args.pgbase = req->wb_pgbase + offset; data->args.pages = data->pagevec; data->args.count = count; - data->args.context = req->wb_context; + data->args.context = get_nfs_open_context(req->wb_context); data->args.stable = NFS_UNSTABLE; if (how & FLUSH_STABLE) { data->args.stable = NFS_DATA_SYNC; @@ -1153,8 +1156,11 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) -void nfs_commit_release(void *wdata) +void nfs_commit_release(void *data) { + struct nfs_write_data *wdata = data; + + put_nfs_open_context(wdata->args.context); nfs_commit_free(wdata); } @@ -1197,6 +1203,7 @@ static void nfs_commit_rpcsetup(struct list_head *head, /* Note: we always request a commit of the entire inode */ data->args.offset = 0; data->args.count = 0; + data->args.context = get_nfs_open_context(first->wb_context); data->res.count = 0; data->res.fattr = &data->fattr; data->res.verf = &data->verf; From 32bfb5c0f495dd88ef6bac4b76885d0820563739 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 19 Feb 2008 20:04:21 -0500 Subject: [PATCH 05/80] SUNRPC: Allow the rpc_release() callback to be run on another workqueue A lot of the work done by the rpc_release() callback is inappropriate for rpciod as it will often involve things like starting a new rpc call in order to clean up state after an interrupted NFSv4 open() call, or calls to mntput(), etc. This patch allows the caller of rpc_run_task() to specify that the rpc_release callback should run on a different workqueue than the default rpciod_workqueue. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/sched.h | 1 + net/sunrpc/sched.c | 29 +++++++++++++++++++++-------- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index f689f02e6793..fefb0ab52189 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -123,6 +123,7 @@ struct rpc_task_setup { const struct rpc_message *rpc_message; const struct rpc_call_ops *callback_ops; void *callback_data; + struct workqueue_struct *workqueue; unsigned short flags; signed char priority; }; diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 4c669121e607..3e0b22382a3b 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -326,7 +326,7 @@ static void rpc_make_runnable(struct rpc_task *task) int status; INIT_WORK(&task->u.tk_work, rpc_async_schedule); - status = queue_work(task->tk_workqueue, &task->u.tk_work); + status = queue_work(rpciod_workqueue, &task->u.tk_work); if (status < 0) { printk(KERN_WARNING "RPC: failed to add task to queue: error: %d!\n", status); task->tk_status = status; @@ -832,7 +832,7 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta task->tk_owner = current->tgid; /* Initialize workqueue for async tasks */ - task->tk_workqueue = rpciod_workqueue; + task->tk_workqueue = task_setup_data->workqueue; task->tk_client = task_setup_data->rpc_client; if (task->tk_client != NULL) { @@ -868,7 +868,7 @@ rpc_alloc_task(void) return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS); } -static void rpc_free_task(struct rcu_head *rcu) +static void rpc_free_task_rcu(struct rcu_head *rcu) { struct rpc_task *task = container_of(rcu, struct rpc_task, u.tk_rcu); dprintk("RPC: %5u freeing task\n", task->tk_pid); @@ -898,12 +898,23 @@ out: return task; } - -void rpc_put_task(struct rpc_task *task) +static void rpc_free_task(struct rpc_task *task) { const struct rpc_call_ops *tk_ops = task->tk_ops; void *calldata = task->tk_calldata; + if (task->tk_flags & RPC_TASK_DYNAMIC) + call_rcu_bh(&task->u.tk_rcu, rpc_free_task_rcu); + rpc_release_calldata(tk_ops, calldata); +} + +static void rpc_async_release(struct work_struct *work) +{ + rpc_free_task(container_of(work, struct rpc_task, u.tk_work)); +} + +void rpc_put_task(struct rpc_task *task) +{ if (!atomic_dec_and_test(&task->tk_count)) return; /* Release resources */ @@ -915,9 +926,11 @@ void rpc_put_task(struct rpc_task *task) rpc_release_client(task->tk_client); task->tk_client = NULL; } - if (task->tk_flags & RPC_TASK_DYNAMIC) - call_rcu_bh(&task->u.tk_rcu, rpc_free_task); - rpc_release_calldata(tk_ops, calldata); + if (task->tk_workqueue != NULL) { + INIT_WORK(&task->u.tk_work, rpc_async_release); + queue_work(task->tk_workqueue, &task->u.tk_work); + } else + rpc_free_task(task); } EXPORT_SYMBOL_GPL(rpc_put_task); From 5746006f1d17d9d5a3015051ea54de4341cb31f9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 19 Feb 2008 20:04:22 -0500 Subject: [PATCH 06/80] NFS: Add an nfsiod workqueue NFS post-rpciod cleanups often involve tasks that cannot be safely performed within the rpciod context (due to deadlock concerns). We therefore add a dedicated NFS workqueue that can perform tasks like cleaning up state after an interrupted NFSv4 open() call, or calling put_nfs_open_context() after an asynchronous read or write call. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 37 +++++++++++++++++++++++++++++++++++++ fs/nfs/internal.h | 1 + 2 files changed, 38 insertions(+) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index a499fb58d858..36cb99985d22 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1219,6 +1219,36 @@ static void nfs_destroy_inodecache(void) kmem_cache_destroy(nfs_inode_cachep); } +struct workqueue_struct *nfsiod_workqueue; + +/* + * start up the nfsiod workqueue + */ +static int nfsiod_start(void) +{ + struct workqueue_struct *wq; + dprintk("RPC: creating workqueue nfsiod\n"); + wq = create_singlethread_workqueue("nfsiod"); + if (wq == NULL) + return -ENOMEM; + nfsiod_workqueue = wq; + return 0; +} + +/* + * Destroy the nfsiod workqueue + */ +static void nfsiod_stop(void) +{ + struct workqueue_struct *wq; + + wq = nfsiod_workqueue; + if (wq == NULL) + return; + nfsiod_workqueue = NULL; + destroy_workqueue(wq); +} + /* * Initialize NFS */ @@ -1226,6 +1256,10 @@ static int __init init_nfs_fs(void) { int err; + err = nfsiod_start(); + if (err) + goto out6; + err = nfs_fs_proc_init(); if (err) goto out5; @@ -1272,6 +1306,8 @@ out3: out4: nfs_fs_proc_exit(); out5: + nfsiod_stop(); +out6: return err; } @@ -1287,6 +1323,7 @@ static void __exit exit_nfs_fs(void) #endif unregister_nfs_fs(); nfs_fs_proc_exit(); + nfsiod_stop(); } /* Not quite true; I just maintain it */ diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 0f5619611b8d..985dc2931031 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -143,6 +143,7 @@ extern struct rpc_procinfo nfs4_procedures[]; extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask); /* inode.c */ +extern struct workqueue_struct *nfsiod_workqueue; extern struct inode *nfs_alloc_inode(struct super_block *sb); extern void nfs_destroy_inode(struct inode *); extern int nfs_write_inode(struct inode *,int); From 101070ca2fe67186f5f5517b66cb4757b17f4e29 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 19 Feb 2008 20:04:23 -0500 Subject: [PATCH 07/80] NFS: Ensure that the asynchronous RPC calls complete on nfsiod. We want to ensure that rpc_call_ops that involve mntput() are run on nfsiod rather than on rpciod, so that they don't deadlock when the resulting umount calls rpc_shutdown_client(). Hence we specify that read, write and commit calls must complete on nfsiod. Ditto for NFSv4 open, lock, locku and close asynchronous calls. Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 4 ++++ fs/nfs/nfs4proc.c | 6 ++++++ fs/nfs/read.c | 1 + fs/nfs/write.c | 2 ++ 4 files changed, 13 insertions(+) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index e0170407a885..e44200579c8d 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -280,6 +280,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, .rpc_client = NFS_CLIENT(inode), .rpc_message = &msg, .callback_ops = &nfs_read_direct_ops, + .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; unsigned int pgbase; @@ -446,6 +447,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) struct rpc_task_setup task_setup_data = { .rpc_client = NFS_CLIENT(inode), .callback_ops = &nfs_write_direct_ops, + .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; @@ -537,6 +539,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) .rpc_message = &msg, .callback_ops = &nfs_commit_direct_ops, .callback_data = data, + .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; @@ -683,6 +686,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, .rpc_client = NFS_CLIENT(inode), .rpc_message = &msg, .callback_ops = &nfs_write_direct_ops, + .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; size_t wsize = NFS_SERVER(inode)->wsize; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 7ce07862c2fb..b6db833f33e6 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -51,6 +51,7 @@ #include "nfs4_fs.h" #include "delegation.h" +#include "internal.h" #include "iostat.h" #define NFSDBG_FACILITY NFSDBG_PROC @@ -773,6 +774,7 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data) .rpc_message = &msg, .callback_ops = &nfs4_open_confirm_ops, .callback_data = data, + .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; int status; @@ -910,6 +912,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) .rpc_message = &msg, .callback_ops = &nfs4_open_ops, .callback_data = data, + .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; int status; @@ -1315,6 +1318,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait) .rpc_client = server->client, .rpc_message = &msg, .callback_ops = &nfs4_close_ops, + .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; int status = -ENOMEM; @@ -3235,6 +3239,7 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, .rpc_client = NFS_CLIENT(lsp->ls_state->inode), .rpc_message = &msg, .callback_ops = &nfs4_locku_ops, + .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; @@ -3419,6 +3424,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f .rpc_client = NFS_CLIENT(state->inode), .rpc_message = &msg, .callback_ops = &nfs4_lock_ops, + .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; int ret; diff --git a/fs/nfs/read.c b/fs/nfs/read.c index fab0d3720a03..87546cd277d5 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -177,6 +177,7 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, .rpc_message = &msg, .callback_ops = call_ops, .callback_data = data, + .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC | swap_flags, }; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 69b4158d9a10..84495ef90728 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -803,6 +803,7 @@ static void nfs_write_rpcsetup(struct nfs_page *req, .rpc_message = &msg, .callback_ops = call_ops, .callback_data = data, + .workqueue = nfsiod_workqueue, .flags = flags, .priority = priority, }; @@ -1187,6 +1188,7 @@ static void nfs_commit_rpcsetup(struct list_head *head, .rpc_message = &msg, .callback_ops = &nfs_commit_ops, .callback_data = data, + .workqueue = nfsiod_workqueue, .flags = flags, .priority = priority, }; From fde95c7554aa77f9a242f32b0b5f8f15395abf52 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 22 Feb 2008 15:09:26 -0500 Subject: [PATCH 08/80] SUNRPC: Clean up rpc_run_timer() All RPC timeout callback functions are expected to wake the task up. We can enforce this by moving the wakeup back into rpc_run_timer. Signed-off-by: Trond Myklebust --- net/sunrpc/sched.c | 67 ++++++++++++++++++++++++---------------------- net/sunrpc/xprt.c | 2 -- 2 files changed, 35 insertions(+), 34 deletions(-) diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 3e0b22382a3b..9433a113862c 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -38,7 +38,6 @@ static struct kmem_cache *rpc_buffer_slabp __read_mostly; static mempool_t *rpc_task_mempool __read_mostly; static mempool_t *rpc_buffer_mempool __read_mostly; -static void __rpc_default_timer(struct rpc_task *task); static void rpc_async_schedule(struct work_struct *); static void rpc_release_task(struct rpc_task *task); @@ -66,25 +65,13 @@ __rpc_disable_timer(struct rpc_task *task) } /* - * Run a timeout function. - * We use the callback in order to allow __rpc_wake_up_task() - * and friends to disable the timer synchronously on SMP systems - * without calling del_timer_sync(). The latter could cause a - * deadlock if called while we're holding spinlocks... + * Default timeout handler if none specified by user */ -static void rpc_run_timer(struct rpc_task *task) +static void +__rpc_default_timer(struct rpc_task *task) { - void (*callback)(struct rpc_task *); - - callback = task->tk_timeout_fn; - task->tk_timeout_fn = NULL; - if (callback && RPC_IS_QUEUED(task)) { - dprintk("RPC: %5u running timer\n", task->tk_pid); - callback(task); - } - smp_mb__before_clear_bit(); - clear_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate); - smp_mb__after_clear_bit(); + dprintk("RPC: %5u timeout (default timer)\n", task->tk_pid); + task->tk_status = -ETIMEDOUT; } /* @@ -415,17 +402,6 @@ static void __rpc_wake_up_task(struct rpc_task *task) } } -/* - * Default timeout handler if none specified by user - */ -static void -__rpc_default_timer(struct rpc_task *task) -{ - dprintk("RPC: %5u timeout (default timer)\n", task->tk_pid); - task->tk_status = -ETIMEDOUT; - rpc_wake_up_task(task); -} - /* * Wake up the specified task */ @@ -578,9 +554,37 @@ void rpc_wake_up_status(struct rpc_wait_queue *queue, int status) } EXPORT_SYMBOL_GPL(rpc_wake_up_status); +/* + * Run a timeout function. + */ +static void rpc_run_timer(unsigned long ptr) +{ + struct rpc_task *task = (struct rpc_task *)ptr; + void (*callback)(struct rpc_task *); + + if (!rpc_start_wakeup(task)) + goto out; + if (RPC_IS_QUEUED(task)) { + struct rpc_wait_queue *queue = task->u.tk_wait.rpc_waitq; + callback = task->tk_timeout_fn; + + dprintk("RPC: %5u running timer\n", task->tk_pid); + if (callback != NULL) + callback(task); + /* Note: we're already in a bh-safe context */ + spin_lock(&queue->lock); + __rpc_do_wake_up_task(task); + spin_unlock(&queue->lock); + } + rpc_finish_wakeup(task); +out: + smp_mb__before_clear_bit(); + clear_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate); + smp_mb__after_clear_bit(); +} + static void __rpc_atrun(struct rpc_task *task) { - rpc_wake_up_task(task); } /* @@ -816,8 +820,7 @@ EXPORT_SYMBOL_GPL(rpc_free); static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *task_setup_data) { memset(task, 0, sizeof(*task)); - setup_timer(&task->tk_timer, (void (*)(unsigned long))rpc_run_timer, - (unsigned long)task); + setup_timer(&task->tk_timer, rpc_run_timer, (unsigned long)task); atomic_set(&task->tk_count, 1); task->tk_flags = task_setup_data->flags; task->tk_ops = task_setup_data->callback_ops; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index d5553b8179f9..96c212ddc41a 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -777,8 +777,6 @@ static void xprt_timer(struct rpc_task *task) xprt->ops->timer(task); task->tk_status = -ETIMEDOUT; } - task->tk_timeout = 0; - rpc_wake_up_task(task); spin_unlock(&xprt->transport_lock); } From 96ef13b283934fbf60b732e6c4ce23e8babd0042 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 22 Feb 2008 15:46:41 -0500 Subject: [PATCH 09/80] SUNRPC: Add a new helper rpc_wake_up_queued_task() In all cases where we currently use rpc_wake_up_task(), we almost always know on which waitqueue the rpc_task is actually sleeping. This will allows us to simplify the queue locking in a future patch. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/sched.h | 4 ++- net/sunrpc/clnt.c | 2 +- net/sunrpc/sched.c | 64 +++++++++++++++++------------------- 3 files changed, 34 insertions(+), 36 deletions(-) diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index fefb0ab52189..83f67779cf00 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -33,7 +33,6 @@ struct rpc_wait_queue; struct rpc_wait { struct list_head list; /* wait queue links */ struct list_head links; /* Links to related tasks */ - struct rpc_wait_queue * rpc_waitq; /* RPC wait queue we're on */ }; /* @@ -80,6 +79,7 @@ struct rpc_task { struct workqueue_struct *tk_workqueue; /* Normally rpciod, but could * be any workqueue */ + struct rpc_wait_queue *tk_waitqueue; /* RPC wait queue we're on */ union { struct work_struct tk_work; /* Async task work queue */ struct rpc_wait tk_wait; /* RPC wait */ @@ -233,6 +233,8 @@ void rpc_init_wait_queue(struct rpc_wait_queue *, const char *); void rpc_sleep_on(struct rpc_wait_queue *, struct rpc_task *, rpc_action action, rpc_action timer); void rpc_wake_up_task(struct rpc_task *); +void rpc_wake_up_queued_task(struct rpc_wait_queue *, + struct rpc_task *); void rpc_wake_up(struct rpc_wait_queue *); struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *); void rpc_wake_up_status(struct rpc_wait_queue *, int); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 8c6a7f1a25e9..fe95bd0ab1e9 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1535,7 +1535,7 @@ void rpc_show_tasks(void) proc = -1; if (RPC_IS_QUEUED(t)) - rpc_waitq = rpc_qname(t->u.tk_wait.rpc_waitq); + rpc_waitq = rpc_qname(t->tk_waitqueue); printk("%5u %04d %04x %6d %8p %6d %8p %8ld %8s %8p %8p\n", t->tk_pid, proc, diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 9433a113862c..9233ace076aa 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -148,7 +148,7 @@ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue, struct rpc_task * list_add(&task->u.tk_wait.list, &queue->tasks[0]); else list_add_tail(&task->u.tk_wait.list, &queue->tasks[0]); - task->u.tk_wait.rpc_waitq = queue; + task->tk_waitqueue = queue; queue->qlen++; rpc_set_queued(task); @@ -175,11 +175,8 @@ static void __rpc_remove_wait_queue_priority(struct rpc_task *task) * Remove request from queue. * Note: must be called with spin lock held. */ -static void __rpc_remove_wait_queue(struct rpc_task *task) +static void __rpc_remove_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task) { - struct rpc_wait_queue *queue; - queue = task->u.tk_wait.rpc_waitq; - if (RPC_IS_PRIORITY(queue)) __rpc_remove_wait_queue_priority(task); else @@ -364,11 +361,12 @@ EXPORT_SYMBOL_GPL(rpc_sleep_on); /** * __rpc_do_wake_up_task - wake up a single rpc_task + * @queue: wait queue * @task: task to be woken up * * Caller must hold queue->lock, and have cleared the task queued flag. */ -static void __rpc_do_wake_up_task(struct rpc_task *task) +static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task *task) { dprintk("RPC: %5u __rpc_wake_up_task (now %lu)\n", task->tk_pid, jiffies); @@ -383,7 +381,7 @@ static void __rpc_do_wake_up_task(struct rpc_task *task) } __rpc_disable_timer(task); - __rpc_remove_wait_queue(task); + __rpc_remove_wait_queue(queue, task); rpc_make_runnable(task); @@ -391,35 +389,37 @@ static void __rpc_do_wake_up_task(struct rpc_task *task) } /* - * Wake up the specified task + * Wake up a queued task while the queue lock is being held */ -static void __rpc_wake_up_task(struct rpc_task *task) +static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task) { + if (!RPC_IS_QUEUED(task) || task->tk_waitqueue != queue) + return; if (rpc_start_wakeup(task)) { - if (RPC_IS_QUEUED(task)) - __rpc_do_wake_up_task(task); + __rpc_do_wake_up_task(queue, task); rpc_finish_wakeup(task); } } +/* + * Wake up a task on a specific queue + */ +void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task) +{ + rcu_read_lock_bh(); + spin_lock(&queue->lock); + rpc_wake_up_task_queue_locked(queue, task); + spin_unlock(&queue->lock); + rcu_read_unlock_bh(); +} +EXPORT_SYMBOL_GPL(rpc_wake_up_queued_task); + /* * Wake up the specified task */ void rpc_wake_up_task(struct rpc_task *task) { - rcu_read_lock_bh(); - if (rpc_start_wakeup(task)) { - if (RPC_IS_QUEUED(task)) { - struct rpc_wait_queue *queue = task->u.tk_wait.rpc_waitq; - - /* Note: we're already in a bh-safe context */ - spin_lock(&queue->lock); - __rpc_do_wake_up_task(task); - spin_unlock(&queue->lock); - } - rpc_finish_wakeup(task); - } - rcu_read_unlock_bh(); + rpc_wake_up_queued_task(task->tk_waitqueue, task); } EXPORT_SYMBOL_GPL(rpc_wake_up_task); @@ -471,7 +471,7 @@ new_queue: new_owner: rpc_set_waitqueue_owner(queue, task->tk_owner); out: - __rpc_wake_up_task(task); + rpc_wake_up_task_queue_locked(queue, task); return task; } @@ -490,7 +490,7 @@ struct rpc_task * rpc_wake_up_next(struct rpc_wait_queue *queue) task = __rpc_wake_up_next_priority(queue); else { task_for_first(task, &queue->tasks[0]) - __rpc_wake_up_task(task); + rpc_wake_up_task_queue_locked(queue, task); } spin_unlock(&queue->lock); rcu_read_unlock_bh(); @@ -515,7 +515,7 @@ void rpc_wake_up(struct rpc_wait_queue *queue) head = &queue->tasks[queue->maxpriority]; for (;;) { list_for_each_entry_safe(task, next, head, u.tk_wait.list) - __rpc_wake_up_task(task); + rpc_wake_up_task_queue_locked(queue, task); if (head == &queue->tasks[0]) break; head--; @@ -543,7 +543,7 @@ void rpc_wake_up_status(struct rpc_wait_queue *queue, int status) for (;;) { list_for_each_entry_safe(task, next, head, u.tk_wait.list) { task->tk_status = status; - __rpc_wake_up_task(task); + rpc_wake_up_task_queue_locked(queue, task); } if (head == &queue->tasks[0]) break; @@ -562,10 +562,8 @@ static void rpc_run_timer(unsigned long ptr) struct rpc_task *task = (struct rpc_task *)ptr; void (*callback)(struct rpc_task *); - if (!rpc_start_wakeup(task)) - goto out; if (RPC_IS_QUEUED(task)) { - struct rpc_wait_queue *queue = task->u.tk_wait.rpc_waitq; + struct rpc_wait_queue *queue = task->tk_waitqueue; callback = task->tk_timeout_fn; dprintk("RPC: %5u running timer\n", task->tk_pid); @@ -573,11 +571,9 @@ static void rpc_run_timer(unsigned long ptr) callback(task); /* Note: we're already in a bh-safe context */ spin_lock(&queue->lock); - __rpc_do_wake_up_task(task); + rpc_wake_up_task_queue_locked(queue, task); spin_unlock(&queue->lock); } - rpc_finish_wakeup(task); -out: smp_mb__before_clear_bit(); clear_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate); smp_mb__after_clear_bit(); From fda1393938035559b417dd5b26b9cc293a7aee00 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 22 Feb 2008 16:34:12 -0500 Subject: [PATCH 10/80] SUNRPC: Convert users of rpc_wake_up_task to use rpc_wake_up_queued_task Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 2 +- include/linux/sunrpc/sched.h | 1 - net/sunrpc/clnt.c | 2 +- net/sunrpc/sched.c | 3 +-- net/sunrpc/xprt.c | 10 +++++----- 5 files changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b6db833f33e6..54743396b666 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2768,7 +2768,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server) rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL, NULL); nfs4_schedule_state_recovery(clp); if (test_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) == 0) - rpc_wake_up_task(task); + rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); task->tk_status = 0; return -EAGAIN; case -NFS4ERR_DELAY: diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 83f67779cf00..7963ef0ffb89 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -232,7 +232,6 @@ void rpc_init_priority_wait_queue(struct rpc_wait_queue *, const char *); void rpc_init_wait_queue(struct rpc_wait_queue *, const char *); void rpc_sleep_on(struct rpc_wait_queue *, struct rpc_task *, rpc_action action, rpc_action timer); -void rpc_wake_up_task(struct rpc_task *); void rpc_wake_up_queued_task(struct rpc_wait_queue *, struct rpc_task *); void rpc_wake_up(struct rpc_wait_queue *); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index fe95bd0ab1e9..0e209af39797 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1066,7 +1066,7 @@ call_transmit(struct rpc_task *task) if (task->tk_msg.rpc_proc->p_decode != NULL) return; task->tk_action = rpc_exit_task; - rpc_wake_up_task(task); + rpc_wake_up_queued_task(&task->tk_xprt->pending, task); } /* diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 9233ace076aa..35acdc39bfcb 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -417,11 +417,10 @@ EXPORT_SYMBOL_GPL(rpc_wake_up_queued_task); /* * Wake up the specified task */ -void rpc_wake_up_task(struct rpc_task *task) +static void rpc_wake_up_task(struct rpc_task *task) { rpc_wake_up_queued_task(task->tk_waitqueue, task); } -EXPORT_SYMBOL_GPL(rpc_wake_up_task); /* * Wake up the next task on a priority queue. diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 96c212ddc41a..6e2772217e55 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -472,7 +472,7 @@ void xprt_write_space(struct rpc_xprt *xprt) if (xprt->snd_task) { dprintk("RPC: write space: waking waiting task on " "xprt %p\n", xprt); - rpc_wake_up_task(xprt->snd_task); + rpc_wake_up_queued_task(&xprt->pending, xprt->snd_task); } spin_unlock_bh(&xprt->transport_lock); } @@ -602,8 +602,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt) /* Try to schedule an autoclose RPC call */ if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) queue_work(rpciod_workqueue, &xprt->task_cleanup); - else if (xprt->snd_task != NULL) - rpc_wake_up_task(xprt->snd_task); + xprt_wake_pending_tasks(xprt, -ENOTCONN); spin_unlock_bh(&xprt->transport_lock); } EXPORT_SYMBOL_GPL(xprt_force_disconnect); @@ -749,18 +748,19 @@ EXPORT_SYMBOL_GPL(xprt_update_rtt); void xprt_complete_rqst(struct rpc_task *task, int copied) { struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; dprintk("RPC: %5u xid %08x complete (%d bytes received)\n", task->tk_pid, ntohl(req->rq_xid), copied); - task->tk_xprt->stat.recvs++; + xprt->stat.recvs++; task->tk_rtt = (long)jiffies - req->rq_xtime; list_del_init(&req->rq_list); /* Ensure all writes are done before we update req->rq_received */ smp_wmb(); req->rq_received = req->rq_private_buf.len = copied; - rpc_wake_up_task(task); + rpc_wake_up_queued_task(&xprt->pending, task); } EXPORT_SYMBOL_GPL(xprt_complete_rqst); From 5d00837b90340af9106dcd93af75fd664c8eb87f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 22 Feb 2008 16:34:17 -0500 Subject: [PATCH 11/80] SUNRPC: Run rpc timeout functions as callbacks instead of in softirqs An audit of the current RPC timeout functions shows that they don't really ever need to run in the softirq context. As long as the softirq is able to signal that the wakeup is due to a timeout (which it can do by setting task->tk_status to -ETIMEDOUT) then the callback functions can just run as standard task->tk_callback functions (in the rpciod/process context). The only possible border-line case would be xprt_timer() for the case of UDP, when the callback is used to reduce the size of the transport congestion window. In testing, however, the effect of moving that update to a callback would appear to be minor. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 2 +- fs/nfs/nfs4state.c | 2 +- include/linux/sunrpc/sched.h | 4 +-- net/sunrpc/auth_gss/auth_gss.c | 4 +-- net/sunrpc/rpcb_clnt.c | 2 +- net/sunrpc/sched.c | 50 ++++++++++------------------------ net/sunrpc/xprt.c | 28 ++++++++++--------- 7 files changed, 36 insertions(+), 56 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 54743396b666..bbb0d58ee6ac 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2765,7 +2765,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server) case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: - rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL, NULL); + rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); nfs4_schedule_state_recovery(clp); if (test_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) == 0) rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index b962397004c1..a2ef02824aa8 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -731,7 +731,7 @@ int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task) list_add_tail(&seqid->list, &sequence->list); if (list_first_entry(&sequence->list, struct nfs_seqid, list) == seqid) goto unlock; - rpc_sleep_on(&sequence->wait, task, NULL, NULL); + rpc_sleep_on(&sequence->wait, task, NULL); status = -EAGAIN; unlock: spin_unlock(&sequence->lock); diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 7963ef0ffb89..503a937bdca9 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -56,12 +56,10 @@ struct rpc_task { __u8 tk_cred_retry; /* - * timeout_fn to be executed by timer bottom half * callback to be executed after waking up * action next procedure for async tasks * tk_ops caller callbacks */ - void (*tk_timeout_fn)(struct rpc_task *); void (*tk_callback)(struct rpc_task *); void (*tk_action)(struct rpc_task *); const struct rpc_call_ops *tk_ops; @@ -231,7 +229,7 @@ void rpc_execute(struct rpc_task *); void rpc_init_priority_wait_queue(struct rpc_wait_queue *, const char *); void rpc_init_wait_queue(struct rpc_wait_queue *, const char *); void rpc_sleep_on(struct rpc_wait_queue *, struct rpc_task *, - rpc_action action, rpc_action timer); + rpc_action action); void rpc_wake_up_queued_task(struct rpc_wait_queue *, struct rpc_task *); void rpc_wake_up(struct rpc_wait_queue *); diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 6dac38792288..dc6391bcda11 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -408,13 +408,13 @@ gss_refresh_upcall(struct rpc_task *task) } spin_lock(&inode->i_lock); if (gss_cred->gc_upcall != NULL) - rpc_sleep_on(&gss_cred->gc_upcall->rpc_waitqueue, task, NULL, NULL); + rpc_sleep_on(&gss_cred->gc_upcall->rpc_waitqueue, task, NULL); else if (gss_msg->ctx == NULL && gss_msg->msg.errno >= 0) { task->tk_timeout = 0; gss_cred->gc_upcall = gss_msg; /* gss_upcall_callback will release the reference to gss_upcall_msg */ atomic_inc(&gss_msg->count); - rpc_sleep_on(&gss_msg->rpc_waitqueue, task, gss_upcall_callback, NULL); + rpc_sleep_on(&gss_msg->rpc_waitqueue, task, gss_upcall_callback); } else err = gss_msg->msg.errno; spin_unlock(&inode->i_lock); diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 3164a0871cf0..f480c718b400 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -298,7 +298,7 @@ void rpcb_getport_async(struct rpc_task *task) /* Put self on queue before sending rpcbind request, in case * rpcb_getport_done completes before we return from rpc_run_task */ - rpc_sleep_on(&xprt->binding, task, NULL, NULL); + rpc_sleep_on(&xprt->binding, task, NULL); /* Someone else may have bound if we slept */ if (xprt_bound(xprt)) { diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 35acdc39bfcb..caf12fd6b6af 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -56,29 +56,18 @@ struct workqueue_struct *rpciod_workqueue; * queue->lock and bh_disabled in order to avoid races within * rpc_run_timer(). */ -static inline void +static void __rpc_disable_timer(struct rpc_task *task) { dprintk("RPC: %5u disabling timer\n", task->tk_pid); - task->tk_timeout_fn = NULL; task->tk_timeout = 0; } -/* - * Default timeout handler if none specified by user - */ -static void -__rpc_default_timer(struct rpc_task *task) -{ - dprintk("RPC: %5u timeout (default timer)\n", task->tk_pid); - task->tk_status = -ETIMEDOUT; -} - /* * Set up a timer for the current task. */ -static inline void -__rpc_add_timer(struct rpc_task *task, rpc_action timer) +static void +__rpc_add_timer(struct rpc_task *task) { if (!task->tk_timeout) return; @@ -86,10 +75,6 @@ __rpc_add_timer(struct rpc_task *task, rpc_action timer) dprintk("RPC: %5u setting alarm for %lu ms\n", task->tk_pid, task->tk_timeout * 1000 / HZ); - if (timer) - task->tk_timeout_fn = timer; - else - task->tk_timeout_fn = __rpc_default_timer; set_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate); mod_timer(&task->tk_timer, jiffies + task->tk_timeout); } @@ -297,7 +282,6 @@ EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task); */ static void rpc_make_runnable(struct rpc_task *task) { - BUG_ON(task->tk_timeout_fn); rpc_clear_queued(task); if (rpc_test_and_set_running(task)) return; @@ -327,7 +311,7 @@ static void rpc_make_runnable(struct rpc_task *task) * as it's on a wait queue. */ static void __rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, - rpc_action action, rpc_action timer) + rpc_action action) { dprintk("RPC: %5u sleep_on(queue \"%s\" time %lu)\n", task->tk_pid, rpc_qname(q), jiffies); @@ -341,11 +325,11 @@ static void __rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, BUG_ON(task->tk_callback != NULL); task->tk_callback = action; - __rpc_add_timer(task, timer); + __rpc_add_timer(task); } void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, - rpc_action action, rpc_action timer) + rpc_action action) { /* Mark the task as being activated if so needed */ rpc_set_active(task); @@ -354,7 +338,7 @@ void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, * Protect the queue operations. */ spin_lock_bh(&q->lock); - __rpc_sleep_on(q, task, action, timer); + __rpc_sleep_on(q, task, action); spin_unlock_bh(&q->lock); } EXPORT_SYMBOL_GPL(rpc_sleep_on); @@ -559,20 +543,15 @@ EXPORT_SYMBOL_GPL(rpc_wake_up_status); static void rpc_run_timer(unsigned long ptr) { struct rpc_task *task = (struct rpc_task *)ptr; - void (*callback)(struct rpc_task *); + struct rpc_wait_queue *queue = task->tk_waitqueue; - if (RPC_IS_QUEUED(task)) { - struct rpc_wait_queue *queue = task->tk_waitqueue; - callback = task->tk_timeout_fn; - - dprintk("RPC: %5u running timer\n", task->tk_pid); - if (callback != NULL) - callback(task); - /* Note: we're already in a bh-safe context */ - spin_lock(&queue->lock); + spin_lock(&queue->lock); + if (RPC_IS_QUEUED(task) && task->tk_waitqueue == queue) { + dprintk("RPC: %5u timeout\n", task->tk_pid); + task->tk_status = -ETIMEDOUT; rpc_wake_up_task_queue_locked(queue, task); - spin_unlock(&queue->lock); } + spin_unlock(&queue->lock); smp_mb__before_clear_bit(); clear_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate); smp_mb__after_clear_bit(); @@ -580,6 +559,7 @@ static void rpc_run_timer(unsigned long ptr) static void __rpc_atrun(struct rpc_task *task) { + task->tk_status = 0; } /* @@ -588,7 +568,7 @@ static void __rpc_atrun(struct rpc_task *task) void rpc_delay(struct rpc_task *task, unsigned long delay) { task->tk_timeout = delay; - rpc_sleep_on(&delay_queue, task, NULL, __rpc_atrun); + rpc_sleep_on(&delay_queue, task, __rpc_atrun); } EXPORT_SYMBOL_GPL(rpc_delay); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 6e2772217e55..9bf118c54316 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -188,9 +188,9 @@ out_sleep: task->tk_timeout = 0; task->tk_status = -EAGAIN; if (req && req->rq_ntrans) - rpc_sleep_on(&xprt->resend, task, NULL, NULL); + rpc_sleep_on(&xprt->resend, task, NULL); else - rpc_sleep_on(&xprt->sending, task, NULL, NULL); + rpc_sleep_on(&xprt->sending, task, NULL); return 0; } EXPORT_SYMBOL_GPL(xprt_reserve_xprt); @@ -238,9 +238,9 @@ out_sleep: task->tk_timeout = 0; task->tk_status = -EAGAIN; if (req && req->rq_ntrans) - rpc_sleep_on(&xprt->resend, task, NULL, NULL); + rpc_sleep_on(&xprt->resend, task, NULL); else - rpc_sleep_on(&xprt->sending, task, NULL, NULL); + rpc_sleep_on(&xprt->sending, task, NULL); return 0; } EXPORT_SYMBOL_GPL(xprt_reserve_xprt_cong); @@ -453,7 +453,7 @@ void xprt_wait_for_buffer_space(struct rpc_task *task) struct rpc_xprt *xprt = req->rq_xprt; task->tk_timeout = req->rq_timeout; - rpc_sleep_on(&xprt->pending, task, NULL, NULL); + rpc_sleep_on(&xprt->pending, task, NULL); } EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space); @@ -652,7 +652,7 @@ void xprt_connect(struct rpc_task *task) task->tk_rqstp->rq_bytes_sent = 0; task->tk_timeout = xprt->connect_timeout; - rpc_sleep_on(&xprt->pending, task, xprt_connect_status, NULL); + rpc_sleep_on(&xprt->pending, task, xprt_connect_status); xprt->stat.connect_start = jiffies; xprt->ops->connect(task); } @@ -769,15 +769,17 @@ static void xprt_timer(struct rpc_task *task) struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; + if (task->tk_status != -ETIMEDOUT) + return; dprintk("RPC: %5u xprt_timer\n", task->tk_pid); - spin_lock(&xprt->transport_lock); + spin_lock_bh(&xprt->transport_lock); if (!req->rq_received) { if (xprt->ops->timer) xprt->ops->timer(task); - task->tk_status = -ETIMEDOUT; - } - spin_unlock(&xprt->transport_lock); + } else + task->tk_status = 0; + spin_unlock_bh(&xprt->transport_lock); } /** @@ -862,7 +864,7 @@ void xprt_transmit(struct rpc_task *task) if (!xprt_connected(xprt)) task->tk_status = -ENOTCONN; else if (!req->rq_received) - rpc_sleep_on(&xprt->pending, task, NULL, xprt_timer); + rpc_sleep_on(&xprt->pending, task, xprt_timer); spin_unlock_bh(&xprt->transport_lock); return; } @@ -873,7 +875,7 @@ void xprt_transmit(struct rpc_task *task) */ task->tk_status = status; if (status == -ECONNREFUSED) - rpc_sleep_on(&xprt->sending, task, NULL, NULL); + rpc_sleep_on(&xprt->sending, task, NULL); } static inline void do_xprt_reserve(struct rpc_task *task) @@ -893,7 +895,7 @@ static inline void do_xprt_reserve(struct rpc_task *task) dprintk("RPC: waiting for request slot\n"); task->tk_status = -EAGAIN; task->tk_timeout = 0; - rpc_sleep_on(&xprt->backlog, task, NULL, NULL); + rpc_sleep_on(&xprt->backlog, task, NULL); } /** From f6a1cc89309f0ae847a9b6fe418d1c4215e5bc55 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 22 Feb 2008 17:06:55 -0500 Subject: [PATCH 12/80] SUNRPC: Add a (empty for the moment) destructor for rpc_wait_queues Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 2 ++ fs/nfs/nfs4state.c | 6 +++++- include/linux/sunrpc/sched.h | 1 + net/sunrpc/auth_gss/auth_gss.c | 1 + net/sunrpc/sched.c | 21 ++++++++++++++------- net/sunrpc/xprt.c | 5 +++++ 6 files changed, 28 insertions(+), 8 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index c5c0175898f6..06f064d8fbbe 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -170,6 +170,8 @@ static void nfs4_shutdown_client(struct nfs_client *clp) BUG_ON(!RB_EMPTY_ROOT(&clp->cl_state_owners)); if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state)) nfs_idmap_delete(clp); + + rpc_destroy_wait_queue(&clp->cl_rpcwaitq); #endif } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index a2ef02824aa8..7775435ea7a5 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -292,8 +292,10 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct spin_unlock(&clp->cl_lock); if (sp == new) get_rpccred(cred); - else + else { + rpc_destroy_wait_queue(&new->so_sequence.wait); kfree(new); + } return sp; } @@ -310,6 +312,7 @@ void nfs4_put_state_owner(struct nfs4_state_owner *sp) return; nfs4_remove_state_owner(clp, sp); spin_unlock(&clp->cl_lock); + rpc_destroy_wait_queue(&sp->so_sequence.wait); put_rpccred(cred); kfree(sp); } @@ -529,6 +532,7 @@ static void nfs4_free_lock_state(struct nfs4_lock_state *lsp) spin_lock(&clp->cl_lock); nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id); spin_unlock(&clp->cl_lock); + rpc_destroy_wait_queue(&lsp->ls_sequence.wait); kfree(lsp); } diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 503a937bdca9..d39729e2b893 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -228,6 +228,7 @@ void rpc_killall_tasks(struct rpc_clnt *); void rpc_execute(struct rpc_task *); void rpc_init_priority_wait_queue(struct rpc_wait_queue *, const char *); void rpc_init_wait_queue(struct rpc_wait_queue *, const char *); +void rpc_destroy_wait_queue(struct rpc_wait_queue *); void rpc_sleep_on(struct rpc_wait_queue *, struct rpc_task *, rpc_action action); void rpc_wake_up_queued_task(struct rpc_wait_queue *, diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index dc6391bcda11..ef6384961808 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -266,6 +266,7 @@ gss_release_msg(struct gss_upcall_msg *gss_msg) BUG_ON(!list_empty(&gss_msg->list)); if (gss_msg->ctx != NULL) gss_put_ctx(gss_msg->ctx); + rpc_destroy_wait_queue(&gss_msg->rpc_waitqueue); kfree(gss_msg); } diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index caf12fd6b6af..86aa897e7b08 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -214,6 +214,11 @@ void rpc_init_wait_queue(struct rpc_wait_queue *queue, const char *qname) } EXPORT_SYMBOL_GPL(rpc_init_wait_queue); +void rpc_destroy_wait_queue(struct rpc_wait_queue *queue) +{ +} +EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue); + static int rpc_wait_bit_killable(void *word) { if (fatal_signal_pending(current)) @@ -1020,11 +1025,20 @@ rpc_destroy_mempool(void) kmem_cache_destroy(rpc_task_slabp); if (rpc_buffer_slabp) kmem_cache_destroy(rpc_buffer_slabp); + rpc_destroy_wait_queue(&delay_queue); } int rpc_init_mempool(void) { + /* + * The following is not strictly a mempool initialisation, + * but there is no harm in doing it here + */ + rpc_init_wait_queue(&delay_queue, "delayq"); + if (!rpciod_start()) + goto err_nomem; + rpc_task_slabp = kmem_cache_create("rpc_tasks", sizeof(struct rpc_task), 0, SLAB_HWCACHE_ALIGN, @@ -1045,13 +1059,6 @@ rpc_init_mempool(void) rpc_buffer_slabp); if (!rpc_buffer_mempool) goto err_nomem; - if (!rpciod_start()) - goto err_nomem; - /* - * The following is not strictly a mempool initialisation, - * but there is no harm in doing it here - */ - rpc_init_wait_queue(&delay_queue, "delayq"); return 0; err_nomem: rpc_destroy_mempool(); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 9bf118c54316..85199c647022 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1052,6 +1052,11 @@ static void xprt_destroy(struct kref *kref) xprt->shutdown = 1; del_timer_sync(&xprt->timer); + rpc_destroy_wait_queue(&xprt->binding); + rpc_destroy_wait_queue(&xprt->pending); + rpc_destroy_wait_queue(&xprt->sending); + rpc_destroy_wait_queue(&xprt->resend); + rpc_destroy_wait_queue(&xprt->backlog); /* * Tear down transport state and free the rpc_xprt */ From 36df9aae3158ce8fc4ede241169dc94ac910d884 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 18 Jul 2007 16:18:52 -0400 Subject: [PATCH 13/80] SUNRPC: Add a timer function to wait queues. This is designed to replace the timeout timer in the individual rpc_tasks. By putting the timer function in the wait queue, we will eventually be able to reduce the total number of timers in use by the RPC subsystem. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/sched.h | 9 +++++++ net/sunrpc/sched.c | 46 ++++++++++++++++++++++++++++++++---- 2 files changed, 51 insertions(+), 4 deletions(-) diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index d39729e2b893..7751d3a05497 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -33,6 +33,8 @@ struct rpc_wait_queue; struct rpc_wait { struct list_head list; /* wait queue links */ struct list_head links; /* Links to related tasks */ + struct list_head timer_list; /* Timer list */ + unsigned long expires; }; /* @@ -191,6 +193,12 @@ struct rpc_task_setup { #define RPC_PRIORITY_HIGH (1) #define RPC_NR_PRIORITY (1 + RPC_PRIORITY_HIGH - RPC_PRIORITY_LOW) +struct rpc_timer { + struct timer_list timer; + struct list_head list; + unsigned long expires; +}; + /* * RPC synchronization objects */ @@ -203,6 +211,7 @@ struct rpc_wait_queue { unsigned char count; /* # task groups remaining serviced so far */ unsigned char nr; /* # tasks remaining for cookie */ unsigned short qlen; /* total # tasks waiting in queue */ + struct rpc_timer timer_list; #ifdef RPC_DEBUG const char * name; #endif diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 86aa897e7b08..29b1c1441f4d 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -40,6 +40,7 @@ static mempool_t *rpc_buffer_mempool __read_mostly; static void rpc_async_schedule(struct work_struct *); static void rpc_release_task(struct rpc_task *task); +static void __rpc_queue_timer_fn(unsigned long ptr); /* * RPC tasks sit here while waiting for conditions to improve. @@ -59,8 +60,18 @@ struct workqueue_struct *rpciod_workqueue; static void __rpc_disable_timer(struct rpc_task *task) { + if (task->tk_timeout == 0) + return; dprintk("RPC: %5u disabling timer\n", task->tk_pid); task->tk_timeout = 0; + list_del(&task->u.tk_wait.timer_list); +} + +static void +rpc_set_queue_timer(struct rpc_wait_queue *queue, unsigned long expires) +{ + queue->timer_list.expires = expires; + mod_timer(&queue->timer_list.timer, expires); } /* @@ -153,7 +164,6 @@ static void __rpc_remove_wait_queue_priority(struct rpc_task *task) list_move(&t->u.tk_wait.list, &task->u.tk_wait.list); list_splice_init(&task->u.tk_wait.links, &t->u.tk_wait.links); } - list_del(&task->u.tk_wait.list); } /* @@ -162,10 +172,10 @@ static void __rpc_remove_wait_queue_priority(struct rpc_task *task) */ static void __rpc_remove_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task) { + __rpc_disable_timer(task); if (RPC_IS_PRIORITY(queue)) __rpc_remove_wait_queue_priority(task); - else - list_del(&task->u.tk_wait.list); + list_del(&task->u.tk_wait.list); queue->qlen--; dprintk("RPC: %5u removed from queue %p \"%s\"\n", task->tk_pid, queue, rpc_qname(queue)); @@ -198,6 +208,9 @@ static void __rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const c INIT_LIST_HEAD(&queue->tasks[i]); queue->maxpriority = nr_queues - 1; rpc_reset_waitqueue_priority(queue); + queue->qlen = 0; + setup_timer(&queue->timer_list.timer, __rpc_queue_timer_fn, (unsigned long)queue); + INIT_LIST_HEAD(&queue->timer_list.list); #ifdef RPC_DEBUG queue->name = qname; #endif @@ -216,6 +229,7 @@ EXPORT_SYMBOL_GPL(rpc_init_wait_queue); void rpc_destroy_wait_queue(struct rpc_wait_queue *queue) { + del_timer_sync(&queue->timer_list.timer); } EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue); @@ -369,7 +383,6 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task return; } - __rpc_disable_timer(task); __rpc_remove_wait_queue(queue, task); rpc_make_runnable(task); @@ -562,6 +575,31 @@ static void rpc_run_timer(unsigned long ptr) smp_mb__after_clear_bit(); } +static void __rpc_queue_timer_fn(unsigned long ptr) +{ + struct rpc_wait_queue *queue = (struct rpc_wait_queue *)ptr; + struct rpc_task *task, *n; + unsigned long expires, now, timeo; + + spin_lock(&queue->lock); + expires = now = jiffies; + list_for_each_entry_safe(task, n, &queue->timer_list.list, u.tk_wait.timer_list) { + timeo = task->u.tk_wait.expires; + if (time_after_eq(now, timeo)) { + list_del_init(&task->u.tk_wait.timer_list); + dprintk("RPC: %5u timeout\n", task->tk_pid); + task->tk_status = -ETIMEDOUT; + rpc_wake_up_task_queue_locked(queue, task); + continue; + } + if (expires == now || time_after(expires, timeo)) + expires = timeo; + } + if (!list_empty(&queue->timer_list.list)) + rpc_set_queue_timer(queue, expires); + spin_unlock(&queue->lock); +} + static void __rpc_atrun(struct rpc_task *task) { task->tk_status = 0; From eb276c0e10187702928aeaa133e1d3dbaf3eafc7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 22 Feb 2008 17:27:59 -0500 Subject: [PATCH 14/80] SUNRPC: Switch tasks to using the rpc_waitqueue's timer function Signed-off-by: Trond Myklebust --- include/linux/sunrpc/sched.h | 9 +----- net/sunrpc/sched.c | 60 ++++++------------------------------ 2 files changed, 11 insertions(+), 58 deletions(-) diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 7751d3a05497..0d7be1642dc0 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -67,12 +67,6 @@ struct rpc_task { const struct rpc_call_ops *tk_ops; void * tk_calldata; - /* - * tk_timer is used for async processing by the RPC scheduling - * primitives. You should not access this directly unless - * you have a pathological interest in kernel oopses. - */ - struct timer_list tk_timer; /* kernel timer */ unsigned long tk_timeout; /* timeout for rpc_sleep() */ unsigned short tk_flags; /* misc flags */ unsigned long tk_runstate; /* Task run status */ @@ -149,8 +143,7 @@ struct rpc_task_setup { #define RPC_TASK_RUNNING 0 #define RPC_TASK_QUEUED 1 #define RPC_TASK_WAKEUP 2 -#define RPC_TASK_HAS_TIMER 3 -#define RPC_TASK_ACTIVE 4 +#define RPC_TASK_ACTIVE 3 #define RPC_IS_RUNNING(t) test_bit(RPC_TASK_RUNNING, &(t)->tk_runstate) #define rpc_set_running(t) set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate) diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 29b1c1441f4d..043eef4c15a2 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -58,13 +58,15 @@ struct workqueue_struct *rpciod_workqueue; * rpc_run_timer(). */ static void -__rpc_disable_timer(struct rpc_task *task) +__rpc_disable_timer(struct rpc_wait_queue *queue, struct rpc_task *task) { if (task->tk_timeout == 0) return; dprintk("RPC: %5u disabling timer\n", task->tk_pid); task->tk_timeout = 0; list_del(&task->u.tk_wait.timer_list); + if (list_empty(&queue->timer_list.list)) + del_timer(&queue->timer_list.timer); } static void @@ -78,7 +80,7 @@ rpc_set_queue_timer(struct rpc_wait_queue *queue, unsigned long expires) * Set up a timer for the current task. */ static void -__rpc_add_timer(struct rpc_task *task) +__rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task) { if (!task->tk_timeout) return; @@ -86,23 +88,10 @@ __rpc_add_timer(struct rpc_task *task) dprintk("RPC: %5u setting alarm for %lu ms\n", task->tk_pid, task->tk_timeout * 1000 / HZ); - set_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate); - mod_timer(&task->tk_timer, jiffies + task->tk_timeout); -} - -/* - * Delete any timer for the current task. Because we use del_timer_sync(), - * this function should never be called while holding queue->lock. - */ -static void -rpc_delete_timer(struct rpc_task *task) -{ - if (RPC_IS_QUEUED(task)) - return; - if (test_and_clear_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate)) { - del_singleshot_timer_sync(&task->tk_timer); - dprintk("RPC: %5u deleting timer\n", task->tk_pid); - } + task->u.tk_wait.expires = jiffies + task->tk_timeout; + if (list_empty(&queue->timer_list.list) || time_before(task->u.tk_wait.expires, queue->timer_list.expires)) + rpc_set_queue_timer(queue, task->u.tk_wait.expires); + list_add(&task->u.tk_wait.timer_list, &queue->timer_list.list); } /* @@ -172,7 +161,7 @@ static void __rpc_remove_wait_queue_priority(struct rpc_task *task) */ static void __rpc_remove_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task) { - __rpc_disable_timer(task); + __rpc_disable_timer(queue, task); if (RPC_IS_PRIORITY(queue)) __rpc_remove_wait_queue_priority(task); list_del(&task->u.tk_wait.list); @@ -344,7 +333,7 @@ static void __rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, BUG_ON(task->tk_callback != NULL); task->tk_callback = action; - __rpc_add_timer(task); + __rpc_add_timer(q, task); } void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, @@ -555,26 +544,6 @@ void rpc_wake_up_status(struct rpc_wait_queue *queue, int status) } EXPORT_SYMBOL_GPL(rpc_wake_up_status); -/* - * Run a timeout function. - */ -static void rpc_run_timer(unsigned long ptr) -{ - struct rpc_task *task = (struct rpc_task *)ptr; - struct rpc_wait_queue *queue = task->tk_waitqueue; - - spin_lock(&queue->lock); - if (RPC_IS_QUEUED(task) && task->tk_waitqueue == queue) { - dprintk("RPC: %5u timeout\n", task->tk_pid); - task->tk_status = -ETIMEDOUT; - rpc_wake_up_task_queue_locked(queue, task); - } - spin_unlock(&queue->lock); - smp_mb__before_clear_bit(); - clear_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate); - smp_mb__after_clear_bit(); -} - static void __rpc_queue_timer_fn(unsigned long ptr) { struct rpc_wait_queue *queue = (struct rpc_wait_queue *)ptr; @@ -586,7 +555,6 @@ static void __rpc_queue_timer_fn(unsigned long ptr) list_for_each_entry_safe(task, n, &queue->timer_list.list, u.tk_wait.timer_list) { timeo = task->u.tk_wait.expires; if (time_after_eq(now, timeo)) { - list_del_init(&task->u.tk_wait.timer_list); dprintk("RPC: %5u timeout\n", task->tk_pid); task->tk_status = -ETIMEDOUT; rpc_wake_up_task_queue_locked(queue, task); @@ -666,10 +634,6 @@ static void __rpc_execute(struct rpc_task *task) BUG_ON(RPC_IS_QUEUED(task)); for (;;) { - /* - * Garbage collection of pending timers... - */ - rpc_delete_timer(task); /* * Execute any pending callback. @@ -838,7 +802,6 @@ EXPORT_SYMBOL_GPL(rpc_free); static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *task_setup_data) { memset(task, 0, sizeof(*task)); - setup_timer(&task->tk_timer, rpc_run_timer, (unsigned long)task); atomic_set(&task->tk_count, 1); task->tk_flags = task_setup_data->flags; task->tk_ops = task_setup_data->callback_ops; @@ -971,9 +934,6 @@ static void rpc_release_task(struct rpc_task *task) } BUG_ON (RPC_IS_QUEUED(task)); - /* Synchronously delete any running timer */ - rpc_delete_timer(task); - #ifdef RPC_DEBUG task->tk_magic = 0; #endif From f5fb7b06e4e4ab18326f067f4317b2016ce18af2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 25 Feb 2008 21:40:50 -0800 Subject: [PATCH 15/80] SUNRPC: Eliminate the now-redundant rpc_start_wakeup() Signed-off-by: Trond Myklebust --- include/linux/sunrpc/sched.h | 12 +----------- net/sunrpc/sched.c | 8 ++------ 2 files changed, 3 insertions(+), 17 deletions(-) diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 0d7be1642dc0..bf69fce84ed0 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -142,8 +142,7 @@ struct rpc_task_setup { #define RPC_TASK_RUNNING 0 #define RPC_TASK_QUEUED 1 -#define RPC_TASK_WAKEUP 2 -#define RPC_TASK_ACTIVE 3 +#define RPC_TASK_ACTIVE 2 #define RPC_IS_RUNNING(t) test_bit(RPC_TASK_RUNNING, &(t)->tk_runstate) #define rpc_set_running(t) set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate) @@ -165,15 +164,6 @@ struct rpc_task_setup { smp_mb__after_clear_bit(); \ } while (0) -#define rpc_start_wakeup(t) \ - (test_and_set_bit(RPC_TASK_WAKEUP, &(t)->tk_runstate) == 0) -#define rpc_finish_wakeup(t) \ - do { \ - smp_mb__before_clear_bit(); \ - clear_bit(RPC_TASK_WAKEUP, &(t)->tk_runstate); \ - smp_mb__after_clear_bit(); \ - } while (0) - #define RPC_IS_ACTIVATED(t) test_bit(RPC_TASK_ACTIVE, &(t)->tk_runstate) /* diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 043eef4c15a2..88a686a8e43e 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -384,12 +384,8 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task */ static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task) { - if (!RPC_IS_QUEUED(task) || task->tk_waitqueue != queue) - return; - if (rpc_start_wakeup(task)) { - __rpc_do_wake_up_task(queue, task); - rpc_finish_wakeup(task); - } + if (RPC_IS_QUEUED(task) && task->tk_waitqueue == queue) + __rpc_do_wake_up_task(queue, task); } /* From ff2d7db848f8db7cade39e55f78f86d77e0de01a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 25 Feb 2008 21:40:51 -0800 Subject: [PATCH 16/80] SUNRPC: Ensure that we read all available tcp data Don't stop until we run out of data, or we hit an error. Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 30e7ac243a90..8bd3b0f73ac0 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1073,6 +1073,7 @@ static void xs_tcp_data_ready(struct sock *sk, int bytes) { struct rpc_xprt *xprt; read_descriptor_t rd_desc; + int read; dprintk("RPC: xs_tcp_data_ready...\n"); @@ -1084,8 +1085,10 @@ static void xs_tcp_data_ready(struct sock *sk, int bytes) /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */ rd_desc.arg.data = xprt; - rd_desc.count = 65536; - tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); + do { + rd_desc.count = 65536; + read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); + } while (read > 0); out: read_unlock(&sk->sk_callback_lock); } From 5e4424af9a1f062c6451681dff24a26e27741cc6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 25 Feb 2008 21:53:49 -0800 Subject: [PATCH 17/80] SUNRPC: Remove now-redundant RCU-safe rpc_task free path Now that we've tightened up the locking rules for RPC queue wakeups, we can remove the RCU-safe kfree calls... Signed-off-by: Trond Myklebust --- fs/nfs/read.c | 8 +------- fs/nfs/write.c | 16 ++-------------- include/linux/sunrpc/sched.h | 2 -- net/sunrpc/sched.c | 37 ++++++++++++------------------------ 4 files changed, 15 insertions(+), 48 deletions(-) diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 87546cd277d5..be9e8270f4d7 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -58,19 +58,13 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) return p; } -static void nfs_readdata_rcu_free(struct rcu_head *head) +static void nfs_readdata_free(struct nfs_read_data *p) { - struct nfs_read_data *p = container_of(head, struct nfs_read_data, task.u.tk_rcu); if (p && (p->pagevec != &p->page_array[0])) kfree(p->pagevec); mempool_free(p, nfs_rdata_mempool); } -static void nfs_readdata_free(struct nfs_read_data *rdata) -{ - call_rcu_bh(&rdata->task.u.tk_rcu, nfs_readdata_rcu_free); -} - void nfs_readdata_release(void *data) { struct nfs_read_data *rdata = data; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 84495ef90728..30513029c00c 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -58,19 +58,13 @@ struct nfs_write_data *nfs_commit_alloc(void) return p; } -static void nfs_commit_rcu_free(struct rcu_head *head) +void nfs_commit_free(struct nfs_write_data *p) { - struct nfs_write_data *p = container_of(head, struct nfs_write_data, task.u.tk_rcu); if (p && (p->pagevec != &p->page_array[0])) kfree(p->pagevec); mempool_free(p, nfs_commit_mempool); } -void nfs_commit_free(struct nfs_write_data *wdata) -{ - call_rcu_bh(&wdata->task.u.tk_rcu, nfs_commit_rcu_free); -} - struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) { struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS); @@ -92,19 +86,13 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) return p; } -static void nfs_writedata_rcu_free(struct rcu_head *head) +static void nfs_writedata_free(struct nfs_write_data *p) { - struct nfs_write_data *p = container_of(head, struct nfs_write_data, task.u.tk_rcu); if (p && (p->pagevec != &p->page_array[0])) kfree(p->pagevec); mempool_free(p, nfs_wdata_mempool); } -static void nfs_writedata_free(struct nfs_write_data *wdata) -{ - call_rcu_bh(&wdata->task.u.tk_rcu, nfs_writedata_rcu_free); -} - void nfs_writedata_release(void *data) { struct nfs_write_data *wdata = data; diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index bf69fce84ed0..d1a5c8c1a0f1 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -11,7 +11,6 @@ #include #include -#include #include #include #include @@ -77,7 +76,6 @@ struct rpc_task { union { struct work_struct tk_work; /* Async task work queue */ struct rpc_wait tk_wait; /* RPC wait */ - struct rcu_head tk_rcu; /* for task deletion */ } u; unsigned short tk_timeouts; /* maj timeouts */ diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 88a686a8e43e..cae219c8caeb 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -393,11 +393,9 @@ static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct r */ void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task) { - rcu_read_lock_bh(); - spin_lock(&queue->lock); + spin_lock_bh(&queue->lock); rpc_wake_up_task_queue_locked(queue, task); - spin_unlock(&queue->lock); - rcu_read_unlock_bh(); + spin_unlock_bh(&queue->lock); } EXPORT_SYMBOL_GPL(rpc_wake_up_queued_task); @@ -470,16 +468,14 @@ struct rpc_task * rpc_wake_up_next(struct rpc_wait_queue *queue) dprintk("RPC: wake_up_next(%p \"%s\")\n", queue, rpc_qname(queue)); - rcu_read_lock_bh(); - spin_lock(&queue->lock); + spin_lock_bh(&queue->lock); if (RPC_IS_PRIORITY(queue)) task = __rpc_wake_up_next_priority(queue); else { task_for_first(task, &queue->tasks[0]) rpc_wake_up_task_queue_locked(queue, task); } - spin_unlock(&queue->lock); - rcu_read_unlock_bh(); + spin_unlock_bh(&queue->lock); return task; } @@ -496,8 +492,7 @@ void rpc_wake_up(struct rpc_wait_queue *queue) struct rpc_task *task, *next; struct list_head *head; - rcu_read_lock_bh(); - spin_lock(&queue->lock); + spin_lock_bh(&queue->lock); head = &queue->tasks[queue->maxpriority]; for (;;) { list_for_each_entry_safe(task, next, head, u.tk_wait.list) @@ -506,8 +501,7 @@ void rpc_wake_up(struct rpc_wait_queue *queue) break; head--; } - spin_unlock(&queue->lock); - rcu_read_unlock_bh(); + spin_unlock_bh(&queue->lock); } EXPORT_SYMBOL_GPL(rpc_wake_up); @@ -523,8 +517,7 @@ void rpc_wake_up_status(struct rpc_wait_queue *queue, int status) struct rpc_task *task, *next; struct list_head *head; - rcu_read_lock_bh(); - spin_lock(&queue->lock); + spin_lock_bh(&queue->lock); head = &queue->tasks[queue->maxpriority]; for (;;) { list_for_each_entry_safe(task, next, head, u.tk_wait.list) { @@ -535,8 +528,7 @@ void rpc_wake_up_status(struct rpc_wait_queue *queue, int status) break; head--; } - spin_unlock(&queue->lock); - rcu_read_unlock_bh(); + spin_unlock_bh(&queue->lock); } EXPORT_SYMBOL_GPL(rpc_wake_up_status); @@ -848,13 +840,6 @@ rpc_alloc_task(void) return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS); } -static void rpc_free_task_rcu(struct rcu_head *rcu) -{ - struct rpc_task *task = container_of(rcu, struct rpc_task, u.tk_rcu); - dprintk("RPC: %5u freeing task\n", task->tk_pid); - mempool_free(task, rpc_task_mempool); -} - /* * Create a new task for the specified client. */ @@ -883,8 +868,10 @@ static void rpc_free_task(struct rpc_task *task) const struct rpc_call_ops *tk_ops = task->tk_ops; void *calldata = task->tk_calldata; - if (task->tk_flags & RPC_TASK_DYNAMIC) - call_rcu_bh(&task->u.tk_rcu, rpc_free_task_rcu); + if (task->tk_flags & RPC_TASK_DYNAMIC) { + dprintk("RPC: %5u freeing task\n", task->tk_pid); + mempool_free(task, rpc_task_mempool); + } rpc_release_calldata(tk_ops, calldata); } From 25337fdc85951dfeac944f16cb565904c619077a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 12 Mar 2008 14:40:14 -0400 Subject: [PATCH 18/80] SUNRPC: Fix a bug in rpcauth_lookup_credcache() The hash bucket is for some reason always being set to zero. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/auth.h | 4 ++-- net/sunrpc/auth.c | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h index 7a69ca3bebaf..84d5f3a05b16 100644 --- a/include/linux/sunrpc/auth.h +++ b/include/linux/sunrpc/auth.h @@ -59,8 +59,8 @@ struct rpc_cred { /* * Client authentication handle */ -#define RPC_CREDCACHE_NR 8 -#define RPC_CREDCACHE_MASK (RPC_CREDCACHE_NR - 1) +#define RPC_CREDCACHE_HASHBITS 4 +#define RPC_CREDCACHE_NR (1 << RPC_CREDCACHE_HASHBITS) struct rpc_cred_cache { struct hlist_head hashtable[RPC_CREDCACHE_NR]; spinlock_t lock; diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index eca941ce298b..b38f6ee2f5e2 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -280,7 +281,9 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, struct hlist_node *pos; struct rpc_cred *cred = NULL, *entry, *new; - int nr = 0; + unsigned int nr; + + nr = hash_long(acred->uid, RPC_CREDCACHE_HASHBITS); if (!(flags & RPCAUTH_LOOKUP_ROOTCREDS)) nr = acred->uid & RPC_CREDCACHE_MASK; From af093835774931de898a9baf7b4041fa0d100f77 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 12 Mar 2008 12:12:16 -0400 Subject: [PATCH 19/80] SUNRPC: Fix RPCAUTH_LOOKUP_ROOTCREDS The current RPCAUTH_LOOKUP_ROOTCREDS flag only works for AUTH_SYS authentication, and then only as a special case in the code. This patch removes the auth_sys special casing, and replaces it with generic code. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/auth.h | 4 +-- net/sunrpc/auth.c | 35 +++++++++++++----------- net/sunrpc/auth_unix.c | 54 ++++++++++++++++--------------------- net/sunrpc/sched.c | 4 ++- 4 files changed, 48 insertions(+), 49 deletions(-) diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h index 84d5f3a05b16..012566a6257b 100644 --- a/include/linux/sunrpc/auth.h +++ b/include/linux/sunrpc/auth.h @@ -89,7 +89,6 @@ struct rpc_auth { /* Flags for rpcauth_lookupcred() */ #define RPCAUTH_LOOKUP_NEW 0x01 /* Accept an uninitialised cred */ -#define RPCAUTH_LOOKUP_ROOTCREDS 0x02 /* This really ought to go! */ /* * Client authentication ops @@ -136,7 +135,8 @@ void rpcauth_release(struct rpc_auth *); struct rpc_cred * rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred *, int); void rpcauth_init_cred(struct rpc_cred *, const struct auth_cred *, struct rpc_auth *, const struct rpc_credops *); struct rpc_cred * rpcauth_lookupcred(struct rpc_auth *, int); -struct rpc_cred * rpcauth_bindcred(struct rpc_task *); +void rpcauth_bindcred(struct rpc_task *); +void rpcauth_bind_root_cred(struct rpc_task *); void rpcauth_holdcred(struct rpc_task *); void put_rpccred(struct rpc_cred *); void rpcauth_unbindcred(struct rpc_task *); diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index b38f6ee2f5e2..b0f2b2ee7cc0 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -285,9 +285,6 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, nr = hash_long(acred->uid, RPC_CREDCACHE_HASHBITS); - if (!(flags & RPCAUTH_LOOKUP_ROOTCREDS)) - nr = acred->uid & RPC_CREDCACHE_MASK; - rcu_read_lock(); hlist_for_each_entry_rcu(entry, pos, &cache->hashtable[nr], cr_hash) { if (!entry->cr_ops->crmatch(acred, entry, flags)) @@ -378,30 +375,38 @@ rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred, } EXPORT_SYMBOL_GPL(rpcauth_init_cred); -struct rpc_cred * -rpcauth_bindcred(struct rpc_task *task) +void +rpcauth_bind_root_cred(struct rpc_task *task) { struct rpc_auth *auth = task->tk_client->cl_auth; struct auth_cred acred = { - .uid = current->fsuid, - .gid = current->fsgid, - .group_info = current->group_info, + .uid = 0, + .gid = 0, }; struct rpc_cred *ret; - int flags = 0; dprintk("RPC: %5u looking up %s cred\n", task->tk_pid, task->tk_client->cl_auth->au_ops->au_name); - get_group_info(acred.group_info); - if (task->tk_flags & RPC_TASK_ROOTCREDS) - flags |= RPCAUTH_LOOKUP_ROOTCREDS; - ret = auth->au_ops->lookup_cred(auth, &acred, flags); + ret = auth->au_ops->lookup_cred(auth, &acred, 0); + if (!IS_ERR(ret)) + task->tk_msg.rpc_cred = ret; + else + task->tk_status = PTR_ERR(ret); +} + +void +rpcauth_bindcred(struct rpc_task *task) +{ + struct rpc_auth *auth = task->tk_client->cl_auth; + struct rpc_cred *ret; + + dprintk("RPC: %5u looking up %s cred\n", + task->tk_pid, auth->au_ops->au_name); + ret = rpcauth_lookupcred(auth, 0); if (!IS_ERR(ret)) task->tk_msg.rpc_cred = ret; else task->tk_status = PTR_ERR(ret); - put_group_info(acred.group_info); - return ret; } void diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 5ed91e5bcee4..b763710d3dbd 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -60,7 +60,8 @@ static struct rpc_cred * unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) { struct unx_cred *cred; - int i; + unsigned int groups = 0; + unsigned int i; dprintk("RPC: allocating UNIX cred for uid %d gid %d\n", acred->uid, acred->gid); @@ -70,21 +71,17 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) rpcauth_init_cred(&cred->uc_base, acred, auth, &unix_credops); cred->uc_base.cr_flags = 1UL << RPCAUTH_CRED_UPTODATE; - if (flags & RPCAUTH_LOOKUP_ROOTCREDS) { - cred->uc_uid = 0; - cred->uc_gid = 0; - cred->uc_gids[0] = NOGROUP; - } else { - int groups = acred->group_info->ngroups; - if (groups > NFS_NGROUPS) - groups = NFS_NGROUPS; - cred->uc_gid = acred->gid; - for (i = 0; i < groups; i++) - cred->uc_gids[i] = GROUP_AT(acred->group_info, i); - if (i < NFS_NGROUPS) - cred->uc_gids[i] = NOGROUP; - } + if (acred->group_info != NULL) + groups = acred->group_info->ngroups; + if (groups > NFS_NGROUPS) + groups = NFS_NGROUPS; + + cred->uc_gid = acred->gid; + for (i = 0; i < groups; i++) + cred->uc_gids[i] = GROUP_AT(acred->group_info, i); + if (i < NFS_NGROUPS) + cred->uc_gids[i] = NOGROUP; return &cred->uc_base; } @@ -118,26 +115,21 @@ static int unx_match(struct auth_cred *acred, struct rpc_cred *rcred, int flags) { struct unx_cred *cred = container_of(rcred, struct unx_cred, uc_base); - int i; + unsigned int groups = 0; + unsigned int i; - if (!(flags & RPCAUTH_LOOKUP_ROOTCREDS)) { - int groups; - if (cred->uc_uid != acred->uid - || cred->uc_gid != acred->gid) - return 0; + if (cred->uc_uid != acred->uid || cred->uc_gid != acred->gid) + return 0; + if (acred->group_info != NULL) groups = acred->group_info->ngroups; - if (groups > NFS_NGROUPS) - groups = NFS_NGROUPS; - for (i = 0; i < groups ; i++) - if (cred->uc_gids[i] != GROUP_AT(acred->group_info, i)) - return 0; - return 1; - } - return (cred->uc_uid == 0 - && cred->uc_gid == 0 - && cred->uc_gids[0] == (gid_t) NOGROUP); + if (groups > NFS_NGROUPS) + groups = NFS_NGROUPS; + for (i = 0; i < groups ; i++) + if (cred->uc_gids[i] != GROUP_AT(acred->group_info, i)) + return 0; + return 1; } /* diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index cae219c8caeb..7db956f6e018 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -821,8 +821,10 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta /* Bind the user cred */ if (task->tk_msg.rpc_cred != NULL) rpcauth_holdcred(task); - else + else if (!(task_setup_data->flags & RPC_TASK_ROOTCREDS)) rpcauth_bindcred(task); + else + rpcauth_bind_root_cred(task); if (task->tk_action == NULL) rpc_call_start(task); } From 4ccda2cdd8d156b6f49440653d5d6997e0facf97 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 12 Mar 2008 16:20:55 -0400 Subject: [PATCH 20/80] SUNRPC: Clean up rpcauth_bindcred() Signed-off-by: Trond Myklebust --- include/linux/sunrpc/auth.h | 4 +--- net/sunrpc/auth.c | 28 ++++++++++++++++++---------- net/sunrpc/sched.c | 11 ++++------- 3 files changed, 23 insertions(+), 20 deletions(-) diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h index 012566a6257b..348546c7826c 100644 --- a/include/linux/sunrpc/auth.h +++ b/include/linux/sunrpc/auth.h @@ -135,9 +135,7 @@ void rpcauth_release(struct rpc_auth *); struct rpc_cred * rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred *, int); void rpcauth_init_cred(struct rpc_cred *, const struct auth_cred *, struct rpc_auth *, const struct rpc_credops *); struct rpc_cred * rpcauth_lookupcred(struct rpc_auth *, int); -void rpcauth_bindcred(struct rpc_task *); -void rpcauth_bind_root_cred(struct rpc_task *); -void rpcauth_holdcred(struct rpc_task *); +void rpcauth_bindcred(struct rpc_task *, struct rpc_cred *, int); void put_rpccred(struct rpc_cred *); void rpcauth_unbindcred(struct rpc_task *); __be32 * rpcauth_marshcred(struct rpc_task *, __be32 *); diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index b0f2b2ee7cc0..1cdb163c4f89 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -375,7 +375,15 @@ rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred, } EXPORT_SYMBOL_GPL(rpcauth_init_cred); -void +static void +rpcauth_generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred) +{ + task->tk_msg.rpc_cred = get_rpccred(cred); + dprintk("RPC: %5u holding %s cred %p\n", task->tk_pid, + cred->cr_auth->au_ops->au_name, cred); +} + +static void rpcauth_bind_root_cred(struct rpc_task *task) { struct rpc_auth *auth = task->tk_client->cl_auth; @@ -394,8 +402,8 @@ rpcauth_bind_root_cred(struct rpc_task *task) task->tk_status = PTR_ERR(ret); } -void -rpcauth_bindcred(struct rpc_task *task) +static void +rpcauth_bind_new_cred(struct rpc_task *task) { struct rpc_auth *auth = task->tk_client->cl_auth; struct rpc_cred *ret; @@ -410,14 +418,14 @@ rpcauth_bindcred(struct rpc_task *task) } void -rpcauth_holdcred(struct rpc_task *task) +rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags) { - struct rpc_cred *cred = task->tk_msg.rpc_cred; - if (cred != NULL) { - get_rpccred(cred); - dprintk("RPC: %5u holding %s cred %p\n", task->tk_pid, - cred->cr_auth->au_ops->au_name, cred); - } + if (cred != NULL) + rpcauth_generic_bind_cred(task, cred); + else if (flags & RPC_TASK_ROOTCREDS) + rpcauth_bind_root_cred(task); + else + rpcauth_bind_new_cred(task); } void diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 7db956f6e018..6eab9bf94baf 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -817,14 +817,11 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta task->tk_action = rpc_prepare_task; if (task_setup_data->rpc_message != NULL) { - memcpy(&task->tk_msg, task_setup_data->rpc_message, sizeof(task->tk_msg)); + task->tk_msg.rpc_proc = task_setup_data->rpc_message->rpc_proc; + task->tk_msg.rpc_argp = task_setup_data->rpc_message->rpc_argp; + task->tk_msg.rpc_resp = task_setup_data->rpc_message->rpc_resp; /* Bind the user cred */ - if (task->tk_msg.rpc_cred != NULL) - rpcauth_holdcred(task); - else if (!(task_setup_data->flags & RPC_TASK_ROOTCREDS)) - rpcauth_bindcred(task); - else - rpcauth_bind_root_cred(task); + rpcauth_bindcred(task, task_setup_data->rpc_message->rpc_cred, task_setup_data->flags); if (task->tk_action == NULL) rpc_call_start(task); } From 9a559efd4199c9812d339e23cc1b6055366b224f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 12 Mar 2008 12:24:49 -0400 Subject: [PATCH 21/80] SUNRPC: Add a generic RPC credential Add an rpc credential that is not tied to any particular auth mechanism, but that can be cached by NFS, and later used to look up a cred for whichever auth mechanism that turns out to be valid when the RPC call is being made. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/auth.h | 3 + net/sunrpc/Makefile | 2 +- net/sunrpc/auth.c | 1 + net/sunrpc/auth_generic.c | 142 ++++++++++++++++++++++++++++++++++++ 4 files changed, 147 insertions(+), 1 deletion(-) create mode 100644 net/sunrpc/auth_generic.c diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h index 348546c7826c..70644ed67996 100644 --- a/include/linux/sunrpc/auth.h +++ b/include/linux/sunrpc/auth.h @@ -125,9 +125,12 @@ extern const struct rpc_authops authunix_ops; extern const struct rpc_authops authnull_ops; void __init rpc_init_authunix(void); +void __init rpc_init_generic_auth(void); void __init rpcauth_init_module(void); void __exit rpcauth_remove_module(void); +void __exit rpc_destroy_generic_auth(void); +struct rpc_cred * rpc_lookup_cred(void); int rpcauth_register(const struct rpc_authops *); int rpcauth_unregister(const struct rpc_authops *); struct rpc_auth * rpcauth_create(rpc_authflavor_t, struct rpc_clnt *); diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile index 92e1dbe50947..5369aa369b35 100644 --- a/net/sunrpc/Makefile +++ b/net/sunrpc/Makefile @@ -8,7 +8,7 @@ obj-$(CONFIG_SUNRPC_GSS) += auth_gss/ obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma/ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \ - auth.o auth_null.o auth_unix.o \ + auth.o auth_null.o auth_unix.o auth_generic.o \ svc.o svcsock.o svcauth.o svcauth_unix.o \ rpcb_clnt.o timer.o xdr.o \ sunrpc_syms.o cache.o rpc_pipe.o \ diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 1cdb163c4f89..012f2a320638 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -566,6 +566,7 @@ static struct shrinker rpc_cred_shrinker = { void __init rpcauth_init_module(void) { rpc_init_authunix(); + rpc_init_generic_auth(); register_shrinker(&rpc_cred_shrinker); } diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c new file mode 100644 index 000000000000..6f129b1b20a0 --- /dev/null +++ b/net/sunrpc/auth_generic.c @@ -0,0 +1,142 @@ +/* + * Generic RPC credential + * + * Copyright (C) 2008, Trond Myklebust + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +struct generic_cred { + struct rpc_cred gc_base; + struct auth_cred acred; +}; + +static struct rpc_auth generic_auth; +static struct rpc_cred_cache generic_cred_cache; +static const struct rpc_credops generic_credops; + +/* + * Public call interface + */ +struct rpc_cred *rpc_lookup_cred(void) +{ + return rpcauth_lookupcred(&generic_auth, 0); +} +EXPORT_SYMBOL_GPL(rpc_lookup_cred); + +/* + * Lookup generic creds for current process + */ +static struct rpc_cred * +generic_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) +{ + return rpcauth_lookup_credcache(&generic_auth, acred, flags); +} + +static struct rpc_cred * +generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) +{ + struct generic_cred *gcred; + + gcred = kmalloc(sizeof(*gcred), GFP_KERNEL); + if (gcred == NULL) + return ERR_PTR(-ENOMEM); + + rpcauth_init_cred(&gcred->gc_base, acred, &generic_auth, &generic_credops); + gcred->gc_base.cr_flags = 1UL << RPCAUTH_CRED_UPTODATE; + + gcred->acred.uid = acred->uid; + gcred->acred.gid = acred->gid; + gcred->acred.group_info = acred->group_info; + if (gcred->acred.group_info != NULL) + get_group_info(gcred->acred.group_info); + + dprintk("RPC: allocated generic cred %p for uid %d gid %d\n", + gcred, acred->uid, acred->gid); + return &gcred->gc_base; +} + +static void +generic_free_cred(struct rpc_cred *cred) +{ + struct generic_cred *gcred = container_of(cred, struct generic_cred, gc_base); + + dprintk("RPC: generic_free_cred %p\n", gcred); + if (gcred->acred.group_info != NULL) + put_group_info(gcred->acred.group_info); + kfree(gcred); +} + +static void +generic_free_cred_callback(struct rcu_head *head) +{ + struct rpc_cred *cred = container_of(head, struct rpc_cred, cr_rcu); + generic_free_cred(cred); +} + +static void +generic_destroy_cred(struct rpc_cred *cred) +{ + call_rcu(&cred->cr_rcu, generic_free_cred_callback); +} + +/* + * Match credentials against current process creds. + */ +static int +generic_match(struct auth_cred *acred, struct rpc_cred *cred, int flags) +{ + struct generic_cred *gcred = container_of(cred, struct generic_cred, gc_base); + + if (gcred->acred.uid != acred->uid || + gcred->acred.gid != acred->gid || + gcred->acred.group_info != acred->group_info) + return 0; + return 1; +} + +void __init rpc_init_generic_auth(void) +{ + spin_lock_init(&generic_cred_cache.lock); +} + +void __exit rpc_destroy_generic_auth(void) +{ + rpcauth_clear_credcache(&generic_cred_cache); +} + +static struct rpc_cred_cache generic_cred_cache = { + {{ NULL, },}, +}; + +static const struct rpc_authops generic_auth_ops = { + .owner = THIS_MODULE, +#ifdef RPC_DEBUG + .au_name = "Generic", +#endif + .lookup_cred = generic_lookup_cred, + .crcreate = generic_create_cred, +}; + +static struct rpc_auth generic_auth = { + .au_ops = &generic_auth_ops, + .au_count = ATOMIC_INIT(0), + .au_credcache = &generic_cred_cache, +}; + +static const struct rpc_credops generic_credops = { + .cr_name = "Generic cred", + .crdestroy = generic_destroy_cred, + .crmatch = generic_match, +}; From 5c691044ecbca04dd558fca4c754121689fe1b34 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 12 Mar 2008 16:21:07 -0400 Subject: [PATCH 22/80] SUNRPC: Add an rpc_credop callback for binding a credential to an rpc_task We need the ability to treat 'generic' creds specially, since they want to bind instances of the auth cred instead of binding themselves. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/auth.h | 2 ++ net/sunrpc/auth.c | 5 +++-- net/sunrpc/auth_generic.c | 15 +++++++++++++++ net/sunrpc/auth_gss/auth_gss.c | 2 ++ net/sunrpc/auth_null.c | 1 + net/sunrpc/auth_unix.c | 1 + 6 files changed, 24 insertions(+), 2 deletions(-) diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h index 70644ed67996..e93cd8aa3eb6 100644 --- a/include/linux/sunrpc/auth.h +++ b/include/linux/sunrpc/auth.h @@ -112,6 +112,7 @@ struct rpc_credops { void (*crdestroy)(struct rpc_cred *); int (*crmatch)(struct auth_cred *, struct rpc_cred *, int); + void (*crbind)(struct rpc_task *, struct rpc_cred *); __be32 * (*crmarshal)(struct rpc_task *, __be32 *); int (*crrefresh)(struct rpc_task *); __be32 * (*crvalidate)(struct rpc_task *, __be32 *); @@ -139,6 +140,7 @@ struct rpc_cred * rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred * void rpcauth_init_cred(struct rpc_cred *, const struct auth_cred *, struct rpc_auth *, const struct rpc_credops *); struct rpc_cred * rpcauth_lookupcred(struct rpc_auth *, int); void rpcauth_bindcred(struct rpc_task *, struct rpc_cred *, int); +void rpcauth_generic_bind_cred(struct rpc_task *, struct rpc_cred *); void put_rpccred(struct rpc_cred *); void rpcauth_unbindcred(struct rpc_task *); __be32 * rpcauth_marshcred(struct rpc_task *, __be32 *); diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 012f2a320638..d65dd7940102 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -375,13 +375,14 @@ rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred, } EXPORT_SYMBOL_GPL(rpcauth_init_cred); -static void +void rpcauth_generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred) { task->tk_msg.rpc_cred = get_rpccred(cred); dprintk("RPC: %5u holding %s cred %p\n", task->tk_pid, cred->cr_auth->au_ops->au_name, cred); } +EXPORT_SYMBOL_GPL(rpcauth_generic_bind_cred); static void rpcauth_bind_root_cred(struct rpc_task *task) @@ -421,7 +422,7 @@ void rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags) { if (cred != NULL) - rpcauth_generic_bind_cred(task, cred); + cred->cr_ops->crbind(task, cred); else if (flags & RPC_TASK_ROOTCREDS) rpcauth_bind_root_cred(task); else diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index 6f129b1b20a0..6a3f77c9e4d3 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c @@ -35,6 +35,20 @@ struct rpc_cred *rpc_lookup_cred(void) } EXPORT_SYMBOL_GPL(rpc_lookup_cred); +static void +generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred) +{ + struct rpc_auth *auth = task->tk_client->cl_auth; + struct auth_cred *acred = &container_of(cred, struct generic_cred, gc_base)->acred; + struct rpc_cred *ret; + + ret = auth->au_ops->lookup_cred(auth, acred, 0); + if (!IS_ERR(ret)) + task->tk_msg.rpc_cred = ret; + else + task->tk_status = PTR_ERR(ret); +} + /* * Lookup generic creds for current process */ @@ -138,5 +152,6 @@ static struct rpc_auth generic_auth = { static const struct rpc_credops generic_credops = { .cr_name = "Generic cred", .crdestroy = generic_destroy_cred, + .crbind = generic_bind_cred, .crmatch = generic_match, }; diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index ef6384961808..d34f6dfc7516 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1300,6 +1300,7 @@ static const struct rpc_credops gss_credops = { .cr_name = "AUTH_GSS", .crdestroy = gss_destroy_cred, .cr_init = gss_cred_init, + .crbind = rpcauth_generic_bind_cred, .crmatch = gss_match, .crmarshal = gss_marshal, .crrefresh = gss_refresh, @@ -1311,6 +1312,7 @@ static const struct rpc_credops gss_credops = { static const struct rpc_credops gss_nullops = { .cr_name = "AUTH_GSS", .crdestroy = gss_destroy_cred, + .crbind = rpcauth_generic_bind_cred, .crmatch = gss_match, .crmarshal = gss_marshal, .crrefresh = gss_refresh_null, diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c index 537d0e8589dd..3c26c18df0de 100644 --- a/net/sunrpc/auth_null.c +++ b/net/sunrpc/auth_null.c @@ -125,6 +125,7 @@ static const struct rpc_credops null_credops = { .cr_name = "AUTH_NULL", .crdestroy = nul_destroy_cred, + .crbind = rpcauth_generic_bind_cred, .crmatch = nul_match, .crmarshal = nul_marshal, .crrefresh = nul_refresh, diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index b763710d3dbd..04e936a56fb2 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -237,6 +237,7 @@ static const struct rpc_credops unix_credops = { .cr_name = "AUTH_UNIX", .crdestroy = unx_destroy_cred, + .crbind = rpcauth_generic_bind_cred, .crmatch = unx_match, .crmarshal = unx_marshal, .crrefresh = unx_refresh, From 98a8e3239427051f5d44f2025b398bdcc3918f37 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 12 Mar 2008 12:25:28 -0400 Subject: [PATCH 23/80] SUNRPC: Add a helper rpcauth_lookup_generic_cred() The NFSv4 protocol allows clients to negotiate security protocols on the fly in the case where an administrator on the server changes the export settings and/or in the case where we may have a filesystem migration event. Instead of having the NFS client code cache credentials that are tied to a particular AUTH method it is therefore preferable to have a generic credential that can be converted into whatever AUTH is in use by the RPC client when the read/write/sillyrename/... is put on the wire. We do this by means of the new "generic" credential, which basically just caches the minimal information that is needed to look up an RPCSEC_GSS, AUTH_SYS, or AUTH_NULL credential. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 2 +- fs/nfs/inode.c | 2 +- fs/nfs/nfs4proc.c | 8 ++++---- fs/nfs/unlink.c | 2 +- net/sunrpc/auth.c | 1 - 5 files changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 6cea7479c5b4..d583654a0b39 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1966,7 +1966,7 @@ force_lookup: if (!NFS_PROTO(inode)->access) goto out_notsup; - cred = rpcauth_lookupcred(NFS_CLIENT(inode)->cl_auth, 0); + cred = rpc_lookup_cred(); if (!IS_ERR(cred)) { res = nfs_do_access(inode, cred, mask); put_rpccred(cred); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index c49f6d8b42d2..15f787355d27 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -613,7 +613,7 @@ int nfs_open(struct inode *inode, struct file *filp) struct nfs_open_context *ctx; struct rpc_cred *cred; - cred = rpcauth_lookupcred(NFS_CLIENT(inode)->cl_auth, 0); + cred = rpc_lookup_cred(); if (IS_ERR(cred)) return PTR_ERR(cred); ctx = alloc_nfs_open_context(filp->f_path.mnt, filp->f_path.dentry, cred); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index bbb0d58ee6ac..f38d0573be18 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1408,7 +1408,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) BUG_ON(nd->intent.open.flags & O_CREAT); } - cred = rpcauth_lookupcred(NFS_CLIENT(dir)->cl_auth, 0); + cred = rpc_lookup_cred(); if (IS_ERR(cred)) return (struct dentry *)cred; parent = dentry->d_parent; @@ -1443,7 +1443,7 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st struct rpc_cred *cred; struct nfs4_state *state; - cred = rpcauth_lookupcred(NFS_CLIENT(dir)->cl_auth, 0); + cred = rpc_lookup_cred(); if (IS_ERR(cred)) return PTR_ERR(cred); state = nfs4_do_open(dir, &path, openflags, NULL, cred); @@ -1660,7 +1660,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, nfs_fattr_init(fattr); - cred = rpcauth_lookupcred(NFS_CLIENT(inode)->cl_auth, 0); + cred = rpc_lookup_cred(); if (IS_ERR(cred)) return PTR_ERR(cred); @@ -1896,7 +1896,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, struct rpc_cred *cred; int status = 0; - cred = rpcauth_lookupcred(NFS_CLIENT(dir)->cl_auth, 0); + cred = rpc_lookup_cred(); if (IS_ERR(cred)) { status = PTR_ERR(cred); goto out; diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 757415363422..3adf8b266461 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -234,7 +234,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry) if (data == NULL) goto out; - data->cred = rpcauth_lookupcred(NFS_CLIENT(dir)->cl_auth, 0); + data->cred = rpc_lookup_cred(); if (IS_ERR(data->cred)) { status = PTR_ERR(data->cred); goto out_free; diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index d65dd7940102..0632cd0a1ad7 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -356,7 +356,6 @@ rpcauth_lookupcred(struct rpc_auth *auth, int flags) put_group_info(acred.group_info); return ret; } -EXPORT_SYMBOL_GPL(rpcauth_lookupcred); void rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred, From 2f42b5d043ee271d1e5d30ecd77186b6c4d4e534 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Thu, 13 Mar 2008 15:26:30 +0200 Subject: [PATCH 24/80] NFS: fix encode_fsinfo_maxsz The previous value was not taking into account space for bitmap array size. Signed-off-by: Fred Isaman Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index db1ed9c46ede..37421dd4805d 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -110,7 +110,7 @@ static int nfs4_stat_to_errno(int); #define decode_savefh_maxsz (op_decode_hdr_maxsz) #define encode_restorefh_maxsz (op_encode_hdr_maxsz) #define decode_restorefh_maxsz (op_decode_hdr_maxsz) -#define encode_fsinfo_maxsz (op_encode_hdr_maxsz + 2) +#define encode_fsinfo_maxsz (encode_getattr_maxsz) #define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 11) #define encode_renew_maxsz (op_encode_hdr_maxsz + 3) #define decode_renew_maxsz (op_decode_hdr_maxsz) From 6d884e8fc8114dc8877218f06a9a9a1d801901e4 Mon Sep 17 00:00:00 2001 From: Fred Date: Wed, 19 Mar 2008 11:24:38 -0400 Subject: [PATCH 25/80] nfs: nfs_redirty_request Both flush functions have the same error handling routine. Pull it out as a function. Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 4cb88df12f83..ce40cadb15db 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -282,8 +282,6 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, spin_unlock(&inode->i_lock); if (!nfs_pageio_add_request(pgio, req)) { nfs_redirty_request(req); - nfs_end_page_writeback(page); - nfs_clear_page_tag_locked(req); return pgio->pg_error; } return 0; @@ -402,7 +400,7 @@ static void nfs_inode_remove_request(struct nfs_page *req) } static void -nfs_redirty_request(struct nfs_page *req) +nfs_mark_request_dirty(struct nfs_page *req) { __set_page_dirty_nobuffers(req->wb_page); } @@ -456,7 +454,7 @@ int nfs_reschedule_unstable_write(struct nfs_page *req) return 1; } if (test_and_clear_bit(PG_NEED_RESCHED, &req->wb_flags)) { - nfs_redirty_request(req); + nfs_mark_request_dirty(req); return 1; } return 0; @@ -847,6 +845,17 @@ static void nfs_write_rpcsetup(struct nfs_page *req, rpc_put_task(task); } +/* If a nfs_flush_* function fails, it should remove reqs from @head and + * call this on each, which will prepare them to be retried on next + * writeback using standard nfs. + */ +static void nfs_redirty_request(struct nfs_page *req) +{ + nfs_mark_request_dirty(req); + nfs_end_page_writeback(req->wb_page); + nfs_clear_page_tag_locked(req); +} + /* * Generate multiple small requests to write out a single * contiguous dirty area on one page. @@ -902,8 +911,6 @@ out_bad: nfs_writedata_release(data); } nfs_redirty_request(req); - nfs_end_page_writeback(req->wb_page); - nfs_clear_page_tag_locked(req); return -ENOMEM; } @@ -944,8 +951,6 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i req = nfs_list_entry(head->next); nfs_list_remove_request(req); nfs_redirty_request(req); - nfs_end_page_writeback(req->wb_page); - nfs_clear_page_tag_locked(req); } return -ENOMEM; } @@ -1298,7 +1303,7 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata) } /* We have a mismatch. Write the page again */ dprintk(" mismatch\n"); - nfs_redirty_request(req); + nfs_mark_request_dirty(req); next: nfs_clear_page_tag_locked(req); } From 4af68bffac444a23f027e18ff244101e63b79227 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Wed, 19 Mar 2008 11:54:04 -0400 Subject: [PATCH 26/80] nfs: remove duplicate initializations of nfs_read_data field Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/read.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/nfs/read.c b/fs/nfs/read.c index ab2f7d233e01..d333f5fedca1 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -251,7 +251,6 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne data = nfs_readdata_alloc(1); if (!data) goto out_bad; - INIT_LIST_HEAD(&data->pages); list_add(&data->pages, &list); requests++; nbytes -= len; @@ -298,7 +297,6 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned if (!data) goto out_bad; - INIT_LIST_HEAD(&data->pages); pages = data->pagevec; while (!list_empty(head)) { req = nfs_list_entry(head->next); From caa02bd540618e4b447a1f776363ba27c4c79090 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 22 Feb 2008 14:49:59 -0500 Subject: [PATCH 27/80] NFS: clean up short packet handling for NFSv2 readdir Currently, the NFS readdir decoders have a workaround for buggy servers that send an empty readdir response with the EOF bit unset. If the server sends a malformed response in some cases, this workaround kicks in and just returns an empty response rather than returning a proper error to the caller. This patch does 3 things: 1) have malformed responses with no entries return error (-EIO) 2) preserve existing workaround for servers that send empty responses with the EOF marker unset. 3) Add some comments to clarify the logic in nfs_xdr_readdirres(). Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/nfs2xdr.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 1f7ea675e0c5..86a80b33ec82 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -428,7 +428,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy) size_t hdrlen; unsigned int pglen, recvd; u32 len; - int status, nr; + int status, nr = 0; __be32 *end, *entry, *kaddr; if ((status = ntohl(*p++))) @@ -452,7 +452,12 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy) kaddr = p = kmap_atomic(*page, KM_USER0); end = (__be32 *)((char *)p + pglen); entry = p; - for (nr = 0; *p++; nr++) { + + /* Make sure the packet actually has a value_follows and EOF entry */ + if ((entry + 1) > end) + goto short_pkt; + + for (; *p++; nr++) { if (p + 2 > end) goto short_pkt; p++; /* fileid */ @@ -467,18 +472,32 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy) goto short_pkt; entry = p; } - if (!nr && (entry[0] != 0 || entry[1] == 0)) - goto short_pkt; + + /* + * Apparently some server sends responses that are a valid size, but + * contain no entries, and have value_follows==0 and EOF==0. For + * those, just set the EOF marker. + */ + if (!nr && entry[1] == 0) { + dprintk("NFS: readdir reply truncated!\n"); + entry[1] = 1; + } out: kunmap_atomic(kaddr, KM_USER0); return nr; short_pkt: + /* + * When we get a short packet there are 2 possibilities. We can + * return an error, or fix up the response to look like a valid + * response and return what we have so far. If there are no + * entries and the packet was short, then return -EIO. If there + * are valid entries in the response, return them and pretend that + * the call was successful, but incomplete. The caller can retry the + * readdir starting at the last cookie. + */ entry[0] = entry[1] = 0; - /* truncate listing ? */ - if (!nr) { - dprintk("NFS: readdir reply truncated!\n"); - entry[1] = 1; - } + if (!nr) + nr = -errno_NFSERR_IO; goto out; err_unmap: nr = -errno_NFSERR_IO; From 643f81115baca3630e544f6874567648b605efae Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 22 Feb 2008 14:50:00 -0500 Subject: [PATCH 28/80] NFS: clean up short packet handling for NFSv3 readdir Currently, the NFS readdir decoders have a workaround for buggy servers that send an empty readdir response with the EOF bit unset. If the server sends a malformed response in some cases, this workaround kicks in and just returns an empty response rather than returning a proper error to the caller. This patch does 3 things: 1) have malformed responses with no entries return error (-EIO) 2) preserve existing workaround for servers that send empty responses with the EOF marker unset. 3) Add some comments to clarify the logic in nfs3_xdr_readdirres(). Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/nfs3xdr.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 3917e2fa4e40..fb03048ac650 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -508,7 +508,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res struct page **page; size_t hdrlen; u32 len, recvd, pglen; - int status, nr; + int status, nr = 0; __be32 *entry, *end, *kaddr; status = ntohl(*p++); @@ -542,7 +542,12 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res kaddr = p = kmap_atomic(*page, KM_USER0); end = (__be32 *)((char *)p + pglen); entry = p; - for (nr = 0; *p++; nr++) { + + /* Make sure the packet actually has a value_follows and EOF entry */ + if ((entry + 1) > end) + goto short_pkt; + + for (; *p++; nr++) { if (p + 3 > end) goto short_pkt; p += 2; /* inode # */ @@ -581,18 +586,32 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res goto short_pkt; entry = p; } - if (!nr && (entry[0] != 0 || entry[1] == 0)) - goto short_pkt; + + /* + * Apparently some server sends responses that are a valid size, but + * contain no entries, and have value_follows==0 and EOF==0. For + * those, just set the EOF marker. + */ + if (!nr && entry[1] == 0) { + dprintk("NFS: readdir reply truncated!\n"); + entry[1] = 1; + } out: kunmap_atomic(kaddr, KM_USER0); return nr; short_pkt: + /* + * When we get a short packet there are 2 possibilities. We can + * return an error, or fix up the response to look like a valid + * response and return what we have so far. If there are no + * entries and the packet was short, then return -EIO. If there + * are valid entries in the response, return them and pretend that + * the call was successful, but incomplete. The caller can retry the + * readdir starting at the last cookie. + */ entry[0] = entry[1] = 0; - /* truncate listing ? */ - if (!nr) { - dprintk("NFS: readdir reply truncated!\n"); - entry[1] = 1; - } + if (!nr) + nr = -errno_NFSERR_IO; goto out; err_unmap: nr = -errno_NFSERR_IO; From 7bda2cdf484a00e52b0ed925e99d4bf4696b2c7a Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 22 Feb 2008 14:50:01 -0500 Subject: [PATCH 29/80] NFS: clean up short packet handling for NFSv4 readdir Currently, the NFS readdir decoders have a workaround for buggy servers that send an empty readdir response with the EOF bit unset. If the server sends a malformed response in some cases, this workaround kicks in and just returns an empty response rather than returning a proper error to the caller. This patch does 3 things: 1) have malformed responses with no entries return error (-EIO) 2) preserve existing workaround for servers that send empty responses with the EOF marker unset. 3) Add some comments to clarify the logic in decode_readdir(). Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 37421dd4805d..2b519f6325f9 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3481,7 +3481,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n size_t hdrlen; u32 recvd, pglen = rcvbuf->page_len; __be32 *end, *entry, *p, *kaddr; - unsigned int nr; + unsigned int nr = 0; int status; status = decode_op_hdr(xdr, OP_READDIR); @@ -3505,7 +3505,12 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n kaddr = p = kmap_atomic(page, KM_USER0); end = p + ((pglen + readdir->pgbase) >> 2); entry = p; - for (nr = 0; *p++; nr++) { + + /* Make sure the packet actually has a value_follows and EOF entry */ + if ((entry + 1) > end) + goto short_pkt; + + for (; *p++; nr++) { u32 len, attrlen, xlen; if (end - p < 3) goto short_pkt; @@ -3532,20 +3537,32 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n p += attrlen; /* attributes */ entry = p; } - if (!nr && (entry[0] != 0 || entry[1] == 0)) - goto short_pkt; + /* + * Apparently some server sends responses that are a valid size, but + * contain no entries, and have value_follows==0 and EOF==0. For + * those, just set the EOF marker. + */ + if (!nr && entry[1] == 0) { + dprintk("NFS: readdir reply truncated!\n"); + entry[1] = 1; + } out: kunmap_atomic(kaddr, KM_USER0); return 0; short_pkt: + /* + * When we get a short packet there are 2 possibilities. We can + * return an error, or fix up the response to look like a valid + * response and return what we have so far. If there are no + * entries and the packet was short, then return -EIO. If there + * are valid entries in the response, return them and pretend that + * the call was successful, but incomplete. The caller can retry the + * readdir starting at the last cookie. + */ dprintk("%s: short packet at entry %d\n", __FUNCTION__, nr); entry[0] = entry[1] = 0; - /* truncate listing ? */ - if (!nr) { - dprintk("NFS: readdir reply truncated!\n"); - entry[1] = 1; - } - goto out; + if (nr) + goto out; err_unmap: kunmap_atomic(kaddr, KM_USER0); return -errno_NFSERR_IO; From 2d76743227028a26f884284aade03d1e43f4f249 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Mar 2008 14:10:08 -0400 Subject: [PATCH 30/80] NFS: numeric mount parameters are unsigned Clean up: use %u instead of %d when displaying NFS mount options. Nit: Fix reporting of "namlen=" option in nfs_show_mount_stats. The mount option is called "namlen" without the "e". Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index fcf4b982c885..53a67c6d4d2a 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -463,17 +463,17 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, const struct proc_nfs_info *nfs_infop; struct nfs_client *clp = nfss->nfs_client; - seq_printf(m, ",vers=%d", clp->rpc_ops->version); - seq_printf(m, ",rsize=%d", nfss->rsize); - seq_printf(m, ",wsize=%d", nfss->wsize); + seq_printf(m, ",vers=%u", clp->rpc_ops->version); + seq_printf(m, ",rsize=%u", nfss->rsize); + seq_printf(m, ",wsize=%u", nfss->wsize); if (nfss->acregmin != 3*HZ || showdefaults) - seq_printf(m, ",acregmin=%d", nfss->acregmin/HZ); + seq_printf(m, ",acregmin=%u", nfss->acregmin/HZ); if (nfss->acregmax != 60*HZ || showdefaults) - seq_printf(m, ",acregmax=%d", nfss->acregmax/HZ); + seq_printf(m, ",acregmax=%u", nfss->acregmax/HZ); if (nfss->acdirmin != 30*HZ || showdefaults) - seq_printf(m, ",acdirmin=%d", nfss->acdirmin/HZ); + seq_printf(m, ",acdirmin=%u", nfss->acdirmin/HZ); if (nfss->acdirmax != 60*HZ || showdefaults) - seq_printf(m, ",acdirmax=%d", nfss->acdirmax/HZ); + seq_printf(m, ",acdirmax=%u", nfss->acdirmax/HZ); for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) { if (nfss->flags & nfs_infop->flag) seq_puts(m, nfs_infop->str); @@ -529,10 +529,10 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) seq_printf(m, "\n\tcaps:\t"); seq_printf(m, "caps=0x%x", nfss->caps); - seq_printf(m, ",wtmult=%d", nfss->wtmult); - seq_printf(m, ",dtsize=%d", nfss->dtsize); - seq_printf(m, ",bsize=%d", nfss->bsize); - seq_printf(m, ",namelen=%d", nfss->namelen); + seq_printf(m, ",wtmult=%u", nfss->wtmult); + seq_printf(m, ",dtsize=%u", nfss->dtsize); + seq_printf(m, ",bsize=%u", nfss->bsize); + seq_printf(m, ",namlen=%u", nfss->namelen); #ifdef CONFIG_NFS_V4 if (nfss->nfs_client->rpc_ops->version == 4) { @@ -546,9 +546,9 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) /* * Display security flavor in effect for this mount */ - seq_printf(m, "\n\tsec:\tflavor=%d", auth->au_ops->au_flavor); + seq_printf(m, "\n\tsec:\tflavor=%u", auth->au_ops->au_flavor); if (auth->au_flavor) - seq_printf(m, ",pseudoflavor=%d", auth->au_flavor); + seq_printf(m, ",pseudoflavor=%u", auth->au_flavor); /* * Display superblock I/O counters From 78fa701f341564e60461de91cd08ff5f7fb09b31 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Mar 2008 14:10:15 -0400 Subject: [PATCH 31/80] NFS: Fix up data types of fields in nfs_parsed_mount_options Clean up: make data types of fields in nfs_parsed_mount_options more consistent with other uses. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 4c1122a13844..e89688a955bf 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -46,9 +46,9 @@ struct nfs_parsed_mount_data { struct sockaddr_storage address; size_t addrlen; char *hostname; - unsigned int version; + u32 version; unsigned short port; - int protocol; + unsigned short protocol; } mount_server; struct { @@ -56,7 +56,7 @@ struct nfs_parsed_mount_data { size_t addrlen; char *hostname; char *export_path; - int protocol; + unsigned short protocol; } nfs_server; struct security_mnt_opts lsm_opts; From f22d6d79fe227245363a8849ea8c85fe6c6598c3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Mar 2008 14:10:22 -0400 Subject: [PATCH 32/80] NFS: Save the value of the "port=" mount option During a remount based on the mount options displayed in /proc/mounts, we want to preserve the original behavior of the mount request. Let's save the original setting of the "port=" mount option in the mount's nfs_server structure. This allows us to simplify the default behavior of port setting for NFSv4 mounts: by default, NFSv2/3 mounts first try an RPC bind to determine the NFS server's port, unless the user specified the "port=" mount option; Users can force the client to skip the RPC bind by explicitly specifying "port=". NFSv4, by contrast, assumes the NFS server port is 2049 and skips the RPC bind, unless the user specifies "port=". Users can force an RPC bind for NFSv4 by explicitly specifying "port=0". I added a couple of extra comments to clarify this behavior. Signed-off-by: Chuck Lever Cc: Miklos Szeredi Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 4 ++++ fs/nfs/internal.h | 1 + fs/nfs/super.c | 37 ++++++------------------------------- include/linux/nfs_fs_sb.h | 1 + 4 files changed, 12 insertions(+), 31 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 06f064d8fbbe..874018113d0e 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -682,6 +682,8 @@ static int nfs_init_server(struct nfs_server *server, if (error < 0) goto error; + server->port = data->nfs_server.port; + error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]); if (error < 0) goto error; @@ -1064,6 +1066,8 @@ static int nfs4_init_server(struct nfs_server *server, server->acdirmin = data->acdirmin * HZ; server->acdirmax = data->acdirmax * HZ; + server->port = data->nfs_server.port; + error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]); error: diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index e89688a955bf..999ad8ee0645 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -56,6 +56,7 @@ struct nfs_parsed_mount_data { size_t addrlen; char *hostname; char *export_path; + unsigned short port; unsigned short protocol; } nfs_server; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 53a67c6d4d2a..3c6f53aa7317 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -685,7 +685,6 @@ static int nfs_parse_mount_options(char *raw, struct nfs_parsed_mount_data *mnt) { char *p, *string, *secdata; - unsigned short port = 0; int rc; if (!raw) { @@ -800,7 +799,7 @@ static int nfs_parse_mount_options(char *raw, return 0; if (option < 0 || option > 65535) return 0; - port = option; + mnt->nfs_server.port = option; break; case Opt_rsize: if (match_int(args, &mnt->rsize)) @@ -1050,7 +1049,8 @@ static int nfs_parse_mount_options(char *raw, } } - nfs_set_port((struct sockaddr *)&mnt->nfs_server.address, port); + nfs_set_port((struct sockaddr *)&mnt->nfs_server.address, + mnt->nfs_server.port); return 1; @@ -1171,7 +1171,9 @@ static int nfs_validate_mount_data(void *options, args->acregmax = 60; args->acdirmin = 30; args->acdirmax = 60; + args->mount_server.port = 0; /* autobind unless user sets port */ args->mount_server.protocol = XPRT_TRANSPORT_UDP; + args->nfs_server.port = 0; /* autobind unless user sets port */ args->nfs_server.protocol = XPRT_TRANSPORT_TCP; switch (data->version) { @@ -1707,28 +1709,6 @@ static void nfs4_fill_super(struct super_block *sb) nfs_initialise_sb(sb); } -/* - * If the user didn't specify a port, set the port number to - * the NFS version 4 default port. - */ -static void nfs4_default_port(struct sockaddr *sap) -{ - switch (sap->sa_family) { - case AF_INET: { - struct sockaddr_in *ap = (struct sockaddr_in *)sap; - if (ap->sin_port == 0) - ap->sin_port = htons(NFS_PORT); - break; - } - case AF_INET6: { - struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap; - if (ap->sin6_port == 0) - ap->sin6_port = htons(NFS_PORT); - break; - } - } -} - /* * Validate NFSv4 mount options */ @@ -1753,6 +1733,7 @@ static int nfs4_validate_mount_data(void *options, args->acregmax = 60; args->acdirmin = 30; args->acdirmax = 60; + args->nfs_server.port = NFS_PORT; /* 2049 unless user set port= */ args->nfs_server.protocol = XPRT_TRANSPORT_TCP; switch (data->version) { @@ -1769,9 +1750,6 @@ static int nfs4_validate_mount_data(void *options, &args->nfs_server.address)) goto out_no_address; - nfs4_default_port((struct sockaddr *) - &args->nfs_server.address); - switch (data->auth_flavourlen) { case 0: args->auth_flavors[0] = RPC_AUTH_UNIX; @@ -1829,9 +1807,6 @@ static int nfs4_validate_mount_data(void *options, &args->nfs_server.address)) return -EINVAL; - nfs4_default_port((struct sockaddr *) - &args->nfs_server.address); - switch (args->auth_flavor_len) { case 0: args->auth_flavors[0] = RPC_AUTH_UNIX; diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 3423c6761bf7..670e5c7222d0 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -93,6 +93,7 @@ struct nfs_server { unsigned int wpages; /* write size (in pages) */ unsigned int wtmult; /* server disk block size */ unsigned int dtsize; /* readdir size */ + unsigned short port; /* "port=" setting */ unsigned int bsize; /* server block size */ unsigned int acregmin; /* attr cache timeouts */ unsigned int acregmax; From 3f8400d1f1f9d5fb175bdbf6236e564dde454f28 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Mar 2008 14:10:30 -0400 Subject: [PATCH 33/80] NFS: Save the values of the "mount*=" mount options Save the value of the mountproto= mountport= mountvers= and mountaddr= options so that these values can be displayed later via nfs_show_options(). This preserves the intent of the original mount options, should the file system need to be remounted based on what's displayed in /proc/mounts. Signed-off-by: Chuck Lever Cc: Miklos Szeredi Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 10 ++++++++++ include/linux/nfs_fs_sb.h | 7 +++++++ 2 files changed, 17 insertions(+) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 874018113d0e..93dfd75aba7c 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -688,6 +688,16 @@ static int nfs_init_server(struct nfs_server *server, if (error < 0) goto error; + /* Preserve the values of mount_server-related mount options */ + if (data->mount_server.addrlen) { + memcpy(&server->mountd_address, &data->mount_server.address, + data->mount_server.addrlen); + server->mountd_addrlen = data->mount_server.addrlen; + } + server->mountd_version = data->mount_server.version; + server->mountd_port = data->mount_server.port; + server->mountd_protocol = data->mount_server.protocol; + server->namelen = data->namlen; /* Create a client RPC handle for the NFSv3 ACL management interface */ nfs_init_server_aclclient(server); diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 670e5c7222d0..ac7e4fb943ea 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -118,6 +118,13 @@ struct nfs_server { atomic_t active; /* Keep trace of any activity to this server */ wait_queue_head_t active_wq; /* Wait for any activity to stop */ + + /* mountd-related mount options */ + struct sockaddr_storage mountd_address; + size_t mountd_addrlen; + u32 mountd_version; + unsigned short mountd_port; + unsigned short mountd_protocol; }; /* Server capabilities */ From 82d101d58a2312297ee79f96d44c1d8c7fe1032d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Mar 2008 14:10:37 -0400 Subject: [PATCH 34/80] NFS: Show most mount options via nfs_show_options() Display all mount options in /proc/mount which may be needed to reconstruct a previous mount. Signed-off-by: Chuck Lever Cc: Miklos Szeredi Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 2 deletions(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 3c6f53aa7317..c99ca1f992ce 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -441,10 +441,52 @@ static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour) return sec_flavours[i].str; } +static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss, + int showdefaults) +{ + struct sockaddr *sap = (struct sockaddr *)&nfss->mountd_address; + + switch (sap->sa_family) { + case AF_INET: { + struct sockaddr_in *sin = (struct sockaddr_in *)sap; + seq_printf(m, ",mountaddr=" NIPQUAD_FMT, + NIPQUAD(sin->sin_addr.s_addr)); + break; + } + case AF_INET6: { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + seq_printf(m, ",mountaddr=" NIP6_FMT, + NIP6(sin6->sin6_addr)); + break; + } + default: + if (showdefaults) + seq_printf(m, ",mountaddr=unspecified"); + } + + if (nfss->mountd_version || showdefaults) + seq_printf(m, ",mountvers=%u", nfss->mountd_version); + if (nfss->mountd_port || showdefaults) + seq_printf(m, ",mountport=%u", nfss->mountd_port); + + switch (nfss->mountd_protocol) { + case IPPROTO_UDP: + seq_printf(m, ",mountproto=udp"); + break; + case IPPROTO_TCP: + seq_printf(m, ",mountproto=tcp"); + break; + default: + if (showdefaults) + seq_printf(m, ",mountproto=auto"); + } +} + /* * Describe the mount options in force on this server representation */ -static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, int showdefaults) +static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, + int showdefaults) { static const struct proc_nfs_info { int flag; @@ -452,6 +494,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, const char *nostr; } nfs_info[] = { { NFS_MOUNT_SOFT, ",soft", ",hard" }, + { NFS_MOUNT_INTR, ",intr", ",nointr" }, + { NFS_MOUNT_POSIX, ",posix", "" }, { NFS_MOUNT_NOCTO, ",nocto", "" }, { NFS_MOUNT_NOAC, ",noac", "" }, { NFS_MOUNT_NONLM, ",nolock", "" }, @@ -462,10 +506,14 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, }; const struct proc_nfs_info *nfs_infop; struct nfs_client *clp = nfss->nfs_client; + u32 version = clp->rpc_ops->version; - seq_printf(m, ",vers=%u", clp->rpc_ops->version); + seq_printf(m, ",vers=%u", version); seq_printf(m, ",rsize=%u", nfss->rsize); seq_printf(m, ",wsize=%u", nfss->wsize); + if (nfss->bsize != 0) + seq_printf(m, ",bsize=%u", nfss->bsize); + seq_printf(m, ",namlen=%u", nfss->namelen); if (nfss->acregmin != 3*HZ || showdefaults) seq_printf(m, ",acregmin=%u", nfss->acregmin/HZ); if (nfss->acregmax != 60*HZ || showdefaults) @@ -482,9 +530,24 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, } seq_printf(m, ",proto=%s", rpc_peeraddr2str(nfss->client, RPC_DISPLAY_PROTO)); + if (version == 4) { + if (nfss->port != NFS_PORT) + seq_printf(m, ",port=%u", nfss->port); + } else + if (nfss->port) + seq_printf(m, ",port=%u", nfss->port); + seq_printf(m, ",timeo=%lu", 10U * nfss->client->cl_timeout->to_initval / HZ); seq_printf(m, ",retrans=%u", nfss->client->cl_timeout->to_retries); seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor)); + + if (version != 4) + nfs_show_mountd_options(m, nfss, showdefaults); + +#ifdef CONFIG_NFS_V4 + if (clp->rpc_ops->version == 4) + seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr); +#endif } /* From ecfc555a8327ff09b07066d73a98c04115007eec Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Mar 2008 14:14:56 -0400 Subject: [PATCH 35/80] NFS: Always enable NFS direct I/O Since O_DIRECT is a standard feature that is enabled in most distros, eliminate the CONFIG_NFS_DIRECTIO build option, and change the fs/nfs/Makefile to always build in the NFS direct I/O engine. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/Kconfig | 24 ------------------------ fs/nfs/Makefile | 3 +-- fs/nfs/file.c | 6 ------ fs/nfs/internal.h | 5 ----- 4 files changed, 1 insertion(+), 37 deletions(-) diff --git a/fs/Kconfig b/fs/Kconfig index d7312825592b..55c1a0374e13 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -1637,30 +1637,6 @@ config NFS_V4 If unsure, say N. -config NFS_DIRECTIO - bool "Allow direct I/O on NFS files" - depends on NFS_FS - help - This option enables applications to perform uncached I/O on files - in NFS file systems using the O_DIRECT open() flag. When O_DIRECT - is set for a file, its data is not cached in the system's page - cache. Data is moved to and from user-level application buffers - directly. Unlike local disk-based file systems, NFS O_DIRECT has - no alignment restrictions. - - Unless your program is designed to use O_DIRECT properly, you are - much better off allowing the NFS client to manage data caching for - you. Misusing O_DIRECT can cause poor server performance or network - storms. This kernel build option defaults OFF to avoid exposing - system administrators unwittingly to a potentially hazardous - feature. - - For more details on NFS O_DIRECT, see fs/nfs/direct.c. - - If unsure, say N. This reduces the size of the NFS client, and - causes open() to return EINVAL if a file residing in NFS is - opened with the O_DIRECT flag. - config NFSD tristate "NFS server support" depends on INET diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index df0f41e09885..ac6170c594a3 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o nfs-y := client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \ - pagelist.o proc.o read.o symlink.o unlink.o \ + direct.o pagelist.o proc.o read.o symlink.o unlink.o \ write.o namespace.o mount_clnt.o nfs-$(CONFIG_ROOT_NFS) += nfsroot.o nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o @@ -14,5 +14,4 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \ delegation.o idmap.o \ callback.o callback_xdr.o callback_proc.o \ nfs4namespace.o -nfs-$(CONFIG_NFS_DIRECTIO) += direct.o nfs-$(CONFIG_SYSCTL) += sysctl.o diff --git a/fs/nfs/file.c b/fs/nfs/file.c index ef57a5ae5904..10e8b807e7ff 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -234,10 +234,8 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov, ssize_t result; size_t count = iov_length(iov, nr_segs); -#ifdef CONFIG_NFS_DIRECTIO if (iocb->ki_filp->f_flags & O_DIRECT) return nfs_file_direct_read(iocb, iov, nr_segs, pos); -#endif dfprintk(VFS, "nfs: read(%s/%s, %lu@%lu)\n", dentry->d_parent->d_name.name, dentry->d_name.name, @@ -383,9 +381,7 @@ const struct address_space_operations nfs_file_aops = { .write_end = nfs_write_end, .invalidatepage = nfs_invalidate_page, .releasepage = nfs_release_page, -#ifdef CONFIG_NFS_DIRECTIO .direct_IO = nfs_direct_IO, -#endif .launder_page = nfs_launder_page, }; @@ -443,10 +439,8 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, ssize_t result; size_t count = iov_length(iov, nr_segs); -#ifdef CONFIG_NFS_DIRECTIO if (iocb->ki_filp->f_flags & O_DIRECT) return nfs_file_direct_write(iocb, iov, nr_segs, pos); -#endif dfprintk(VFS, "nfs: write(%s/%s(%ld), %lu@%Ld)\n", dentry->d_parent->d_name.name, dentry->d_name.name, diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 999ad8ee0645..04ae867dddba 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -116,13 +116,8 @@ extern void nfs_destroy_readpagecache(void); extern int __init nfs_init_writepagecache(void); extern void nfs_destroy_writepagecache(void); -#ifdef CONFIG_NFS_DIRECTIO extern int __init nfs_init_directcache(void); extern void nfs_destroy_directcache(void); -#else -#define nfs_init_directcache() (0) -#define nfs_destroy_directcache() do {} while(0) -#endif /* nfs2xdr.c */ extern int nfs_stat_to_errno(int); From 327a299d8abd0f4580098cbd6c58b0f017417005 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Mar 2008 14:15:11 -0400 Subject: [PATCH 36/80] SUNRPC: Update help Kconfig text Clean up: refresh the help text for Kconfig items related to the sunrpc module. Remove obsolete URLs, and make the language consistent among the options. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/Kconfig | 45 ++++++++++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/fs/Kconfig b/fs/Kconfig index 55c1a0374e13..83201856b362 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -1757,15 +1757,32 @@ config SUNRPC_XPRT_RDMA tristate depends on SUNRPC && INFINIBAND && EXPERIMENTAL default SUNRPC && INFINIBAND + help + This option enables an RPC client transport capability that + allows the NFS client to mount servers via an RDMA-enabled + transport. + + To compile RPC client RDMA transport support as a module, + choose M here: the module will be called xprtrdma. + + If unsure, say N. config SUNRPC_BIND34 bool "Support for rpcbind versions 3 & 4 (EXPERIMENTAL)" depends on SUNRPC && EXPERIMENTAL help - Provides kernel support for querying rpcbind servers via versions 3 - and 4 of the rpcbind protocol. The kernel automatically falls back - to version 2 if a remote rpcbind service does not support versions - 3 or 4. + RPC requests over IPv6 networks require support for larger + addresses when performing an RPC bind. Sun added support for + IPv6 addressing by creating two new versions of the rpcbind + protocol (RFC 1833). + + This option enables support in the kernel RPC client for + querying rpcbind servers via versions 3 and 4 of the rpcbind + protocol. The kernel automatically falls back to version 2 + if a remote rpcbind service does not support versions 3 or 4. + By themselves, these new versions do not provide support for + RPC over IPv6, but the new protocol versions are necessary to + support it. If unsure, say N to get traditional behavior (version 2 rpcbind requests only). @@ -1779,12 +1796,13 @@ config RPCSEC_GSS_KRB5 select CRYPTO_DES select CRYPTO_CBC help - Provides for secure RPC calls by means of a gss-api - mechanism based on Kerberos V5. This is required for - NFSv4. + Choose Y here to enable Secure RPC using the Kerberos version 5 + GSS-API mechanism (RFC 1964). - Note: Requires an auxiliary userspace daemon which may be found on - http://www.citi.umich.edu/projects/nfsv4/ + Secure RPC calls with Kerberos require an auxiliary user-space + daemon which may be found in the Linux nfs-utils package + available from http://linux-nfs.org/. In addition, user-space + Kerberos support should be installed. If unsure, say N. @@ -1798,11 +1816,12 @@ config RPCSEC_GSS_SPKM3 select CRYPTO_CAST5 select CRYPTO_CBC help - Provides for secure RPC calls by means of a gss-api - mechanism based on the SPKM3 public-key mechanism. + Choose Y here to enable Secure RPC using the SPKM3 public key + GSS-API mechansim (RFC 2025). - Note: Requires an auxiliary userspace daemon which may be found on - http://www.citi.umich.edu/projects/nfsv4/ + Secure RPC calls with SPKM3 require an auxiliary userspace + daemon which may be found in the Linux nfs-utils package + available from http://linux-nfs.org/. If unsure, say N. From 1e40316bc44851c5920490df01482ed862e4dbc4 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Mar 2008 14:15:18 -0400 Subject: [PATCH 37/80] SUNRPC: Add a default setting for CONFIG_SUNRPC_BIND34 Most distros will want support for rpcbind protocols 3 and 4 to default off until they have integrated user-space support for the new rpcbind daemon which supports IPv6 RPC services. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/Kconfig b/fs/Kconfig index 83201856b362..4115deb567e1 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -1770,6 +1770,7 @@ config SUNRPC_XPRT_RDMA config SUNRPC_BIND34 bool "Support for rpcbind versions 3 & 4 (EXPERIMENTAL)" depends on SUNRPC && EXPERIMENTAL + default n help RPC requests over IPv6 networks require support for larger addresses when performing an RPC bind. Sun added support for From 90d5b18061656993410dfd57ddb88aa5a3f34563 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Mar 2008 14:18:30 -0400 Subject: [PATCH 38/80] NLM: LOCKD fails to load if CONFIG_SYSCTL is not set Bruce Fields says: "By the way, we've got another config-related nit here: http://bugzilla.linux-nfs.org/show_bug.cgi?id=156 You can build lockd without CONFIG_SYSCTL set, but then the module will fail to load." For now, disable the sysctl registration calls in lockd if CONFIG_SYSCTL is not enabled. This allows the kernel to build properly if PROC_FS or SYSCTL is not enabled, but an NFS client is desired. In the long run, we would like to be able to build the kernel with an NFS client but without lockd. This makes sense, for example, if you want an NFSv4-only NFS client, as NFSv4 doesn't use NLM at all. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/lockd/svc.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 1ed8bd4de941..38c2f0b1dd7d 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -74,7 +74,9 @@ static const unsigned long nlm_timeout_min = 3; static const unsigned long nlm_timeout_max = 20; static const int nlm_port_min = 0, nlm_port_max = 65535; +#ifdef CONFIG_SYSCTL static struct ctl_table_header * nlm_sysctl_table; +#endif static unsigned long get_lockd_grace_period(void) { @@ -359,6 +361,8 @@ out: } EXPORT_SYMBOL(lockd_down); +#ifdef CONFIG_SYSCTL + /* * Sysctl parameters (same as module parameters, different interface). */ @@ -443,6 +447,8 @@ static ctl_table nlm_sysctl_root[] = { { .ctl_name = 0 } }; +#endif /* CONFIG_SYSCTL */ + /* * Module (and sysfs) parameters. */ @@ -516,15 +522,21 @@ module_param(nsm_use_hostnames, bool, 0644); static int __init init_nlm(void) { +#ifdef CONFIG_SYSCTL nlm_sysctl_table = register_sysctl_table(nlm_sysctl_root); return nlm_sysctl_table ? 0 : -ENOMEM; +#else + return 0; +#endif } static void __exit exit_nlm(void) { /* FIXME: delete all NLM clients */ nlm_shutdown_hosts(); +#ifdef CONFIG_SYSCTL unregister_sysctl_table(nlm_sysctl_table); +#endif } module_init(init_nlm); From eb18860e1385bfc7f08fcb7ba362e4a5156c8324 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Mar 2008 14:18:37 -0400 Subject: [PATCH 39/80] NLM: NLM protocol version numbers are u32 Clean up: RPC protocol version numbers are u32. Make sure we use an appropriate type for NLM version numbers when calling nlm_lookup_host(). Eliminates a harmless mixed sign comparison in nlm_host_lookup(). Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/lockd/host.c | 20 +++++++++++--------- include/linux/lockd/lockd.h | 6 ++++-- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index f1ef49fff118..f23750db1650 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -42,11 +42,12 @@ static struct nsm_handle * nsm_find(const struct sockaddr_in *sin, /* * Common host lookup routine for server & client */ -static struct nlm_host * -nlm_lookup_host(int server, const struct sockaddr_in *sin, - int proto, int version, const char *hostname, - unsigned int hostname_len, - const struct sockaddr_in *ssin) +static struct nlm_host *nlm_lookup_host(int server, + const struct sockaddr_in *sin, + int proto, u32 version, + const char *hostname, + unsigned int hostname_len, + const struct sockaddr_in *ssin) { struct hlist_head *chain; struct hlist_node *pos; @@ -55,7 +56,7 @@ nlm_lookup_host(int server, const struct sockaddr_in *sin, int hash; dprintk("lockd: nlm_lookup_host("NIPQUAD_FMT"->"NIPQUAD_FMT - ", p=%d, v=%d, my role=%s, name=%.*s)\n", + ", p=%d, v=%u, my role=%s, name=%.*s)\n", NIPQUAD(ssin->sin_addr.s_addr), NIPQUAD(sin->sin_addr.s_addr), proto, version, server? "server" : "client", @@ -175,9 +176,10 @@ nlm_destroy_host(struct nlm_host *host) /* * Find an NLM server handle in the cache. If there is none, create it. */ -struct nlm_host * -nlmclnt_lookup_host(const struct sockaddr_in *sin, int proto, int version, - const char *hostname, unsigned int hostname_len) +struct nlm_host *nlmclnt_lookup_host(const struct sockaddr_in *sin, + int proto, u32 version, + const char *hostname, + unsigned int hostname_len) { struct sockaddr_in ssin = {0}; diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 4babb2a129ac..d15595013051 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -173,8 +173,10 @@ void nlmclnt_next_cookie(struct nlm_cookie *); /* * Host cache */ -struct nlm_host *nlmclnt_lookup_host(const struct sockaddr_in *, int, int, - const char *, unsigned int); +struct nlm_host *nlmclnt_lookup_host(const struct sockaddr_in *sin, + int proto, u32 version, + const char *hostname, + unsigned int hostname_len); struct nlm_host *nlmsvc_lookup_host(struct svc_rqst *, const char *, unsigned int); struct rpc_clnt * nlm_bind_host(struct nlm_host *); From f34ec991ae0f015f5cdc51ad46c3a317ffae2466 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Mar 2008 14:18:45 -0400 Subject: [PATCH 40/80] lockd: bring a few function declarations up to date Clean-up: replace __inline__ and use up-to-date function declaration conventions. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/lockd/lockd.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index d15595013051..acf39e1e3a3d 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -219,8 +219,7 @@ void nlmsvc_mark_resources(void); void nlmsvc_free_host_resources(struct nlm_host *); void nlmsvc_invalidate_all(void); -static __inline__ struct inode * -nlmsvc_file_inode(struct nlm_file *file) +static inline struct inode *nlmsvc_file_inode(struct nlm_file *file) { return file->f_file->f_path.dentry->d_inode; } @@ -228,8 +227,8 @@ nlmsvc_file_inode(struct nlm_file *file) /* * Compare two host addresses (needs modifying for ipv6) */ -static __inline__ int -nlm_cmp_addr(const struct sockaddr_in *sin1, const struct sockaddr_in *sin2) +static inline int nlm_cmp_addr(const struct sockaddr_in *sin1, + const struct sockaddr_in *sin2) { return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr; } @@ -238,8 +237,8 @@ nlm_cmp_addr(const struct sockaddr_in *sin1, const struct sockaddr_in *sin2) * Compare two NLM locks. * When the second lock is of type F_UNLCK, this acts like a wildcard. */ -static __inline__ int -nlm_compare_locks(const struct file_lock *fl1, const struct file_lock *fl2) +static inline int nlm_compare_locks(const struct file_lock *fl1, + const struct file_lock *fl2) { return fl1->fl_pid == fl2->fl_pid && fl1->fl_owner == fl2->fl_owner From 099bd05f27ff24a3041d54e7ed42d2eb681484b9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Mar 2008 14:25:32 -0400 Subject: [PATCH 41/80] lockd: Ensure NSM strings aren't longer than protocol allows Introduce a special helper function to check the length of NSM strings before they are placed on the wire. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/lockd/mon.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 908b23fadd05..84fd84cb67b7 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -149,6 +149,15 @@ nsm_create(void) * XDR functions for NSM. */ +static __be32 *xdr_encode_nsm_string(__be32 *p, char *string) +{ + size_t len = strlen(string); + + if (len > SM_MAXSTRLEN) + len = SM_MAXSTRLEN; + return xdr_encode_opaque(p, string, len); +} + static __be32 * xdr_encode_common(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp) { From 496951743100b2d2ccd8b268257e79425e489851 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Mar 2008 14:25:39 -0400 Subject: [PATCH 42/80] lockd: refactor SM_MON mon_name argument encoder Clean up: introduce a new XDR encoder specifically for the mon_name argument of SM_MON requests. This will be updated later to support IPv6 addresses in addition to IPv4 addresses. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/lockd/mon.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 84fd84cb67b7..87bd4e4d65dd 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -18,6 +18,8 @@ #define NLMDBG_FACILITY NLMDBG_MONITOR +#define XDR_ADDRBUF_LEN (20) + static struct rpc_clnt * nsm_create(void); static struct rpc_program nsm_program; @@ -158,6 +160,29 @@ static __be32 *xdr_encode_nsm_string(__be32 *p, char *string) return xdr_encode_opaque(p, string, len); } +/* + * "mon_name" specifies the host to be monitored. + * + * Linux uses a text version of the IP address of the remote + * host as the host identifier (the "mon_name" argument). + * + * Linux statd always looks up the canonical hostname first for + * whatever remote hostname it receives, so this works alright. + */ +static __be32 *xdr_encode_mon_name(__be32 *p, struct nsm_args *argp) +{ + char buffer[XDR_ADDRBUF_LEN + 1]; + char *name = argp->mon_name; + + if (!nsm_use_hostnames) { + snprintf(buffer, XDR_ADDRBUF_LEN, + NIPQUAD_FMT, NIPQUAD(argp->addr)); + name = buffer; + } + + return xdr_encode_nsm_string(p, name); +} + static __be32 * xdr_encode_common(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp) { From 850c95fd07b63704d06909c9a081c51272d32db1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Mar 2008 14:25:46 -0400 Subject: [PATCH 43/80] lockd: refactor SM_MON my_id argument encoder Clean up: introduce a new XDR encoder specifically for the my_id argument of SM_MON requests. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/lockd/mon.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 87bd4e4d65dd..06216d6715f7 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -183,6 +183,25 @@ static __be32 *xdr_encode_mon_name(__be32 *p, struct nsm_args *argp) return xdr_encode_nsm_string(p, name); } +/* + * The "my_id" argument specifies the hostname and RPC procedure + * to be called when the status manager receives notification + * (via the SM_NOTIFY call) that the state of host "mon_name" + * has changed. + */ +static __be32 *xdr_encode_my_id(__be32 *p, struct nsm_args *argp) +{ + p = xdr_encode_nsm_string(p, utsname()->nodename); + if (!p) + return ERR_PTR(-EIO); + + *p++ = htonl(argp->prog); + *p++ = htonl(argp->vers); + *p++ = htonl(argp->proc); + + return p; +} + static __be32 * xdr_encode_common(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp) { From ea72a7f170e686baf00ceee57b6197bef686889c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Mar 2008 14:25:53 -0400 Subject: [PATCH 44/80] lockd: document use of mon_id argument in SM_MON requests Clean up: document the argument type that xdr_encode_common() is marshalling by introducing a new function. The new function will replace xdr_encode_common() in just a sec. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/lockd/mon.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 06216d6715f7..3935d7b604f5 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -202,6 +202,19 @@ static __be32 *xdr_encode_my_id(__be32 *p, struct nsm_args *argp) return p; } +/* + * The "mon_id" argument specifies the non-private arguments + * of an SM_MON or SM_UNMON call. + */ +static __be32 *xdr_encode_mon_id(__be32 *p, struct nsm_args *argp) +{ + p = xdr_encode_mon_name(p, argp); + if (!p) + return ERR_PTR(-EIO); + + return xdr_encode_my_id(p, argp); +} + static __be32 * xdr_encode_common(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp) { From 2ca7754d4c96d68e1475690422a202ba5f0443d8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Mar 2008 14:26:01 -0400 Subject: [PATCH 45/80] lockd: Fix up incorrect RPC buffer size calculations. Switch to using the new mon_id encoder function. Now that we've refactored the encoding of SM_MON requests, we've discovered that the pre-computed buffer length maximums are incorrect! Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/lockd/mon.c | 39 ++++++++------------------------------- 1 file changed, 8 insertions(+), 31 deletions(-) diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 3935d7b604f5..f2507fec7c6f 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -149,6 +149,9 @@ nsm_create(void) /* * XDR functions for NSM. + * + * See http://www.opengroup.org/ for details on the Network + * Status Monitor wire protocol. */ static __be32 *xdr_encode_nsm_string(__be32 *p, char *string) @@ -215,37 +218,10 @@ static __be32 *xdr_encode_mon_id(__be32 *p, struct nsm_args *argp) return xdr_encode_my_id(p, argp); } -static __be32 * -xdr_encode_common(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp) -{ - char buffer[20], *name; - - /* - * Use the dotted-quad IP address of the remote host as - * identifier. Linux statd always looks up the canonical - * hostname first for whatever remote hostname it receives, - * so this works alright. - */ - if (nsm_use_hostnames) { - name = argp->mon_name; - } else { - sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(argp->addr)); - name = buffer; - } - if (!(p = xdr_encode_string(p, name)) - || !(p = xdr_encode_string(p, utsname()->nodename))) - return ERR_PTR(-EIO); - *p++ = htonl(argp->prog); - *p++ = htonl(argp->vers); - *p++ = htonl(argp->proc); - - return p; -} - static int xdr_encode_mon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp) { - p = xdr_encode_common(rqstp, p, argp); + p = xdr_encode_mon_id(p, argp); if (IS_ERR(p)) return PTR_ERR(p); @@ -261,7 +237,7 @@ xdr_encode_mon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp) static int xdr_encode_unmon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp) { - p = xdr_encode_common(rqstp, p, argp); + p = xdr_encode_mon_id(p, argp); if (IS_ERR(p)) return PTR_ERR(p); rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p); @@ -286,8 +262,9 @@ xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp) } #define SM_my_name_sz (1+XDR_QUADLEN(SM_MAXSTRLEN)) -#define SM_my_id_sz (3+1+SM_my_name_sz) -#define SM_mon_id_sz (1+XDR_QUADLEN(20)+SM_my_id_sz) +#define SM_my_id_sz (SM_my_name_sz+3) +#define SM_mon_name_sz (1+XDR_QUADLEN(SM_MAXSTRLEN)) +#define SM_mon_id_sz (SM_mon_name_sz+SM_my_id_sz) #define SM_mon_sz (SM_mon_id_sz+4) #define SM_monres_sz 2 #define SM_unmonres_sz 1 From 0490a54a00c14212f22c5948c8c13a4553d745bd Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Mar 2008 14:26:08 -0400 Subject: [PATCH 46/80] lockd: introduce new function to encode private argument in SM_MON requests Clean up: refactor the encoding of the opaque 16-byte private argument in xdr_encode_mon(). This will be updated later to support IPv6 addresses. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/lockd/mon.c | 30 ++++++++++++++++++++++++------ include/linux/lockd/sm_inter.h | 1 + 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index f2507fec7c6f..e4d563543b11 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -218,6 +218,24 @@ static __be32 *xdr_encode_mon_id(__be32 *p, struct nsm_args *argp) return xdr_encode_my_id(p, argp); } +/* + * The "priv" argument may contain private information required + * by the SM_MON call. This information will be supplied in the + * SM_NOTIFY call. + * + * Linux provides the raw IP address of the monitored host, + * left in network byte order. + */ +static __be32 *xdr_encode_priv(__be32 *p, struct nsm_args *argp) +{ + *p++ = argp->addr; + *p++ = 0; + *p++ = 0; + *p++ = 0; + + return p; +} + static int xdr_encode_mon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp) { @@ -225,11 +243,10 @@ xdr_encode_mon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp) if (IS_ERR(p)) return PTR_ERR(p); - /* Surprise - there may even be room for an IPv6 address now */ - *p++ = argp->addr; - *p++ = 0; - *p++ = 0; - *p++ = 0; + p = xdr_encode_priv(p, argp); + if (IS_ERR(p)) + return PTR_ERR(p); + rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p); return 0; } @@ -265,7 +282,8 @@ xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp) #define SM_my_id_sz (SM_my_name_sz+3) #define SM_mon_name_sz (1+XDR_QUADLEN(SM_MAXSTRLEN)) #define SM_mon_id_sz (SM_mon_name_sz+SM_my_id_sz) -#define SM_mon_sz (SM_mon_id_sz+4) +#define SM_priv_sz (XDR_QUADLEN(SM_PRIV_SIZE)) +#define SM_mon_sz (SM_mon_id_sz+SM_priv_sz) #define SM_monres_sz 2 #define SM_unmonres_sz 1 diff --git a/include/linux/lockd/sm_inter.h b/include/linux/lockd/sm_inter.h index 22a645828f26..5a5448bdb17d 100644 --- a/include/linux/lockd/sm_inter.h +++ b/include/linux/lockd/sm_inter.h @@ -19,6 +19,7 @@ #define SM_NOTIFY 6 #define SM_MAXSTRLEN 1024 +#define SM_PRIV_SIZE 16 /* * Arguments for all calls to statd From 24b74bf0c9e08cbda74d3c64af69ad402ed54e04 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 19 Apr 2008 13:15:47 -0400 Subject: [PATCH 47/80] SUNRPC: Fix a bug in call_decode() call_verify() can, under certain circumstances, free the RPC slot. In that case, our cached pointer 'req = task->tk_rqstp' is invalid. Bug was introduced in commit 220bcc2afd7011b3e0569fc178331fa983c92c1b (SUNRPC: Don't call xprt_release in call refresh). Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index ea14314331b0..522b06849f86 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1240,10 +1240,13 @@ call_decode(struct rpc_task *task) task->tk_status); return; out_retry: - req->rq_received = req->rq_private_buf.len = 0; task->tk_status = 0; - if (task->tk_client->cl_discrtry) - xprt_force_disconnect(task->tk_xprt); + /* Note: call_verify() may have freed the RPC slot */ + if (task->tk_rqstp == req) { + req->rq_received = req->rq_private_buf.len = 0; + if (task->tk_client->cl_discrtry) + xprt_force_disconnect(task->tk_xprt); + } } /* From b6ddf64ffe9d59577a9176856bb6fe69a539f573 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 17 Apr 2008 18:52:19 -0400 Subject: [PATCH 48/80] SUNRPC: Fix up xprt_write_space() The rest of the networking layer uses SOCK_ASYNC_NOSPACE to signal whether or not we have someone waiting for buffer memory. Convert the SUNRPC layer to use the same idiom. Remove the unlikely()s in xs_udp_write_space and xs_tcp_write_space. In fact, the most common case will be that there is nobody waiting for buffer space. SOCK_NOSPACE is there to tell the TCP layer whether or not the cwnd was limited by the application window. Ensure that we follow the same idiom as the rest of the networking layer here too. Finally, ensure that we clear SOCK_ASYNC_NOSPACE once we wake up, so that write_space() doesn't keep waking things up on xprt->pending. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 2 +- net/sunrpc/xprt.c | 4 +-- net/sunrpc/xprtsock.c | 59 +++++++++++++++++++++++++------------ 3 files changed, 43 insertions(+), 22 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index b3ff9a815e6f..8a0629abb86a 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -232,7 +232,7 @@ int xprt_unregister_transport(struct xprt_class *type); void xprt_set_retrans_timeout_def(struct rpc_task *task); void xprt_set_retrans_timeout_rtt(struct rpc_task *task); void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status); -void xprt_wait_for_buffer_space(struct rpc_task *task); +void xprt_wait_for_buffer_space(struct rpc_task *task, rpc_action action); void xprt_write_space(struct rpc_xprt *xprt); void xprt_update_rtt(struct rpc_task *task); void xprt_adjust_cwnd(struct rpc_task *task, int result); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 85199c647022..3ba64f9f84ba 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -447,13 +447,13 @@ EXPORT_SYMBOL_GPL(xprt_wake_pending_tasks); * @task: task to be put to sleep * */ -void xprt_wait_for_buffer_space(struct rpc_task *task) +void xprt_wait_for_buffer_space(struct rpc_task *task, rpc_action action) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; task->tk_timeout = req->rq_timeout; - rpc_sleep_on(&xprt->pending, task, NULL); + rpc_sleep_on(&xprt->pending, task, action); } EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 8bd3b0f73ac0..4c2462e2f2b2 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -516,6 +516,14 @@ out: return sent; } +static void xs_nospace_callback(struct rpc_task *task) +{ + struct sock_xprt *transport = container_of(task->tk_rqstp->rq_xprt, struct sock_xprt, xprt); + + transport->inet->sk_write_pending--; + clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); +} + /** * xs_nospace - place task on wait queue if transmit was incomplete * @task: task to put to sleep @@ -531,20 +539,27 @@ static void xs_nospace(struct rpc_task *task) task->tk_pid, req->rq_slen - req->rq_bytes_sent, req->rq_slen); - if (test_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags)) { - /* Protect against races with write_space */ - spin_lock_bh(&xprt->transport_lock); + /* Protect against races with write_space */ + spin_lock_bh(&xprt->transport_lock); - /* Don't race with disconnect */ - if (!xprt_connected(xprt)) - task->tk_status = -ENOTCONN; - else if (test_bit(SOCK_NOSPACE, &transport->sock->flags)) - xprt_wait_for_buffer_space(task); + /* Don't race with disconnect */ + if (xprt_connected(xprt)) { + if (test_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags)) { + /* + * Notify TCP that we're limited by the application + * window size + */ + set_bit(SOCK_NOSPACE, &transport->sock->flags); + transport->inet->sk_write_pending++; + /* ...and wait for more buffer space */ + xprt_wait_for_buffer_space(task, xs_nospace_callback); + } + } else { + clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); + task->tk_status = -ENOTCONN; + } - spin_unlock_bh(&xprt->transport_lock); - } else - /* Keep holding the socket if it is blocked */ - rpc_delay(task, HZ>>4); + spin_unlock_bh(&xprt->transport_lock); } /** @@ -588,19 +603,20 @@ static int xs_udp_send_request(struct rpc_task *task) } switch (status) { + case -EAGAIN: + xs_nospace(task); + break; case -ENETUNREACH: case -EPIPE: case -ECONNREFUSED: /* When the server has died, an ICMP port unreachable message * prompts ECONNREFUSED. */ - break; - case -EAGAIN: - xs_nospace(task); + clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); break; default: + clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); dprintk("RPC: sendmsg returned unrecognized error %d\n", -status); - break; } return status; @@ -695,12 +711,13 @@ static int xs_tcp_send_request(struct rpc_task *task) case -ENOTCONN: case -EPIPE: status = -ENOTCONN; + clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); break; default: dprintk("RPC: sendmsg returned unrecognized error %d\n", -status); + clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); xs_tcp_shutdown(xprt); - break; } return status; @@ -1189,9 +1206,11 @@ static void xs_udp_write_space(struct sock *sk) if (unlikely(!(sock = sk->sk_socket))) goto out; + clear_bit(SOCK_NOSPACE, &sock->flags); + if (unlikely(!(xprt = xprt_from_sock(sk)))) goto out; - if (unlikely(!test_and_clear_bit(SOCK_NOSPACE, &sock->flags))) + if (test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags) == 0) goto out; xprt_write_space(xprt); @@ -1222,9 +1241,11 @@ static void xs_tcp_write_space(struct sock *sk) if (unlikely(!(sock = sk->sk_socket))) goto out; + clear_bit(SOCK_NOSPACE, &sock->flags); + if (unlikely(!(xprt = xprt_from_sock(sk)))) goto out; - if (unlikely(!test_and_clear_bit(SOCK_NOSPACE, &sock->flags))) + if (test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags) == 0) goto out; xprt_write_space(xprt); From 080a1f148df0615f7a610e4776dd8f3fb706f54f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 19 Apr 2008 14:22:31 -0400 Subject: [PATCH 49/80] SUNRPC: Don't attempt to destroy expired RPCSEC_GSS credentials.. ..and always destroy using a 'soft' RPC call. Destroying GSS credentials isn't mandatory; the server can always cope with a few credentials not getting destroyed in a timely fashion. This actually fixes a hang situation. Basically, some servers will decide that the client is crazy if it tries to destroy an RPC context for which they have sent an RPCSEC_GSS_CREDPROBLEM, and so will refuse to talk to it for a while. The regression therefor probably was introduced by commit 0df7fb74fbb709591301871a38aac7735a1d6583. Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/auth_gss.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index d34f6dfc7516..55948cd5ea55 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -710,7 +710,7 @@ gss_destroying_context(struct rpc_cred *cred) struct rpc_task *task; if (gss_cred->gc_ctx == NULL || - gss_cred->gc_ctx->gc_proc == RPC_GSS_PROC_DESTROY) + test_and_clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0) return 0; gss_cred->gc_ctx->gc_proc = RPC_GSS_PROC_DESTROY; @@ -720,7 +720,7 @@ gss_destroying_context(struct rpc_cred *cred) * by the RPC call or by the put_rpccred() below */ get_rpccred(cred); - task = rpc_call_null(gss_auth->client, cred, RPC_TASK_ASYNC); + task = rpc_call_null(gss_auth->client, cred, RPC_TASK_ASYNC|RPC_TASK_SOFT); if (!IS_ERR(task)) rpc_put_task(task); From 73e3302f60c0e11a0db0b34b903f591139c4f937 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 11 Apr 2008 16:03:54 -0400 Subject: [PATCH 50/80] NFS: Fix nfs_wb_page() to always exit with an error or a clean page It is possible for nfs_wb_page() to sometimes exit with 0 return value, yet the page is left in a dirty state. For instance in the case where the server rebooted, and the COMMIT request failed, then all the previously "clean" pages which were cached by the server, but were not guaranteed to have been writted out to disk, have to be redirtied and resent to the server. The fix is to have nfs_wb_page_priority() check that the page is clean before it exits... This fixes a condition that triggers the BUG_ON(PagePrivate(page)) in nfs_create_request() when we're in the nfs_readpage() path. Also eliminate a redundant BUG_ON(!PageLocked(page)) while we're at it. It turns out that clear_page_dirty_for_io() has the exact same test. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index ce40cadb15db..997b42aa3702 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1493,18 +1493,19 @@ static int nfs_wb_page_priority(struct inode *inode, struct page *page, }; int ret; - BUG_ON(!PageLocked(page)); - if (clear_page_dirty_for_io(page)) { - ret = nfs_writepage_locked(page, &wbc); + do { + if (clear_page_dirty_for_io(page)) { + ret = nfs_writepage_locked(page, &wbc); + if (ret < 0) + goto out_error; + } else if (!PagePrivate(page)) + break; + ret = nfs_sync_mapping_wait(page->mapping, &wbc, how); if (ret < 0) - goto out; - } - if (!PagePrivate(page)) - return 0; - ret = nfs_sync_mapping_wait(page->mapping, &wbc, how); - if (ret >= 0) - return 0; -out: + goto out_error; + } while (PagePrivate(page)); + return 0; +out_error: __mark_inode_dirty(inode, I_DIRTY_PAGES); return ret; } From fdd1e74c89fe39259a29c494209abad63ff76f82 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 15 Apr 2008 16:33:58 -0400 Subject: [PATCH 51/80] NFS: Ensure that the read code cleans up properly when rpc_run_task() fails In the case of readpage() we need to ensure that the pages get unlocked, and that the error is flagged. In the case of O_DIRECT, we need to ensure that the pages are all released. Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 19 ++++++++++++------ fs/nfs/read.c | 53 ++++++++++++++++++++++++++++++------------------- 2 files changed, 46 insertions(+), 26 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index e44200579c8d..9d9085b93a32 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -229,14 +229,20 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq) static void nfs_direct_read_result(struct rpc_task *task, void *calldata) { struct nfs_read_data *data = calldata; - struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; - if (nfs_readpage_result(task, data) != 0) - return; + nfs_readpage_result(task, data); +} + +static void nfs_direct_read_release(void *calldata) +{ + + struct nfs_read_data *data = calldata; + struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; + int status = data->task.tk_status; spin_lock(&dreq->lock); - if (unlikely(task->tk_status < 0)) { - dreq->error = task->tk_status; + if (unlikely(status < 0)) { + dreq->error = status; spin_unlock(&dreq->lock); } else { dreq->count += data->res.count; @@ -249,11 +255,12 @@ static void nfs_direct_read_result(struct rpc_task *task, void *calldata) if (put_dreq(dreq)) nfs_direct_complete(dreq); + nfs_readdata_release(calldata); } static const struct rpc_call_ops nfs_read_direct_ops = { .rpc_call_done = nfs_direct_read_result, - .rpc_release = nfs_readdata_release, + .rpc_release = nfs_direct_read_release, }; /* diff --git a/fs/nfs/read.c b/fs/nfs/read.c index d333f5fedca1..6f9208a549a0 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -338,26 +338,25 @@ int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data) return 0; } -static int nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data) +static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data) { struct nfs_readargs *argp = &data->args; struct nfs_readres *resp = &data->res; if (resp->eof || resp->count == argp->count) - return 0; + return; /* This is a short read! */ nfs_inc_stats(data->inode, NFSIOS_SHORTREAD); /* Has the server at least made some progress? */ if (resp->count == 0) - return 0; + return; /* Yes, so retry the read at the end of the data */ argp->offset += resp->count; argp->pgbase += resp->count; argp->count -= resp->count; rpc_restart_call(task); - return -EAGAIN; } /* @@ -366,29 +365,37 @@ static int nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data) static void nfs_readpage_result_partial(struct rpc_task *task, void *calldata) { struct nfs_read_data *data = calldata; - struct nfs_page *req = data->req; - struct page *page = req->wb_page; if (nfs_readpage_result(task, data) != 0) return; + if (task->tk_status < 0) + return; - if (likely(task->tk_status >= 0)) { - nfs_readpage_truncate_uninitialised_page(data); - if (nfs_readpage_retry(task, data) != 0) - return; - } - if (unlikely(task->tk_status < 0)) + nfs_readpage_truncate_uninitialised_page(data); + nfs_readpage_retry(task, data); +} + +static void nfs_readpage_release_partial(void *calldata) +{ + struct nfs_read_data *data = calldata; + struct nfs_page *req = data->req; + struct page *page = req->wb_page; + int status = data->task.tk_status; + + if (status < 0) SetPageError(page); + if (atomic_dec_and_test(&req->wb_complete)) { if (!PageError(page)) SetPageUptodate(page); nfs_readpage_release(req); } + nfs_readdata_release(calldata); } static const struct rpc_call_ops nfs_read_partial_ops = { .rpc_call_done = nfs_readpage_result_partial, - .rpc_release = nfs_readdata_release, + .rpc_release = nfs_readpage_release_partial, }; static void nfs_readpage_set_pages_uptodate(struct nfs_read_data *data) @@ -423,29 +430,35 @@ static void nfs_readpage_result_full(struct rpc_task *task, void *calldata) if (nfs_readpage_result(task, data) != 0) return; + if (task->tk_status < 0) + return; /* * Note: nfs_readpage_retry may change the values of * data->args. In the multi-page case, we therefore need * to ensure that we call nfs_readpage_set_pages_uptodate() * first. */ - if (likely(task->tk_status >= 0)) { - nfs_readpage_truncate_uninitialised_page(data); - nfs_readpage_set_pages_uptodate(data); - if (nfs_readpage_retry(task, data) != 0) - return; - } + nfs_readpage_truncate_uninitialised_page(data); + nfs_readpage_set_pages_uptodate(data); + nfs_readpage_retry(task, data); +} + +static void nfs_readpage_release_full(void *calldata) +{ + struct nfs_read_data *data = calldata; + while (!list_empty(&data->pages)) { struct nfs_page *req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); nfs_readpage_release(req); } + nfs_readdata_release(calldata); } static const struct rpc_call_ops nfs_read_full_ops = { .rpc_call_done = nfs_readpage_result_full, - .rpc_release = nfs_readdata_release, + .rpc_release = nfs_readpage_release_full, }; /* From c9d8f89d9816c1d16ada492aa547a4d692508c0d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 15 Apr 2008 16:56:39 -0400 Subject: [PATCH 52/80] NFS: Ensure that the write code cleans up properly when rpc_run_task() fails Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 50 +++++++++++++++++-------------- fs/nfs/write.c | 67 +++++++++++++++++++++++++++--------------- include/linux/nfs_fs.h | 4 +-- 3 files changed, 73 insertions(+), 48 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 9d9085b93a32..abf8e0286e35 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -508,27 +508,34 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) static void nfs_direct_commit_result(struct rpc_task *task, void *calldata) { struct nfs_write_data *data = calldata; - struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; /* Call the NFS version-specific code */ - if (NFS_PROTO(data->inode)->commit_done(task, data) != 0) - return; - if (unlikely(task->tk_status < 0)) { + NFS_PROTO(data->inode)->commit_done(task, data); +} + +static void nfs_direct_commit_release(void *calldata) +{ + struct nfs_write_data *data = calldata; + struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; + int status = data->task.tk_status; + + if (status < 0) { dprintk("NFS: %5u commit failed with error %d.\n", - task->tk_pid, task->tk_status); + data->task.tk_pid, status); dreq->flags = NFS_ODIRECT_RESCHED_WRITES; } else if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) { - dprintk("NFS: %5u commit verify failed\n", task->tk_pid); + dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid); dreq->flags = NFS_ODIRECT_RESCHED_WRITES; } - dprintk("NFS: %5u commit returned %d\n", task->tk_pid, task->tk_status); + dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status); nfs_direct_write_complete(dreq, data->inode); + nfs_commitdata_release(calldata); } static const struct rpc_call_ops nfs_commit_direct_ops = { .rpc_call_done = nfs_direct_commit_result, - .rpc_release = nfs_commit_release, + .rpc_release = nfs_direct_commit_release, }; static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) @@ -596,7 +603,7 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode static void nfs_alloc_commit_data(struct nfs_direct_req *dreq) { - dreq->commit_data = nfs_commit_alloc(); + dreq->commit_data = nfs_commitdata_alloc(); if (dreq->commit_data != NULL) dreq->commit_data->req = (struct nfs_page *) dreq; } @@ -617,11 +624,20 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode static void nfs_direct_write_result(struct rpc_task *task, void *calldata) { struct nfs_write_data *data = calldata; - struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; - int status = task->tk_status; if (nfs_writeback_done(task, data) != 0) return; +} + +/* + * NB: Return the value of the first error return code. Subsequent + * errors after the first one are ignored. + */ +static void nfs_direct_write_release(void *calldata) +{ + struct nfs_write_data *data = calldata; + struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; + int status = data->task.tk_status; spin_lock(&dreq->lock); @@ -643,23 +659,13 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata) break; case NFS_ODIRECT_DO_COMMIT: if (memcmp(&dreq->verf, &data->verf, sizeof(dreq->verf))) { - dprintk("NFS: %5u write verify failed\n", task->tk_pid); + dprintk("NFS: %5u write verify failed\n", data->task.tk_pid); dreq->flags = NFS_ODIRECT_RESCHED_WRITES; } } } out_unlock: spin_unlock(&dreq->lock); -} - -/* - * NB: Return the value of the first error return code. Subsequent - * errors after the first one are ignored. - */ -static void nfs_direct_write_release(void *calldata) -{ - struct nfs_write_data *data = calldata; - struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; if (put_dreq(dreq)) nfs_direct_write_complete(dreq, data->inode); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 997b42aa3702..31681bb5e4c1 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -48,7 +48,7 @@ static struct kmem_cache *nfs_wdata_cachep; static mempool_t *nfs_wdata_mempool; static mempool_t *nfs_commit_mempool; -struct nfs_write_data *nfs_commit_alloc(void) +struct nfs_write_data *nfs_commitdata_alloc(void) { struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS); @@ -973,7 +973,6 @@ static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata) { struct nfs_write_data *data = calldata; struct nfs_page *req = data->req; - struct page *page = req->wb_page; dprintk("NFS: write (%s/%Ld %d@%Ld)", req->wb_context->path.dentry->d_inode->i_sb->s_id, @@ -981,13 +980,20 @@ static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata) req->wb_bytes, (long long)req_offset(req)); - if (nfs_writeback_done(task, data) != 0) - return; + nfs_writeback_done(task, data); +} - if (task->tk_status < 0) { +static void nfs_writeback_release_partial(void *calldata) +{ + struct nfs_write_data *data = calldata; + struct nfs_page *req = data->req; + struct page *page = req->wb_page; + int status = data->task.tk_status; + + if (status < 0) { nfs_set_pageerror(page); - nfs_context_set_write_error(req->wb_context, task->tk_status); - dprintk(", error = %d\n", task->tk_status); + nfs_context_set_write_error(req->wb_context, status); + dprintk(", error = %d\n", status); goto out; } @@ -1012,11 +1018,12 @@ static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata) out: if (atomic_dec_and_test(&req->wb_complete)) nfs_writepage_release(req); + nfs_writedata_release(calldata); } static const struct rpc_call_ops nfs_write_partial_ops = { .rpc_call_done = nfs_writeback_done_partial, - .rpc_release = nfs_writedata_release, + .rpc_release = nfs_writeback_release_partial, }; /* @@ -1029,17 +1036,21 @@ static const struct rpc_call_ops nfs_write_partial_ops = { static void nfs_writeback_done_full(struct rpc_task *task, void *calldata) { struct nfs_write_data *data = calldata; - struct nfs_page *req; - struct page *page; - if (nfs_writeback_done(task, data) != 0) - return; + nfs_writeback_done(task, data); +} + +static void nfs_writeback_release_full(void *calldata) +{ + struct nfs_write_data *data = calldata; + int status = data->task.tk_status; /* Update attributes as result of writeback. */ while (!list_empty(&data->pages)) { - req = nfs_list_entry(data->pages.next); + struct nfs_page *req = nfs_list_entry(data->pages.next); + struct page *page = req->wb_page; + nfs_list_remove_request(req); - page = req->wb_page; dprintk("NFS: write (%s/%Ld %d@%Ld)", req->wb_context->path.dentry->d_inode->i_sb->s_id, @@ -1047,10 +1058,10 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata) req->wb_bytes, (long long)req_offset(req)); - if (task->tk_status < 0) { + if (status < 0) { nfs_set_pageerror(page); - nfs_context_set_write_error(req->wb_context, task->tk_status); - dprintk(", error = %d\n", task->tk_status); + nfs_context_set_write_error(req->wb_context, status); + dprintk(", error = %d\n", status); goto remove_request; } @@ -1070,11 +1081,12 @@ remove_request: next: nfs_clear_page_tag_locked(req); } + nfs_writedata_release(calldata); } static const struct rpc_call_ops nfs_write_full_ops = { .rpc_call_done = nfs_writeback_done_full, - .rpc_release = nfs_writedata_release, + .rpc_release = nfs_writeback_release_full, }; @@ -1160,7 +1172,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) -void nfs_commit_release(void *data) +void nfs_commitdata_release(void *data) { struct nfs_write_data *wdata = data; @@ -1233,7 +1245,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) struct nfs_write_data *data; struct nfs_page *req; - data = nfs_commit_alloc(); + data = nfs_commitdata_alloc(); if (!data) goto out_bad; @@ -1261,7 +1273,6 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) static void nfs_commit_done(struct rpc_task *task, void *calldata) { struct nfs_write_data *data = calldata; - struct nfs_page *req; dprintk("NFS: %5u nfs_commit_done (status %d)\n", task->tk_pid, task->tk_status); @@ -1269,6 +1280,13 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata) /* Call the NFS version-specific code */ if (NFS_PROTO(data->inode)->commit_done(task, data) != 0) return; +} + +static void nfs_commit_release(void *calldata) +{ + struct nfs_write_data *data = calldata; + struct nfs_page *req; + int status = data->task.tk_status; while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); @@ -1283,10 +1301,10 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata) (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), req->wb_bytes, (long long)req_offset(req)); - if (task->tk_status < 0) { - nfs_context_set_write_error(req->wb_context, task->tk_status); + if (status < 0) { + nfs_context_set_write_error(req->wb_context, status); nfs_inode_remove_request(req); - dprintk(", error = %d\n", task->tk_status); + dprintk(", error = %d\n", status); goto next; } @@ -1307,6 +1325,7 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata) next: nfs_clear_page_tag_locked(req); } + nfs_commitdata_release(calldata); } static const struct rpc_call_ops nfs_commit_ops = { diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index f4a0e4c218df..7f0602c87712 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -466,9 +466,9 @@ extern int nfs_wb_page(struct inode *inode, struct page* page); extern int nfs_wb_page_cancel(struct inode *inode, struct page* page); #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) extern int nfs_commit_inode(struct inode *, int); -extern struct nfs_write_data *nfs_commit_alloc(void); +extern struct nfs_write_data *nfs_commitdata_alloc(void); extern void nfs_commit_free(struct nfs_write_data *wdata); -extern void nfs_commit_release(void *wdata); +extern void nfs_commitdata_release(void *wdata); #else static inline int nfs_commit_inode(struct inode *inode, int how) From dbae4c73f08b8a7980cc912954ade3d4c1fb6147 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 14 Apr 2008 14:54:53 -0400 Subject: [PATCH 53/80] NFS: Ensure that rpc_run_task() errors are propagated back to the caller Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 10 ++++++---- fs/nfs/read.c | 23 +++++++++++++++-------- fs/nfs/write.c | 33 +++++++++++++++++++-------------- 3 files changed, 40 insertions(+), 26 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index abf8e0286e35..4757a2b326a1 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -347,8 +347,9 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, NFS_PROTO(inode)->read_setup(data, &msg); task = rpc_run_task(&task_setup_data); - if (!IS_ERR(task)) - rpc_put_task(task); + if (IS_ERR(task)) + break; + rpc_put_task(task); dprintk("NFS: %5u initiated direct read call " "(req %s/%Ld, %zu bytes @ offset %Lu)\n", @@ -763,8 +764,9 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, NFS_PROTO(inode)->write_setup(data, &msg); task = rpc_run_task(&task_setup_data); - if (!IS_ERR(task)) - rpc_put_task(task); + if (IS_ERR(task)) + break; + rpc_put_task(task); dprintk("NFS: %5u initiated direct write call " "(req %s/%Ld, %zu bytes @ offset %Lu)\n", diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 6f9208a549a0..16f57e0af999 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -153,7 +153,7 @@ static void nfs_readpage_release(struct nfs_page *req) /* * Set up the NFS read request struct */ -static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, +static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, const struct rpc_call_ops *call_ops, unsigned int count, unsigned int offset) { @@ -202,8 +202,10 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, (unsigned long long)data->args.offset); task = rpc_run_task(&task_setup_data); - if (!IS_ERR(task)) - rpc_put_task(task); + if (IS_ERR(task)) + return PTR_ERR(task); + rpc_put_task(task); + return 0; } static void @@ -240,6 +242,7 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne size_t rsize = NFS_SERVER(inode)->rsize, nbytes; unsigned int offset; int requests = 0; + int ret = 0; LIST_HEAD(list); nfs_list_remove_request(req); @@ -261,6 +264,8 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne offset = 0; nbytes = count; do { + int ret2; + data = list_entry(list.next, struct nfs_read_data, pages); list_del_init(&data->pages); @@ -268,13 +273,15 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne if (nbytes < rsize) rsize = nbytes; - nfs_read_rpcsetup(req, data, &nfs_read_partial_ops, + ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops, rsize, offset); + if (ret == 0) + ret = ret2; offset += rsize; nbytes -= rsize; } while (nbytes != 0); - return 0; + return ret; out_bad: while (!list_empty(&list)) { @@ -292,6 +299,7 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned struct nfs_page *req; struct page **pages; struct nfs_read_data *data; + int ret = -ENOMEM; data = nfs_readdata_alloc(npages); if (!data) @@ -307,11 +315,10 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned } req = nfs_list_entry(data->pages.next); - nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0); - return 0; + return nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0); out_bad: nfs_async_read_error(head); - return -ENOMEM; + return ret; } /* diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 31681bb5e4c1..1ade11d1ba07 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -778,7 +778,7 @@ static int flush_task_priority(int how) /* * Set up the argument/result storage required for the RPC call. */ -static void nfs_write_rpcsetup(struct nfs_page *req, +static int nfs_write_rpcsetup(struct nfs_page *req, struct nfs_write_data *data, const struct rpc_call_ops *call_ops, unsigned int count, unsigned int offset, @@ -841,8 +841,10 @@ static void nfs_write_rpcsetup(struct nfs_page *req, (unsigned long long)data->args.offset); task = rpc_run_task(&task_setup_data); - if (!IS_ERR(task)) - rpc_put_task(task); + if (IS_ERR(task)) + return PTR_ERR(task); + rpc_put_task(task); + return 0; } /* If a nfs_flush_* function fails, it should remove reqs from @head and @@ -868,6 +870,7 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned size_t wsize = NFS_SERVER(inode)->wsize, nbytes; unsigned int offset; int requests = 0; + int ret = 0; LIST_HEAD(list); nfs_list_remove_request(req); @@ -889,6 +892,8 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned offset = 0; nbytes = count; do { + int ret2; + data = list_entry(list.next, struct nfs_write_data, pages); list_del_init(&data->pages); @@ -896,13 +901,15 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned if (nbytes < wsize) wsize = nbytes; - nfs_write_rpcsetup(req, data, &nfs_write_partial_ops, + ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops, wsize, offset, how); + if (ret == 0) + ret = ret2; offset += wsize; nbytes -= wsize; } while (nbytes != 0); - return 0; + return ret; out_bad: while (!list_empty(&list)) { @@ -943,9 +950,7 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i req = nfs_list_entry(data->pages.next); /* Set up the argument struct */ - nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, how); - - return 0; + return nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, how); out_bad: while (!list_empty(head)) { req = nfs_list_entry(head->next); @@ -1183,7 +1188,7 @@ void nfs_commitdata_release(void *data) /* * Set up the argument/result storage required for the RPC call. */ -static void nfs_commit_rpcsetup(struct list_head *head, +static int nfs_commit_rpcsetup(struct list_head *head, struct nfs_write_data *data, int how) { @@ -1232,8 +1237,10 @@ static void nfs_commit_rpcsetup(struct list_head *head, dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); task = rpc_run_task(&task_setup_data); - if (!IS_ERR(task)) - rpc_put_task(task); + if (IS_ERR(task)) + return PTR_ERR(task); + rpc_put_task(task); + return 0; } /* @@ -1251,9 +1258,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) goto out_bad; /* Set up the argument struct */ - nfs_commit_rpcsetup(head, data, how); - - return 0; + return nfs_commit_rpcsetup(head, data, how); out_bad: while (!list_empty(head)) { req = nfs_list_entry(head->next); From 35d05778e25ee16dbddb60331be0bc1309efba19 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 5 Apr 2008 15:54:17 -0400 Subject: [PATCH 54/80] NFSv4: Remove bogus call to nfs4_drop_state_owner() in _nfs4_open_expired() There should be no need to invalidate a perfectly good state owner just because of a stale filehandle. Doing so can cause the state recovery code to break, since nfs4_get_renew_cred() and nfs4_get_setclientid_cred() rely on finding active state owners. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index f38d0573be18..424aa206470f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -982,11 +982,8 @@ static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *s if (IS_ERR(opendata)) return PTR_ERR(opendata); ret = nfs4_open_recover(opendata, state); - if (ret == -ESTALE) { - /* Invalidate the state owner so we don't ever use it again */ - nfs4_drop_state_owner(state->owner); + if (ret == -ESTALE) d_drop(ctx->path.dentry); - } nfs4_opendata_put(opendata); return ret; } From c1d519312dcdf11532fed9f99a8ecc3547ffd9d6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 7 Apr 2008 13:20:54 -0400 Subject: [PATCH 55/80] NFSv4: Only increment the sequence id if the server saw it It is quite possible that the OPEN, CLOSE, LOCK, LOCKU,... compounds fail before the actual stateful operation has been executed (for instance in the PUTFH call). There is no way to tell from the overall status result which operations were executed from the COMPOUND. The fix is to move incrementing of the sequence id into the XDR layer, so that we do it as we process the results from the stateful operation. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 12 ++++++------ fs/nfs/nfs4xdr.c | 18 +++++++++++++++++- include/linux/nfs_xdr.h | 10 ++++++++-- 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 424aa206470f..9f2759da74eb 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -240,6 +240,8 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p) { p->o_res.f_attr = &p->f_attr; p->o_res.dir_attr = &p->dir_attr; + p->o_res.seqid = p->o_arg.seqid; + p->c_res.seqid = p->c_arg.seqid; p->o_res.server = p->o_arg.server; nfs_fattr_init(&p->f_attr); nfs_fattr_init(&p->dir_attr); @@ -730,7 +732,6 @@ static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata) renew_lease(data->o_res.server, data->timestamp); data->rpc_done = 1; } - nfs_increment_open_seqid(data->rpc_status, data->c_arg.seqid); } static void nfs4_open_confirm_release(void *calldata) @@ -860,7 +861,6 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata) if (!(data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM)) nfs_confirm_seqid(&data->owner->so_seqid, 0); } - nfs_increment_open_seqid(data->rpc_status, data->o_arg.seqid); data->rpc_done = 1; } @@ -1226,7 +1226,6 @@ static void nfs4_close_done(struct rpc_task *task, void *data) /* hmm. we are done with the inode, and in the process of freeing * the state_owner. we keep this around to process errors */ - nfs_increment_open_seqid(task->tk_status, calldata->arg.seqid); switch (task->tk_status) { case 0: nfs_set_open_stateid(state, &calldata->res.stateid, 0); @@ -1333,6 +1332,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait) goto out_free_calldata; calldata->arg.bitmask = server->attr_bitmask; calldata->res.fattr = &calldata->fattr; + calldata->res.seqid = calldata->arg.seqid; calldata->res.server = server; calldata->path.mnt = mntget(path->mnt); calldata->path.dentry = dget(path->dentry); @@ -3159,6 +3159,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, p->arg.fh = NFS_FH(inode); p->arg.fl = &p->fl; p->arg.seqid = seqid; + p->res.seqid = seqid; p->arg.stateid = &lsp->ls_stateid; p->lsp = lsp; atomic_inc(&lsp->ls_count); @@ -3184,7 +3185,6 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) if (RPC_ASSASSINATED(task)) return; - nfs_increment_lock_seqid(task->tk_status, calldata->arg.seqid); switch (task->tk_status) { case 0: memcpy(calldata->lsp->ls_stateid.data, @@ -3322,6 +3322,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, p->arg.lock_stateid = &lsp->ls_stateid; p->arg.lock_owner.clientid = server->nfs_client->cl_clientid; p->arg.lock_owner.id = lsp->ls_id.id; + p->res.lock_seqid = p->arg.lock_seqid; p->lsp = lsp; atomic_inc(&lsp->ls_count); p->ctx = get_nfs_open_context(ctx); @@ -3348,6 +3349,7 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) return; data->arg.open_stateid = &state->stateid; data->arg.new_lock_owner = 1; + data->res.open_seqid = data->arg.open_seqid; } else data->arg.new_lock_owner = 0; data->timestamp = jiffies; @@ -3365,7 +3367,6 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) if (RPC_ASSASSINATED(task)) goto out; if (data->arg.new_lock_owner != 0) { - nfs_increment_open_seqid(data->rpc_status, data->arg.open_seqid); if (data->rpc_status == 0) nfs_confirm_seqid(&data->lsp->ls_seqid, 0); else @@ -3377,7 +3378,6 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) data->lsp->ls_flags |= NFS_LOCK_INITIALIZED; renew_lease(NFS_SERVER(data->ctx->path.dentry->d_inode), data->timestamp); } - nfs_increment_lock_seqid(data->rpc_status, data->arg.lock_seqid); out: dprintk("%s: done, ret = %d!\n", __FUNCTION__, data->rpc_status); } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 2b519f6325f9..b8b2e391d183 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3005,6 +3005,8 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) int status; status = decode_op_hdr(xdr, OP_CLOSE); + if (status != -EIO) + nfs_increment_open_seqid(status, res->seqid); if (status) return status; READ_BUF(NFS4_STATEID_SIZE); @@ -3296,11 +3298,17 @@ static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res) int status; status = decode_op_hdr(xdr, OP_LOCK); + if (status == -EIO) + goto out; if (status == 0) { READ_BUF(NFS4_STATEID_SIZE); COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); } else if (status == -NFS4ERR_DENIED) - return decode_lock_denied(xdr, NULL); + status = decode_lock_denied(xdr, NULL); + if (res->open_seqid != NULL) + nfs_increment_open_seqid(status, res->open_seqid); + nfs_increment_lock_seqid(status, res->lock_seqid); +out: return status; } @@ -3319,6 +3327,8 @@ static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res) int status; status = decode_op_hdr(xdr, OP_LOCKU); + if (status != -EIO) + nfs_increment_lock_seqid(status, res->seqid); if (status == 0) { READ_BUF(NFS4_STATEID_SIZE); COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); @@ -3384,6 +3394,8 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) int status; status = decode_op_hdr(xdr, OP_OPEN); + if (status != -EIO) + nfs_increment_open_seqid(status, res->seqid); if (status) return status; READ_BUF(NFS4_STATEID_SIZE); @@ -3416,6 +3428,8 @@ static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmre int status; status = decode_op_hdr(xdr, OP_OPEN_CONFIRM); + if (status != -EIO) + nfs_increment_open_seqid(status, res->seqid); if (status) return status; READ_BUF(NFS4_STATEID_SIZE); @@ -3429,6 +3443,8 @@ static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *re int status; status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE); + if (status != -EIO) + nfs_increment_open_seqid(status, res->seqid); if (status) return status; READ_BUF(NFS4_STATEID_SIZE); diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index f301d0b8babc..24263bb8e0be 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -140,6 +140,7 @@ struct nfs_openres { __u32 rflags; struct nfs_fattr * f_attr; struct nfs_fattr * dir_attr; + struct nfs_seqid * seqid; const struct nfs_server *server; int delegation_type; nfs4_stateid delegation; @@ -159,6 +160,7 @@ struct nfs_open_confirmargs { struct nfs_open_confirmres { nfs4_stateid stateid; + struct nfs_seqid * seqid; }; /* @@ -175,6 +177,7 @@ struct nfs_closeargs { struct nfs_closeres { nfs4_stateid stateid; struct nfs_fattr * fattr; + struct nfs_seqid * seqid; const struct nfs_server *server; }; /* @@ -199,7 +202,9 @@ struct nfs_lock_args { }; struct nfs_lock_res { - nfs4_stateid stateid; + nfs4_stateid stateid; + struct nfs_seqid * lock_seqid; + struct nfs_seqid * open_seqid; }; struct nfs_locku_args { @@ -210,7 +215,8 @@ struct nfs_locku_args { }; struct nfs_locku_res { - nfs4_stateid stateid; + nfs4_stateid stateid; + struct nfs_seqid * seqid; }; struct nfs_lockt_args { From 1e799b673c6b82b336ab13c48b5651d511ca3000 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 21 Mar 2008 16:19:41 -0400 Subject: [PATCH 56/80] SUNRPC: Fix read ordering problems with req->rq_private_buf.len We want to ensure that req->rq_private_buf.len is updated before req->rq_received, so that call_decode() doesn't use an old value for req->rq_rcv_buf.len. In 'call_decode()' itself, instead of using task->tk_status (which is set using req->rq_received) must use the actual value of req->rq_private_buf.len when deciding whether or not the received RPC reply is too short. Finally ensure that we set req->rq_rcv_buf.len to zero when retrying a request. A typo meant that we were resetting req->rq_private_buf.len in call_decode(), and then clobbering that value with the old rq_rcv_buf.len again in xprt_transmit(). Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 26 +++++++++++++------------- net/sunrpc/xprt.c | 3 ++- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 522b06849f86..3ae560464513 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1199,18 +1199,6 @@ call_decode(struct rpc_task *task) task->tk_flags &= ~RPC_CALL_MAJORSEEN; } - if (task->tk_status < 12) { - if (!RPC_IS_SOFT(task)) { - task->tk_action = call_bind; - clnt->cl_stats->rpcretrans++; - goto out_retry; - } - dprintk("RPC: %s: too small RPC reply size (%d bytes)\n", - clnt->cl_protname, task->tk_status); - task->tk_action = call_timeout; - goto out_retry; - } - /* * Ensure that we see all writes made by xprt_complete_rqst() * before it changed req->rq_received. @@ -1222,6 +1210,18 @@ call_decode(struct rpc_task *task) WARN_ON(memcmp(&req->rq_rcv_buf, &req->rq_private_buf, sizeof(req->rq_rcv_buf)) != 0); + if (req->rq_rcv_buf.len < 12) { + if (!RPC_IS_SOFT(task)) { + task->tk_action = call_bind; + clnt->cl_stats->rpcretrans++; + goto out_retry; + } + dprintk("RPC: %s: too small RPC reply size (%d bytes)\n", + clnt->cl_protname, task->tk_status); + task->tk_action = call_timeout; + goto out_retry; + } + /* Verify the RPC header */ p = call_verify(task); if (IS_ERR(p)) { @@ -1243,7 +1243,7 @@ out_retry: task->tk_status = 0; /* Note: call_verify() may have freed the RPC slot */ if (task->tk_rqstp == req) { - req->rq_received = req->rq_private_buf.len = 0; + req->rq_received = req->rq_rcv_buf.len = 0; if (task->tk_client->cl_discrtry) xprt_force_disconnect(task->tk_xprt); } diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 3ba64f9f84ba..5110a4ea7fdf 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -757,9 +757,10 @@ void xprt_complete_rqst(struct rpc_task *task, int copied) task->tk_rtt = (long)jiffies - req->rq_xtime; list_del_init(&req->rq_list); + req->rq_private_buf.len = copied; /* Ensure all writes are done before we update req->rq_received */ smp_wmb(); - req->rq_received = req->rq_private_buf.len = copied; + req->rq_received = copied; rpc_wake_up_queued_task(&xprt->pending, task); } EXPORT_SYMBOL_GPL(xprt_complete_rqst); From 4a9af59fee0701d9db99bc148d87b8852d6d6dd8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 1 Apr 2008 18:57:06 -0400 Subject: [PATCH 57/80] NLM/lockd: Ensure we don't corrupt fl->fl_flags in nlmclnt_unlock() Also fix up nlmclnt_lock() so that it doesn't pass modified versions of fl->fl_flags to nlmclnt_cancel() and other helpers. Signed-off-by: Trond Myklebust --- fs/lockd/clntproc.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index b6b74a60e1eb..4e1c0123b45d 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -493,6 +493,7 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl) } fl->fl_flags |= FL_ACCESS; status = do_vfs_lock(fl); + fl->fl_flags = fl_flags; if (status < 0) goto out; @@ -530,10 +531,11 @@ again: goto again; } /* Ensure the resulting lock will get added to granted list */ - fl->fl_flags = fl_flags | FL_SLEEP; + fl->fl_flags |= FL_SLEEP; if (do_vfs_lock(fl) < 0) printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __FUNCTION__); up_read(&host->h_rwsem); + fl->fl_flags = fl_flags; } status = nlm_stat_to_errno(resp->status); out_unblock: @@ -543,7 +545,6 @@ out_unblock: nlmclnt_cancel(host, req->a_args.block, fl); out: nlm_release_call(req); - fl->fl_flags = fl_flags; return status; } @@ -598,7 +599,8 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) { struct nlm_host *host = req->a_host; struct nlm_res *resp = &req->a_res; - int status = 0; + int status; + unsigned char fl_flags = fl->fl_flags; /* * Note: the server is supposed to either grant us the unlock @@ -607,11 +609,13 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) */ fl->fl_flags |= FL_EXISTS; down_read(&host->h_rwsem); - if (do_vfs_lock(fl) == -ENOENT) { - up_read(&host->h_rwsem); + status = do_vfs_lock(fl); + up_read(&host->h_rwsem); + fl->fl_flags = fl_flags; + if (status == -ENOENT) { + status = 0; goto out; } - up_read(&host->h_rwsem); if (req->a_flags & RPC_TASK_ASYNC) return nlm_async_call(req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops); From 536ff0f809b0f4d56e1c41e66768d330668e0a55 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 4 Apr 2008 15:08:02 -0400 Subject: [PATCH 58/80] NFSv4: Ensure we don't corrupt fl->fl_flags in nfs4_proc_unlck Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 9f2759da74eb..a1069325b876 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3263,6 +3263,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * struct nfs4_lock_state *lsp; struct rpc_task *task; int status = 0; + unsigned char fl_flags = request->fl_flags; status = nfs4_set_lock_state(state, request); /* Unlock _before_ we do the RPC call */ @@ -3286,6 +3287,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * status = nfs4_wait_for_completion_rpc_task(task); rpc_put_task(task); out: + request->fl_flags = fl_flags; return status; } From 5e7f37a76fa5b604949020b7317962262812b2dd Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 1 Apr 2008 18:58:49 -0400 Subject: [PATCH 59/80] NLM/lockd: Add a reference counter to struct nlm_rqst When we replace the existing synchronous RPC calls with asynchronous calls, the reference count will be needed in order to allow us to examine the result of the RPC call. Signed-off-by: Trond Myklebust --- fs/lockd/clntproc.c | 3 +++ include/linux/lockd/lockd.h | 1 + 2 files changed, 4 insertions(+) diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 4e1c0123b45d..749eb5328cb0 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -221,6 +221,7 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host) for(;;) { call = kzalloc(sizeof(*call), GFP_KERNEL); if (call != NULL) { + atomic_set(&call->a_count, 1); locks_init_lock(&call->a_args.lock.fl); locks_init_lock(&call->a_res.lock.fl); call->a_host = host; @@ -237,6 +238,8 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host) void nlm_release_call(struct nlm_rqst *call) { + if (!atomic_dec_and_test(&call->a_count)) + return; nlm_release_host(call->a_host); nlmclnt_release_lockargs(call); kfree(call); diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index acf39e1e3a3d..94649a8da014 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -91,6 +91,7 @@ struct nlm_wait; */ #define NLMCLNT_OHSIZE ((__NEW_UTS_LEN) + 10u) struct nlm_rqst { + atomic_t a_count; unsigned int a_flags; /* initial RPC task flags */ struct nlm_host * a_host; /* host handle */ struct nlm_args a_args; /* arguments */ From dc9d8d048168ff61c458bec06b28996cb90b182a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 28 Mar 2008 16:04:36 -0400 Subject: [PATCH 60/80] NLM/lockd: convert __nlm_async_call to use rpc_run_task() Peter Staubach comments: > In the course of investigating testing failures in the locking phase of > the Connectathon testsuite, I discovered a couple of things. One was > that one of the tests in the locking tests was racy when it didn't seem > to need to be and two, that the NFS client asynchronously releases locks > when a process is exiting. ... > The Single UNIX Specification Version 3 specifies that: "All locks > associated with a file for a given process shall be removed when a file > descriptor for that file is closed by that process or the process holding > that file descriptor terminates.". > > This does not specify whether those locks must be released prior to the > completion of the exit processing for the process or not. However, > general assumptions seem to be that those locks will be released. This > leads to more deterministic behavior under normal circumstances. The following patch converts the NFSv2/v3 locking code to use the same mechanism as NFSv4 for sending asynchronous RPC calls and then waiting for them to complete. This ensures that the UNLOCK and CANCEL RPC calls will complete even if the user interrupts the call, yet satisfies the above request for synchronous behaviour on process exit. Signed-off-by: Trond Myklebust --- fs/lockd/clntproc.c | 64 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 54 insertions(+), 10 deletions(-) diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 749eb5328cb0..a34b709006a1 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -346,10 +346,16 @@ in_grace_period: /* * Generic NLM call, async version. */ -static int __nlm_async_call(struct nlm_rqst *req, u32 proc, struct rpc_message *msg, const struct rpc_call_ops *tk_ops) +static struct rpc_task *__nlm_async_call(struct nlm_rqst *req, u32 proc, struct rpc_message *msg, const struct rpc_call_ops *tk_ops) { struct nlm_host *host = req->a_host; struct rpc_clnt *clnt; + struct rpc_task_setup task_setup_data = { + .rpc_message = msg, + .callback_ops = tk_ops, + .callback_data = req, + .flags = RPC_TASK_ASYNC, + }; dprintk("lockd: call procedure %d on %s (async)\n", (int)proc, host->h_name); @@ -359,21 +365,36 @@ static int __nlm_async_call(struct nlm_rqst *req, u32 proc, struct rpc_message * if (clnt == NULL) goto out_err; msg->rpc_proc = &clnt->cl_procinfo[proc]; + task_setup_data.rpc_client = clnt; /* bootstrap and kick off the async RPC call */ - return rpc_call_async(clnt, msg, RPC_TASK_ASYNC, tk_ops, req); + return rpc_run_task(&task_setup_data); out_err: tk_ops->rpc_release(req); - return -ENOLCK; + return ERR_PTR(-ENOLCK); } +static int nlm_do_async_call(struct nlm_rqst *req, u32 proc, struct rpc_message *msg, const struct rpc_call_ops *tk_ops) +{ + struct rpc_task *task; + + task = __nlm_async_call(req, proc, msg, tk_ops); + if (IS_ERR(task)) + return PTR_ERR(task); + rpc_put_task(task); + return 0; +} + +/* + * NLM asynchronous call. + */ int nlm_async_call(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops) { struct rpc_message msg = { .rpc_argp = &req->a_args, .rpc_resp = &req->a_res, }; - return __nlm_async_call(req, proc, &msg, tk_ops); + return nlm_do_async_call(req, proc, &msg, tk_ops); } int nlm_async_reply(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops) @@ -381,7 +402,32 @@ int nlm_async_reply(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *t struct rpc_message msg = { .rpc_argp = &req->a_res, }; - return __nlm_async_call(req, proc, &msg, tk_ops); + return nlm_do_async_call(req, proc, &msg, tk_ops); +} + +/* + * NLM client asynchronous call. + * + * Note that although the calls are asynchronous, and are therefore + * guaranteed to complete, we still always attempt to wait for + * completion in order to be able to correctly track the lock + * state. + */ +static int nlmclnt_async_call(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops) +{ + struct rpc_message msg = { + .rpc_argp = &req->a_args, + .rpc_resp = &req->a_res, + }; + struct rpc_task *task; + int err; + + task = __nlm_async_call(req, proc, &msg, tk_ops); + if (IS_ERR(task)) + return PTR_ERR(task); + err = rpc_wait_for_completion_task(task); + rpc_put_task(task); + return err; } /* @@ -620,10 +666,8 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) goto out; } - if (req->a_flags & RPC_TASK_ASYNC) - return nlm_async_call(req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops); - - status = nlmclnt_call(req, NLMPROC_UNLOCK); + atomic_inc(&req->a_count); + status = nlmclnt_async_call(req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops); if (status < 0) goto out; @@ -697,7 +741,7 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl nlmclnt_setlockargs(req, fl); req->a_args.block = block; - status = nlm_async_call(req, NLMPROC_CANCEL, &nlmclnt_cancel_ops); + status = nlmclnt_async_call(req, NLMPROC_CANCEL, &nlmclnt_cancel_ops); spin_lock_irqsave(¤t->sighand->siglock, flags); current->blocked = oldset; From 8ec7ff74448f65ac963e330795d771ab14ec8408 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 28 Mar 2008 16:04:51 -0400 Subject: [PATCH 61/80] NLM: Remove the signal masking in nlmclnt_proc/nlmclnt_cancel The signal masks have been rendered obsolete by the preceding patch. Signed-off-by: Trond Myklebust --- fs/lockd/clntproc.c | 42 +----------------------------------------- 1 file changed, 1 insertion(+), 41 deletions(-) diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index a34b709006a1..5f13e0363b28 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -155,8 +155,6 @@ static void nlmclnt_release_lockargs(struct nlm_rqst *req) int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl) { struct nlm_rqst *call; - sigset_t oldset; - unsigned long flags; int status; nlm_get_host(host); @@ -168,22 +166,6 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl) /* Set up the argument struct */ nlmclnt_setlockargs(call, fl); - /* Keep the old signal mask */ - spin_lock_irqsave(¤t->sighand->siglock, flags); - oldset = current->blocked; - - /* If we're cleaning up locks because the process is exiting, - * perform the RPC call asynchronously. */ - if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) - && fl->fl_type == F_UNLCK - && (current->flags & PF_EXITING)) { - sigfillset(¤t->blocked); /* Mask all signals */ - recalc_sigpending(); - - call->a_flags = RPC_TASK_ASYNC; - } - spin_unlock_irqrestore(¤t->sighand->siglock, flags); - if (IS_SETLK(cmd) || IS_SETLKW(cmd)) { if (fl->fl_type != F_UNLCK) { call->a_args.block = IS_SETLKW(cmd) ? 1 : 0; @@ -198,11 +180,6 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl) fl->fl_ops->fl_release_private(fl); fl->fl_ops = NULL; - spin_lock_irqsave(¤t->sighand->siglock, flags); - current->blocked = oldset; - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); - dprintk("lockd: clnt proc returns %d\n", status); return status; } @@ -722,16 +699,6 @@ static const struct rpc_call_ops nlmclnt_unlock_ops = { static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl) { struct nlm_rqst *req; - unsigned long flags; - sigset_t oldset; - int status; - - /* Block all signals while setting up call */ - spin_lock_irqsave(¤t->sighand->siglock, flags); - oldset = current->blocked; - sigfillset(¤t->blocked); - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); req = nlm_alloc_call(nlm_get_host(host)); if (!req) @@ -741,14 +708,7 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl nlmclnt_setlockargs(req, fl); req->a_args.block = block; - status = nlmclnt_async_call(req, NLMPROC_CANCEL, &nlmclnt_cancel_ops); - - spin_lock_irqsave(¤t->sighand->siglock, flags); - current->blocked = oldset; - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); - - return status; + return nlmclnt_async_call(req, NLMPROC_CANCEL, &nlmclnt_cancel_ops); } static void nlmclnt_cancel_callback(struct rpc_task *task, void *data) From 6b4b3a752b3464f2fd9fe2837fb19270c23c1d6b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 2 Apr 2008 14:40:53 -0400 Subject: [PATCH 62/80] NLM/lockd: Ensure that nlmclnt_cancel() returns results of the CANCEL call Currently, it returns success as long as the RPC call was sent. We'd like to know if the CANCEL operation succeeded on the server. Signed-off-by: Trond Myklebust --- fs/lockd/clntproc.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 5f13e0363b28..ea1a6940af22 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -699,6 +699,10 @@ static const struct rpc_call_ops nlmclnt_unlock_ops = { static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl) { struct nlm_rqst *req; + int status; + + dprintk("lockd: blocking lock attempt was interrupted by a signal.\n" + " Attempting to cancel lock.\n"); req = nlm_alloc_call(nlm_get_host(host)); if (!req) @@ -708,7 +712,12 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl nlmclnt_setlockargs(req, fl); req->a_args.block = block; - return nlmclnt_async_call(req, NLMPROC_CANCEL, &nlmclnt_cancel_ops); + atomic_inc(&req->a_count); + status = nlmclnt_async_call(req, NLMPROC_CANCEL, &nlmclnt_cancel_ops); + if (status == 0 && req->a_res.status == nlm_lck_denied) + status = -ENOLCK; + nlm_release_call(req); + return status; } static void nlmclnt_cancel_callback(struct rpc_task *task, void *data) From 5f50c0c6d644d6c8180d9079c13c5d9de3adeb34 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 1 Apr 2008 20:26:22 -0400 Subject: [PATCH 63/80] NLM/lockd: Fix a race when cancelling a blocking lock We shouldn't remove the lock from the list of blocked locks until the CANCEL call has completed since we may be racing with a GRANTED callback. Also ensure that we send an UNLOCK if the CANCEL request failed. Normally that should only happen if the process gets hit with a fatal signal. Signed-off-by: Trond Myklebust --- fs/lockd/clntproc.c | 43 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index ea1a6940af22..37d1aa20a607 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -510,6 +510,7 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl) struct nlm_res *resp = &req->a_res; struct nlm_wait *block = NULL; unsigned char fl_flags = fl->fl_flags; + unsigned char fl_type; int status = -ENOLCK; if (nsm_monitor(host) < 0) { @@ -525,13 +526,16 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl) block = nlmclnt_prepare_block(host, fl); again: + /* + * Initialise resp->status to a valid non-zero value, + * since 0 == nlm_lck_granted + */ + resp->status = nlm_lck_blocked; for(;;) { /* Reboot protection */ fl->fl_u.nfs_fl.state = host->h_state; status = nlmclnt_call(req, NLMPROC_LOCK); if (status < 0) - goto out_unblock; - if (!req->a_args.block) break; /* Did a reclaimer thread notify us of a server reboot? */ if (resp->status == nlm_lck_denied_grace_period) @@ -540,15 +544,22 @@ again: break; /* Wait on an NLM blocking lock */ status = nlmclnt_block(block, req, NLMCLNT_POLL_TIMEOUT); - /* if we were interrupted. Send a CANCEL request to the server - * and exit - */ if (status < 0) - goto out_unblock; + break; if (resp->status != nlm_lck_blocked) break; } + /* if we were interrupted while blocking, then cancel the lock request + * and exit + */ + if (resp->status == nlm_lck_blocked) { + if (!req->a_args.block) + goto out_unlock; + if (nlmclnt_cancel(host, req->a_args.block, fl) == 0) + goto out_unblock; + } + if (resp->status == nlm_granted) { down_read(&host->h_rwsem); /* Check whether or not the server has rebooted */ @@ -562,16 +573,30 @@ again: printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __FUNCTION__); up_read(&host->h_rwsem); fl->fl_flags = fl_flags; + status = 0; } + if (status < 0) + goto out_unlock; status = nlm_stat_to_errno(resp->status); out_unblock: nlmclnt_finish_block(block); - /* Cancel the blocked request if it is still pending */ - if (resp->status == nlm_lck_blocked) - nlmclnt_cancel(host, req->a_args.block, fl); out: nlm_release_call(req); return status; +out_unlock: + /* Fatal error: ensure that we remove the lock altogether */ + dprintk("lockd: lock attempt ended in fatal error.\n" + " Attempting to unlock.\n"); + nlmclnt_finish_block(block); + fl_type = fl->fl_type; + fl->fl_type = F_UNLCK; + down_read(&host->h_rwsem); + do_vfs_lock(fl); + up_read(&host->h_rwsem); + fl->fl_type = fl_type; + fl->fl_flags = fl_flags; + nlmclnt_async_call(req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops); + return status; } /* From c4d7c402b788b73dc24f1e54a57f89d3dc5eb7bc Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 1 Apr 2008 20:26:52 -0400 Subject: [PATCH 64/80] NFS: Remove the buggy lock-if-signalled case from do_setlk() Both NLM and NFSv4 should be able to clean up adequately in the case where the user interrupts the RPC call... Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 10e8b807e7ff..742cb745cb47 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -566,17 +566,9 @@ static int do_setlk(struct file *filp, int cmd, struct file_lock *fl) lock_kernel(); /* Use local locking if mounted with "-onolock" */ - if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)) { + if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)) status = NFS_PROTO(inode)->lock(filp, cmd, fl); - /* If we were signalled we still need to ensure that - * we clean up any state on the server. We therefore - * record the lock call as having succeeded in order to - * ensure that locks_remove_posix() cleans it out when - * the process exits. - */ - if (status == -EINTR || status == -ERESTARTSYS) - do_vfs_lock(filp, fl); - } else + else status = do_vfs_lock(filp, fl); unlock_kernel(); if (status < 0) From d11d10cc05c94a32632d6928d15a1034200dd9a5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 2 Apr 2008 14:44:05 -0400 Subject: [PATCH 65/80] NLM/lockd: Ensure client locking calls use correct credentials Now that we've added the 'generic' credentials (that are independent of the rpc_client) to the nfs_open_context, we can use those in the NLM client to ensure that the lock/unlock requests are authenticated to whoever originally opened the file. Signed-off-by: Trond Myklebust --- fs/lockd/clntproc.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 37d1aa20a607..40b16f23e49a 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -247,7 +247,7 @@ static int nlm_wait_on_grace(wait_queue_head_t *queue) * Generic NLM call */ static int -nlmclnt_call(struct nlm_rqst *req, u32 proc) +nlmclnt_call(struct rpc_cred *cred, struct nlm_rqst *req, u32 proc) { struct nlm_host *host = req->a_host; struct rpc_clnt *clnt; @@ -256,6 +256,7 @@ nlmclnt_call(struct nlm_rqst *req, u32 proc) struct rpc_message msg = { .rpc_argp = argp, .rpc_resp = resp, + .rpc_cred = cred, }; int status; @@ -390,11 +391,12 @@ int nlm_async_reply(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *t * completion in order to be able to correctly track the lock * state. */ -static int nlmclnt_async_call(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops) +static int nlmclnt_async_call(struct rpc_cred *cred, struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops) { struct rpc_message msg = { .rpc_argp = &req->a_args, .rpc_resp = &req->a_res, + .rpc_cred = cred, }; struct rpc_task *task; int err; @@ -415,7 +417,7 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl) { int status; - status = nlmclnt_call(req, NLMPROC_TEST); + status = nlmclnt_call(nfs_file_cred(fl->fl_file), req, NLMPROC_TEST); if (status < 0) goto out; @@ -506,6 +508,7 @@ static int do_vfs_lock(struct file_lock *fl) static int nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl) { + struct rpc_cred *cred = nfs_file_cred(fl->fl_file); struct nlm_host *host = req->a_host; struct nlm_res *resp = &req->a_res; struct nlm_wait *block = NULL; @@ -534,7 +537,7 @@ again: for(;;) { /* Reboot protection */ fl->fl_u.nfs_fl.state = host->h_state; - status = nlmclnt_call(req, NLMPROC_LOCK); + status = nlmclnt_call(cred, req, NLMPROC_LOCK); if (status < 0) break; /* Did a reclaimer thread notify us of a server reboot? */ @@ -595,7 +598,7 @@ out_unlock: up_read(&host->h_rwsem); fl->fl_type = fl_type; fl->fl_flags = fl_flags; - nlmclnt_async_call(req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops); + nlmclnt_async_call(cred, req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops); return status; } @@ -619,8 +622,8 @@ nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl) nlmclnt_setlockargs(req, fl); req->a_args.reclaim = 1; - if ((status = nlmclnt_call(req, NLMPROC_LOCK)) >= 0 - && req->a_res.status == nlm_granted) + status = nlmclnt_call(nfs_file_cred(fl->fl_file), req, NLMPROC_LOCK); + if (status >= 0 && req->a_res.status == nlm_granted) return 0; printk(KERN_WARNING "lockd: failed to reclaim lock for pid %d " @@ -669,7 +672,8 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) } atomic_inc(&req->a_count); - status = nlmclnt_async_call(req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops); + status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req, + NLMPROC_UNLOCK, &nlmclnt_unlock_ops); if (status < 0) goto out; @@ -738,7 +742,8 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl req->a_args.block = block; atomic_inc(&req->a_count); - status = nlmclnt_async_call(req, NLMPROC_CANCEL, &nlmclnt_cancel_ops); + status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req, + NLMPROC_CANCEL, &nlmclnt_cancel_ops); if (status == 0 && req->a_res.status == nlm_lck_denied) status = -ENOLCK; nlm_release_call(req); From 856dff3d3875bdc8b88e4a65779873af76776a69 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Mon, 31 Mar 2008 17:39:06 +0300 Subject: [PATCH 66/80] nfs: return negative error value from nfs{,4}_stat_to_errno All use sites for nfs{,4}_stat_to_errno negate their return value. It's more efficient to return a negative error from the stat_to_errno convertors rather than negating its return value everywhere. This also produces slightly smaller code. Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- fs/nfs/nfs2xdr.c | 76 ++++++++++++++++++++++----------------------- fs/nfs/nfs3xdr.c | 34 ++++++++++---------- fs/nfs/nfs4xdr.c | 80 ++++++++++++++++++++++++------------------------ 3 files changed, 95 insertions(+), 95 deletions(-) diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 86a80b33ec82..28bab67d1519 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -267,7 +267,7 @@ nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res) int status; if ((status = ntohl(*p++))) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); p = xdr_decode_fattr(p, res->fattr); count = ntohl(*p++); @@ -432,7 +432,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy) __be32 *end, *entry, *kaddr; if ((status = ntohl(*p++))) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); hdrlen = (u8 *) p - (u8 *) iov->iov_base; if (iov->iov_len < hdrlen) { @@ -537,7 +537,7 @@ nfs_xdr_stat(struct rpc_rqst *req, __be32 *p, void *dummy) int status; if ((status = ntohl(*p++)) != 0) - status = -nfs_stat_to_errno(status); + status = nfs_stat_to_errno(status); return status; } @@ -551,7 +551,7 @@ nfs_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) int status; if ((status = ntohl(*p++))) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); xdr_decode_fattr(p, fattr); return 0; } @@ -566,7 +566,7 @@ nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res) int status; if ((status = ntohl(*p++))) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); p = xdr_decode_fhandle(p, res->fh); xdr_decode_fattr(p, res->fattr); return 0; @@ -604,7 +604,7 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy) int status; if ((status = ntohl(*p++))) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); /* Convert length of symlink */ len = ntohl(*p++); if (len >= rcvbuf->page_len) { @@ -653,7 +653,7 @@ nfs_xdr_statfsres(struct rpc_rqst *req, __be32 *p, struct nfs2_fsstat *res) int status; if ((status = ntohl(*p++))) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); res->tsize = ntohl(*p++); res->bsize = ntohl(*p++); @@ -672,39 +672,39 @@ static struct { int errno; } nfs_errtbl[] = { { NFS_OK, 0 }, - { NFSERR_PERM, EPERM }, - { NFSERR_NOENT, ENOENT }, - { NFSERR_IO, errno_NFSERR_IO }, - { NFSERR_NXIO, ENXIO }, -/* { NFSERR_EAGAIN, EAGAIN }, */ - { NFSERR_ACCES, EACCES }, - { NFSERR_EXIST, EEXIST }, - { NFSERR_XDEV, EXDEV }, - { NFSERR_NODEV, ENODEV }, - { NFSERR_NOTDIR, ENOTDIR }, - { NFSERR_ISDIR, EISDIR }, - { NFSERR_INVAL, EINVAL }, - { NFSERR_FBIG, EFBIG }, - { NFSERR_NOSPC, ENOSPC }, - { NFSERR_ROFS, EROFS }, - { NFSERR_MLINK, EMLINK }, - { NFSERR_NAMETOOLONG, ENAMETOOLONG }, - { NFSERR_NOTEMPTY, ENOTEMPTY }, - { NFSERR_DQUOT, EDQUOT }, - { NFSERR_STALE, ESTALE }, - { NFSERR_REMOTE, EREMOTE }, + { NFSERR_PERM, -EPERM }, + { NFSERR_NOENT, -ENOENT }, + { NFSERR_IO, -errno_NFSERR_IO}, + { NFSERR_NXIO, -ENXIO }, +/* { NFSERR_EAGAIN, -EAGAIN }, */ + { NFSERR_ACCES, -EACCES }, + { NFSERR_EXIST, -EEXIST }, + { NFSERR_XDEV, -EXDEV }, + { NFSERR_NODEV, -ENODEV }, + { NFSERR_NOTDIR, -ENOTDIR }, + { NFSERR_ISDIR, -EISDIR }, + { NFSERR_INVAL, -EINVAL }, + { NFSERR_FBIG, -EFBIG }, + { NFSERR_NOSPC, -ENOSPC }, + { NFSERR_ROFS, -EROFS }, + { NFSERR_MLINK, -EMLINK }, + { NFSERR_NAMETOOLONG, -ENAMETOOLONG }, + { NFSERR_NOTEMPTY, -ENOTEMPTY }, + { NFSERR_DQUOT, -EDQUOT }, + { NFSERR_STALE, -ESTALE }, + { NFSERR_REMOTE, -EREMOTE }, #ifdef EWFLUSH - { NFSERR_WFLUSH, EWFLUSH }, + { NFSERR_WFLUSH, -EWFLUSH }, #endif - { NFSERR_BADHANDLE, EBADHANDLE }, - { NFSERR_NOT_SYNC, ENOTSYNC }, - { NFSERR_BAD_COOKIE, EBADCOOKIE }, - { NFSERR_NOTSUPP, ENOTSUPP }, - { NFSERR_TOOSMALL, ETOOSMALL }, - { NFSERR_SERVERFAULT, ESERVERFAULT }, - { NFSERR_BADTYPE, EBADTYPE }, - { NFSERR_JUKEBOX, EJUKEBOX }, - { -1, EIO } + { NFSERR_BADHANDLE, -EBADHANDLE }, + { NFSERR_NOT_SYNC, -ENOTSYNC }, + { NFSERR_BAD_COOKIE, -EBADCOOKIE }, + { NFSERR_NOTSUPP, -ENOTSUPP }, + { NFSERR_TOOSMALL, -ETOOSMALL }, + { NFSERR_SERVERFAULT, -ESERVERFAULT }, + { NFSERR_BADTYPE, -EBADTYPE }, + { NFSERR_JUKEBOX, -EJUKEBOX }, + { -1, -EIO } }; /* diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index fb03048ac650..11cdddec1432 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -515,7 +515,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res /* Decode post_op_attrs */ p = xdr_decode_post_op_attr(p, res->dir_attr); if (status) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); /* Decode verifier cookie */ if (res->verf) { res->verf[0] = *p++; @@ -751,7 +751,7 @@ nfs3_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) int status; if ((status = ntohl(*p++))) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); xdr_decode_fattr(p, fattr); return 0; } @@ -766,7 +766,7 @@ nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) int status; if ((status = ntohl(*p++))) - status = -nfs_stat_to_errno(status); + status = nfs_stat_to_errno(status); xdr_decode_wcc_data(p, fattr); return status; } @@ -786,7 +786,7 @@ nfs3_xdr_lookupres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res) int status; if ((status = ntohl(*p++))) { - status = -nfs_stat_to_errno(status); + status = nfs_stat_to_errno(status); } else { if (!(p = xdr_decode_fhandle(p, res->fh))) return -errno_NFSERR_IO; @@ -806,7 +806,7 @@ nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res) p = xdr_decode_post_op_attr(p, res->fattr); if (status) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); res->access = ntohl(*p++); return 0; } @@ -843,7 +843,7 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) p = xdr_decode_post_op_attr(p, fattr); if (status != 0) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); /* Convert length of symlink */ len = ntohl(*p++); @@ -891,7 +891,7 @@ nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res) p = xdr_decode_post_op_attr(p, res->fattr); if (status != 0) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); /* Decode reply count and EOF flag. NFSv3 is somewhat redundant * in that it puts the count both in the res struct and in the @@ -941,7 +941,7 @@ nfs3_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res) p = xdr_decode_wcc_data(p, res->fattr); if (status != 0) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); res->count = ntohl(*p++); res->verf->committed = (enum nfs3_stable_how)ntohl(*p++); @@ -972,7 +972,7 @@ nfs3_xdr_createres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res) res->fattr->valid = 0; } } else { - status = -nfs_stat_to_errno(status); + status = nfs_stat_to_errno(status); } p = xdr_decode_wcc_data(p, res->dir_attr); return status; @@ -987,7 +987,7 @@ nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs3_renameres *res) int status; if ((status = ntohl(*p++)) != 0) - status = -nfs_stat_to_errno(status); + status = nfs_stat_to_errno(status); p = xdr_decode_wcc_data(p, res->fromattr); p = xdr_decode_wcc_data(p, res->toattr); return status; @@ -1002,7 +1002,7 @@ nfs3_xdr_linkres(struct rpc_rqst *req, __be32 *p, struct nfs3_linkres *res) int status; if ((status = ntohl(*p++)) != 0) - status = -nfs_stat_to_errno(status); + status = nfs_stat_to_errno(status); p = xdr_decode_post_op_attr(p, res->fattr); p = xdr_decode_wcc_data(p, res->dir_attr); return status; @@ -1020,7 +1020,7 @@ nfs3_xdr_fsstatres(struct rpc_rqst *req, __be32 *p, struct nfs_fsstat *res) p = xdr_decode_post_op_attr(p, res->fattr); if (status != 0) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); p = xdr_decode_hyper(p, &res->tbytes); p = xdr_decode_hyper(p, &res->fbytes); @@ -1045,7 +1045,7 @@ nfs3_xdr_fsinfores(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *res) p = xdr_decode_post_op_attr(p, res->fattr); if (status != 0) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); res->rtmax = ntohl(*p++); res->rtpref = ntohl(*p++); @@ -1073,7 +1073,7 @@ nfs3_xdr_pathconfres(struct rpc_rqst *req, __be32 *p, struct nfs_pathconf *res) p = xdr_decode_post_op_attr(p, res->fattr); if (status != 0) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); res->max_link = ntohl(*p++); res->max_namelen = ntohl(*p++); @@ -1092,7 +1092,7 @@ nfs3_xdr_commitres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res) status = ntohl(*p++); p = xdr_decode_wcc_data(p, res->fattr); if (status != 0) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); res->verf->verifier[0] = *p++; res->verf->verifier[1] = *p++; @@ -1114,7 +1114,7 @@ nfs3_xdr_getaclres(struct rpc_rqst *req, __be32 *p, int err, base; if (status != 0) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); p = xdr_decode_post_op_attr(p, res->fattr); res->mask = ntohl(*p++); if (res->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) @@ -1141,7 +1141,7 @@ nfs3_xdr_setaclres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) int status = ntohl(*p++); if (status) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); xdr_decode_post_op_attr(p, fattr); return 0; } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index b8b2e391d183..809ef0d787e3 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -2241,7 +2241,7 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) } READ32(nfserr); if (nfserr != NFS_OK) - return -nfs4_stat_to_errno(nfserr); + return nfs4_stat_to_errno(nfserr); return 0; } @@ -3760,7 +3760,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) READ_BUF(len); return -NFSERR_CLID_INUSE; } else - return -nfs4_stat_to_errno(nfserr); + return nfs4_stat_to_errno(nfserr); return 0; } @@ -4422,7 +4422,7 @@ static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs_fsinf if (!status) status = decode_fsinfo(&xdr, fsinfo); if (!status) - status = -nfs4_stat_to_errno(hdr.status); + status = nfs4_stat_to_errno(hdr.status); return status; } @@ -4512,7 +4512,7 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p, if (!status) status = decode_setclientid(&xdr, clp); if (!status) - status = -nfs4_stat_to_errno(hdr.status); + status = nfs4_stat_to_errno(hdr.status); return status; } @@ -4534,7 +4534,7 @@ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str if (!status) status = decode_fsinfo(&xdr, fsinfo); if (!status) - status = -nfs4_stat_to_errno(hdr.status); + status = nfs4_stat_to_errno(hdr.status); return status; } @@ -4644,42 +4644,42 @@ static struct { int errno; } nfs_errtbl[] = { { NFS4_OK, 0 }, - { NFS4ERR_PERM, EPERM }, - { NFS4ERR_NOENT, ENOENT }, - { NFS4ERR_IO, errno_NFSERR_IO }, - { NFS4ERR_NXIO, ENXIO }, - { NFS4ERR_ACCESS, EACCES }, - { NFS4ERR_EXIST, EEXIST }, - { NFS4ERR_XDEV, EXDEV }, - { NFS4ERR_NOTDIR, ENOTDIR }, - { NFS4ERR_ISDIR, EISDIR }, - { NFS4ERR_INVAL, EINVAL }, - { NFS4ERR_FBIG, EFBIG }, - { NFS4ERR_NOSPC, ENOSPC }, - { NFS4ERR_ROFS, EROFS }, - { NFS4ERR_MLINK, EMLINK }, - { NFS4ERR_NAMETOOLONG, ENAMETOOLONG }, - { NFS4ERR_NOTEMPTY, ENOTEMPTY }, - { NFS4ERR_DQUOT, EDQUOT }, - { NFS4ERR_STALE, ESTALE }, - { NFS4ERR_BADHANDLE, EBADHANDLE }, - { NFS4ERR_BADOWNER, EINVAL }, - { NFS4ERR_BADNAME, EINVAL }, - { NFS4ERR_BAD_COOKIE, EBADCOOKIE }, - { NFS4ERR_NOTSUPP, ENOTSUPP }, - { NFS4ERR_TOOSMALL, ETOOSMALL }, - { NFS4ERR_SERVERFAULT, ESERVERFAULT }, - { NFS4ERR_BADTYPE, EBADTYPE }, - { NFS4ERR_LOCKED, EAGAIN }, - { NFS4ERR_RESOURCE, EREMOTEIO }, - { NFS4ERR_SYMLINK, ELOOP }, - { NFS4ERR_OP_ILLEGAL, EOPNOTSUPP }, - { NFS4ERR_DEADLOCK, EDEADLK }, - { NFS4ERR_WRONGSEC, EPERM }, /* FIXME: this needs + { NFS4ERR_PERM, -EPERM }, + { NFS4ERR_NOENT, -ENOENT }, + { NFS4ERR_IO, -errno_NFSERR_IO}, + { NFS4ERR_NXIO, -ENXIO }, + { NFS4ERR_ACCESS, -EACCES }, + { NFS4ERR_EXIST, -EEXIST }, + { NFS4ERR_XDEV, -EXDEV }, + { NFS4ERR_NOTDIR, -ENOTDIR }, + { NFS4ERR_ISDIR, -EISDIR }, + { NFS4ERR_INVAL, -EINVAL }, + { NFS4ERR_FBIG, -EFBIG }, + { NFS4ERR_NOSPC, -ENOSPC }, + { NFS4ERR_ROFS, -EROFS }, + { NFS4ERR_MLINK, -EMLINK }, + { NFS4ERR_NAMETOOLONG, -ENAMETOOLONG }, + { NFS4ERR_NOTEMPTY, -ENOTEMPTY }, + { NFS4ERR_DQUOT, -EDQUOT }, + { NFS4ERR_STALE, -ESTALE }, + { NFS4ERR_BADHANDLE, -EBADHANDLE }, + { NFS4ERR_BADOWNER, -EINVAL }, + { NFS4ERR_BADNAME, -EINVAL }, + { NFS4ERR_BAD_COOKIE, -EBADCOOKIE }, + { NFS4ERR_NOTSUPP, -ENOTSUPP }, + { NFS4ERR_TOOSMALL, -ETOOSMALL }, + { NFS4ERR_SERVERFAULT, -ESERVERFAULT }, + { NFS4ERR_BADTYPE, -EBADTYPE }, + { NFS4ERR_LOCKED, -EAGAIN }, + { NFS4ERR_RESOURCE, -EREMOTEIO }, + { NFS4ERR_SYMLINK, -ELOOP }, + { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP }, + { NFS4ERR_DEADLOCK, -EDEADLK }, + { NFS4ERR_WRONGSEC, -EPERM }, /* FIXME: this needs * to be handled by a * middle-layer. */ - { -1, EIO } + { -1, -EIO } }; /* @@ -4696,14 +4696,14 @@ nfs4_stat_to_errno(int stat) } if (stat <= 10000 || stat > 10100) { /* The server is looney tunes. */ - return ESERVERFAULT; + return -ESERVERFAULT; } /* If we cannot translate the error, the recovery routines should * handle it. * Note: remaining NFSv4 error codes have values > 10000, so should * not conflict with native Linux error codes. */ - return stat; + return -stat; } #define PROC(proc, argtype, restype) \ From 441092415770ddec648800701895913c4bfd60c1 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Wed, 2 Apr 2008 15:21:15 +0300 Subject: [PATCH 67/80] nfs: fix printout of multiword bitfields Benny points out that zero-padding of multiword bitfields is necessary, and that delimiting each word is nice to avoid endianess confusion. bhalevy: without zero padding output can be ambiguous. Also, since the printed array of two 32-bit unsigned integers is not a 64-bit number, delimiting the output with a semicolon makes more sense. Signed-off-by: Fred Isaman Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 809ef0d787e3..5a2d64927b35 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1191,8 +1191,8 @@ static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; WRITE32(attrs[0] & readdir->bitmask[0]); WRITE32(attrs[1] & readdir->bitmask[1]); - dprintk("%s: cookie = %Lu, verifier = 0x%x%x, bitmap = 0x%x%x\n", - __FUNCTION__, + dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n", + __func__, (unsigned long long)readdir->cookie, ((u32 *)readdir->verifier.data)[0], ((u32 *)readdir->verifier.data)[1], @@ -2291,7 +2291,7 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3 bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS; } else bitmask[0] = bitmask[1] = 0; - dprintk("%s: bitmask=0x%x%x\n", __FUNCTION__, bitmask[0], bitmask[1]); + dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]); return 0; } @@ -3505,8 +3505,8 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n return status; READ_BUF(8); COPYMEM(readdir->verifier.data, 8); - dprintk("%s: verifier = 0x%x%x\n", - __FUNCTION__, + dprintk("%s: verifier = %08x:%08x\n", + __func__, ((u32 *)readdir->verifier.data)[0], ((u32 *)readdir->verifier.data)[1]); From 78ea323be6380a9313e87fe241809e912e8ae401 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 7 Apr 2008 20:49:28 -0400 Subject: [PATCH 68/80] NFSv4: Don't use cred->cr_ops->cr_name in nfs4_proc_setclientid() With the recent change to generic creds, we can no longer use cred->cr_ops->cr_name to distinguish between RPCSEC_GSS principals and AUTH_SYS/AUTH_NULL identities. Replace it with the rpc_authops->au_name instead... Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 2 +- include/linux/sunrpc/auth.h | 2 -- net/sunrpc/auth_generic.c | 2 -- net/sunrpc/auth_gss/auth_gss.c | 2 -- net/sunrpc/auth_null.c | 2 -- net/sunrpc/auth_unix.c | 2 -- 6 files changed, 1 insertion(+), 11 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a1069325b876..dbc09271af02 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2885,7 +2885,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po RPC_DISPLAY_ADDR), rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO), - cred->cr_ops->cr_name, + clp->cl_rpcclient->cl_auth->au_ops->au_name, clp->cl_id_uniquifier); setclientid.sc_netid_len = scnprintf(setclientid.sc_netid, sizeof(setclientid.sc_netid), diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h index e93cd8aa3eb6..a19c3af933ca 100644 --- a/include/linux/sunrpc/auth.h +++ b/include/linux/sunrpc/auth.h @@ -96,9 +96,7 @@ struct rpc_auth { struct rpc_authops { struct module *owner; rpc_authflavor_t au_flavor; /* flavor (RPC_AUTH_*) */ -#ifdef RPC_DEBUG char * au_name; -#endif struct rpc_auth * (*create)(struct rpc_clnt *, rpc_authflavor_t); void (*destroy)(struct rpc_auth *); diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index 6a3f77c9e4d3..b6f124c85072 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c @@ -136,9 +136,7 @@ static struct rpc_cred_cache generic_cred_cache = { static const struct rpc_authops generic_auth_ops = { .owner = THIS_MODULE, -#ifdef RPC_DEBUG .au_name = "Generic", -#endif .lookup_cred = generic_lookup_cred, .crcreate = generic_create_cred, }; diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 55948cd5ea55..7567eb95823b 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1287,9 +1287,7 @@ out: static const struct rpc_authops authgss_ops = { .owner = THIS_MODULE, .au_flavor = RPC_AUTH_GSS, -#ifdef RPC_DEBUG .au_name = "RPCSEC_GSS", -#endif .create = gss_create, .destroy = gss_destroy, .lookup_cred = gss_lookup_cred, diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c index 3c26c18df0de..c70dd7f5258e 100644 --- a/net/sunrpc/auth_null.c +++ b/net/sunrpc/auth_null.c @@ -104,9 +104,7 @@ nul_validate(struct rpc_task *task, __be32 *p) const struct rpc_authops authnull_ops = { .owner = THIS_MODULE, .au_flavor = RPC_AUTH_NULL, -#ifdef RPC_DEBUG .au_name = "NULL", -#endif .create = nul_create, .destroy = nul_destroy, .lookup_cred = nul_lookup_cred, diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 04e936a56fb2..44920b90bdc4 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -210,9 +210,7 @@ void __init rpc_init_authunix(void) const struct rpc_authops authunix_ops = { .owner = THIS_MODULE, .au_flavor = RPC_AUTH_UNIX, -#ifdef RPC_DEBUG .au_name = "UNIX", -#endif .create = unx_create, .destroy = unx_destroy, .lookup_cred = unx_lookup_cred, From 7c67db3a8a98045744f06fcd6d8f476d9df0ba5c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 7 Apr 2008 20:50:11 -0400 Subject: [PATCH 69/80] NFSv4: Reintroduce machine creds We need to try to ensure that we always use the same credentials whenever we re-establish the clientid on the server. If not, the server won't recognise that we're the same client, and so may not allow us to recover state. Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 7 +++++++ include/linux/nfs_fs_sb.h | 2 ++ include/linux/sunrpc/auth.h | 2 ++ include/linux/sunrpc/auth_gss.h | 1 + net/sunrpc/auth_generic.c | 26 ++++++++++++++++++++++++-- net/sunrpc/auth_gss/auth_gss.c | 12 +++++++++++- 6 files changed, 47 insertions(+), 3 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 93dfd75aba7c..f2f3b284e6dd 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -112,6 +112,7 @@ struct nfs_client_initdata { static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) { struct nfs_client *clp; + struct rpc_cred *cred; if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL) goto error_0; @@ -150,6 +151,9 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_ clp->cl_boot_time = CURRENT_TIME; clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; #endif + cred = rpc_lookup_machine_cred(); + if (!IS_ERR(cred)) + clp->cl_machine_cred = cred; return clp; @@ -191,6 +195,9 @@ static void nfs_free_client(struct nfs_client *clp) if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) nfs_callback_down(); + if (clp->cl_machine_cred != NULL) + put_rpccred(clp->cl_machine_cred); + kfree(clp->cl_hostname); kfree(clp); diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index ac7e4fb943ea..c9beacd16c00 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -32,6 +32,8 @@ struct nfs_client { const struct nfs_rpc_ops *rpc_ops; /* NFS protocol vector */ int cl_proto; /* Network transport protocol */ + struct rpc_cred *cl_machine_cred; + #ifdef CONFIG_NFS_V4 u64 cl_clientid; /* constant */ nfs4_verifier cl_confirm; diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h index a19c3af933ca..3f632182d8eb 100644 --- a/include/linux/sunrpc/auth.h +++ b/include/linux/sunrpc/auth.h @@ -26,6 +26,7 @@ struct auth_cred { uid_t uid; gid_t gid; struct group_info *group_info; + unsigned char machine_cred : 1; }; /* @@ -130,6 +131,7 @@ void __exit rpcauth_remove_module(void); void __exit rpc_destroy_generic_auth(void); struct rpc_cred * rpc_lookup_cred(void); +struct rpc_cred * rpc_lookup_machine_cred(void); int rpcauth_register(const struct rpc_authops *); int rpcauth_unregister(const struct rpc_authops *); struct rpc_auth * rpcauth_create(rpc_authflavor_t, struct rpc_clnt *); diff --git a/include/linux/sunrpc/auth_gss.h b/include/linux/sunrpc/auth_gss.h index 67658e17a375..fec6899bf355 100644 --- a/include/linux/sunrpc/auth_gss.h +++ b/include/linux/sunrpc/auth_gss.h @@ -84,6 +84,7 @@ struct gss_cred { enum rpc_gss_svc gc_service; struct gss_cl_ctx *gc_ctx; struct gss_upcall_msg *gc_upcall; + unsigned char gc_machine_cred : 1; }; #endif /* __KERNEL__ */ diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index b6f124c85072..d927d9f57412 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c @@ -17,6 +17,9 @@ # define RPCDBG_FACILITY RPCDBG_AUTH #endif +#define RPC_ANONYMOUS_USERID ((uid_t)-2) +#define RPC_ANONYMOUS_GROUPID ((gid_t)-2) + struct generic_cred { struct rpc_cred gc_base; struct auth_cred acred; @@ -35,6 +38,22 @@ struct rpc_cred *rpc_lookup_cred(void) } EXPORT_SYMBOL_GPL(rpc_lookup_cred); +/* + * Public call interface for looking up machine creds. + */ +struct rpc_cred *rpc_lookup_machine_cred(void) +{ + struct auth_cred acred = { + .uid = RPC_ANONYMOUS_USERID, + .gid = RPC_ANONYMOUS_GROUPID, + .machine_cred = 1, + }; + + dprintk("RPC: looking up machine cred\n"); + return generic_auth.au_ops->lookup_cred(&generic_auth, &acred, 0); +} +EXPORT_SYMBOL_GPL(rpc_lookup_machine_cred); + static void generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred) { @@ -75,8 +94,10 @@ generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) gcred->acred.group_info = acred->group_info; if (gcred->acred.group_info != NULL) get_group_info(gcred->acred.group_info); + gcred->acred.machine_cred = acred->machine_cred; - dprintk("RPC: allocated generic cred %p for uid %d gid %d\n", + dprintk("RPC: allocated %s cred %p for uid %d gid %d\n", + gcred->acred.machine_cred ? "machine" : "generic", gcred, acred->uid, acred->gid); return &gcred->gc_base; } @@ -115,7 +136,8 @@ generic_match(struct auth_cred *acred, struct rpc_cred *cred, int flags) if (gcred->acred.uid != acred->uid || gcred->acred.gid != acred->gid || - gcred->acred.group_info != acred->group_info) + gcred->acred.group_info != acred->group_info || + gcred->acred.machine_cred != acred->machine_cred) return 0; return 1; } diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 7567eb95823b..46f7ec800af9 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -371,9 +371,16 @@ gss_alloc_msg(struct gss_auth *gss_auth, uid_t uid) static struct gss_upcall_msg * gss_setup_upcall(struct rpc_clnt *clnt, struct gss_auth *gss_auth, struct rpc_cred *cred) { + struct gss_cred *gss_cred = container_of(cred, + struct gss_cred, gc_base); struct gss_upcall_msg *gss_new, *gss_msg; + uid_t uid = cred->cr_uid; - gss_new = gss_alloc_msg(gss_auth, cred->cr_uid); + /* Special case: rpc.gssd assumes that uid == 0 implies machine creds */ + if (gss_cred->gc_machine_cred != 0) + uid = 0; + + gss_new = gss_alloc_msg(gss_auth, uid); if (gss_new == NULL) return ERR_PTR(-ENOMEM); gss_msg = gss_add_msg(gss_auth, gss_new); @@ -818,6 +825,7 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) */ cred->gc_base.cr_flags = 1UL << RPCAUTH_CRED_NEW; cred->gc_service = gss_auth->service; + cred->gc_machine_cred = acred->machine_cred; kref_get(&gss_auth->kref); return &cred->gc_base; @@ -855,6 +863,8 @@ gss_match(struct auth_cred *acred, struct rpc_cred *rc, int flags) if (gss_cred->gc_ctx && time_after(jiffies, gss_cred->gc_ctx->gc_expiry)) return 0; out: + if (acred->machine_cred != gss_cred->gc_machine_cred) + return 0; return (rc->cr_uid == acred->uid); } From a2b2bb8822c78806930bbb4d4c5bb3ae69648fd0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 8 Apr 2008 16:02:17 -0400 Subject: [PATCH 70/80] NFSv4: Attempt to use machine credentials in SETCLIENTID calls Signed-off-by: Trond Myklebust --- fs/nfs/nfs4state.c | 41 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 7775435ea7a5..46eb624e4f16 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -71,6 +71,29 @@ static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred) return status; } +static struct rpc_cred *nfs4_get_machine_cred(struct nfs_client *clp) +{ + struct rpc_cred *cred = NULL; + + spin_lock(&clp->cl_lock); + if (clp->cl_machine_cred != NULL) + cred = get_rpccred(clp->cl_machine_cred); + spin_unlock(&clp->cl_lock); + return cred; +} + +static void nfs4_clear_machine_cred(struct nfs_client *clp) +{ + struct rpc_cred *cred; + + spin_lock(&clp->cl_lock); + cred = clp->cl_machine_cred; + clp->cl_machine_cred = NULL; + spin_unlock(&clp->cl_lock); + if (cred != NULL) + put_rpccred(cred); +} + struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp) { struct nfs4_state_owner *sp; @@ -91,13 +114,18 @@ static struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp) { struct nfs4_state_owner *sp; struct rb_node *pos; + struct rpc_cred *cred; + cred = nfs4_get_machine_cred(clp); + if (cred != NULL) + goto out; pos = rb_first(&clp->cl_state_owners); if (pos != NULL) { sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); - return get_rpccred(sp->so_cred); + cred = get_rpccred(sp->so_cred); } - return NULL; +out: + return cred; } static void nfs_alloc_unique_id(struct rb_root *root, struct nfs_unique_id *new, @@ -924,10 +952,10 @@ restart_loop: if (cred != NULL) { /* Yes there are: try to renew the old lease */ status = nfs4_proc_renew(clp, cred); + put_rpccred(cred); switch (status) { case 0: case -NFS4ERR_CB_PATH_DOWN: - put_rpccred(cred); goto out; case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_LEASE_MOVED: @@ -936,14 +964,19 @@ restart_loop: } else { /* "reboot" to ensure we clear all state on the server */ clp->cl_boot_time = CURRENT_TIME; - cred = nfs4_get_setclientid_cred(clp); } /* We're going to have to re-establish a clientid */ nfs4_state_mark_reclaim(clp); status = -ENOENT; + cred = nfs4_get_setclientid_cred(clp); if (cred != NULL) { status = nfs4_init_client(clp, cred); put_rpccred(cred); + /* Handle case where the user hasn't set up machine creds */ + if (status == -EACCES && cred == clp->cl_machine_cred) { + nfs4_clear_machine_cred(clp); + goto restart_loop; + } } if (status) goto out_error; From d2b831416365e8b1f27809b62d5e4260883956cc Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 14 Apr 2008 18:13:37 -0400 Subject: [PATCH 71/80] SUNRPC: Protect creds against early garbage collection Signed-off-by: Trond Myklebust --- net/sunrpc/auth.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 0632cd0a1ad7..6bfea9ed6869 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -220,6 +220,9 @@ rpcauth_destroy_credcache(struct rpc_auth *auth) } EXPORT_SYMBOL_GPL(rpcauth_destroy_credcache); + +#define RPC_AUTH_EXPIRY_MORATORIUM (60 * HZ) + /* * Remove stale credentials. Avoid sleeping inside the loop. */ @@ -228,6 +231,7 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan) { spinlock_t *cache_lock; struct rpc_cred *cred; + unsigned long expired = jiffies - RPC_AUTH_EXPIRY_MORATORIUM; while (!list_empty(&cred_unused)) { cred = list_entry(cred_unused.next, struct rpc_cred, cr_lru); @@ -235,6 +239,10 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan) number_cred_unused--; if (atomic_read(&cred->cr_count) != 0) continue; + /* Enforce a 5 second garbage collection moratorium */ + if (time_in_range(cred->cr_expire, expired, jiffies) && + test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0) + continue; cache_lock = &cred->cr_auth->au_credcache->lock; spin_lock(cache_lock); if (atomic_read(&cred->cr_count) == 0) { From 06b4b681ababc20596aa947595714710f557131d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 16 Apr 2008 16:51:38 -0400 Subject: [PATCH 72/80] SUNRPC: remove XS_SENDMSG_RETRY The condition for exiting from the loop in xs_tcp_send_request() should be that we find we're not making progress (i.e. number of bytes sent is 0). Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 4c2462e2f2b2..4a567a93e6ad 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -135,12 +135,6 @@ static ctl_table sunrpc_table[] = { #endif -/* - * How many times to try sending a request on a socket before waiting - * for the socket buffer to clear. - */ -#define XS_SENDMSG_RETRY (10U) - /* * Time out for an RPC UDP socket connect. UDP socket connects are * synchronous, but we set a timeout anyway in case of resource @@ -666,7 +660,6 @@ static int xs_tcp_send_request(struct rpc_task *task) struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct xdr_buf *xdr = &req->rq_snd_buf; int status; - unsigned int retry = 0; xs_encode_tcp_record_marker(&req->rq_snd_buf); @@ -697,9 +690,10 @@ static int xs_tcp_send_request(struct rpc_task *task) return 0; } + if (status != 0) + continue; status = -EAGAIN; - if (retry++ > XS_SENDMSG_RETRY) - break; + break; } switch (status) { From 636ac43318ce6939c1698fb67e714d421314ed71 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 17 Apr 2008 14:00:47 -0400 Subject: [PATCH 73/80] SUNRPC: Remove the unused export of xprt_force_disconnect Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 5110a4ea7fdf..a0646a3b4a39 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -605,7 +605,6 @@ void xprt_force_disconnect(struct rpc_xprt *xprt) xprt_wake_pending_tasks(xprt, -ENOTCONN); spin_unlock_bh(&xprt->transport_lock); } -EXPORT_SYMBOL_GPL(xprt_force_disconnect); static void xprt_init_autodisconnect(unsigned long data) From 7c1d71cf56feebfb5b98219b9d11dfc3a2feca62 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 17 Apr 2008 16:52:57 -0400 Subject: [PATCH 74/80] SUNRPC: Don't disconnect more than once if retransmitting NFSv4 requests NFSv4 requires us to ensure that we break the TCP connection before we're allowed to retransmit a request. However in the case where we're retransmitting several requests that have been sent on the same connection, we need to ensure that we don't interfere with the attempt to reconnect and/or break the connection again once it has been established. We therefore introduce a 'connection' cookie that is bumped every time a connection is broken. This allows requests to track if they need to force a disconnection. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 8 ++++++++ net/sunrpc/clnt.c | 6 ++++-- net/sunrpc/xprt.c | 29 +++++++++++++++++++++++++++++ net/sunrpc/xprtsock.c | 2 ++ 4 files changed, 43 insertions(+), 2 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 8a0629abb86a..4d80a118d538 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -86,6 +86,10 @@ struct rpc_rqst { unsigned long rq_majortimeo; /* major timeout alarm */ unsigned long rq_timeout; /* Current timeout value */ unsigned int rq_retries; /* # of retries */ + unsigned int rq_connect_cookie; + /* A cookie used to track the + state of the transport + connection */ /* * Partial send handling @@ -152,6 +156,9 @@ struct rpc_xprt { unsigned long connect_timeout, bind_timeout, reestablish_timeout; + unsigned int connect_cookie; /* A cookie that gets bumped + every time the transport + is reconnected */ /* * Disconnection of idle transports @@ -241,6 +248,7 @@ void xprt_complete_rqst(struct rpc_task *task, int copied); void xprt_release_rqst_cong(struct rpc_task *task); void xprt_disconnect_done(struct rpc_xprt *xprt); void xprt_force_disconnect(struct rpc_xprt *xprt); +void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie); /* * Reserved bit positions in xprt->state diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 3ae560464513..8773b4342c92 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1120,7 +1120,8 @@ call_status(struct rpc_task *task) case -ETIMEDOUT: task->tk_action = call_timeout; if (task->tk_client->cl_discrtry) - xprt_force_disconnect(task->tk_xprt); + xprt_conditional_disconnect(task->tk_xprt, + req->rq_connect_cookie); break; case -ECONNREFUSED: case -ENOTCONN: @@ -1245,7 +1246,8 @@ out_retry: if (task->tk_rqstp == req) { req->rq_received = req->rq_rcv_buf.len = 0; if (task->tk_client->cl_discrtry) - xprt_force_disconnect(task->tk_xprt); + xprt_conditional_disconnect(task->tk_xprt, + req->rq_connect_cookie); } } diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index a0646a3b4a39..75d748eee0eb 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -606,6 +606,34 @@ void xprt_force_disconnect(struct rpc_xprt *xprt) spin_unlock_bh(&xprt->transport_lock); } +/** + * xprt_conditional_disconnect - force a transport to disconnect + * @xprt: transport to disconnect + * @cookie: 'connection cookie' + * + * This attempts to break the connection if and only if 'cookie' matches + * the current transport 'connection cookie'. It ensures that we don't + * try to break the connection more than once when we need to retransmit + * a batch of RPC requests. + * + */ +void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie) +{ + /* Don't race with the test_bit() in xprt_clear_locked() */ + spin_lock_bh(&xprt->transport_lock); + if (cookie != xprt->connect_cookie) + goto out; + if (test_bit(XPRT_CLOSING, &xprt->state) || !xprt_connected(xprt)) + goto out; + set_bit(XPRT_CLOSE_WAIT, &xprt->state); + /* Try to schedule an autoclose RPC call */ + if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) + queue_work(rpciod_workqueue, &xprt->task_cleanup); + xprt_wake_pending_tasks(xprt, -ENOTCONN); +out: + spin_unlock_bh(&xprt->transport_lock); +} + static void xprt_init_autodisconnect(unsigned long data) { @@ -849,6 +877,7 @@ void xprt_transmit(struct rpc_task *task) } else if (!req->rq_bytes_sent) return; + req->rq_connect_cookie = xprt->connect_cookie; status = xprt->ops->send_request(task); if (status == 0) { dprintk("RPC: %5u xmit complete\n", task->tk_pid); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 4a567a93e6ad..63d79e347c00 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1142,6 +1142,7 @@ static void xs_tcp_state_change(struct sock *sk) break; case TCP_FIN_WAIT1: /* The client initiated a shutdown of the socket */ + xprt->connect_cookie++; xprt->reestablish_timeout = 0; set_bit(XPRT_CLOSING, &xprt->state); smp_mb__before_clear_bit(); @@ -1154,6 +1155,7 @@ static void xs_tcp_state_change(struct sock *sk) set_bit(XPRT_CLOSING, &xprt->state); xprt_force_disconnect(xprt); case TCP_SYN_SENT: + xprt->connect_cookie++; case TCP_CLOSING: /* * If the server closed down the connection, make sure that From 7b6962b0a6000df48c8a5fd967d262f77704101b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 17 Apr 2008 16:53:01 -0400 Subject: [PATCH 75/80] SUNRPC: Fix a race in gss_refresh_upcall() If the downcall completes before we get the spin_lock then we currently fail to refresh the credential. Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/auth_gss.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 46f7ec800af9..6f1b4e2f5e81 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -117,6 +117,7 @@ gss_cred_set_ctx(struct rpc_cred *cred, struct gss_cl_ctx *ctx) struct gss_cl_ctx *old; old = gss_cred->gc_ctx; + gss_get_ctx(ctx); rcu_assign_pointer(gss_cred->gc_ctx, ctx); set_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); clear_bit(RPCAUTH_CRED_NEW, &cred->cr_flags); @@ -340,7 +341,7 @@ gss_upcall_callback(struct rpc_task *task) spin_lock(&inode->i_lock); if (gss_msg->ctx) - gss_cred_set_ctx(task->tk_msg.rpc_cred, gss_get_ctx(gss_msg->ctx)); + gss_cred_set_ctx(task->tk_msg.rpc_cred, gss_msg->ctx); else task->tk_status = gss_msg->msg.errno; gss_cred->gc_upcall = NULL; @@ -417,7 +418,11 @@ gss_refresh_upcall(struct rpc_task *task) spin_lock(&inode->i_lock); if (gss_cred->gc_upcall != NULL) rpc_sleep_on(&gss_cred->gc_upcall->rpc_waitqueue, task, NULL); - else if (gss_msg->ctx == NULL && gss_msg->msg.errno >= 0) { + else if (gss_msg->ctx != NULL) { + gss_cred_set_ctx(task->tk_msg.rpc_cred, gss_msg->ctx); + gss_cred->gc_upcall = NULL; + rpc_wake_up_status(&gss_msg->rpc_waitqueue, gss_msg->msg.errno); + } else if (gss_msg->msg.errno >= 0) { task->tk_timeout = 0; gss_cred->gc_upcall = gss_msg; /* gss_upcall_callback will release the reference to gss_upcall_msg */ @@ -462,7 +467,7 @@ gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred) schedule(); } if (gss_msg->ctx) - gss_cred_set_ctx(cred, gss_get_ctx(gss_msg->ctx)); + gss_cred_set_ctx(cred, gss_msg->ctx); else err = gss_msg->msg.errno; spin_unlock(&inode->i_lock); From cd019f7517206a74d8cdb64d5c82b1f76be608cc Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 17 Apr 2008 17:03:58 -0400 Subject: [PATCH 76/80] SUNRPC: Don't change the RPCSEC_GSS context on a credential that is in use When a server rejects our credential with an AUTH_REJECTEDCRED or similar, we need to refresh the credential and then retry the request. However, we do want to allow any requests that are in flight to finish executing, so that we can at least attempt to process the replies that depend on this instance of the credential. The solution is to ensure that gss_refresh() looks up an entirely new RPCSEC_GSS credential instead of attempting to create a context for the existing invalid credential. Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/auth_gss.c | 69 +++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 27 deletions(-) diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 6f1b4e2f5e81..621c07f322c4 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -114,28 +114,14 @@ static void gss_cred_set_ctx(struct rpc_cred *cred, struct gss_cl_ctx *ctx) { struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); - struct gss_cl_ctx *old; - old = gss_cred->gc_ctx; + if (!test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags)) + return; gss_get_ctx(ctx); rcu_assign_pointer(gss_cred->gc_ctx, ctx); set_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); + smp_mb__before_clear_bit(); clear_bit(RPCAUTH_CRED_NEW, &cred->cr_flags); - if (old) - gss_put_ctx(old); -} - -static int -gss_cred_is_uptodate_ctx(struct rpc_cred *cred) -{ - struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); - int res = 0; - - rcu_read_lock(); - if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) && gss_cred->gc_ctx) - res = 1; - rcu_read_unlock(); - return res; } static const void * @@ -857,15 +843,12 @@ gss_match(struct auth_cred *acred, struct rpc_cred *rc, int flags) { struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base); - /* - * If the searchflags have set RPCAUTH_LOOKUP_NEW, then - * we don't really care if the credential has expired or not, - * since the caller should be prepared to reinitialise it. - */ - if ((flags & RPCAUTH_LOOKUP_NEW) && test_bit(RPCAUTH_CRED_NEW, &rc->cr_flags)) + if (test_bit(RPCAUTH_CRED_NEW, &rc->cr_flags)) goto out; /* Don't match with creds that have expired. */ - if (gss_cred->gc_ctx && time_after(jiffies, gss_cred->gc_ctx->gc_expiry)) + if (time_after(jiffies, gss_cred->gc_ctx->gc_expiry)) + return 0; + if (!test_bit(RPCAUTH_CRED_UPTODATE, &rc->cr_flags)) return 0; out: if (acred->machine_cred != gss_cred->gc_machine_cred) @@ -933,16 +916,48 @@ out_put_ctx: return NULL; } +static int gss_renew_cred(struct rpc_task *task) +{ + struct rpc_cred *oldcred = task->tk_msg.rpc_cred; + struct gss_cred *gss_cred = container_of(oldcred, + struct gss_cred, + gc_base); + struct rpc_auth *auth = oldcred->cr_auth; + struct auth_cred acred = { + .uid = oldcred->cr_uid, + .machine_cred = gss_cred->gc_machine_cred, + }; + struct rpc_cred *new; + + new = gss_lookup_cred(auth, &acred, RPCAUTH_LOOKUP_NEW); + if (IS_ERR(new)) + return PTR_ERR(new); + task->tk_msg.rpc_cred = new; + put_rpccred(oldcred); + return 0; +} + /* * Refresh credentials. XXX - finish */ static int gss_refresh(struct rpc_task *task) { + struct rpc_cred *cred = task->tk_msg.rpc_cred; + int ret = 0; - if (!gss_cred_is_uptodate_ctx(task->tk_msg.rpc_cred)) - return gss_refresh_upcall(task); - return 0; + if (!test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags) && + !test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags)) { + ret = gss_renew_cred(task); + if (ret < 0) + goto out; + cred = task->tk_msg.rpc_cred; + } + + if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags)) + ret = gss_refresh_upcall(task); +out: + return ret; } /* Dummy refresh routine: used only when destroying the context */ From 63649bd7080a6a50fabcb1935f4b7c4e64155066 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 17 Apr 2008 20:42:09 +0400 Subject: [PATCH 77/80] NFS - fix potential NULL pointer dereference v2 There is possible NULL pointer dereference if kstr[n]dup failed. So fix them for safety. Signed-off-by: Cyrill Gorcunov Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index c99ca1f992ce..2215bcd24bd2 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1297,6 +1297,8 @@ static int nfs_validate_mount_data(void *options, args->namlen = data->namlen; args->bsize = data->bsize; args->auth_flavors[0] = data->pseudoflavor; + if (!args->nfs_server.hostname) + goto out_nomem; /* * The legacy version 6 binary mount data from userspace has a @@ -1343,6 +1345,8 @@ static int nfs_validate_mount_data(void *options, len = c - dev_name; /* N.B. caller will free nfs_server.hostname in all cases */ args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL); + if (!args->nfs_server.hostname) + goto out_nomem; c++; if (strlen(c) > NFS_MAXPATHLEN) @@ -1386,6 +1390,10 @@ out_v3_not_compiled: return -EPROTONOSUPPORT; #endif /* !CONFIG_NFS_V3 */ +out_nomem: + dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n"); + return -ENOMEM; + out_no_address: dfprintk(MOUNT, "NFS: mount program didn't pass remote address\n"); return -EINVAL; @@ -1892,12 +1900,16 @@ static int nfs4_validate_mount_data(void *options, return -ENAMETOOLONG; /* N.B. caller will free nfs_server.hostname in all cases */ args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL); + if (!args->nfs_server.hostname) + goto out_nomem; c++; /* step over the ':' */ len = strlen(c); if (len > NFS4_MAXPATHLEN) return -ENAMETOOLONG; args->nfs_server.export_path = kstrndup(c, len, GFP_KERNEL); + if (!args->nfs_server.export_path) + goto out_nomem; dprintk("NFS: MNTPATH: '%s'\n", args->nfs_server.export_path); @@ -1919,6 +1931,10 @@ out_inval_auth: data->auth_flavourlen); return -EINVAL; +out_nomem: + dfprintk(MOUNT, "NFS4: not enough memory to handle mount options\n"); + return -ENOMEM; + out_no_address: dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n"); return -EINVAL; From daa7da5fd3040e08e3d7878cd0667c1f4cfd338a Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 11 Apr 2008 11:50:58 -0400 Subject: [PATCH 78/80] NFS: remove duplicate flags assignment from nfs_validate_mount_data Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 2215bcd24bd2..88a0ecd84c65 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1275,7 +1275,6 @@ static int nfs_validate_mount_data(void *options, args->flags = data->flags; args->rsize = data->rsize; args->wsize = data->wsize; - args->flags = data->flags; args->timeo = data->timeo; args->retrans = data->retrans; args->acregmin = data->acregmin; From a3dab293539031b0970585b9b355cebbc91ecbd4 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Mon, 14 Apr 2008 21:41:32 +0300 Subject: [PATCH 79/80] make nfs_automount_list static nfs_automount_list can now become static. Signed-off-by: Adrian Bunk Signed-off-by: Trond Myklebust --- fs/nfs/namespace.c | 2 +- include/linux/nfs_fs.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 607f6eb9cdb5..af4d0f1e402c 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -20,7 +20,7 @@ static void nfs_expire_automounts(struct work_struct *work); -LIST_HEAD(nfs_automount_list); +static LIST_HEAD(nfs_automount_list); static DECLARE_DELAYED_WORK(nfs_automount_task, nfs_expire_automounts); int nfs_mountpoint_expiry_timeout = 500 * HZ; diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 7f0602c87712..27d6a8d98cef 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -430,7 +430,6 @@ extern void nfs_unregister_sysctl(void); /* * linux/fs/nfs/namespace.c */ -extern struct list_head nfs_automount_list; extern const struct inode_operations nfs_mountpoint_inode_operations; extern const struct inode_operations nfs_referral_inode_operations; extern int nfs_mountpoint_expiry_timeout; From b48633bd086d21f4a2a5bea96c7e6c7ba58eb60c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 22 Apr 2008 16:47:55 -0400 Subject: [PATCH 80/80] SUNRPC: Invalidate the RPCSEC_GSS session if the server dropped the request RFC 2203 requires the server to drop the request if it believes the RPCSEC_GSS context is out of sequence. The problem is that we have no way on the client to know why the server dropped the request. In order to avoid spinning forever trying to resend the request, the safe approach is therefore to always invalidate the RPCSEC_GSS context on every major timeout. Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 8773b4342c92..c6efb982057b 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1173,6 +1173,11 @@ call_timeout(struct rpc_task *task) clnt->cl_protname, clnt->cl_server); } rpc_force_rebind(clnt); + /* + * Did our request time out due to an RPCSEC_GSS out-of-sequence + * event? RFC2203 requires the server to drop all such requests. + */ + rpcauth_invalcred(task); retry: clnt->cl_stats->rpcretrans++;