OpenCloudOS-Kernel/fs/lockd/clntproc.c

885 lines
22 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/fs/lockd/clntproc.c
*
* RPC procedures for the client side NLM implementation
*
* Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
*/
#include <linux/module.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/filelock.h>
#include <linux/nfs_fs.h>
#include <linux/utsname.h>
#include <linux/freezer.h>
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/svc.h>
#include <linux/lockd/lockd.h>
#include "trace.h"
#define NLMDBG_FACILITY NLMDBG_CLIENT
#define NLMCLNT_GRACE_WAIT (5*HZ)
#define NLMCLNT_POLL_TIMEOUT (30*HZ)
#define NLMCLNT_MAX_RETRIES 3
static int nlmclnt_test(struct nlm_rqst *, struct file_lock *);
static int nlmclnt_lock(struct nlm_rqst *, struct file_lock *);
static int nlmclnt_unlock(struct nlm_rqst *, struct file_lock *);
static int nlm_stat_to_errno(__be32 stat);
static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host);
static int nlmclnt_cancel(struct nlm_host *, int , struct file_lock *);
static const struct rpc_call_ops nlmclnt_unlock_ops;
static const struct rpc_call_ops nlmclnt_cancel_ops;
/*
* Cookie counter for NLM requests
*/
static atomic_t nlm_cookie = ATOMIC_INIT(0x1234);
void nlmclnt_next_cookie(struct nlm_cookie *c)
{
u32 cookie = atomic_inc_return(&nlm_cookie);
memcpy(c->data, &cookie, 4);
c->len=4;
}
static struct nlm_lockowner *
nlmclnt_get_lockowner(struct nlm_lockowner *lockowner)
{
lockd: convert nlm_lockowner.count from atomic_t to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable nlm_lockowner.count is used as pure reference counter. Convert it to refcount_t and fix up the operations. **Important note for maintainers: Some functions from refcount_t API defined in lib/refcount.c have different memory ordering guarantees than their atomic counterparts. The full comparison can be seen in https://lkml.org/lkml/2017/11/15/57 and it is hopefully soon in state to be merged to the documentation tree. Normally the differences should not matter since refcount_t provides enough guarantees to satisfy the refcounting use cases, but in some rare cases it might matter. Please double check that you don't have some undocumented memory guarantees for this variable usage. For the nlm_lockowner.count it might make a difference in following places: - nlm_put_lockowner(): decrement in refcount_dec_and_lock() only provides RELEASE ordering, control dependency on success and holds a spin lock on success vs. fully ordered atomic counterpart. No changes in spin lock guarantees. Suggested-by: Kees Cook <keescook@chromium.org> Reviewed-by: David Windsor <dwindsor@gmail.com> Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com> Signed-off-by: Elena Reshetova <elena.reshetova@intel.com> Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
2017-11-29 19:15:45 +08:00
refcount_inc(&lockowner->count);
return lockowner;
}
static void nlmclnt_put_lockowner(struct nlm_lockowner *lockowner)
{
lockd: convert nlm_lockowner.count from atomic_t to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable nlm_lockowner.count is used as pure reference counter. Convert it to refcount_t and fix up the operations. **Important note for maintainers: Some functions from refcount_t API defined in lib/refcount.c have different memory ordering guarantees than their atomic counterparts. The full comparison can be seen in https://lkml.org/lkml/2017/11/15/57 and it is hopefully soon in state to be merged to the documentation tree. Normally the differences should not matter since refcount_t provides enough guarantees to satisfy the refcounting use cases, but in some rare cases it might matter. Please double check that you don't have some undocumented memory guarantees for this variable usage. For the nlm_lockowner.count it might make a difference in following places: - nlm_put_lockowner(): decrement in refcount_dec_and_lock() only provides RELEASE ordering, control dependency on success and holds a spin lock on success vs. fully ordered atomic counterpart. No changes in spin lock guarantees. Suggested-by: Kees Cook <keescook@chromium.org> Reviewed-by: David Windsor <dwindsor@gmail.com> Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com> Signed-off-by: Elena Reshetova <elena.reshetova@intel.com> Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
2017-11-29 19:15:45 +08:00
if (!refcount_dec_and_lock(&lockowner->count, &lockowner->host->h_lock))
return;
list_del(&lockowner->list);
spin_unlock(&lockowner->host->h_lock);
lockd: Create client-side nlm_host cache NFS clients don't need the garbage collection processing that is performed on nlm_host structures. The client picks up an nlm_host at mount time and holds a reference to it until the file system is unmounted. Servers, on the other hand, don't have a precise way to tell when an nlm_host is no longer being used, so zero refcount nlm_host entries are left to expire in the cache after a time. Basically there's nothing holding a reference to an nlm_host between individual server-side NLM requests, but we can't afford the expense of recreating them for every new NLM request from a client. The nlm_host cache adds some lifetime hysteresis to entries in the cache so the next time a particular nlm_host is needed, it's likely to be discovered by a lookup rather than created from whole cloth. With the new implementation, client nlm_host cache items are no longer garbage collected, and are destroyed directly by a new release function specialized for client entries, nlmclnt_release_host(). They are cached in their own data structure, and have their own lookup logic, simplified and specialized for client nlm_host entries. However, the client nlm_host cache still shares reboot recovery logic with the server nlm_host cache. The NSM "peer rebooted" downcall for clients and servers still come through the same RPC call. This is a legacy formal API that would be difficult to alter, and besides, the user space NSM implementation can't tell the difference between peers that are clients or servers. For this reason, the client cache continues to share the nlm_host_mutex (and reboot recovery logic) with the server cache. Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
2010-12-14 23:05:52 +08:00
nlmclnt_release_host(lockowner->host);
kfree(lockowner);
}
static inline int nlm_pidbusy(struct nlm_host *host, uint32_t pid)
{
struct nlm_lockowner *lockowner;
list_for_each_entry(lockowner, &host->h_lockowners, list) {
if (lockowner->pid == pid)
return -EBUSY;
}
return 0;
}
static inline uint32_t __nlm_alloc_pid(struct nlm_host *host)
{
uint32_t res;
do {
res = host->h_pidcount++;
} while (nlm_pidbusy(host, res) < 0);
return res;
}
static struct nlm_lockowner *__nlmclnt_find_lockowner(struct nlm_host *host, fl_owner_t owner)
{
struct nlm_lockowner *lockowner;
list_for_each_entry(lockowner, &host->h_lockowners, list) {
if (lockowner->owner != owner)
continue;
return nlmclnt_get_lockowner(lockowner);
}
return NULL;
}
static struct nlm_lockowner *nlmclnt_find_lockowner(struct nlm_host *host, fl_owner_t owner)
{
struct nlm_lockowner *res, *new = NULL;
spin_lock(&host->h_lock);
res = __nlmclnt_find_lockowner(host, owner);
if (res == NULL) {
spin_unlock(&host->h_lock);
new = kmalloc(sizeof(*new), GFP_KERNEL);
spin_lock(&host->h_lock);
res = __nlmclnt_find_lockowner(host, owner);
if (res == NULL && new != NULL) {
res = new;
lockd: convert nlm_lockowner.count from atomic_t to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable nlm_lockowner.count is used as pure reference counter. Convert it to refcount_t and fix up the operations. **Important note for maintainers: Some functions from refcount_t API defined in lib/refcount.c have different memory ordering guarantees than their atomic counterparts. The full comparison can be seen in https://lkml.org/lkml/2017/11/15/57 and it is hopefully soon in state to be merged to the documentation tree. Normally the differences should not matter since refcount_t provides enough guarantees to satisfy the refcounting use cases, but in some rare cases it might matter. Please double check that you don't have some undocumented memory guarantees for this variable usage. For the nlm_lockowner.count it might make a difference in following places: - nlm_put_lockowner(): decrement in refcount_dec_and_lock() only provides RELEASE ordering, control dependency on success and holds a spin lock on success vs. fully ordered atomic counterpart. No changes in spin lock guarantees. Suggested-by: Kees Cook <keescook@chromium.org> Reviewed-by: David Windsor <dwindsor@gmail.com> Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com> Signed-off-by: Elena Reshetova <elena.reshetova@intel.com> Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
2017-11-29 19:15:45 +08:00
refcount_set(&new->count, 1);
new->owner = owner;
new->pid = __nlm_alloc_pid(host);
new->host = nlm_get_host(host);
list_add(&new->list, &host->h_lockowners);
new = NULL;
}
}
spin_unlock(&host->h_lock);
kfree(new);
return res;
}
/*
* Initialize arguments for TEST/LOCK/UNLOCK/CANCEL calls
*/
static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl)
{
struct nlm_args *argp = &req->a_args;
struct nlm_lock *lock = &argp->lock;
char *nodename = req->a_host->h_rpcclnt->cl_nodename;
nlmclnt_next_cookie(&argp->cookie);
memcpy(&lock->fh, NFS_FH(file_inode(fl->fl_file)), sizeof(struct nfs_fh));
lock->caller = nodename;
lock->oh.data = req->a_owner;
lock->oh.len = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s",
(unsigned int)fl->fl_u.nfs_fl.owner->pid,
nodename);
lock->svid = fl->fl_u.nfs_fl.owner->pid;
lock->fl.fl_start = fl->fl_start;
lock->fl.fl_end = fl->fl_end;
lock->fl.fl_type = fl->fl_type;
}
static void nlmclnt_release_lockargs(struct nlm_rqst *req)
{
WARN_ON_ONCE(req->a_args.lock.fl.fl_ops != NULL);
}
/**
* nlmclnt_proc - Perform a single client-side lock request
* @host: address of a valid nlm_host context representing the NLM server
* @cmd: fcntl-style file lock operation to perform
* @fl: address of arguments for the lock operation
* @data: address of data to be sent to callback operations
*
*/
int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl, void *data)
{
struct nlm_rqst *call;
int status;
const struct nlmclnt_operations *nlmclnt_ops = host->h_nlmclnt_ops;
call = nlm_alloc_call(host);
if (call == NULL)
return -ENOMEM;
if (nlmclnt_ops && nlmclnt_ops->nlmclnt_alloc_call)
nlmclnt_ops->nlmclnt_alloc_call(data);
nlmclnt_locks_init_private(fl, host);
if (!fl->fl_u.nfs_fl.owner) {
/* lockowner allocation has failed */
nlmclnt_release_call(call);
return -ENOMEM;
}
/* Set up the argument struct */
nlmclnt_setlockargs(call, fl);
call->a_callback_data = data;
if (IS_SETLK(cmd) || IS_SETLKW(cmd)) {
if (fl->fl_type != F_UNLCK) {
call->a_args.block = IS_SETLKW(cmd) ? 1 : 0;
status = nlmclnt_lock(call, fl);
} else
status = nlmclnt_unlock(call, fl);
} else if (IS_GETLK(cmd))
status = nlmclnt_test(call, fl);
else
status = -EINVAL;
fl->fl_ops->fl_release_private(fl);
fl->fl_ops = NULL;
dprintk("lockd: clnt proc returns %d\n", status);
return status;
}
EXPORT_SYMBOL_GPL(nlmclnt_proc);
/*
* Allocate an NLM RPC call struct
*/
struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)
{
struct nlm_rqst *call;
for(;;) {
call = kzalloc(sizeof(*call), GFP_KERNEL);
if (call != NULL) {
lockd: convert nlm_rqst.a_count from atomic_t to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable nlm_rqst.a_count is used as pure reference counter. Convert it to refcount_t and fix up the operations. **Important note for maintainers: Some functions from refcount_t API defined in lib/refcount.c have different memory ordering guarantees than their atomic counterparts. The full comparison can be seen in https://lkml.org/lkml/2017/11/15/57 and it is hopefully soon in state to be merged to the documentation tree. Normally the differences should not matter since refcount_t provides enough guarantees to satisfy the refcounting use cases, but in some rare cases it might matter. Please double check that you don't have some undocumented memory guarantees for this variable usage. For the nlm_rqst.a_count it might make a difference in following places: - nlmclnt_release_call() and nlmsvc_release_call(): decrement in refcount_dec_and_test() only provides RELEASE ordering and control dependency on success vs. fully ordered atomic counterpart Suggested-by: Kees Cook <keescook@chromium.org> Reviewed-by: David Windsor <dwindsor@gmail.com> Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com> Signed-off-by: Elena Reshetova <elena.reshetova@intel.com> Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
2017-11-29 19:15:46 +08:00
refcount_set(&call->a_count, 1);
locks_init_lock(&call->a_args.lock.fl);
locks_init_lock(&call->a_res.lock.fl);
call->a_host = nlm_get_host(host);
return call;
}
if (signalled())
break;
printk("nlm_alloc_call: failed, waiting for memory\n");
schedule_timeout_interruptible(5*HZ);
}
return NULL;
}
void nlmclnt_release_call(struct nlm_rqst *call)
{
const struct nlmclnt_operations *nlmclnt_ops = call->a_host->h_nlmclnt_ops;
lockd: convert nlm_rqst.a_count from atomic_t to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable nlm_rqst.a_count is used as pure reference counter. Convert it to refcount_t and fix up the operations. **Important note for maintainers: Some functions from refcount_t API defined in lib/refcount.c have different memory ordering guarantees than their atomic counterparts. The full comparison can be seen in https://lkml.org/lkml/2017/11/15/57 and it is hopefully soon in state to be merged to the documentation tree. Normally the differences should not matter since refcount_t provides enough guarantees to satisfy the refcounting use cases, but in some rare cases it might matter. Please double check that you don't have some undocumented memory guarantees for this variable usage. For the nlm_rqst.a_count it might make a difference in following places: - nlmclnt_release_call() and nlmsvc_release_call(): decrement in refcount_dec_and_test() only provides RELEASE ordering and control dependency on success vs. fully ordered atomic counterpart Suggested-by: Kees Cook <keescook@chromium.org> Reviewed-by: David Windsor <dwindsor@gmail.com> Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com> Signed-off-by: Elena Reshetova <elena.reshetova@intel.com> Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
2017-11-29 19:15:46 +08:00
if (!refcount_dec_and_test(&call->a_count))
return;
if (nlmclnt_ops && nlmclnt_ops->nlmclnt_release_call)
nlmclnt_ops->nlmclnt_release_call(call->a_callback_data);
lockd: Create client-side nlm_host cache NFS clients don't need the garbage collection processing that is performed on nlm_host structures. The client picks up an nlm_host at mount time and holds a reference to it until the file system is unmounted. Servers, on the other hand, don't have a precise way to tell when an nlm_host is no longer being used, so zero refcount nlm_host entries are left to expire in the cache after a time. Basically there's nothing holding a reference to an nlm_host between individual server-side NLM requests, but we can't afford the expense of recreating them for every new NLM request from a client. The nlm_host cache adds some lifetime hysteresis to entries in the cache so the next time a particular nlm_host is needed, it's likely to be discovered by a lookup rather than created from whole cloth. With the new implementation, client nlm_host cache items are no longer garbage collected, and are destroyed directly by a new release function specialized for client entries, nlmclnt_release_host(). They are cached in their own data structure, and have their own lookup logic, simplified and specialized for client nlm_host entries. However, the client nlm_host cache still shares reboot recovery logic with the server nlm_host cache. The NSM "peer rebooted" downcall for clients and servers still come through the same RPC call. This is a legacy formal API that would be difficult to alter, and besides, the user space NSM implementation can't tell the difference between peers that are clients or servers. For this reason, the client cache continues to share the nlm_host_mutex (and reboot recovery logic) with the server cache. Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
2010-12-14 23:05:52 +08:00
nlmclnt_release_host(call->a_host);
nlmclnt_release_lockargs(call);
kfree(call);
}
static void nlmclnt_rpc_release(void *data)
{
nlmclnt_release_call(data);
}
static int nlm_wait_on_grace(wait_queue_head_t *queue)
{
DEFINE_WAIT(wait);
int status = -EINTR;
prepare_to_wait(queue, &wait, TASK_INTERRUPTIBLE);
if (!signalled ()) {
schedule_timeout(NLMCLNT_GRACE_WAIT);
try_to_freeze();
if (!signalled ())
status = 0;
}
finish_wait(queue, &wait);
return status;
}
/*
* Generic NLM call
*/
static int
nlmclnt_call(const struct cred *cred, struct nlm_rqst *req, u32 proc)
{
struct nlm_host *host = req->a_host;
struct rpc_clnt *clnt;
struct nlm_args *argp = &req->a_args;
struct nlm_res *resp = &req->a_res;
struct rpc_message msg = {
.rpc_argp = argp,
.rpc_resp = resp,
.rpc_cred = cred,
};
int status;
dprintk("lockd: call procedure %d on %s\n",
(int)proc, host->h_name);
do {
if (host->h_reclaiming && !argp->reclaim)
goto in_grace_period;
/* If we have no RPC client yet, create one. */
if ((clnt = nlm_bind_host(host)) == NULL)
return -ENOLCK;
msg.rpc_proc = &clnt->cl_procinfo[proc];
/* Perform the RPC call. If an error occurs, try again */
if ((status = rpc_call_sync(clnt, &msg, 0)) < 0) {
dprintk("lockd: rpc_call returned error %d\n", -status);
switch (status) {
case -EPROTONOSUPPORT:
status = -EINVAL;
break;
case -ECONNREFUSED:
case -ETIMEDOUT:
case -ENOTCONN:
nlm_rebind_host(host);
status = -EAGAIN;
break;
case -ERESTARTSYS:
return signalled () ? -EINTR : status;
default:
break;
}
break;
} else
if (resp->status == nlm_lck_denied_grace_period) {
dprintk("lockd: server in grace period\n");
if (argp->reclaim) {
printk(KERN_WARNING
"lockd: spurious grace period reject?!\n");
return -ENOLCK;
}
} else {
if (!argp->reclaim) {
/* We appear to be out of the grace period */
wake_up_all(&host->h_gracewait);
}
dprintk("lockd: server returns status %d\n",
ntohl(resp->status));
return 0; /* Okay, call complete */
}
in_grace_period:
/*
* The server has rebooted and appears to be in the grace
* period during which locks are only allowed to be
* reclaimed.
* We can only back off and try again later.
*/
status = nlm_wait_on_grace(&host->h_gracewait);
} while (status == 0);
return status;
}
/*
* Generic NLM call, async version.
*/
static struct rpc_task *__nlm_async_call(struct nlm_rqst *req, u32 proc, struct rpc_message *msg, const struct rpc_call_ops *tk_ops)
{
struct nlm_host *host = req->a_host;
struct rpc_clnt *clnt;
struct rpc_task_setup task_setup_data = {
.rpc_message = msg,
.callback_ops = tk_ops,
.callback_data = req,
.flags = RPC_TASK_ASYNC,
};
dprintk("lockd: call procedure %d on %s (async)\n",
(int)proc, host->h_name);
/* If we have no RPC client yet, create one. */
clnt = nlm_bind_host(host);
if (clnt == NULL)
goto out_err;
msg->rpc_proc = &clnt->cl_procinfo[proc];
task_setup_data.rpc_client = clnt;
/* bootstrap and kick off the async RPC call */
return rpc_run_task(&task_setup_data);
out_err:
tk_ops->rpc_release(req);
return ERR_PTR(-ENOLCK);
}
static int nlm_do_async_call(struct nlm_rqst *req, u32 proc, struct rpc_message *msg, const struct rpc_call_ops *tk_ops)
{
struct rpc_task *task;
task = __nlm_async_call(req, proc, msg, tk_ops);
if (IS_ERR(task))
return PTR_ERR(task);
rpc_put_task(task);
return 0;
}
/*
* NLM asynchronous call.
*/
int nlm_async_call(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops)
{
struct rpc_message msg = {
.rpc_argp = &req->a_args,
.rpc_resp = &req->a_res,
};
return nlm_do_async_call(req, proc, &msg, tk_ops);
}
int nlm_async_reply(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops)
{
struct rpc_message msg = {
.rpc_argp = &req->a_res,
};
return nlm_do_async_call(req, proc, &msg, tk_ops);
}
/*
* NLM client asynchronous call.
*
* Note that although the calls are asynchronous, and are therefore
* guaranteed to complete, we still always attempt to wait for
* completion in order to be able to correctly track the lock
* state.
*/
static int nlmclnt_async_call(const struct cred *cred, struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops)
{
struct rpc_message msg = {
.rpc_argp = &req->a_args,
.rpc_resp = &req->a_res,
.rpc_cred = cred,
};
struct rpc_task *task;
int err;
task = __nlm_async_call(req, proc, &msg, tk_ops);
if (IS_ERR(task))
return PTR_ERR(task);
err = rpc_wait_for_completion_task(task);
rpc_put_task(task);
return err;
}
/*
* TEST for the presence of a conflicting lock
*/
static int
nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
{
int status;
status = nlmclnt_call(nfs_file_cred(fl->fl_file), req, NLMPROC_TEST);
if (status < 0)
goto out;
switch (req->a_res.status) {
case nlm_granted:
fl->fl_type = F_UNLCK;
break;
case nlm_lck_denied:
/*
* Report the conflicting lock back to the application.
*/
fl->fl_start = req->a_res.lock.fl.fl_start;
fl->fl_end = req->a_res.lock.fl.fl_end;
fl->fl_type = req->a_res.lock.fl.fl_type;
fl->fl_pid = -req->a_res.lock.fl.fl_pid;
break;
default:
status = nlm_stat_to_errno(req->a_res.status);
}
out:
trace_nlmclnt_test(&req->a_args.lock,
(const struct sockaddr *)&req->a_host->h_addr,
req->a_host->h_addrlen, req->a_res.status);
nlmclnt_release_call(req);
return status;
}
static void nlmclnt_locks_copy_lock(struct file_lock *new, struct file_lock *fl)
{
spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock);
new->fl_u.nfs_fl.state = fl->fl_u.nfs_fl.state;
new->fl_u.nfs_fl.owner = nlmclnt_get_lockowner(fl->fl_u.nfs_fl.owner);
list_add_tail(&new->fl_u.nfs_fl.list, &fl->fl_u.nfs_fl.owner->host->h_granted);
spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock);
}
static void nlmclnt_locks_release_private(struct file_lock *fl)
{
spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock);
list_del(&fl->fl_u.nfs_fl.list);
spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock);
nlmclnt_put_lockowner(fl->fl_u.nfs_fl.owner);
}
static const struct file_lock_operations nlmclnt_lock_ops = {
.fl_copy_lock = nlmclnt_locks_copy_lock,
.fl_release_private = nlmclnt_locks_release_private,
};
static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host)
{
fl->fl_u.nfs_fl.state = 0;
fl->fl_u.nfs_fl.owner = nlmclnt_find_lockowner(host, fl->fl_owner);
INIT_LIST_HEAD(&fl->fl_u.nfs_fl.list);
fl->fl_ops = &nlmclnt_lock_ops;
}
static int do_vfs_lock(struct file_lock *fl)
{
return locks_lock_file_wait(fl->fl_file, fl);
}
/*
* LOCK: Try to create a lock
*
* Programmer Harassment Alert
*
* When given a blocking lock request in a sync RPC call, the HPUX lockd
* will faithfully return LCK_BLOCKED but never cares to notify us when
* the lock could be granted. This way, our local process could hang
* around forever waiting for the callback.
*
* Solution A: Implement busy-waiting
* Solution B: Use the async version of the call (NLM_LOCK_{MSG,RES})
*
* For now I am implementing solution A, because I hate the idea of
* re-implementing lockd for a third time in two months. The async
* calls shouldn't be too hard to do, however.
*
* This is one of the lovely things about standards in the NFS area:
* they're so soft and squishy you can't really blame HP for doing this.
*/
static int
nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
{
const struct cred *cred = nfs_file_cred(fl->fl_file);
struct nlm_host *host = req->a_host;
struct nlm_res *resp = &req->a_res;
struct nlm_wait block;
unsigned char fl_flags = fl->fl_flags;
unsigned char fl_type;
__be32 b_status;
int status = -ENOLCK;
if (nsm_monitor(host) < 0)
goto out;
lockd: Update NSM state from SM_MON replies When rpc.statd starts up in user space at boot time, it attempts to write the latest NSM local state number into /proc/sys/fs/nfs/nsm_local_state. If lockd.ko isn't loaded yet (as is the case in most configurations), that file doesn't exist, thus the kernel's NSM state remains set to its initial value of zero during lockd operation. This is a problem because rpc.statd and lockd use the NSM state number to prevent repeated lock recovery on rebooted hosts. If lockd sends a zero NSM state, but then a delayed SM_NOTIFY with a real NSM state number is received, there is no way for lockd or rpc.statd to distinguish that stale SM_NOTIFY from an actual reboot. Thus lock recovery could be performed after the rebooted host has already started reclaiming locks, and those locks will be lost. We could change /etc/init.d/nfslock so it always modprobes lockd.ko before starting rpc.statd. However, if lockd.ko is ever unloaded and reloaded, we are back at square one, since the NSM state is not preserved across an unload/reload cycle. This may happen frequently on clients that use automounter. A period of NFS inactivity causes lockd.ko to be unloaded, and the kernel loses its NSM state setting. Instead, let's use the fact that rpc.statd plants the local system's NSM state in every SM_MON (and SM_UNMON) reply. lockd performs a synchronous SM_MON upcall to the local rpc.statd _before_ sending its first NLM request to a new remote. This would permit rpc.statd to provide the current NSM state to lockd, even after lockd.ko had been unloaded and reloaded. Note that NLMPROC_LOCK arguments are constructed before the nsm_monitor() call, so we have to rearrange argument construction very slightly to make this all work out. And, the kernel appears to treat NSM state as a u32 (see struct nlm_args and nsm_res). Make nsm_local_state a u32 as well, to ensure we don't get bogus comparison results. Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
2009-06-18 09:02:10 +08:00
req->a_args.state = nsm_local_state;
fl->fl_flags |= FL_ACCESS;
status = do_vfs_lock(fl);
fl->fl_flags = fl_flags;
if (status < 0)
goto out;
nlmclnt_prepare_block(&block, host, fl);
again:
/*
* Initialise resp->status to a valid non-zero value,
* since 0 == nlm_lck_granted
*/
resp->status = nlm_lck_blocked;
/*
* A GRANTED callback can come at any time -- even before the reply
* to the LOCK request arrives, so we queue the wait before
* requesting the lock.
*/
nlmclnt_queue_block(&block);
for (;;) {
/* Reboot protection */
fl->fl_u.nfs_fl.state = host->h_state;
status = nlmclnt_call(cred, req, NLMPROC_LOCK);
if (status < 0)
break;
/* Did a reclaimer thread notify us of a server reboot? */
if (resp->status == nlm_lck_denied_grace_period)
continue;
if (resp->status != nlm_lck_blocked)
break;
/* Wait on an NLM blocking lock */
status = nlmclnt_wait(&block, req, NLMCLNT_POLL_TIMEOUT);
if (status < 0)
break;
if (block.b_status != nlm_lck_blocked)
break;
}
b_status = nlmclnt_dequeue_block(&block);
if (resp->status == nlm_lck_blocked)
resp->status = b_status;
/* if we were interrupted while blocking, then cancel the lock request
* and exit
*/
if (resp->status == nlm_lck_blocked) {
if (!req->a_args.block)
goto out_unlock;
if (nlmclnt_cancel(host, req->a_args.block, fl) == 0)
goto out;
}
if (resp->status == nlm_granted) {
down_read(&host->h_rwsem);
/* Check whether or not the server has rebooted */
if (fl->fl_u.nfs_fl.state != host->h_state) {
up_read(&host->h_rwsem);
goto again;
}
/* Ensure the resulting lock will get added to granted list */
fl->fl_flags |= FL_SLEEP;
if (do_vfs_lock(fl) < 0)
printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __func__);
up_read(&host->h_rwsem);
fl->fl_flags = fl_flags;
status = 0;
}
if (status < 0)
goto out_unlock;
/*
* EAGAIN doesn't make sense for sleeping locks, and in some
* cases NLM_LCK_DENIED is returned for a permanent error. So
* turn it into an ENOLCK.
*/
if (resp->status == nlm_lck_denied && (fl_flags & FL_SLEEP))
status = -ENOLCK;
else
status = nlm_stat_to_errno(resp->status);
out:
trace_nlmclnt_lock(&req->a_args.lock,
(const struct sockaddr *)&req->a_host->h_addr,
req->a_host->h_addrlen, req->a_res.status);
nlmclnt_release_call(req);
return status;
out_unlock:
/* Fatal error: ensure that we remove the lock altogether */
trace_nlmclnt_lock(&req->a_args.lock,
(const struct sockaddr *)&req->a_host->h_addr,
req->a_host->h_addrlen, req->a_res.status);
dprintk("lockd: lock attempt ended in fatal error.\n"
" Attempting to unlock.\n");
fl_type = fl->fl_type;
fl->fl_type = F_UNLCK;
down_read(&host->h_rwsem);
do_vfs_lock(fl);
up_read(&host->h_rwsem);
fl->fl_type = fl_type;
fl->fl_flags = fl_flags;
nlmclnt_async_call(cred, req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
return status;
}
/*
* RECLAIM: Try to reclaim a lock
*/
int
nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl,
struct nlm_rqst *req)
{
int status;
memset(req, 0, sizeof(*req));
locks_init_lock(&req->a_args.lock.fl);
locks_init_lock(&req->a_res.lock.fl);
req->a_host = host;
/* Set up the argument struct */
nlmclnt_setlockargs(req, fl);
req->a_args.reclaim = 1;
status = nlmclnt_call(nfs_file_cred(fl->fl_file), req, NLMPROC_LOCK);
if (status >= 0 && req->a_res.status == nlm_granted)
return 0;
printk(KERN_WARNING "lockd: failed to reclaim lock for pid %d "
"(errno %d, status %d)\n", fl->fl_pid,
status, ntohl(req->a_res.status));
/*
* FIXME: This is a serious failure. We can
*
* a. Ignore the problem
* b. Send the owning process some signal (Linux doesn't have
* SIGLOST, though...)
* c. Retry the operation
*
* Until someone comes up with a simple implementation
* for b or c, I'll choose option a.
*/
return -ENOLCK;
}
/*
* UNLOCK: remove an existing lock
*/
static int
nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
{
struct nlm_host *host = req->a_host;
struct nlm_res *resp = &req->a_res;
int status;
unsigned char fl_flags = fl->fl_flags;
/*
* Note: the server is supposed to either grant us the unlock
* request, or to deny it with NLM_LCK_DENIED_GRACE_PERIOD. In either
* case, we want to unlock.
*/
fl->fl_flags |= FL_EXISTS;
down_read(&host->h_rwsem);
status = do_vfs_lock(fl);
up_read(&host->h_rwsem);
fl->fl_flags = fl_flags;
if (status == -ENOENT) {
status = 0;
goto out;
}
lockd: convert nlm_rqst.a_count from atomic_t to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable nlm_rqst.a_count is used as pure reference counter. Convert it to refcount_t and fix up the operations. **Important note for maintainers: Some functions from refcount_t API defined in lib/refcount.c have different memory ordering guarantees than their atomic counterparts. The full comparison can be seen in https://lkml.org/lkml/2017/11/15/57 and it is hopefully soon in state to be merged to the documentation tree. Normally the differences should not matter since refcount_t provides enough guarantees to satisfy the refcounting use cases, but in some rare cases it might matter. Please double check that you don't have some undocumented memory guarantees for this variable usage. For the nlm_rqst.a_count it might make a difference in following places: - nlmclnt_release_call() and nlmsvc_release_call(): decrement in refcount_dec_and_test() only provides RELEASE ordering and control dependency on success vs. fully ordered atomic counterpart Suggested-by: Kees Cook <keescook@chromium.org> Reviewed-by: David Windsor <dwindsor@gmail.com> Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com> Signed-off-by: Elena Reshetova <elena.reshetova@intel.com> Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
2017-11-29 19:15:46 +08:00
refcount_inc(&req->a_count);
status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req,
NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
if (status < 0)
goto out;
if (resp->status == nlm_granted)
goto out;
if (resp->status != nlm_lck_denied_nolocks)
printk("lockd: unexpected unlock status: %d\n",
ntohl(resp->status));
/* What to do now? I'm out of my depth... */
status = -ENOLCK;
out:
trace_nlmclnt_unlock(&req->a_args.lock,
(const struct sockaddr *)&req->a_host->h_addr,
req->a_host->h_addrlen, req->a_res.status);
nlmclnt_release_call(req);
return status;
}
static void nlmclnt_unlock_prepare(struct rpc_task *task, void *data)
{
struct nlm_rqst *req = data;
const struct nlmclnt_operations *nlmclnt_ops = req->a_host->h_nlmclnt_ops;
bool defer_call = false;
if (nlmclnt_ops && nlmclnt_ops->nlmclnt_unlock_prepare)
defer_call = nlmclnt_ops->nlmclnt_unlock_prepare(task, req->a_callback_data);
if (!defer_call)
rpc_call_start(task);
}
static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
{
struct nlm_rqst *req = data;
u32 status = ntohl(req->a_res.status);
if (RPC_SIGNALLED(task))
goto die;
if (task->tk_status < 0) {
dprintk("lockd: unlock failed (err = %d)\n", -task->tk_status);
switch (task->tk_status) {
case -EACCES:
case -EIO:
goto die;
default:
goto retry_rebind;
}
}
if (status == NLM_LCK_DENIED_GRACE_PERIOD) {
rpc_delay(task, NLMCLNT_GRACE_WAIT);
goto retry_unlock;
}
if (status != NLM_LCK_GRANTED)
printk(KERN_WARNING "lockd: unexpected unlock status: %d\n", status);
die:
return;
retry_rebind:
nlm_rebind_host(req->a_host);
retry_unlock:
rpc_restart_call(task);
}
static const struct rpc_call_ops nlmclnt_unlock_ops = {
.rpc_call_prepare = nlmclnt_unlock_prepare,
.rpc_call_done = nlmclnt_unlock_callback,
.rpc_release = nlmclnt_rpc_release,
};
/*
* Cancel a blocked lock request.
* We always use an async RPC call for this in order not to hang a
* process that has been Ctrl-C'ed.
*/
static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl)
{
struct nlm_rqst *req;
int status;
dprintk("lockd: blocking lock attempt was interrupted by a signal.\n"
" Attempting to cancel lock.\n");
req = nlm_alloc_call(host);
if (!req)
return -ENOMEM;
req->a_flags = RPC_TASK_ASYNC;
nlmclnt_setlockargs(req, fl);
req->a_args.block = block;
lockd: convert nlm_rqst.a_count from atomic_t to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable nlm_rqst.a_count is used as pure reference counter. Convert it to refcount_t and fix up the operations. **Important note for maintainers: Some functions from refcount_t API defined in lib/refcount.c have different memory ordering guarantees than their atomic counterparts. The full comparison can be seen in https://lkml.org/lkml/2017/11/15/57 and it is hopefully soon in state to be merged to the documentation tree. Normally the differences should not matter since refcount_t provides enough guarantees to satisfy the refcounting use cases, but in some rare cases it might matter. Please double check that you don't have some undocumented memory guarantees for this variable usage. For the nlm_rqst.a_count it might make a difference in following places: - nlmclnt_release_call() and nlmsvc_release_call(): decrement in refcount_dec_and_test() only provides RELEASE ordering and control dependency on success vs. fully ordered atomic counterpart Suggested-by: Kees Cook <keescook@chromium.org> Reviewed-by: David Windsor <dwindsor@gmail.com> Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com> Signed-off-by: Elena Reshetova <elena.reshetova@intel.com> Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
2017-11-29 19:15:46 +08:00
refcount_inc(&req->a_count);
status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req,
NLMPROC_CANCEL, &nlmclnt_cancel_ops);
if (status == 0 && req->a_res.status == nlm_lck_denied)
status = -ENOLCK;
nlmclnt_release_call(req);
return status;
}
static void nlmclnt_cancel_callback(struct rpc_task *task, void *data)
{
struct nlm_rqst *req = data;
u32 status = ntohl(req->a_res.status);
if (RPC_SIGNALLED(task))
goto die;
if (task->tk_status < 0) {
dprintk("lockd: CANCEL call error %d, retrying.\n",
task->tk_status);
goto retry_cancel;
}
switch (status) {
case NLM_LCK_GRANTED:
case NLM_LCK_DENIED_GRACE_PERIOD:
case NLM_LCK_DENIED:
/* Everything's good */
break;
case NLM_LCK_DENIED_NOLOCKS:
dprintk("lockd: CANCEL failed (server has no locks)\n");
goto retry_cancel;
default:
printk(KERN_NOTICE "lockd: weird return %d for CANCEL call\n",
status);
}
die:
return;
retry_cancel:
/* Don't ever retry more than 3 times */
if (req->a_retries++ >= NLMCLNT_MAX_RETRIES)
goto die;
nlm_rebind_host(req->a_host);
rpc_restart_call(task);
rpc_delay(task, 30 * HZ);
}
static const struct rpc_call_ops nlmclnt_cancel_ops = {
.rpc_call_done = nlmclnt_cancel_callback,
.rpc_release = nlmclnt_rpc_release,
};
/*
* Convert an NLM status code to a generic kernel errno
*/
static int
nlm_stat_to_errno(__be32 status)
{
switch(ntohl(status)) {
case NLM_LCK_GRANTED:
return 0;
case NLM_LCK_DENIED:
return -EAGAIN;
case NLM_LCK_DENIED_NOLOCKS:
case NLM_LCK_DENIED_GRACE_PERIOD:
return -ENOLCK;
case NLM_LCK_BLOCKED:
printk(KERN_NOTICE "lockd: unexpected status NLM_BLOCKED\n");
return -ENOLCK;
#ifdef CONFIG_LOCKD_V4
case NLM_DEADLCK:
return -EDEADLK;
case NLM_ROFS:
return -EROFS;
case NLM_STALE_FH:
return -ESTALE;
case NLM_FBIG:
return -EOVERFLOW;
case NLM_FAILED:
return -ENOLCK;
#endif
}
printk(KERN_NOTICE "lockd: unexpected server status %d\n",
ntohl(status));
return -ENOLCK;
}