Another RDMA update from Chuck Lever, and a bunch of miscellaneous
bugfixes. -----BEGIN PGP SIGNATURE----- iQIcBAABAgAGBQJZE2UeAAoJECebzXlCjuG+St8P/0vG+ps9sY012E6Wh9gy4Ev4 BtxG/c3CtcxrbNzW+cFhdEloBGtC0VvcrKNCozJTK4LdaPYErkyRBpjgXvIggT9I GWY4ftpH3eJ6uByN9Okgc3/1la2poDflJO/nYhdRed3YHOnXTtx/746tu1xAnVCV tFtDGrbJZTprt5c3zETtdtquCUSy2aMT5ZbrdU3yWBCwQMNSIufN3an8epfB++xx Ct+G0HTRffcWAdYuLT0N1HKqm8pkncdNMFpm7mVw0hMCRy552G3fuj8LtkhVTvKE 1KN3zXY4jhaUYWD5Yt6AJcpLEro65b8swYk4e9FP2TNUpCmuRdXT9cb9vE8YztxC 8s4N23RHaEx9I6pC3OU64a2HfhiQM/oOIvjlhTBsjojXsQcqZFD1vsoSYA8Byl0w m9EQWqPqge4m6yEYl7uAyL6xSthbrhcU1Ks5jvNXGcWzEQj7BATnynJANsfZ+y6r ZoVcsRNX49m1BG+p9br+9DFffPiNFUMqxbfr73L9HRep3OsPeFKazFG0bKd3hOqA E6L/AnBd9soSqTuTvbisWrGWbomhtd5G/fAa1uHrWTPHMXUWCmkguiau51FNfcHu xcJlBBVCvUmmd5u3wF6QeiyjPs4KEBzQzsOUsWKHRxDBp6s+5PX/lHuXRBlDP+fN TQq0KbvBtea1OyMaRtoV =Rtl/ -----END PGP SIGNATURE----- Merge tag 'nfsd-4.12' of git://linux-nfs.org/~bfields/linux Pull nfsd updates from Bruce Fields: "Another RDMA update from Chuck Lever, and a bunch of miscellaneous bugfixes" * tag 'nfsd-4.12' of git://linux-nfs.org/~bfields/linux: (26 commits) nfsd: Fix up the "supattr_exclcreat" attributes nfsd: encoders mustn't use unitialized values in error cases nfsd: fix undefined behavior in nfsd4_layout_verify lockd: fix lockd shutdown race NFSv4: Fix callback server shutdown SUNRPC: Refactor svc_set_num_threads() NFSv4.x/callback: Create the callback service through svc_create_pooled lockd: remove redundant check on block svcrdma: Clean out old XDR encoders svcrdma: Remove the req_map cache svcrdma: Remove unused RDMA Write completion handler svcrdma: Reduce size of sge array in struct svc_rdma_op_ctxt svcrdma: Clean up RPC-over-RDMA backchannel reply processing svcrdma: Report Write/Reply chunk overruns svcrdma: Clean up RDMA_ERROR path svcrdma: Use rdma_rw API in RPC reply path svcrdma: Introduce local rdma_rw API helpers svcrdma: Clean up svc_rdma_get_inv_rkey() svcrdma: Add helper to save pages under I/O svcrdma: Eliminate RPCRDMA_SQ_DEPTH_MULT ...
This commit is contained in:
commit
c70422f760
|
@ -132,6 +132,8 @@ lockd(void *vrqstp)
|
|||
{
|
||||
int err = 0;
|
||||
struct svc_rqst *rqstp = vrqstp;
|
||||
struct net *net = &init_net;
|
||||
struct lockd_net *ln = net_generic(net, lockd_net_id);
|
||||
|
||||
/* try_to_freeze() is called from svc_recv() */
|
||||
set_freezable();
|
||||
|
@ -176,6 +178,8 @@ lockd(void *vrqstp)
|
|||
if (nlmsvc_ops)
|
||||
nlmsvc_invalidate_all();
|
||||
nlm_shutdown_hosts();
|
||||
cancel_delayed_work_sync(&ln->grace_period_end);
|
||||
locks_end_grace(&ln->lockd_manager);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -270,8 +274,6 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net)
|
|||
if (ln->nlmsvc_users) {
|
||||
if (--ln->nlmsvc_users == 0) {
|
||||
nlm_shutdown_hosts_net(net);
|
||||
cancel_delayed_work_sync(&ln->grace_period_end);
|
||||
locks_end_grace(&ln->lockd_manager);
|
||||
svc_shutdown_net(serv, net);
|
||||
dprintk("lockd_down_net: per-net data destroyed; net=%p\n", net);
|
||||
}
|
||||
|
|
|
@ -870,16 +870,16 @@ nlmsvc_grant_reply(struct nlm_cookie *cookie, __be32 status)
|
|||
if (!(block = nlmsvc_find_block(cookie)))
|
||||
return;
|
||||
|
||||
if (block) {
|
||||
if (status == nlm_lck_denied_grace_period) {
|
||||
/* Try again in a couple of seconds */
|
||||
nlmsvc_insert_block(block, 10 * HZ);
|
||||
} else {
|
||||
/* Lock is now held by client, or has been rejected.
|
||||
* In both cases, the block should be removed. */
|
||||
/*
|
||||
* Lock is now held by client, or has been rejected.
|
||||
* In both cases, the block should be removed.
|
||||
*/
|
||||
nlmsvc_unlink_block(block);
|
||||
}
|
||||
}
|
||||
nlmsvc_release_block(block);
|
||||
}
|
||||
|
||||
|
|
|
@ -76,7 +76,10 @@ nfs4_callback_svc(void *vrqstp)
|
|||
|
||||
set_freezable();
|
||||
|
||||
while (!kthread_should_stop()) {
|
||||
while (!kthread_freezable_should_stop(NULL)) {
|
||||
|
||||
if (signal_pending(current))
|
||||
flush_signals(current);
|
||||
/*
|
||||
* Listen for a request on the socket
|
||||
*/
|
||||
|
@ -85,6 +88,8 @@ nfs4_callback_svc(void *vrqstp)
|
|||
continue;
|
||||
svc_process(rqstp);
|
||||
}
|
||||
svc_exit_thread(rqstp);
|
||||
module_put_and_exit(0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -103,9 +108,10 @@ nfs41_callback_svc(void *vrqstp)
|
|||
|
||||
set_freezable();
|
||||
|
||||
while (!kthread_should_stop()) {
|
||||
if (try_to_freeze())
|
||||
continue;
|
||||
while (!kthread_freezable_should_stop(NULL)) {
|
||||
|
||||
if (signal_pending(current))
|
||||
flush_signals(current);
|
||||
|
||||
prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
|
||||
spin_lock_bh(&serv->sv_cb_lock);
|
||||
|
@ -121,11 +127,13 @@ nfs41_callback_svc(void *vrqstp)
|
|||
error);
|
||||
} else {
|
||||
spin_unlock_bh(&serv->sv_cb_lock);
|
||||
if (!kthread_should_stop())
|
||||
schedule();
|
||||
finish_wait(&serv->sv_cb_waitq, &wq);
|
||||
}
|
||||
flush_signals(current);
|
||||
}
|
||||
svc_exit_thread(rqstp);
|
||||
module_put_and_exit(0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -221,14 +229,14 @@ err_bind:
|
|||
static struct svc_serv_ops nfs40_cb_sv_ops = {
|
||||
.svo_function = nfs4_callback_svc,
|
||||
.svo_enqueue_xprt = svc_xprt_do_enqueue,
|
||||
.svo_setup = svc_set_num_threads,
|
||||
.svo_setup = svc_set_num_threads_sync,
|
||||
.svo_module = THIS_MODULE,
|
||||
};
|
||||
#if defined(CONFIG_NFS_V4_1)
|
||||
static struct svc_serv_ops nfs41_cb_sv_ops = {
|
||||
.svo_function = nfs41_callback_svc,
|
||||
.svo_enqueue_xprt = svc_xprt_do_enqueue,
|
||||
.svo_setup = svc_set_num_threads,
|
||||
.svo_setup = svc_set_num_threads_sync,
|
||||
.svo_module = THIS_MODULE,
|
||||
};
|
||||
|
||||
|
@ -280,7 +288,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
|
|||
printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n",
|
||||
cb_info->users);
|
||||
|
||||
serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops);
|
||||
serv = svc_create_pooled(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops);
|
||||
if (!serv) {
|
||||
printk(KERN_ERR "nfs_callback_create_svc: create service failed\n");
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
|
|
@ -334,8 +334,11 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
|
|||
if (!p)
|
||||
return 0;
|
||||
p = xdr_decode_hyper(p, &args->offset);
|
||||
|
||||
args->count = ntohl(*p++);
|
||||
|
||||
if (!xdr_argsize_check(rqstp, p))
|
||||
return 0;
|
||||
|
||||
len = min(args->count, max_blocksize);
|
||||
|
||||
/* set up the kvec */
|
||||
|
@ -349,7 +352,7 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
|
|||
v++;
|
||||
}
|
||||
args->vlen = v;
|
||||
return xdr_argsize_check(rqstp, p);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int
|
||||
|
@ -541,9 +544,11 @@ nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p,
|
|||
p = decode_fh(p, &args->fh);
|
||||
if (!p)
|
||||
return 0;
|
||||
if (!xdr_argsize_check(rqstp, p))
|
||||
return 0;
|
||||
args->buffer = page_address(*(rqstp->rq_next_page++));
|
||||
|
||||
return xdr_argsize_check(rqstp, p);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int
|
||||
|
@ -569,10 +574,14 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
|
|||
args->verf = p; p += 2;
|
||||
args->dircount = ~0;
|
||||
args->count = ntohl(*p++);
|
||||
|
||||
if (!xdr_argsize_check(rqstp, p))
|
||||
return 0;
|
||||
|
||||
args->count = min_t(u32, args->count, PAGE_SIZE);
|
||||
args->buffer = page_address(*(rqstp->rq_next_page++));
|
||||
|
||||
return xdr_argsize_check(rqstp, p);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int
|
||||
|
@ -590,6 +599,9 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
|
|||
args->dircount = ntohl(*p++);
|
||||
args->count = ntohl(*p++);
|
||||
|
||||
if (!xdr_argsize_check(rqstp, p))
|
||||
return 0;
|
||||
|
||||
len = args->count = min(args->count, max_blocksize);
|
||||
while (len > 0) {
|
||||
struct page *p = *(rqstp->rq_next_page++);
|
||||
|
@ -597,8 +609,7 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
|
|||
args->buffer = page_address(p);
|
||||
len -= PAGE_SIZE;
|
||||
}
|
||||
|
||||
return xdr_argsize_check(rqstp, p);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int
|
||||
|
|
|
@ -1259,7 +1259,8 @@ nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
if (!(exp->ex_layout_types & (1 << layout_type))) {
|
||||
if (layout_type >= LAYOUT_TYPE_MAX ||
|
||||
!(exp->ex_layout_types & (1 << layout_type))) {
|
||||
dprintk("%s: layout type %d not supported\n",
|
||||
__func__, layout_type);
|
||||
return NULL;
|
||||
|
|
|
@ -1912,28 +1912,15 @@ static void copy_clid(struct nfs4_client *target, struct nfs4_client *source)
|
|||
target->cl_clientid.cl_id = source->cl_clientid.cl_id;
|
||||
}
|
||||
|
||||
int strdup_if_nonnull(char **target, char *source)
|
||||
{
|
||||
if (source) {
|
||||
*target = kstrdup(source, GFP_KERNEL);
|
||||
if (!*target)
|
||||
return -ENOMEM;
|
||||
} else
|
||||
*target = NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int copy_cred(struct svc_cred *target, struct svc_cred *source)
|
||||
{
|
||||
int ret;
|
||||
target->cr_principal = kstrdup(source->cr_principal, GFP_KERNEL);
|
||||
target->cr_raw_principal = kstrdup(source->cr_raw_principal,
|
||||
GFP_KERNEL);
|
||||
if ((source->cr_principal && ! target->cr_principal) ||
|
||||
(source->cr_raw_principal && ! target->cr_raw_principal))
|
||||
return -ENOMEM;
|
||||
|
||||
ret = strdup_if_nonnull(&target->cr_principal, source->cr_principal);
|
||||
if (ret)
|
||||
return ret;
|
||||
ret = strdup_if_nonnull(&target->cr_raw_principal,
|
||||
source->cr_raw_principal);
|
||||
if (ret)
|
||||
return ret;
|
||||
target->cr_flavor = source->cr_flavor;
|
||||
target->cr_uid = source->cr_uid;
|
||||
target->cr_gid = source->cr_gid;
|
||||
|
|
|
@ -2831,9 +2831,14 @@ out_acl:
|
|||
}
|
||||
#endif /* CONFIG_NFSD_PNFS */
|
||||
if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
|
||||
status = nfsd4_encode_bitmap(xdr, NFSD_SUPPATTR_EXCLCREAT_WORD0,
|
||||
NFSD_SUPPATTR_EXCLCREAT_WORD1,
|
||||
NFSD_SUPPATTR_EXCLCREAT_WORD2);
|
||||
u32 supp[3];
|
||||
|
||||
memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp));
|
||||
supp[0] &= NFSD_SUPPATTR_EXCLCREAT_WORD0;
|
||||
supp[1] &= NFSD_SUPPATTR_EXCLCREAT_WORD1;
|
||||
supp[2] &= NFSD_SUPPATTR_EXCLCREAT_WORD2;
|
||||
|
||||
status = nfsd4_encode_bitmap(xdr, supp[0], supp[1], supp[2]);
|
||||
if (status)
|
||||
goto out;
|
||||
}
|
||||
|
@ -4119,8 +4124,7 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
|
|||
struct nfsd4_getdeviceinfo *gdev)
|
||||
{
|
||||
struct xdr_stream *xdr = &resp->xdr;
|
||||
const struct nfsd4_layout_ops *ops =
|
||||
nfsd4_layout_ops[gdev->gd_layout_type];
|
||||
const struct nfsd4_layout_ops *ops;
|
||||
u32 starting_len = xdr->buf->len, needed_len;
|
||||
__be32 *p;
|
||||
|
||||
|
@ -4137,6 +4141,7 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
|
|||
|
||||
/* If maxcount is 0 then just update notifications */
|
||||
if (gdev->gd_maxcount != 0) {
|
||||
ops = nfsd4_layout_ops[gdev->gd_layout_type];
|
||||
nfserr = ops->encode_getdeviceinfo(xdr, gdev);
|
||||
if (nfserr) {
|
||||
/*
|
||||
|
@ -4189,8 +4194,7 @@ nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
|
|||
struct nfsd4_layoutget *lgp)
|
||||
{
|
||||
struct xdr_stream *xdr = &resp->xdr;
|
||||
const struct nfsd4_layout_ops *ops =
|
||||
nfsd4_layout_ops[lgp->lg_layout_type];
|
||||
const struct nfsd4_layout_ops *ops;
|
||||
__be32 *p;
|
||||
|
||||
dprintk("%s: err %d\n", __func__, nfserr);
|
||||
|
@ -4213,6 +4217,7 @@ nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
|
|||
*p++ = cpu_to_be32(lgp->lg_seg.iomode);
|
||||
*p++ = cpu_to_be32(lgp->lg_layout_type);
|
||||
|
||||
ops = nfsd4_layout_ops[lgp->lg_layout_type];
|
||||
nfserr = ops->encode_layoutget(xdr, lgp);
|
||||
out:
|
||||
kfree(lgp->lg_content);
|
||||
|
|
|
@ -257,6 +257,9 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
|
|||
len = args->count = ntohl(*p++);
|
||||
p++; /* totalcount - unused */
|
||||
|
||||
if (!xdr_argsize_check(rqstp, p))
|
||||
return 0;
|
||||
|
||||
len = min_t(unsigned int, len, NFSSVC_MAXBLKSIZE_V2);
|
||||
|
||||
/* set up somewhere to store response.
|
||||
|
@ -272,7 +275,7 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
|
|||
v++;
|
||||
}
|
||||
args->vlen = v;
|
||||
return xdr_argsize_check(rqstp, p);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int
|
||||
|
@ -362,9 +365,11 @@ nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readli
|
|||
p = decode_fh(p, &args->fh);
|
||||
if (!p)
|
||||
return 0;
|
||||
if (!xdr_argsize_check(rqstp, p))
|
||||
return 0;
|
||||
args->buffer = page_address(*(rqstp->rq_next_page++));
|
||||
|
||||
return xdr_argsize_check(rqstp, p);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int
|
||||
|
@ -402,9 +407,11 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
|
|||
args->cookie = ntohl(*p++);
|
||||
args->count = ntohl(*p++);
|
||||
args->count = min_t(u32, args->count, PAGE_SIZE);
|
||||
if (!xdr_argsize_check(rqstp, p))
|
||||
return 0;
|
||||
args->buffer = page_address(*(rqstp->rq_next_page++));
|
||||
|
||||
return xdr_argsize_check(rqstp, p);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -94,6 +94,12 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
|
|||
err = follow_down(&path);
|
||||
if (err < 0)
|
||||
goto out;
|
||||
if (path.mnt == exp->ex_path.mnt && path.dentry == dentry &&
|
||||
nfsd_mountpoint(dentry, exp) == 2) {
|
||||
/* This is only a mountpoint in some other namespace */
|
||||
path_put(&path);
|
||||
goto out;
|
||||
}
|
||||
|
||||
exp2 = rqst_exp_get_by_name(rqstp, &path);
|
||||
if (IS_ERR(exp2)) {
|
||||
|
@ -167,16 +173,26 @@ static int nfsd_lookup_parent(struct svc_rqst *rqstp, struct dentry *dparent, st
|
|||
/*
|
||||
* For nfsd purposes, we treat V4ROOT exports as though there was an
|
||||
* export at *every* directory.
|
||||
* We return:
|
||||
* '1' if this dentry *must* be an export point,
|
||||
* '2' if it might be, if there is really a mount here, and
|
||||
* '0' if there is no chance of an export point here.
|
||||
*/
|
||||
int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp)
|
||||
{
|
||||
if (d_mountpoint(dentry))
|
||||
if (!d_inode(dentry))
|
||||
return 0;
|
||||
if (exp->ex_flags & NFSEXP_V4ROOT)
|
||||
return 1;
|
||||
if (nfsd4_is_junction(dentry))
|
||||
return 1;
|
||||
if (!(exp->ex_flags & NFSEXP_V4ROOT))
|
||||
if (d_mountpoint(dentry))
|
||||
/*
|
||||
* Might only be a mountpoint in a different namespace,
|
||||
* but we need to check.
|
||||
*/
|
||||
return 2;
|
||||
return 0;
|
||||
return d_inode(dentry) != NULL;
|
||||
}
|
||||
|
||||
__be32
|
||||
|
|
|
@ -143,6 +143,9 @@ enum rpcrdma_proc {
|
|||
#define rdma_done cpu_to_be32(RDMA_DONE)
|
||||
#define rdma_error cpu_to_be32(RDMA_ERROR)
|
||||
|
||||
#define err_vers cpu_to_be32(ERR_VERS)
|
||||
#define err_chunk cpu_to_be32(ERR_CHUNK)
|
||||
|
||||
/*
|
||||
* Private extension to RPC-over-RDMA Version One.
|
||||
* Message passed during RDMA-CM connection set-up.
|
||||
|
|
|
@ -336,8 +336,7 @@ xdr_argsize_check(struct svc_rqst *rqstp, __be32 *p)
|
|||
{
|
||||
char *cp = (char *)p;
|
||||
struct kvec *vec = &rqstp->rq_arg.head[0];
|
||||
return cp >= (char*)vec->iov_base
|
||||
&& cp <= (char*)vec->iov_base + vec->iov_len;
|
||||
return cp == (char *)vec->iov_base + vec->iov_len;
|
||||
}
|
||||
|
||||
static inline int
|
||||
|
@ -474,6 +473,7 @@ void svc_pool_map_put(void);
|
|||
struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int,
|
||||
struct svc_serv_ops *);
|
||||
int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
|
||||
int svc_set_num_threads_sync(struct svc_serv *, struct svc_pool *, int);
|
||||
int svc_pool_stats_open(struct svc_serv *serv, struct file *file);
|
||||
void svc_destroy(struct svc_serv *);
|
||||
void svc_shutdown_net(struct svc_serv *, struct net *);
|
||||
|
|
|
@ -48,6 +48,12 @@
|
|||
#include <rdma/rdma_cm.h>
|
||||
#define SVCRDMA_DEBUG
|
||||
|
||||
/* Default and maximum inline threshold sizes */
|
||||
enum {
|
||||
RPCRDMA_DEF_INLINE_THRESH = 4096,
|
||||
RPCRDMA_MAX_INLINE_THRESH = 65536
|
||||
};
|
||||
|
||||
/* RPC/RDMA parameters and stats */
|
||||
extern unsigned int svcrdma_ord;
|
||||
extern unsigned int svcrdma_max_requests;
|
||||
|
@ -85,27 +91,11 @@ struct svc_rdma_op_ctxt {
|
|||
enum dma_data_direction direction;
|
||||
int count;
|
||||
unsigned int mapped_sges;
|
||||
struct ib_sge sge[RPCSVC_MAXPAGES];
|
||||
struct ib_send_wr send_wr;
|
||||
struct ib_sge sge[1 + RPCRDMA_MAX_INLINE_THRESH / PAGE_SIZE];
|
||||
struct page *pages[RPCSVC_MAXPAGES];
|
||||
};
|
||||
|
||||
/*
|
||||
* NFS_ requests are mapped on the client side by the chunk lists in
|
||||
* the RPCRDMA header. During the fetching of the RPC from the client
|
||||
* and the writing of the reply to the client, the memory in the
|
||||
* client and the memory in the server must be mapped as contiguous
|
||||
* vaddr/len for access by the hardware. These data strucures keep
|
||||
* these mappings.
|
||||
*
|
||||
* For an RDMA_WRITE, the 'sge' maps the RPC REPLY. For RDMA_READ, the
|
||||
* 'sge' in the svc_rdma_req_map maps the server side RPC reply and the
|
||||
* 'ch' field maps the read-list of the RPCRDMA header to the 'sge'
|
||||
* mapping of the reply.
|
||||
*/
|
||||
struct svc_rdma_chunk_sge {
|
||||
int start; /* sge no for this chunk */
|
||||
int count; /* sge count for this chunk */
|
||||
};
|
||||
struct svc_rdma_fastreg_mr {
|
||||
struct ib_mr *mr;
|
||||
struct scatterlist *sg;
|
||||
|
@ -114,15 +104,7 @@ struct svc_rdma_fastreg_mr {
|
|||
enum dma_data_direction direction;
|
||||
struct list_head frmr_list;
|
||||
};
|
||||
struct svc_rdma_req_map {
|
||||
struct list_head free;
|
||||
unsigned long count;
|
||||
union {
|
||||
struct kvec sge[RPCSVC_MAXPAGES];
|
||||
struct svc_rdma_chunk_sge ch[RPCSVC_MAXPAGES];
|
||||
unsigned long lkey[RPCSVC_MAXPAGES];
|
||||
};
|
||||
};
|
||||
|
||||
#define RDMACTXT_F_LAST_CTXT 2
|
||||
|
||||
#define SVCRDMA_DEVCAP_FAST_REG 1 /* fast mr registration */
|
||||
|
@ -144,14 +126,15 @@ struct svcxprt_rdma {
|
|||
u32 sc_max_requests; /* Max requests */
|
||||
u32 sc_max_bc_requests;/* Backward credits */
|
||||
int sc_max_req_size; /* Size of each RQ WR buf */
|
||||
u8 sc_port_num;
|
||||
|
||||
struct ib_pd *sc_pd;
|
||||
|
||||
spinlock_t sc_ctxt_lock;
|
||||
struct list_head sc_ctxts;
|
||||
int sc_ctxt_used;
|
||||
spinlock_t sc_map_lock;
|
||||
struct list_head sc_maps;
|
||||
spinlock_t sc_rw_ctxt_lock;
|
||||
struct list_head sc_rw_ctxts;
|
||||
|
||||
struct list_head sc_rq_dto_q;
|
||||
spinlock_t sc_rq_dto_lock;
|
||||
|
@ -181,9 +164,7 @@ struct svcxprt_rdma {
|
|||
/* The default ORD value is based on two outstanding full-size writes with a
|
||||
* page size of 4k, or 32k * 2 ops / 4k = 16 outstanding RDMA_READ. */
|
||||
#define RPCRDMA_ORD (64/4)
|
||||
#define RPCRDMA_SQ_DEPTH_MULT 8
|
||||
#define RPCRDMA_MAX_REQUESTS 32
|
||||
#define RPCRDMA_MAX_REQ_SIZE 4096
|
||||
|
||||
/* Typical ULP usage of BC requests is NFSv4.1 backchannel. Our
|
||||
* current NFSv4.1 implementation supports one backchannel slot.
|
||||
|
@ -201,19 +182,11 @@ static inline void svc_rdma_count_mappings(struct svcxprt_rdma *rdma,
|
|||
|
||||
/* svc_rdma_backchannel.c */
|
||||
extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt,
|
||||
struct rpcrdma_msg *rmsgp,
|
||||
__be32 *rdma_resp,
|
||||
struct xdr_buf *rcvbuf);
|
||||
|
||||
/* svc_rdma_marshal.c */
|
||||
extern int svc_rdma_xdr_decode_req(struct xdr_buf *);
|
||||
extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *,
|
||||
struct rpcrdma_msg *,
|
||||
enum rpcrdma_errcode, __be32 *);
|
||||
extern void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *, int);
|
||||
extern void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *, int);
|
||||
extern void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *, int,
|
||||
__be32, __be64, u32);
|
||||
extern unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp);
|
||||
|
||||
/* svc_rdma_recvfrom.c */
|
||||
extern int svc_rdma_recvfrom(struct svc_rqst *);
|
||||
|
@ -224,16 +197,25 @@ extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *,
|
|||
struct svc_rdma_op_ctxt *, int *, u32 *,
|
||||
u32, u32, u64, bool);
|
||||
|
||||
/* svc_rdma_rw.c */
|
||||
extern void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma);
|
||||
extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
|
||||
__be32 *wr_ch, struct xdr_buf *xdr);
|
||||
extern int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
|
||||
__be32 *rp_ch, bool writelist,
|
||||
struct xdr_buf *xdr);
|
||||
|
||||
/* svc_rdma_sendto.c */
|
||||
extern int svc_rdma_map_xdr(struct svcxprt_rdma *, struct xdr_buf *,
|
||||
struct svc_rdma_req_map *, bool);
|
||||
extern int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma,
|
||||
struct svc_rdma_op_ctxt *ctxt,
|
||||
__be32 *rdma_resp, unsigned int len);
|
||||
extern int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma,
|
||||
struct svc_rdma_op_ctxt *ctxt,
|
||||
int num_sge, u32 inv_rkey);
|
||||
extern int svc_rdma_sendto(struct svc_rqst *);
|
||||
extern void svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *,
|
||||
int);
|
||||
|
||||
/* svc_rdma_transport.c */
|
||||
extern void svc_rdma_wc_send(struct ib_cq *, struct ib_wc *);
|
||||
extern void svc_rdma_wc_write(struct ib_cq *, struct ib_wc *);
|
||||
extern void svc_rdma_wc_reg(struct ib_cq *, struct ib_wc *);
|
||||
extern void svc_rdma_wc_read(struct ib_cq *, struct ib_wc *);
|
||||
extern void svc_rdma_wc_inv(struct ib_cq *, struct ib_wc *);
|
||||
|
@ -244,9 +226,6 @@ extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *);
|
|||
extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *);
|
||||
extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int);
|
||||
extern void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt);
|
||||
extern struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *);
|
||||
extern void svc_rdma_put_req_map(struct svcxprt_rdma *,
|
||||
struct svc_rdma_req_map *);
|
||||
extern struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *);
|
||||
extern void svc_rdma_put_frmr(struct svcxprt_rdma *,
|
||||
struct svc_rdma_fastreg_mr *);
|
||||
|
|
|
@ -22,6 +22,8 @@
|
|||
#ifndef _NFSD_CLD_H
|
||||
#define _NFSD_CLD_H
|
||||
|
||||
#include <linux/types.h>
|
||||
|
||||
/* latest upcall version available */
|
||||
#define CLD_UPCALL_VERSION 1
|
||||
|
||||
|
@ -37,18 +39,18 @@ enum cld_command {
|
|||
|
||||
/* representation of long-form NFSv4 client ID */
|
||||
struct cld_name {
|
||||
uint16_t cn_len; /* length of cm_id */
|
||||
__u16 cn_len; /* length of cm_id */
|
||||
unsigned char cn_id[NFS4_OPAQUE_LIMIT]; /* client-provided */
|
||||
} __attribute__((packed));
|
||||
|
||||
/* message struct for communication with userspace */
|
||||
struct cld_msg {
|
||||
uint8_t cm_vers; /* upcall version */
|
||||
uint8_t cm_cmd; /* upcall command */
|
||||
int16_t cm_status; /* return code */
|
||||
uint32_t cm_xid; /* transaction id */
|
||||
__u8 cm_vers; /* upcall version */
|
||||
__u8 cm_cmd; /* upcall command */
|
||||
__s16 cm_status; /* return code */
|
||||
__u32 cm_xid; /* transaction id */
|
||||
union {
|
||||
int64_t cm_gracetime; /* grace period start time */
|
||||
__s64 cm_gracetime; /* grace period start time */
|
||||
struct cld_name cm_name;
|
||||
} __attribute__((packed)) cm_u;
|
||||
} __attribute__((packed));
|
||||
|
|
|
@ -52,6 +52,7 @@ config SUNRPC_XPRT_RDMA
|
|||
tristate "RPC-over-RDMA transport"
|
||||
depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS
|
||||
default SUNRPC && INFINIBAND
|
||||
select SG_POOL
|
||||
help
|
||||
This option allows the NFS client and server to use RDMA
|
||||
transports (InfiniBand, iWARP, or RoCE).
|
||||
|
|
146
net/sunrpc/svc.c
146
net/sunrpc/svc.c
|
@ -702,6 +702,65 @@ found_pool:
|
|||
return task;
|
||||
}
|
||||
|
||||
/* create new threads */
|
||||
static int
|
||||
svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
|
||||
{
|
||||
struct svc_rqst *rqstp;
|
||||
struct task_struct *task;
|
||||
struct svc_pool *chosen_pool;
|
||||
unsigned int state = serv->sv_nrthreads-1;
|
||||
int node;
|
||||
|
||||
do {
|
||||
nrservs--;
|
||||
chosen_pool = choose_pool(serv, pool, &state);
|
||||
|
||||
node = svc_pool_map_get_node(chosen_pool->sp_id);
|
||||
rqstp = svc_prepare_thread(serv, chosen_pool, node);
|
||||
if (IS_ERR(rqstp))
|
||||
return PTR_ERR(rqstp);
|
||||
|
||||
__module_get(serv->sv_ops->svo_module);
|
||||
task = kthread_create_on_node(serv->sv_ops->svo_function, rqstp,
|
||||
node, "%s", serv->sv_name);
|
||||
if (IS_ERR(task)) {
|
||||
module_put(serv->sv_ops->svo_module);
|
||||
svc_exit_thread(rqstp);
|
||||
return PTR_ERR(task);
|
||||
}
|
||||
|
||||
rqstp->rq_task = task;
|
||||
if (serv->sv_nrpools > 1)
|
||||
svc_pool_map_set_cpumask(task, chosen_pool->sp_id);
|
||||
|
||||
svc_sock_update_bufs(serv);
|
||||
wake_up_process(task);
|
||||
} while (nrservs > 0);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/* destroy old threads */
|
||||
static int
|
||||
svc_signal_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
|
||||
{
|
||||
struct task_struct *task;
|
||||
unsigned int state = serv->sv_nrthreads-1;
|
||||
|
||||
/* destroy old threads */
|
||||
do {
|
||||
task = choose_victim(serv, pool, &state);
|
||||
if (task == NULL)
|
||||
break;
|
||||
send_sig(SIGINT, task, 1);
|
||||
nrservs++;
|
||||
} while (nrservs < 0);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Create or destroy enough new threads to make the number
|
||||
* of threads the given number. If `pool' is non-NULL, applies
|
||||
|
@ -719,13 +778,6 @@ found_pool:
|
|||
int
|
||||
svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
|
||||
{
|
||||
struct svc_rqst *rqstp;
|
||||
struct task_struct *task;
|
||||
struct svc_pool *chosen_pool;
|
||||
int error = 0;
|
||||
unsigned int state = serv->sv_nrthreads-1;
|
||||
int node;
|
||||
|
||||
if (pool == NULL) {
|
||||
/* The -1 assumes caller has done a svc_get() */
|
||||
nrservs -= (serv->sv_nrthreads-1);
|
||||
|
@ -735,46 +787,52 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
|
|||
spin_unlock_bh(&pool->sp_lock);
|
||||
}
|
||||
|
||||
/* create new threads */
|
||||
while (nrservs > 0) {
|
||||
nrservs--;
|
||||
chosen_pool = choose_pool(serv, pool, &state);
|
||||
|
||||
node = svc_pool_map_get_node(chosen_pool->sp_id);
|
||||
rqstp = svc_prepare_thread(serv, chosen_pool, node);
|
||||
if (IS_ERR(rqstp)) {
|
||||
error = PTR_ERR(rqstp);
|
||||
break;
|
||||
}
|
||||
|
||||
__module_get(serv->sv_ops->svo_module);
|
||||
task = kthread_create_on_node(serv->sv_ops->svo_function, rqstp,
|
||||
node, "%s", serv->sv_name);
|
||||
if (IS_ERR(task)) {
|
||||
error = PTR_ERR(task);
|
||||
module_put(serv->sv_ops->svo_module);
|
||||
svc_exit_thread(rqstp);
|
||||
break;
|
||||
}
|
||||
|
||||
rqstp->rq_task = task;
|
||||
if (serv->sv_nrpools > 1)
|
||||
svc_pool_map_set_cpumask(task, chosen_pool->sp_id);
|
||||
|
||||
svc_sock_update_bufs(serv);
|
||||
wake_up_process(task);
|
||||
}
|
||||
/* destroy old threads */
|
||||
while (nrservs < 0 &&
|
||||
(task = choose_victim(serv, pool, &state)) != NULL) {
|
||||
send_sig(SIGINT, task, 1);
|
||||
nrservs++;
|
||||
}
|
||||
|
||||
return error;
|
||||
if (nrservs > 0)
|
||||
return svc_start_kthreads(serv, pool, nrservs);
|
||||
if (nrservs < 0)
|
||||
return svc_signal_kthreads(serv, pool, nrservs);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(svc_set_num_threads);
|
||||
|
||||
/* destroy old threads */
|
||||
static int
|
||||
svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
|
||||
{
|
||||
struct task_struct *task;
|
||||
unsigned int state = serv->sv_nrthreads-1;
|
||||
|
||||
/* destroy old threads */
|
||||
do {
|
||||
task = choose_victim(serv, pool, &state);
|
||||
if (task == NULL)
|
||||
break;
|
||||
kthread_stop(task);
|
||||
nrservs++;
|
||||
} while (nrservs < 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int
|
||||
svc_set_num_threads_sync(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
|
||||
{
|
||||
if (pool == NULL) {
|
||||
/* The -1 assumes caller has done a svc_get() */
|
||||
nrservs -= (serv->sv_nrthreads-1);
|
||||
} else {
|
||||
spin_lock_bh(&pool->sp_lock);
|
||||
nrservs -= pool->sp_nrthreads;
|
||||
spin_unlock_bh(&pool->sp_lock);
|
||||
}
|
||||
|
||||
if (nrservs > 0)
|
||||
return svc_start_kthreads(serv, pool, nrservs);
|
||||
if (nrservs < 0)
|
||||
return svc_stop_kthreads(serv, pool, nrservs);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(svc_set_num_threads_sync);
|
||||
|
||||
/*
|
||||
* Called from a server thread as it's exiting. Caller must hold the "service
|
||||
* mutex" for the service.
|
||||
|
|
|
@ -4,5 +4,5 @@ rpcrdma-y := transport.o rpc_rdma.o verbs.o \
|
|||
fmr_ops.o frwr_ops.o \
|
||||
svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
|
||||
svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \
|
||||
module.o
|
||||
svc_rdma_rw.o module.o
|
||||
rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o
|
||||
|
|
|
@ -58,9 +58,9 @@ unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS;
|
|||
unsigned int svcrdma_max_bc_requests = RPCRDMA_MAX_BC_REQUESTS;
|
||||
static unsigned int min_max_requests = 4;
|
||||
static unsigned int max_max_requests = 16384;
|
||||
unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE;
|
||||
static unsigned int min_max_inline = 4096;
|
||||
static unsigned int max_max_inline = 65536;
|
||||
unsigned int svcrdma_max_req_size = RPCRDMA_DEF_INLINE_THRESH;
|
||||
static unsigned int min_max_inline = RPCRDMA_DEF_INLINE_THRESH;
|
||||
static unsigned int max_max_inline = RPCRDMA_MAX_INLINE_THRESH;
|
||||
|
||||
atomic_t rdma_stat_recv;
|
||||
atomic_t rdma_stat_read;
|
||||
|
@ -247,8 +247,6 @@ int svc_rdma_init(void)
|
|||
dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
|
||||
dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord);
|
||||
dprintk("\tmax_requests : %u\n", svcrdma_max_requests);
|
||||
dprintk("\tsq_depth : %u\n",
|
||||
svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT);
|
||||
dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests);
|
||||
dprintk("\tmax_inline : %d\n", svcrdma_max_req_size);
|
||||
|
||||
|
|
|
@ -12,7 +12,17 @@
|
|||
|
||||
#undef SVCRDMA_BACKCHANNEL_DEBUG
|
||||
|
||||
int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
|
||||
/**
|
||||
* svc_rdma_handle_bc_reply - Process incoming backchannel reply
|
||||
* @xprt: controlling backchannel transport
|
||||
* @rdma_resp: pointer to incoming transport header
|
||||
* @rcvbuf: XDR buffer into which to decode the reply
|
||||
*
|
||||
* Returns:
|
||||
* %0 if @rcvbuf is filled in, xprt_complete_rqst called,
|
||||
* %-EAGAIN if server should call ->recvfrom again.
|
||||
*/
|
||||
int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp,
|
||||
struct xdr_buf *rcvbuf)
|
||||
{
|
||||
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
|
||||
|
@ -27,13 +37,13 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
|
|||
|
||||
p = (__be32 *)src->iov_base;
|
||||
len = src->iov_len;
|
||||
xid = rmsgp->rm_xid;
|
||||
xid = *rdma_resp;
|
||||
|
||||
#ifdef SVCRDMA_BACKCHANNEL_DEBUG
|
||||
pr_info("%s: xid=%08x, length=%zu\n",
|
||||
__func__, be32_to_cpu(xid), len);
|
||||
pr_info("%s: RPC/RDMA: %*ph\n",
|
||||
__func__, (int)RPCRDMA_HDRLEN_MIN, rmsgp);
|
||||
__func__, (int)RPCRDMA_HDRLEN_MIN, rdma_resp);
|
||||
pr_info("%s: RPC: %*ph\n",
|
||||
__func__, (int)len, p);
|
||||
#endif
|
||||
|
@ -53,7 +63,7 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
|
|||
goto out_unlock;
|
||||
memcpy(dst->iov_base, p, len);
|
||||
|
||||
credits = be32_to_cpu(rmsgp->rm_credit);
|
||||
credits = be32_to_cpup(rdma_resp + 2);
|
||||
if (credits == 0)
|
||||
credits = 1; /* don't deadlock */
|
||||
else if (credits > r_xprt->rx_buf.rb_bc_max_requests)
|
||||
|
@ -90,9 +100,9 @@ out_notfound:
|
|||
* Caller holds the connection's mutex and has already marshaled
|
||||
* the RPC/RDMA request.
|
||||
*
|
||||
* This is similar to svc_rdma_reply, but takes an rpc_rqst
|
||||
* instead, does not support chunks, and avoids blocking memory
|
||||
* allocation.
|
||||
* This is similar to svc_rdma_send_reply_msg, but takes a struct
|
||||
* rpc_rqst instead, does not support chunks, and avoids blocking
|
||||
* memory allocation.
|
||||
*
|
||||
* XXX: There is still an opportunity to block in svc_rdma_send()
|
||||
* if there are no SQ entries to post the Send. This may occur if
|
||||
|
@ -101,59 +111,36 @@ out_notfound:
|
|||
static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
|
||||
struct rpc_rqst *rqst)
|
||||
{
|
||||
struct xdr_buf *sndbuf = &rqst->rq_snd_buf;
|
||||
struct svc_rdma_op_ctxt *ctxt;
|
||||
struct svc_rdma_req_map *vec;
|
||||
struct ib_send_wr send_wr;
|
||||
int ret;
|
||||
|
||||
vec = svc_rdma_get_req_map(rdma);
|
||||
ret = svc_rdma_map_xdr(rdma, sndbuf, vec, false);
|
||||
if (ret)
|
||||
ctxt = svc_rdma_get_context(rdma);
|
||||
|
||||
/* rpcrdma_bc_send_request builds the transport header and
|
||||
* the backchannel RPC message in the same buffer. Thus only
|
||||
* one SGE is needed to send both.
|
||||
*/
|
||||
ret = svc_rdma_map_reply_hdr(rdma, ctxt, rqst->rq_buffer,
|
||||
rqst->rq_snd_buf.len);
|
||||
if (ret < 0)
|
||||
goto out_err;
|
||||
|
||||
ret = svc_rdma_repost_recv(rdma, GFP_NOIO);
|
||||
if (ret)
|
||||
goto out_err;
|
||||
|
||||
ctxt = svc_rdma_get_context(rdma);
|
||||
ctxt->pages[0] = virt_to_page(rqst->rq_buffer);
|
||||
ctxt->count = 1;
|
||||
|
||||
ctxt->direction = DMA_TO_DEVICE;
|
||||
ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
|
||||
ctxt->sge[0].length = sndbuf->len;
|
||||
ctxt->sge[0].addr =
|
||||
ib_dma_map_page(rdma->sc_cm_id->device, ctxt->pages[0], 0,
|
||||
sndbuf->len, DMA_TO_DEVICE);
|
||||
if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) {
|
||||
ret = -EIO;
|
||||
ret = svc_rdma_post_send_wr(rdma, ctxt, 1, 0);
|
||||
if (ret)
|
||||
goto out_unmap;
|
||||
}
|
||||
svc_rdma_count_mappings(rdma, ctxt);
|
||||
|
||||
memset(&send_wr, 0, sizeof(send_wr));
|
||||
ctxt->cqe.done = svc_rdma_wc_send;
|
||||
send_wr.wr_cqe = &ctxt->cqe;
|
||||
send_wr.sg_list = ctxt->sge;
|
||||
send_wr.num_sge = 1;
|
||||
send_wr.opcode = IB_WR_SEND;
|
||||
send_wr.send_flags = IB_SEND_SIGNALED;
|
||||
|
||||
ret = svc_rdma_send(rdma, &send_wr);
|
||||
if (ret) {
|
||||
ret = -EIO;
|
||||
goto out_unmap;
|
||||
}
|
||||
|
||||
out_err:
|
||||
svc_rdma_put_req_map(rdma, vec);
|
||||
dprintk("svcrdma: %s returns %d\n", __func__, ret);
|
||||
return ret;
|
||||
|
||||
out_unmap:
|
||||
svc_rdma_unmap_dma(ctxt);
|
||||
svc_rdma_put_context(ctxt, 1);
|
||||
ret = -EIO;
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
|
|
|
@ -166,92 +166,3 @@ out_inval:
|
|||
dprintk("svcrdma: failed to parse transport header\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
|
||||
struct rpcrdma_msg *rmsgp,
|
||||
enum rpcrdma_errcode err, __be32 *va)
|
||||
{
|
||||
__be32 *startp = va;
|
||||
|
||||
*va++ = rmsgp->rm_xid;
|
||||
*va++ = rmsgp->rm_vers;
|
||||
*va++ = xprt->sc_fc_credits;
|
||||
*va++ = rdma_error;
|
||||
*va++ = cpu_to_be32(err);
|
||||
if (err == ERR_VERS) {
|
||||
*va++ = rpcrdma_version;
|
||||
*va++ = rpcrdma_version;
|
||||
}
|
||||
|
||||
return (int)((unsigned long)va - (unsigned long)startp);
|
||||
}
|
||||
|
||||
/**
|
||||
* svc_rdma_xdr_get_reply_hdr_length - Get length of Reply transport header
|
||||
* @rdma_resp: buffer containing Reply transport header
|
||||
*
|
||||
* Returns length of transport header, in bytes.
|
||||
*/
|
||||
unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp)
|
||||
{
|
||||
unsigned int nsegs;
|
||||
__be32 *p;
|
||||
|
||||
p = rdma_resp;
|
||||
|
||||
/* RPC-over-RDMA V1 replies never have a Read list. */
|
||||
p += rpcrdma_fixed_maxsz + 1;
|
||||
|
||||
/* Skip Write list. */
|
||||
while (*p++ != xdr_zero) {
|
||||
nsegs = be32_to_cpup(p++);
|
||||
p += nsegs * rpcrdma_segment_maxsz;
|
||||
}
|
||||
|
||||
/* Skip Reply chunk. */
|
||||
if (*p++ != xdr_zero) {
|
||||
nsegs = be32_to_cpup(p++);
|
||||
p += nsegs * rpcrdma_segment_maxsz;
|
||||
}
|
||||
|
||||
return (unsigned long)p - (unsigned long)rdma_resp;
|
||||
}
|
||||
|
||||
void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks)
|
||||
{
|
||||
struct rpcrdma_write_array *ary;
|
||||
|
||||
/* no read-list */
|
||||
rmsgp->rm_body.rm_chunks[0] = xdr_zero;
|
||||
|
||||
/* write-array discrim */
|
||||
ary = (struct rpcrdma_write_array *)
|
||||
&rmsgp->rm_body.rm_chunks[1];
|
||||
ary->wc_discrim = xdr_one;
|
||||
ary->wc_nchunks = cpu_to_be32(chunks);
|
||||
|
||||
/* write-list terminator */
|
||||
ary->wc_array[chunks].wc_target.rs_handle = xdr_zero;
|
||||
|
||||
/* reply-array discriminator */
|
||||
ary->wc_array[chunks].wc_target.rs_length = xdr_zero;
|
||||
}
|
||||
|
||||
void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary,
|
||||
int chunks)
|
||||
{
|
||||
ary->wc_discrim = xdr_one;
|
||||
ary->wc_nchunks = cpu_to_be32(chunks);
|
||||
}
|
||||
|
||||
void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
|
||||
int chunk_no,
|
||||
__be32 rs_handle,
|
||||
__be64 rs_offset,
|
||||
u32 write_len)
|
||||
{
|
||||
struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target;
|
||||
seg->rs_handle = rs_handle;
|
||||
seg->rs_offset = rs_offset;
|
||||
seg->rs_length = cpu_to_be32(write_len);
|
||||
}
|
||||
|
|
|
@ -558,33 +558,85 @@ static void rdma_read_complete(struct svc_rqst *rqstp,
|
|||
rqstp->rq_arg.buflen = head->arg.buflen;
|
||||
}
|
||||
|
||||
static void svc_rdma_send_error(struct svcxprt_rdma *xprt,
|
||||
__be32 *rdma_argp, int status)
|
||||
{
|
||||
struct svc_rdma_op_ctxt *ctxt;
|
||||
__be32 *p, *err_msgp;
|
||||
unsigned int length;
|
||||
struct page *page;
|
||||
int ret;
|
||||
|
||||
ret = svc_rdma_repost_recv(xprt, GFP_KERNEL);
|
||||
if (ret)
|
||||
return;
|
||||
|
||||
page = alloc_page(GFP_KERNEL);
|
||||
if (!page)
|
||||
return;
|
||||
err_msgp = page_address(page);
|
||||
|
||||
p = err_msgp;
|
||||
*p++ = *rdma_argp;
|
||||
*p++ = *(rdma_argp + 1);
|
||||
*p++ = xprt->sc_fc_credits;
|
||||
*p++ = rdma_error;
|
||||
if (status == -EPROTONOSUPPORT) {
|
||||
*p++ = err_vers;
|
||||
*p++ = rpcrdma_version;
|
||||
*p++ = rpcrdma_version;
|
||||
} else {
|
||||
*p++ = err_chunk;
|
||||
}
|
||||
length = (unsigned long)p - (unsigned long)err_msgp;
|
||||
|
||||
/* Map transport header; no RPC message payload */
|
||||
ctxt = svc_rdma_get_context(xprt);
|
||||
ret = svc_rdma_map_reply_hdr(xprt, ctxt, err_msgp, length);
|
||||
if (ret) {
|
||||
dprintk("svcrdma: Error %d mapping send for protocol error\n",
|
||||
ret);
|
||||
return;
|
||||
}
|
||||
|
||||
ret = svc_rdma_post_send_wr(xprt, ctxt, 1, 0);
|
||||
if (ret) {
|
||||
dprintk("svcrdma: Error %d posting send for protocol error\n",
|
||||
ret);
|
||||
svc_rdma_unmap_dma(ctxt);
|
||||
svc_rdma_put_context(ctxt, 1);
|
||||
}
|
||||
}
|
||||
|
||||
/* By convention, backchannel calls arrive via rdma_msg type
|
||||
* messages, and never populate the chunk lists. This makes
|
||||
* the RPC/RDMA header small and fixed in size, so it is
|
||||
* straightforward to check the RPC header's direction field.
|
||||
*/
|
||||
static bool
|
||||
svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, struct rpcrdma_msg *rmsgp)
|
||||
static bool svc_rdma_is_backchannel_reply(struct svc_xprt *xprt,
|
||||
__be32 *rdma_resp)
|
||||
{
|
||||
__be32 *p = (__be32 *)rmsgp;
|
||||
__be32 *p;
|
||||
|
||||
if (!xprt->xpt_bc_xprt)
|
||||
return false;
|
||||
|
||||
if (rmsgp->rm_type != rdma_msg)
|
||||
return false;
|
||||
if (rmsgp->rm_body.rm_chunks[0] != xdr_zero)
|
||||
return false;
|
||||
if (rmsgp->rm_body.rm_chunks[1] != xdr_zero)
|
||||
return false;
|
||||
if (rmsgp->rm_body.rm_chunks[2] != xdr_zero)
|
||||
p = rdma_resp + 3;
|
||||
if (*p++ != rdma_msg)
|
||||
return false;
|
||||
|
||||
/* sanity */
|
||||
if (p[7] != rmsgp->rm_xid)
|
||||
if (*p++ != xdr_zero)
|
||||
return false;
|
||||
if (*p++ != xdr_zero)
|
||||
return false;
|
||||
if (*p++ != xdr_zero)
|
||||
return false;
|
||||
|
||||
/* XID sanity */
|
||||
if (*p++ != *rdma_resp)
|
||||
return false;
|
||||
/* call direction */
|
||||
if (p[8] == cpu_to_be32(RPC_CALL))
|
||||
if (*p == cpu_to_be32(RPC_CALL))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
|
@ -650,8 +702,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
|
|||
goto out_drop;
|
||||
rqstp->rq_xprt_hlen = ret;
|
||||
|
||||
if (svc_rdma_is_backchannel_reply(xprt, rmsgp)) {
|
||||
ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, rmsgp,
|
||||
if (svc_rdma_is_backchannel_reply(xprt, &rmsgp->rm_xid)) {
|
||||
ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt,
|
||||
&rmsgp->rm_xid,
|
||||
&rqstp->rq_arg);
|
||||
svc_rdma_put_context(ctxt, 0);
|
||||
if (ret)
|
||||
|
@ -686,7 +739,7 @@ complete:
|
|||
return ret;
|
||||
|
||||
out_err:
|
||||
svc_rdma_send_error(rdma_xprt, rmsgp, ret);
|
||||
svc_rdma_send_error(rdma_xprt, &rmsgp->rm_xid, ret);
|
||||
svc_rdma_put_context(ctxt, 0);
|
||||
return 0;
|
||||
|
||||
|
|
|
@ -0,0 +1,512 @@
|
|||
/*
|
||||
* Copyright (c) 2016 Oracle. All rights reserved.
|
||||
*
|
||||
* Use the core R/W API to move RPC-over-RDMA Read and Write chunks.
|
||||
*/
|
||||
|
||||
#include <linux/sunrpc/rpc_rdma.h>
|
||||
#include <linux/sunrpc/svc_rdma.h>
|
||||
#include <linux/sunrpc/debug.h>
|
||||
|
||||
#include <rdma/rw.h>
|
||||
|
||||
#define RPCDBG_FACILITY RPCDBG_SVCXPRT
|
||||
|
||||
/* Each R/W context contains state for one chain of RDMA Read or
|
||||
* Write Work Requests.
|
||||
*
|
||||
* Each WR chain handles a single contiguous server-side buffer,
|
||||
* because scatterlist entries after the first have to start on
|
||||
* page alignment. xdr_buf iovecs cannot guarantee alignment.
|
||||
*
|
||||
* Each WR chain handles only one R_key. Each RPC-over-RDMA segment
|
||||
* from a client may contain a unique R_key, so each WR chain moves
|
||||
* up to one segment at a time.
|
||||
*
|
||||
* The scatterlist makes this data structure over 4KB in size. To
|
||||
* make it less likely to fail, and to handle the allocation for
|
||||
* smaller I/O requests without disabling bottom-halves, these
|
||||
* contexts are created on demand, but cached and reused until the
|
||||
* controlling svcxprt_rdma is destroyed.
|
||||
*/
|
||||
struct svc_rdma_rw_ctxt {
|
||||
struct list_head rw_list;
|
||||
struct rdma_rw_ctx rw_ctx;
|
||||
int rw_nents;
|
||||
struct sg_table rw_sg_table;
|
||||
struct scatterlist rw_first_sgl[0];
|
||||
};
|
||||
|
||||
static inline struct svc_rdma_rw_ctxt *
|
||||
svc_rdma_next_ctxt(struct list_head *list)
|
||||
{
|
||||
return list_first_entry_or_null(list, struct svc_rdma_rw_ctxt,
|
||||
rw_list);
|
||||
}
|
||||
|
||||
static struct svc_rdma_rw_ctxt *
|
||||
svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges)
|
||||
{
|
||||
struct svc_rdma_rw_ctxt *ctxt;
|
||||
|
||||
spin_lock(&rdma->sc_rw_ctxt_lock);
|
||||
|
||||
ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts);
|
||||
if (ctxt) {
|
||||
list_del(&ctxt->rw_list);
|
||||
spin_unlock(&rdma->sc_rw_ctxt_lock);
|
||||
} else {
|
||||
spin_unlock(&rdma->sc_rw_ctxt_lock);
|
||||
ctxt = kmalloc(sizeof(*ctxt) +
|
||||
SG_CHUNK_SIZE * sizeof(struct scatterlist),
|
||||
GFP_KERNEL);
|
||||
if (!ctxt)
|
||||
goto out;
|
||||
INIT_LIST_HEAD(&ctxt->rw_list);
|
||||
}
|
||||
|
||||
ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl;
|
||||
if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges,
|
||||
ctxt->rw_sg_table.sgl)) {
|
||||
kfree(ctxt);
|
||||
ctxt = NULL;
|
||||
}
|
||||
out:
|
||||
return ctxt;
|
||||
}
|
||||
|
||||
static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
|
||||
struct svc_rdma_rw_ctxt *ctxt)
|
||||
{
|
||||
sg_free_table_chained(&ctxt->rw_sg_table, true);
|
||||
|
||||
spin_lock(&rdma->sc_rw_ctxt_lock);
|
||||
list_add(&ctxt->rw_list, &rdma->sc_rw_ctxts);
|
||||
spin_unlock(&rdma->sc_rw_ctxt_lock);
|
||||
}
|
||||
|
||||
/**
|
||||
* svc_rdma_destroy_rw_ctxts - Free accumulated R/W contexts
|
||||
* @rdma: transport about to be destroyed
|
||||
*
|
||||
*/
|
||||
void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma)
|
||||
{
|
||||
struct svc_rdma_rw_ctxt *ctxt;
|
||||
|
||||
while ((ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts)) != NULL) {
|
||||
list_del(&ctxt->rw_list);
|
||||
kfree(ctxt);
|
||||
}
|
||||
}
|
||||
|
||||
/* A chunk context tracks all I/O for moving one Read or Write
|
||||
* chunk. This is a a set of rdma_rw's that handle data movement
|
||||
* for all segments of one chunk.
|
||||
*
|
||||
* These are small, acquired with a single allocator call, and
|
||||
* no more than one is needed per chunk. They are allocated on
|
||||
* demand, and not cached.
|
||||
*/
|
||||
struct svc_rdma_chunk_ctxt {
|
||||
struct ib_cqe cc_cqe;
|
||||
struct svcxprt_rdma *cc_rdma;
|
||||
struct list_head cc_rwctxts;
|
||||
int cc_sqecount;
|
||||
enum dma_data_direction cc_dir;
|
||||
};
|
||||
|
||||
static void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
|
||||
struct svc_rdma_chunk_ctxt *cc,
|
||||
enum dma_data_direction dir)
|
||||
{
|
||||
cc->cc_rdma = rdma;
|
||||
svc_xprt_get(&rdma->sc_xprt);
|
||||
|
||||
INIT_LIST_HEAD(&cc->cc_rwctxts);
|
||||
cc->cc_sqecount = 0;
|
||||
cc->cc_dir = dir;
|
||||
}
|
||||
|
||||
static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc)
|
||||
{
|
||||
struct svcxprt_rdma *rdma = cc->cc_rdma;
|
||||
struct svc_rdma_rw_ctxt *ctxt;
|
||||
|
||||
while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) {
|
||||
list_del(&ctxt->rw_list);
|
||||
|
||||
rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp,
|
||||
rdma->sc_port_num, ctxt->rw_sg_table.sgl,
|
||||
ctxt->rw_nents, cc->cc_dir);
|
||||
svc_rdma_put_rw_ctxt(rdma, ctxt);
|
||||
}
|
||||
svc_xprt_put(&rdma->sc_xprt);
|
||||
}
|
||||
|
||||
/* State for sending a Write or Reply chunk.
|
||||
* - Tracks progress of writing one chunk over all its segments
|
||||
* - Stores arguments for the SGL constructor functions
|
||||
*/
|
||||
struct svc_rdma_write_info {
|
||||
/* write state of this chunk */
|
||||
unsigned int wi_seg_off;
|
||||
unsigned int wi_seg_no;
|
||||
unsigned int wi_nsegs;
|
||||
__be32 *wi_segs;
|
||||
|
||||
/* SGL constructor arguments */
|
||||
struct xdr_buf *wi_xdr;
|
||||
unsigned char *wi_base;
|
||||
unsigned int wi_next_off;
|
||||
|
||||
struct svc_rdma_chunk_ctxt wi_cc;
|
||||
};
|
||||
|
||||
static struct svc_rdma_write_info *
|
||||
svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk)
|
||||
{
|
||||
struct svc_rdma_write_info *info;
|
||||
|
||||
info = kmalloc(sizeof(*info), GFP_KERNEL);
|
||||
if (!info)
|
||||
return info;
|
||||
|
||||
info->wi_seg_off = 0;
|
||||
info->wi_seg_no = 0;
|
||||
info->wi_nsegs = be32_to_cpup(++chunk);
|
||||
info->wi_segs = ++chunk;
|
||||
svc_rdma_cc_init(rdma, &info->wi_cc, DMA_TO_DEVICE);
|
||||
return info;
|
||||
}
|
||||
|
||||
static void svc_rdma_write_info_free(struct svc_rdma_write_info *info)
|
||||
{
|
||||
svc_rdma_cc_release(&info->wi_cc);
|
||||
kfree(info);
|
||||
}
|
||||
|
||||
/**
|
||||
* svc_rdma_write_done - Write chunk completion
|
||||
* @cq: controlling Completion Queue
|
||||
* @wc: Work Completion
|
||||
*
|
||||
* Pages under I/O are freed by a subsequent Send completion.
|
||||
*/
|
||||
static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
|
||||
{
|
||||
struct ib_cqe *cqe = wc->wr_cqe;
|
||||
struct svc_rdma_chunk_ctxt *cc =
|
||||
container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
|
||||
struct svcxprt_rdma *rdma = cc->cc_rdma;
|
||||
struct svc_rdma_write_info *info =
|
||||
container_of(cc, struct svc_rdma_write_info, wi_cc);
|
||||
|
||||
atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
|
||||
wake_up(&rdma->sc_send_wait);
|
||||
|
||||
if (unlikely(wc->status != IB_WC_SUCCESS)) {
|
||||
set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
|
||||
if (wc->status != IB_WC_WR_FLUSH_ERR)
|
||||
pr_err("svcrdma: write ctx: %s (%u/0x%x)\n",
|
||||
ib_wc_status_msg(wc->status),
|
||||
wc->status, wc->vendor_err);
|
||||
}
|
||||
|
||||
svc_rdma_write_info_free(info);
|
||||
}
|
||||
|
||||
/* This function sleeps when the transport's Send Queue is congested.
|
||||
*
|
||||
* Assumptions:
|
||||
* - If ib_post_send() succeeds, only one completion is expected,
|
||||
* even if one or more WRs are flushed. This is true when posting
|
||||
* an rdma_rw_ctx or when posting a single signaled WR.
|
||||
*/
|
||||
static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc)
|
||||
{
|
||||
struct svcxprt_rdma *rdma = cc->cc_rdma;
|
||||
struct svc_xprt *xprt = &rdma->sc_xprt;
|
||||
struct ib_send_wr *first_wr, *bad_wr;
|
||||
struct list_head *tmp;
|
||||
struct ib_cqe *cqe;
|
||||
int ret;
|
||||
|
||||
first_wr = NULL;
|
||||
cqe = &cc->cc_cqe;
|
||||
list_for_each(tmp, &cc->cc_rwctxts) {
|
||||
struct svc_rdma_rw_ctxt *ctxt;
|
||||
|
||||
ctxt = list_entry(tmp, struct svc_rdma_rw_ctxt, rw_list);
|
||||
first_wr = rdma_rw_ctx_wrs(&ctxt->rw_ctx, rdma->sc_qp,
|
||||
rdma->sc_port_num, cqe, first_wr);
|
||||
cqe = NULL;
|
||||
}
|
||||
|
||||
do {
|
||||
if (atomic_sub_return(cc->cc_sqecount,
|
||||
&rdma->sc_sq_avail) > 0) {
|
||||
ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
|
||||
if (ret)
|
||||
break;
|
||||
return 0;
|
||||
}
|
||||
|
||||
atomic_inc(&rdma_stat_sq_starve);
|
||||
atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
|
||||
wait_event(rdma->sc_send_wait,
|
||||
atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount);
|
||||
} while (1);
|
||||
|
||||
pr_err("svcrdma: ib_post_send failed (%d)\n", ret);
|
||||
set_bit(XPT_CLOSE, &xprt->xpt_flags);
|
||||
|
||||
/* If even one was posted, there will be a completion. */
|
||||
if (bad_wr != first_wr)
|
||||
return 0;
|
||||
|
||||
atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
|
||||
wake_up(&rdma->sc_send_wait);
|
||||
return -ENOTCONN;
|
||||
}
|
||||
|
||||
/* Build and DMA-map an SGL that covers one kvec in an xdr_buf
|
||||
*/
|
||||
static void svc_rdma_vec_to_sg(struct svc_rdma_write_info *info,
|
||||
unsigned int len,
|
||||
struct svc_rdma_rw_ctxt *ctxt)
|
||||
{
|
||||
struct scatterlist *sg = ctxt->rw_sg_table.sgl;
|
||||
|
||||
sg_set_buf(&sg[0], info->wi_base, len);
|
||||
info->wi_base += len;
|
||||
|
||||
ctxt->rw_nents = 1;
|
||||
}
|
||||
|
||||
/* Build and DMA-map an SGL that covers part of an xdr_buf's pagelist.
|
||||
*/
|
||||
static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info,
|
||||
unsigned int remaining,
|
||||
struct svc_rdma_rw_ctxt *ctxt)
|
||||
{
|
||||
unsigned int sge_no, sge_bytes, page_off, page_no;
|
||||
struct xdr_buf *xdr = info->wi_xdr;
|
||||
struct scatterlist *sg;
|
||||
struct page **page;
|
||||
|
||||
page_off = (info->wi_next_off + xdr->page_base) & ~PAGE_MASK;
|
||||
page_no = (info->wi_next_off + xdr->page_base) >> PAGE_SHIFT;
|
||||
page = xdr->pages + page_no;
|
||||
info->wi_next_off += remaining;
|
||||
sg = ctxt->rw_sg_table.sgl;
|
||||
sge_no = 0;
|
||||
do {
|
||||
sge_bytes = min_t(unsigned int, remaining,
|
||||
PAGE_SIZE - page_off);
|
||||
sg_set_page(sg, *page, sge_bytes, page_off);
|
||||
|
||||
remaining -= sge_bytes;
|
||||
sg = sg_next(sg);
|
||||
page_off = 0;
|
||||
sge_no++;
|
||||
page++;
|
||||
} while (remaining);
|
||||
|
||||
ctxt->rw_nents = sge_no;
|
||||
}
|
||||
|
||||
/* Construct RDMA Write WRs to send a portion of an xdr_buf containing
|
||||
* an RPC Reply.
|
||||
*/
|
||||
static int
|
||||
svc_rdma_build_writes(struct svc_rdma_write_info *info,
|
||||
void (*constructor)(struct svc_rdma_write_info *info,
|
||||
unsigned int len,
|
||||
struct svc_rdma_rw_ctxt *ctxt),
|
||||
unsigned int remaining)
|
||||
{
|
||||
struct svc_rdma_chunk_ctxt *cc = &info->wi_cc;
|
||||
struct svcxprt_rdma *rdma = cc->cc_rdma;
|
||||
struct svc_rdma_rw_ctxt *ctxt;
|
||||
__be32 *seg;
|
||||
int ret;
|
||||
|
||||
cc->cc_cqe.done = svc_rdma_write_done;
|
||||
seg = info->wi_segs + info->wi_seg_no * rpcrdma_segment_maxsz;
|
||||
do {
|
||||
unsigned int write_len;
|
||||
u32 seg_length, seg_handle;
|
||||
u64 seg_offset;
|
||||
|
||||
if (info->wi_seg_no >= info->wi_nsegs)
|
||||
goto out_overflow;
|
||||
|
||||
seg_handle = be32_to_cpup(seg);
|
||||
seg_length = be32_to_cpup(seg + 1);
|
||||
xdr_decode_hyper(seg + 2, &seg_offset);
|
||||
seg_offset += info->wi_seg_off;
|
||||
|
||||
write_len = min(remaining, seg_length - info->wi_seg_off);
|
||||
ctxt = svc_rdma_get_rw_ctxt(rdma,
|
||||
(write_len >> PAGE_SHIFT) + 2);
|
||||
if (!ctxt)
|
||||
goto out_noctx;
|
||||
|
||||
constructor(info, write_len, ctxt);
|
||||
ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp,
|
||||
rdma->sc_port_num, ctxt->rw_sg_table.sgl,
|
||||
ctxt->rw_nents, 0, seg_offset,
|
||||
seg_handle, DMA_TO_DEVICE);
|
||||
if (ret < 0)
|
||||
goto out_initerr;
|
||||
|
||||
list_add(&ctxt->rw_list, &cc->cc_rwctxts);
|
||||
cc->cc_sqecount += ret;
|
||||
if (write_len == seg_length - info->wi_seg_off) {
|
||||
seg += 4;
|
||||
info->wi_seg_no++;
|
||||
info->wi_seg_off = 0;
|
||||
} else {
|
||||
info->wi_seg_off += write_len;
|
||||
}
|
||||
remaining -= write_len;
|
||||
} while (remaining);
|
||||
|
||||
return 0;
|
||||
|
||||
out_overflow:
|
||||
dprintk("svcrdma: inadequate space in Write chunk (%u)\n",
|
||||
info->wi_nsegs);
|
||||
return -E2BIG;
|
||||
|
||||
out_noctx:
|
||||
dprintk("svcrdma: no R/W ctxs available\n");
|
||||
return -ENOMEM;
|
||||
|
||||
out_initerr:
|
||||
svc_rdma_put_rw_ctxt(rdma, ctxt);
|
||||
pr_err("svcrdma: failed to map pagelist (%d)\n", ret);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
/* Send one of an xdr_buf's kvecs by itself. To send a Reply
|
||||
* chunk, the whole RPC Reply is written back to the client.
|
||||
* This function writes either the head or tail of the xdr_buf
|
||||
* containing the Reply.
|
||||
*/
|
||||
static int svc_rdma_send_xdr_kvec(struct svc_rdma_write_info *info,
|
||||
struct kvec *vec)
|
||||
{
|
||||
info->wi_base = vec->iov_base;
|
||||
return svc_rdma_build_writes(info, svc_rdma_vec_to_sg,
|
||||
vec->iov_len);
|
||||
}
|
||||
|
||||
/* Send an xdr_buf's page list by itself. A Write chunk is
|
||||
* just the page list. a Reply chunk is the head, page list,
|
||||
* and tail. This function is shared between the two types
|
||||
* of chunk.
|
||||
*/
|
||||
static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info,
|
||||
struct xdr_buf *xdr)
|
||||
{
|
||||
info->wi_xdr = xdr;
|
||||
info->wi_next_off = 0;
|
||||
return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg,
|
||||
xdr->page_len);
|
||||
}
|
||||
|
||||
/**
|
||||
* svc_rdma_send_write_chunk - Write all segments in a Write chunk
|
||||
* @rdma: controlling RDMA transport
|
||||
* @wr_ch: Write chunk provided by client
|
||||
* @xdr: xdr_buf containing the data payload
|
||||
*
|
||||
* Returns a non-negative number of bytes the chunk consumed, or
|
||||
* %-E2BIG if the payload was larger than the Write chunk,
|
||||
* %-ENOMEM if rdma_rw context pool was exhausted,
|
||||
* %-ENOTCONN if posting failed (connection is lost),
|
||||
* %-EIO if rdma_rw initialization failed (DMA mapping, etc).
|
||||
*/
|
||||
int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch,
|
||||
struct xdr_buf *xdr)
|
||||
{
|
||||
struct svc_rdma_write_info *info;
|
||||
int ret;
|
||||
|
||||
if (!xdr->page_len)
|
||||
return 0;
|
||||
|
||||
info = svc_rdma_write_info_alloc(rdma, wr_ch);
|
||||
if (!info)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = svc_rdma_send_xdr_pagelist(info, xdr);
|
||||
if (ret < 0)
|
||||
goto out_err;
|
||||
|
||||
ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
|
||||
if (ret < 0)
|
||||
goto out_err;
|
||||
return xdr->page_len;
|
||||
|
||||
out_err:
|
||||
svc_rdma_write_info_free(info);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* svc_rdma_send_reply_chunk - Write all segments in the Reply chunk
|
||||
* @rdma: controlling RDMA transport
|
||||
* @rp_ch: Reply chunk provided by client
|
||||
* @writelist: true if client provided a Write list
|
||||
* @xdr: xdr_buf containing an RPC Reply
|
||||
*
|
||||
* Returns a non-negative number of bytes the chunk consumed, or
|
||||
* %-E2BIG if the payload was larger than the Reply chunk,
|
||||
* %-ENOMEM if rdma_rw context pool was exhausted,
|
||||
* %-ENOTCONN if posting failed (connection is lost),
|
||||
* %-EIO if rdma_rw initialization failed (DMA mapping, etc).
|
||||
*/
|
||||
int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, __be32 *rp_ch,
|
||||
bool writelist, struct xdr_buf *xdr)
|
||||
{
|
||||
struct svc_rdma_write_info *info;
|
||||
int consumed, ret;
|
||||
|
||||
info = svc_rdma_write_info_alloc(rdma, rp_ch);
|
||||
if (!info)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = svc_rdma_send_xdr_kvec(info, &xdr->head[0]);
|
||||
if (ret < 0)
|
||||
goto out_err;
|
||||
consumed = xdr->head[0].iov_len;
|
||||
|
||||
/* Send the page list in the Reply chunk only if the
|
||||
* client did not provide Write chunks.
|
||||
*/
|
||||
if (!writelist && xdr->page_len) {
|
||||
ret = svc_rdma_send_xdr_pagelist(info, xdr);
|
||||
if (ret < 0)
|
||||
goto out_err;
|
||||
consumed += xdr->page_len;
|
||||
}
|
||||
|
||||
if (xdr->tail[0].iov_len) {
|
||||
ret = svc_rdma_send_xdr_kvec(info, &xdr->tail[0]);
|
||||
if (ret < 0)
|
||||
goto out_err;
|
||||
consumed += xdr->tail[0].iov_len;
|
||||
}
|
||||
|
||||
ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
|
||||
if (ret < 0)
|
||||
goto out_err;
|
||||
return consumed;
|
||||
|
||||
out_err:
|
||||
svc_rdma_write_info_free(info);
|
||||
return ret;
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -272,85 +272,6 @@ static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
|
|||
}
|
||||
}
|
||||
|
||||
static struct svc_rdma_req_map *alloc_req_map(gfp_t flags)
|
||||
{
|
||||
struct svc_rdma_req_map *map;
|
||||
|
||||
map = kmalloc(sizeof(*map), flags);
|
||||
if (map)
|
||||
INIT_LIST_HEAD(&map->free);
|
||||
return map;
|
||||
}
|
||||
|
||||
static bool svc_rdma_prealloc_maps(struct svcxprt_rdma *xprt)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
/* One for each receive buffer on this connection. */
|
||||
i = xprt->sc_max_requests;
|
||||
|
||||
while (i--) {
|
||||
struct svc_rdma_req_map *map;
|
||||
|
||||
map = alloc_req_map(GFP_KERNEL);
|
||||
if (!map) {
|
||||
dprintk("svcrdma: No memory for request map\n");
|
||||
return false;
|
||||
}
|
||||
list_add(&map->free, &xprt->sc_maps);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *xprt)
|
||||
{
|
||||
struct svc_rdma_req_map *map = NULL;
|
||||
|
||||
spin_lock(&xprt->sc_map_lock);
|
||||
if (list_empty(&xprt->sc_maps))
|
||||
goto out_empty;
|
||||
|
||||
map = list_first_entry(&xprt->sc_maps,
|
||||
struct svc_rdma_req_map, free);
|
||||
list_del_init(&map->free);
|
||||
spin_unlock(&xprt->sc_map_lock);
|
||||
|
||||
out:
|
||||
map->count = 0;
|
||||
return map;
|
||||
|
||||
out_empty:
|
||||
spin_unlock(&xprt->sc_map_lock);
|
||||
|
||||
/* Pre-allocation amount was incorrect */
|
||||
map = alloc_req_map(GFP_NOIO);
|
||||
if (map)
|
||||
goto out;
|
||||
|
||||
WARN_ONCE(1, "svcrdma: empty request map list?\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void svc_rdma_put_req_map(struct svcxprt_rdma *xprt,
|
||||
struct svc_rdma_req_map *map)
|
||||
{
|
||||
spin_lock(&xprt->sc_map_lock);
|
||||
list_add(&map->free, &xprt->sc_maps);
|
||||
spin_unlock(&xprt->sc_map_lock);
|
||||
}
|
||||
|
||||
static void svc_rdma_destroy_maps(struct svcxprt_rdma *xprt)
|
||||
{
|
||||
while (!list_empty(&xprt->sc_maps)) {
|
||||
struct svc_rdma_req_map *map;
|
||||
|
||||
map = list_first_entry(&xprt->sc_maps,
|
||||
struct svc_rdma_req_map, free);
|
||||
list_del(&map->free);
|
||||
kfree(map);
|
||||
}
|
||||
}
|
||||
|
||||
/* QP event handler */
|
||||
static void qp_event_handler(struct ib_event *event, void *context)
|
||||
{
|
||||
|
@ -473,24 +394,6 @@ void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
|
|||
svc_rdma_put_context(ctxt, 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* svc_rdma_wc_write - Invoked by RDMA provider for each polled Write WC
|
||||
* @cq: completion queue
|
||||
* @wc: completed WR
|
||||
*
|
||||
*/
|
||||
void svc_rdma_wc_write(struct ib_cq *cq, struct ib_wc *wc)
|
||||
{
|
||||
struct ib_cqe *cqe = wc->wr_cqe;
|
||||
struct svc_rdma_op_ctxt *ctxt;
|
||||
|
||||
svc_rdma_send_wc_common_put(cq, wc, "write");
|
||||
|
||||
ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe);
|
||||
svc_rdma_unmap_dma(ctxt);
|
||||
svc_rdma_put_context(ctxt, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* svc_rdma_wc_reg - Invoked by RDMA provider for each polled FASTREG WC
|
||||
* @cq: completion queue
|
||||
|
@ -561,14 +464,14 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
|
|||
INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
|
||||
INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
|
||||
INIT_LIST_HEAD(&cma_xprt->sc_ctxts);
|
||||
INIT_LIST_HEAD(&cma_xprt->sc_maps);
|
||||
INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts);
|
||||
init_waitqueue_head(&cma_xprt->sc_send_wait);
|
||||
|
||||
spin_lock_init(&cma_xprt->sc_lock);
|
||||
spin_lock_init(&cma_xprt->sc_rq_dto_lock);
|
||||
spin_lock_init(&cma_xprt->sc_frmr_q_lock);
|
||||
spin_lock_init(&cma_xprt->sc_ctxt_lock);
|
||||
spin_lock_init(&cma_xprt->sc_map_lock);
|
||||
spin_lock_init(&cma_xprt->sc_rw_ctxt_lock);
|
||||
|
||||
/*
|
||||
* Note that this implies that the underlying transport support
|
||||
|
@ -999,6 +902,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
|
|||
newxprt, newxprt->sc_cm_id);
|
||||
|
||||
dev = newxprt->sc_cm_id->device;
|
||||
newxprt->sc_port_num = newxprt->sc_cm_id->port_num;
|
||||
|
||||
/* Qualify the transport resource defaults with the
|
||||
* capabilities of this particular device */
|
||||
|
@ -1014,13 +918,11 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
|
|||
svcrdma_max_bc_requests);
|
||||
newxprt->sc_rq_depth = newxprt->sc_max_requests +
|
||||
newxprt->sc_max_bc_requests;
|
||||
newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth;
|
||||
newxprt->sc_sq_depth = newxprt->sc_rq_depth;
|
||||
atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth);
|
||||
|
||||
if (!svc_rdma_prealloc_ctxts(newxprt))
|
||||
goto errout;
|
||||
if (!svc_rdma_prealloc_maps(newxprt))
|
||||
goto errout;
|
||||
|
||||
/*
|
||||
* Limit ORD based on client limit, local device limit, and
|
||||
|
@ -1050,6 +952,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
|
|||
memset(&qp_attr, 0, sizeof qp_attr);
|
||||
qp_attr.event_handler = qp_event_handler;
|
||||
qp_attr.qp_context = &newxprt->sc_xprt;
|
||||
qp_attr.port_num = newxprt->sc_cm_id->port_num;
|
||||
qp_attr.cap.max_rdma_ctxs = newxprt->sc_max_requests;
|
||||
qp_attr.cap.max_send_wr = newxprt->sc_sq_depth;
|
||||
qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth;
|
||||
qp_attr.cap.max_send_sge = newxprt->sc_max_sge;
|
||||
|
@ -1248,8 +1152,8 @@ static void __svc_rdma_free(struct work_struct *work)
|
|||
}
|
||||
|
||||
rdma_dealloc_frmr_q(rdma);
|
||||
svc_rdma_destroy_rw_ctxts(rdma);
|
||||
svc_rdma_destroy_ctxts(rdma);
|
||||
svc_rdma_destroy_maps(rdma);
|
||||
|
||||
/* Destroy the QP if present (not a listener) */
|
||||
if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
|
||||
|
|
Loading…
Reference in New Issue