2007-09-11 01:50:12 +08:00
|
|
|
/*
|
2007-09-11 01:50:42 +08:00
|
|
|
* Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
|
|
|
|
*
|
|
|
|
* This software is available to you under a choice of one of two
|
|
|
|
* licenses. You may choose to be licensed under the terms of the GNU
|
|
|
|
* General Public License (GPL) Version 2, available from the file
|
|
|
|
* COPYING in the main directory of this source tree, or the BSD-type
|
|
|
|
* license below:
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
*
|
|
|
|
* Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
*
|
|
|
|
* Redistributions in binary form must reproduce the above
|
|
|
|
* copyright notice, this list of conditions and the following
|
|
|
|
* disclaimer in the documentation and/or other materials provided
|
|
|
|
* with the distribution.
|
|
|
|
*
|
|
|
|
* Neither the name of the Network Appliance, Inc. nor the names of
|
|
|
|
* its contributors may be used to endorse or promote products
|
|
|
|
* derived from this software without specific prior written
|
|
|
|
* permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
|
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
|
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
|
|
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
|
|
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
|
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
|
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
|
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
|
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* rpc_rdma.c
|
|
|
|
*
|
|
|
|
* This file contains the guts of the RPC RDMA protocol, and
|
|
|
|
* does marshaling/unmarshaling, etc. It is also where interfacing
|
|
|
|
* to the Linux RPC framework lives.
|
2007-09-11 01:50:12 +08:00
|
|
|
*/
|
|
|
|
|
|
|
|
#include "xprt_rdma.h"
|
|
|
|
|
2007-09-11 01:50:42 +08:00
|
|
|
#include <linux/highmem.h>
|
|
|
|
|
2014-11-18 05:58:04 +08:00
|
|
|
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
|
2007-09-11 01:50:42 +08:00
|
|
|
# define RPCDBG_FACILITY RPCDBG_TRANS
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static const char transfertypes[][12] = {
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
"inline", /* no chunks */
|
|
|
|
"read list", /* some argument via rdma read */
|
|
|
|
"*read list", /* entire request via rdma read */
|
|
|
|
"write list", /* some result via rdma write */
|
2007-09-11 01:50:42 +08:00
|
|
|
"reply chunk" /* entire reply via rdma write */
|
|
|
|
};
|
2016-05-03 02:41:05 +08:00
|
|
|
|
|
|
|
/* Returns size of largest RPC-over-RDMA header in a Call message
|
|
|
|
*
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
* The largest Call header contains a full-size Read list and a
|
|
|
|
* minimal Reply chunk.
|
2016-05-03 02:41:05 +08:00
|
|
|
*/
|
|
|
|
static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs)
|
|
|
|
{
|
|
|
|
unsigned int size;
|
|
|
|
|
|
|
|
/* Fixed header fields and list discriminators */
|
|
|
|
size = RPCRDMA_HDRLEN_MIN;
|
|
|
|
|
|
|
|
/* Maximum Read list size */
|
|
|
|
maxsegs += 2; /* segment for head and tail buffers */
|
|
|
|
size = maxsegs * sizeof(struct rpcrdma_read_chunk);
|
|
|
|
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
/* Minimal Read chunk size */
|
|
|
|
size += sizeof(__be32); /* segment count */
|
|
|
|
size += sizeof(struct rpcrdma_segment);
|
|
|
|
size += sizeof(__be32); /* list discriminator */
|
|
|
|
|
2016-05-03 02:41:05 +08:00
|
|
|
dprintk("RPC: %s: max call header size = %u\n",
|
|
|
|
__func__, size);
|
|
|
|
return size;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Returns size of largest RPC-over-RDMA header in a Reply message
|
|
|
|
*
|
|
|
|
* There is only one Write list or one Reply chunk per Reply
|
|
|
|
* message. The larger list is the Write list.
|
|
|
|
*/
|
|
|
|
static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
|
|
|
|
{
|
|
|
|
unsigned int size;
|
|
|
|
|
|
|
|
/* Fixed header fields and list discriminators */
|
|
|
|
size = RPCRDMA_HDRLEN_MIN;
|
|
|
|
|
|
|
|
/* Maximum Write list size */
|
|
|
|
maxsegs += 2; /* segment for head and tail buffers */
|
|
|
|
size = sizeof(__be32); /* segment count */
|
|
|
|
size += maxsegs * sizeof(struct rpcrdma_segment);
|
|
|
|
size += sizeof(__be32); /* list discriminator */
|
|
|
|
|
|
|
|
dprintk("RPC: %s: max reply header size = %u\n",
|
|
|
|
__func__, size);
|
|
|
|
return size;
|
|
|
|
}
|
|
|
|
|
2016-09-15 22:57:07 +08:00
|
|
|
void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt)
|
2016-05-03 02:41:05 +08:00
|
|
|
{
|
2016-09-15 22:57:07 +08:00
|
|
|
struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
|
|
|
|
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
|
|
|
|
unsigned int maxsegs = ia->ri_max_segs;
|
|
|
|
|
2016-05-03 02:41:05 +08:00
|
|
|
ia->ri_max_inline_write = cdata->inline_wsize -
|
|
|
|
rpcrdma_max_call_header_size(maxsegs);
|
|
|
|
ia->ri_max_inline_read = cdata->inline_rsize -
|
|
|
|
rpcrdma_max_reply_header_size(maxsegs);
|
|
|
|
}
|
2007-09-11 01:50:42 +08:00
|
|
|
|
2015-08-04 01:03:49 +08:00
|
|
|
/* The client can send a request inline as long as the RPCRDMA header
|
|
|
|
* plus the RPC call fit under the transport's inline limit. If the
|
|
|
|
* combined call message size exceeds that limit, the client must use
|
|
|
|
* the read chunk list for this operation.
|
|
|
|
*/
|
2016-05-03 02:41:05 +08:00
|
|
|
static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
|
|
|
|
struct rpc_rqst *rqst)
|
2015-08-04 01:03:49 +08:00
|
|
|
{
|
2016-05-03 02:41:05 +08:00
|
|
|
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
|
2015-08-04 01:03:49 +08:00
|
|
|
|
2016-05-03 02:41:05 +08:00
|
|
|
return rqst->rq_snd_buf.len <= ia->ri_max_inline_write;
|
2015-08-04 01:03:49 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* The client can't know how large the actual reply will be. Thus it
|
|
|
|
* plans for the largest possible reply for that particular ULP
|
|
|
|
* operation. If the maximum combined reply message size exceeds that
|
|
|
|
* limit, the client must provide a write list or a reply chunk for
|
|
|
|
* this request.
|
|
|
|
*/
|
2016-05-03 02:41:05 +08:00
|
|
|
static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
|
|
|
|
struct rpc_rqst *rqst)
|
2015-08-04 01:03:49 +08:00
|
|
|
{
|
2016-05-03 02:41:05 +08:00
|
|
|
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
|
2015-08-04 01:03:49 +08:00
|
|
|
|
2016-05-03 02:41:05 +08:00
|
|
|
return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read;
|
2015-08-04 01:03:49 +08:00
|
|
|
}
|
|
|
|
|
xprtrdma: Segment head and tail XDR buffers on page boundaries
A single memory allocation is used for the pair of buffers wherein
the RPC client builds an RPC call message and decodes its matching
reply. These buffers are sized based on the maximum possible size
of the RPC call and reply messages for the operation in progress.
This means that as the call buffer increases in size, the start of
the reply buffer is pushed farther into the memory allocation.
RPC requests are growing in size. It used to be that both the call
and reply buffers fit inside a single page.
But these days, thanks to NFSv4 (and especially security labels in
NFSv4.2) the maximum call and reply sizes are large. NFSv4.0 OPEN,
for example, now requires a 6KB allocation for a pair of call and
reply buffers, and NFSv4 LOOKUP is not far behind.
As the maximum size of a call increases, the reply buffer is pushed
far enough into the buffer's memory allocation that a page boundary
can appear in the middle of it.
When the maximum possible reply size is larger than the client's
RDMA receive buffers (currently 1KB), the client has to register a
Reply chunk for the server to RDMA Write the reply into.
The logic in rpcrdma_convert_iovs() assumes that xdr_buf head and
tail buffers would always be contained on a single page. It supplies
just one segment for the head and one for the tail.
FMR, for example, registers up to a page boundary (only a portion of
the reply buffer in the OPEN case above). But without additional
segments, it doesn't register the rest of the buffer.
When the server tries to write the OPEN reply, the RDMA Write fails
with a remote access error since the client registered only part of
the Reply chunk.
rpcrdma_convert_iovs() must split the XDR buffer into multiple
segments, each of which are guaranteed not to contain a page
boundary. That way fmr_op_map is given the proper number of segments
to register the whole reply buffer.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Devesh Sharma <devesh.sharma@broadcom.com>
Reviewed-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-03-05 00:27:52 +08:00
|
|
|
/* Split "vec" on page boundaries into segments. FMR registers pages,
|
|
|
|
* not a byte range. Other modes coalesce these segments into a single
|
|
|
|
* MR when they can.
|
|
|
|
*/
|
|
|
|
static int
|
2016-06-30 01:54:25 +08:00
|
|
|
rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n)
|
xprtrdma: Segment head and tail XDR buffers on page boundaries
A single memory allocation is used for the pair of buffers wherein
the RPC client builds an RPC call message and decodes its matching
reply. These buffers are sized based on the maximum possible size
of the RPC call and reply messages for the operation in progress.
This means that as the call buffer increases in size, the start of
the reply buffer is pushed farther into the memory allocation.
RPC requests are growing in size. It used to be that both the call
and reply buffers fit inside a single page.
But these days, thanks to NFSv4 (and especially security labels in
NFSv4.2) the maximum call and reply sizes are large. NFSv4.0 OPEN,
for example, now requires a 6KB allocation for a pair of call and
reply buffers, and NFSv4 LOOKUP is not far behind.
As the maximum size of a call increases, the reply buffer is pushed
far enough into the buffer's memory allocation that a page boundary
can appear in the middle of it.
When the maximum possible reply size is larger than the client's
RDMA receive buffers (currently 1KB), the client has to register a
Reply chunk for the server to RDMA Write the reply into.
The logic in rpcrdma_convert_iovs() assumes that xdr_buf head and
tail buffers would always be contained on a single page. It supplies
just one segment for the head and one for the tail.
FMR, for example, registers up to a page boundary (only a portion of
the reply buffer in the OPEN case above). But without additional
segments, it doesn't register the rest of the buffer.
When the server tries to write the OPEN reply, the RDMA Write fails
with a remote access error since the client registered only part of
the Reply chunk.
rpcrdma_convert_iovs() must split the XDR buffer into multiple
segments, each of which are guaranteed not to contain a page
boundary. That way fmr_op_map is given the proper number of segments
to register the whole reply buffer.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Devesh Sharma <devesh.sharma@broadcom.com>
Reviewed-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-03-05 00:27:52 +08:00
|
|
|
{
|
|
|
|
size_t page_offset;
|
|
|
|
u32 remaining;
|
|
|
|
char *base;
|
|
|
|
|
|
|
|
base = vec->iov_base;
|
|
|
|
page_offset = offset_in_page(base);
|
|
|
|
remaining = vec->iov_len;
|
2016-06-30 01:54:25 +08:00
|
|
|
while (remaining && n < RPCRDMA_MAX_SEGS) {
|
xprtrdma: Segment head and tail XDR buffers on page boundaries
A single memory allocation is used for the pair of buffers wherein
the RPC client builds an RPC call message and decodes its matching
reply. These buffers are sized based on the maximum possible size
of the RPC call and reply messages for the operation in progress.
This means that as the call buffer increases in size, the start of
the reply buffer is pushed farther into the memory allocation.
RPC requests are growing in size. It used to be that both the call
and reply buffers fit inside a single page.
But these days, thanks to NFSv4 (and especially security labels in
NFSv4.2) the maximum call and reply sizes are large. NFSv4.0 OPEN,
for example, now requires a 6KB allocation for a pair of call and
reply buffers, and NFSv4 LOOKUP is not far behind.
As the maximum size of a call increases, the reply buffer is pushed
far enough into the buffer's memory allocation that a page boundary
can appear in the middle of it.
When the maximum possible reply size is larger than the client's
RDMA receive buffers (currently 1KB), the client has to register a
Reply chunk for the server to RDMA Write the reply into.
The logic in rpcrdma_convert_iovs() assumes that xdr_buf head and
tail buffers would always be contained on a single page. It supplies
just one segment for the head and one for the tail.
FMR, for example, registers up to a page boundary (only a portion of
the reply buffer in the OPEN case above). But without additional
segments, it doesn't register the rest of the buffer.
When the server tries to write the OPEN reply, the RDMA Write fails
with a remote access error since the client registered only part of
the Reply chunk.
rpcrdma_convert_iovs() must split the XDR buffer into multiple
segments, each of which are guaranteed not to contain a page
boundary. That way fmr_op_map is given the proper number of segments
to register the whole reply buffer.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Devesh Sharma <devesh.sharma@broadcom.com>
Reviewed-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-03-05 00:27:52 +08:00
|
|
|
seg[n].mr_page = NULL;
|
|
|
|
seg[n].mr_offset = base;
|
|
|
|
seg[n].mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining);
|
|
|
|
remaining -= seg[n].mr_len;
|
|
|
|
base += seg[n].mr_len;
|
|
|
|
++n;
|
|
|
|
page_offset = 0;
|
|
|
|
}
|
|
|
|
return n;
|
|
|
|
}
|
|
|
|
|
2007-09-11 01:50:42 +08:00
|
|
|
/*
|
|
|
|
* Chunk assembly from upper layer xdr_buf.
|
|
|
|
*
|
|
|
|
* Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk
|
|
|
|
* elements. Segments are then coalesced when registered, if possible
|
|
|
|
* within the selected memreg mode.
|
2014-05-28 22:35:14 +08:00
|
|
|
*
|
|
|
|
* Returns positive number of segments converted, or a negative errno.
|
2007-09-11 01:50:42 +08:00
|
|
|
*/
|
|
|
|
|
|
|
|
static int
|
2007-10-27 01:30:43 +08:00
|
|
|
rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
|
2016-09-15 22:57:16 +08:00
|
|
|
enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg,
|
|
|
|
bool reminv_expected)
|
2007-09-11 01:50:42 +08:00
|
|
|
{
|
2016-06-30 01:54:25 +08:00
|
|
|
int len, n, p, page_base;
|
2011-02-10 03:45:28 +08:00
|
|
|
struct page **ppages;
|
2007-09-11 01:50:42 +08:00
|
|
|
|
2016-06-30 01:54:25 +08:00
|
|
|
n = 0;
|
xprtrdma: Segment head and tail XDR buffers on page boundaries
A single memory allocation is used for the pair of buffers wherein
the RPC client builds an RPC call message and decodes its matching
reply. These buffers are sized based on the maximum possible size
of the RPC call and reply messages for the operation in progress.
This means that as the call buffer increases in size, the start of
the reply buffer is pushed farther into the memory allocation.
RPC requests are growing in size. It used to be that both the call
and reply buffers fit inside a single page.
But these days, thanks to NFSv4 (and especially security labels in
NFSv4.2) the maximum call and reply sizes are large. NFSv4.0 OPEN,
for example, now requires a 6KB allocation for a pair of call and
reply buffers, and NFSv4 LOOKUP is not far behind.
As the maximum size of a call increases, the reply buffer is pushed
far enough into the buffer's memory allocation that a page boundary
can appear in the middle of it.
When the maximum possible reply size is larger than the client's
RDMA receive buffers (currently 1KB), the client has to register a
Reply chunk for the server to RDMA Write the reply into.
The logic in rpcrdma_convert_iovs() assumes that xdr_buf head and
tail buffers would always be contained on a single page. It supplies
just one segment for the head and one for the tail.
FMR, for example, registers up to a page boundary (only a portion of
the reply buffer in the OPEN case above). But without additional
segments, it doesn't register the rest of the buffer.
When the server tries to write the OPEN reply, the RDMA Write fails
with a remote access error since the client registered only part of
the Reply chunk.
rpcrdma_convert_iovs() must split the XDR buffer into multiple
segments, each of which are guaranteed not to contain a page
boundary. That way fmr_op_map is given the proper number of segments
to register the whole reply buffer.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Devesh Sharma <devesh.sharma@broadcom.com>
Reviewed-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-03-05 00:27:52 +08:00
|
|
|
if (pos == 0) {
|
2016-06-30 01:54:25 +08:00
|
|
|
n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n);
|
|
|
|
if (n == RPCRDMA_MAX_SEGS)
|
|
|
|
goto out_overflow;
|
2007-09-11 01:50:42 +08:00
|
|
|
}
|
|
|
|
|
2011-02-10 03:45:28 +08:00
|
|
|
len = xdrbuf->page_len;
|
|
|
|
ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
|
|
|
|
page_base = xdrbuf->page_base & ~PAGE_MASK;
|
|
|
|
p = 0;
|
2016-06-30 01:54:25 +08:00
|
|
|
while (len && n < RPCRDMA_MAX_SEGS) {
|
2014-05-28 22:34:24 +08:00
|
|
|
if (!ppages[p]) {
|
|
|
|
/* alloc the pagelist for receiving buffer */
|
|
|
|
ppages[p] = alloc_page(GFP_ATOMIC);
|
|
|
|
if (!ppages[p])
|
2016-06-30 01:53:43 +08:00
|
|
|
return -EAGAIN;
|
2014-05-28 22:34:24 +08:00
|
|
|
}
|
2011-02-10 03:45:28 +08:00
|
|
|
seg[n].mr_page = ppages[p];
|
|
|
|
seg[n].mr_offset = (void *)(unsigned long) page_base;
|
|
|
|
seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len);
|
2014-05-28 22:35:14 +08:00
|
|
|
if (seg[n].mr_len > PAGE_SIZE)
|
2016-06-30 01:54:25 +08:00
|
|
|
goto out_overflow;
|
2011-02-10 03:45:28 +08:00
|
|
|
len -= seg[n].mr_len;
|
2007-09-11 01:50:42 +08:00
|
|
|
++n;
|
2011-02-10 03:45:28 +08:00
|
|
|
++p;
|
|
|
|
page_base = 0; /* page offset only applies to first page */
|
2007-09-11 01:50:42 +08:00
|
|
|
}
|
|
|
|
|
2011-02-10 03:45:28 +08:00
|
|
|
/* Message overflows the seg array */
|
2016-06-30 01:54:25 +08:00
|
|
|
if (len && n == RPCRDMA_MAX_SEGS)
|
|
|
|
goto out_overflow;
|
2011-02-10 03:45:28 +08:00
|
|
|
|
2015-08-04 01:04:17 +08:00
|
|
|
/* When encoding the read list, the tail is always sent inline */
|
|
|
|
if (type == rpcrdma_readch)
|
|
|
|
return n;
|
|
|
|
|
2016-09-15 22:57:16 +08:00
|
|
|
/* When encoding the Write list, some servers need to see an extra
|
|
|
|
* segment for odd-length Write chunks. The upper layer provides
|
|
|
|
* space in the tail iovec for this purpose.
|
|
|
|
*/
|
|
|
|
if (type == rpcrdma_writech && reminv_expected)
|
|
|
|
return n;
|
|
|
|
|
2007-12-11 00:24:48 +08:00
|
|
|
if (xdrbuf->tail[0].iov_len) {
|
2008-10-10 03:01:11 +08:00
|
|
|
/* the rpcrdma protocol allows us to omit any trailing
|
|
|
|
* xdr pad bytes, saving the server an RDMA operation. */
|
|
|
|
if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize)
|
|
|
|
return n;
|
2016-06-30 01:54:25 +08:00
|
|
|
n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n);
|
|
|
|
if (n == RPCRDMA_MAX_SEGS)
|
|
|
|
goto out_overflow;
|
2007-09-11 01:50:42 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return n;
|
2016-06-30 01:54:25 +08:00
|
|
|
|
|
|
|
out_overflow:
|
|
|
|
pr_err("rpcrdma: segment array overflow\n");
|
|
|
|
return -EIO;
|
2007-09-11 01:50:42 +08:00
|
|
|
}
|
|
|
|
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
static inline __be32 *
|
2016-06-30 01:54:16 +08:00
|
|
|
xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw)
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
{
|
2016-06-30 01:54:16 +08:00
|
|
|
*iptr++ = cpu_to_be32(mw->mw_handle);
|
|
|
|
*iptr++ = cpu_to_be32(mw->mw_length);
|
|
|
|
return xdr_encode_hyper(iptr, mw->mw_offset);
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* XDR-encode the Read list. Supports encoding a list of read
|
|
|
|
* segments that belong to a single read chunk.
|
|
|
|
*
|
|
|
|
* Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
|
|
|
|
*
|
|
|
|
* Read chunklist (a linked list):
|
|
|
|
* N elements, position P (same P for all chunks of same arg!):
|
|
|
|
* 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
|
|
|
|
*
|
|
|
|
* Returns a pointer to the XDR word in the RDMA header following
|
|
|
|
* the end of the Read list, or an error pointer.
|
|
|
|
*/
|
|
|
|
static __be32 *
|
|
|
|
rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
|
|
|
|
struct rpcrdma_req *req, struct rpc_rqst *rqst,
|
|
|
|
__be32 *iptr, enum rpcrdma_chunktype rtype)
|
|
|
|
{
|
2016-06-30 01:54:25 +08:00
|
|
|
struct rpcrdma_mr_seg *seg;
|
2016-06-30 01:54:16 +08:00
|
|
|
struct rpcrdma_mw *mw;
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
unsigned int pos;
|
|
|
|
int n, nsegs;
|
|
|
|
|
|
|
|
if (rtype == rpcrdma_noch) {
|
|
|
|
*iptr++ = xdr_zero; /* item not present */
|
|
|
|
return iptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
pos = rqst->rq_snd_buf.head[0].iov_len;
|
|
|
|
if (rtype == rpcrdma_areadch)
|
|
|
|
pos = 0;
|
2016-06-30 01:54:25 +08:00
|
|
|
seg = req->rl_segments;
|
2016-09-15 22:57:16 +08:00
|
|
|
nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg, false);
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
if (nsegs < 0)
|
|
|
|
return ERR_PTR(nsegs);
|
|
|
|
|
|
|
|
do {
|
2016-06-30 01:54:16 +08:00
|
|
|
n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
|
|
|
|
false, &mw);
|
2016-06-30 01:53:52 +08:00
|
|
|
if (n < 0)
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
return ERR_PTR(n);
|
2016-06-30 01:54:16 +08:00
|
|
|
list_add(&mw->mw_list, &req->rl_registered);
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
|
|
|
|
*iptr++ = xdr_one; /* item present */
|
|
|
|
|
|
|
|
/* All read segments in this chunk
|
|
|
|
* have the same "position".
|
|
|
|
*/
|
|
|
|
*iptr++ = cpu_to_be32(pos);
|
2016-06-30 01:54:16 +08:00
|
|
|
iptr = xdr_encode_rdma_segment(iptr, mw);
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
|
2016-06-30 01:54:16 +08:00
|
|
|
dprintk("RPC: %5u %s: pos %u %u@0x%016llx:0x%08x (%s)\n",
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
rqst->rq_task->tk_pid, __func__, pos,
|
2016-06-30 01:54:16 +08:00
|
|
|
mw->mw_length, (unsigned long long)mw->mw_offset,
|
|
|
|
mw->mw_handle, n < nsegs ? "more" : "last");
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
|
|
|
|
r_xprt->rx_stats.read_chunk_count++;
|
|
|
|
seg += n;
|
|
|
|
nsegs -= n;
|
|
|
|
} while (nsegs);
|
|
|
|
|
|
|
|
/* Finish Read list */
|
|
|
|
*iptr++ = xdr_zero; /* Next item not present */
|
|
|
|
return iptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* XDR-encode the Write list. Supports encoding a list containing
|
|
|
|
* one array of plain segments that belong to a single write chunk.
|
|
|
|
*
|
|
|
|
* Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
|
|
|
|
*
|
|
|
|
* Write chunklist (a list of (one) counted array):
|
|
|
|
* N elements:
|
|
|
|
* 1 - N - HLOO - HLOO - ... - HLOO - 0
|
|
|
|
*
|
|
|
|
* Returns a pointer to the XDR word in the RDMA header following
|
|
|
|
* the end of the Write list, or an error pointer.
|
|
|
|
*/
|
|
|
|
static __be32 *
|
|
|
|
rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
|
|
|
|
struct rpc_rqst *rqst, __be32 *iptr,
|
|
|
|
enum rpcrdma_chunktype wtype)
|
|
|
|
{
|
2016-06-30 01:54:25 +08:00
|
|
|
struct rpcrdma_mr_seg *seg;
|
2016-06-30 01:54:16 +08:00
|
|
|
struct rpcrdma_mw *mw;
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
int n, nsegs, nchunks;
|
|
|
|
__be32 *segcount;
|
|
|
|
|
|
|
|
if (wtype != rpcrdma_writech) {
|
|
|
|
*iptr++ = xdr_zero; /* no Write list present */
|
|
|
|
return iptr;
|
|
|
|
}
|
|
|
|
|
2016-06-30 01:54:25 +08:00
|
|
|
seg = req->rl_segments;
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf,
|
|
|
|
rqst->rq_rcv_buf.head[0].iov_len,
|
2016-09-15 22:57:16 +08:00
|
|
|
wtype, seg,
|
|
|
|
r_xprt->rx_ia.ri_reminv_expected);
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
if (nsegs < 0)
|
|
|
|
return ERR_PTR(nsegs);
|
|
|
|
|
|
|
|
*iptr++ = xdr_one; /* Write list present */
|
|
|
|
segcount = iptr++; /* save location of segment count */
|
|
|
|
|
|
|
|
nchunks = 0;
|
|
|
|
do {
|
2016-06-30 01:54:16 +08:00
|
|
|
n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
|
|
|
|
true, &mw);
|
2016-06-30 01:53:52 +08:00
|
|
|
if (n < 0)
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
return ERR_PTR(n);
|
2016-06-30 01:54:16 +08:00
|
|
|
list_add(&mw->mw_list, &req->rl_registered);
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
|
2016-06-30 01:54:16 +08:00
|
|
|
iptr = xdr_encode_rdma_segment(iptr, mw);
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
|
2016-06-30 01:54:16 +08:00
|
|
|
dprintk("RPC: %5u %s: %u@0x016%llx:0x%08x (%s)\n",
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
rqst->rq_task->tk_pid, __func__,
|
2016-06-30 01:54:16 +08:00
|
|
|
mw->mw_length, (unsigned long long)mw->mw_offset,
|
|
|
|
mw->mw_handle, n < nsegs ? "more" : "last");
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
|
|
|
|
r_xprt->rx_stats.write_chunk_count++;
|
|
|
|
r_xprt->rx_stats.total_rdma_request += seg->mr_len;
|
|
|
|
nchunks++;
|
|
|
|
seg += n;
|
|
|
|
nsegs -= n;
|
|
|
|
} while (nsegs);
|
|
|
|
|
|
|
|
/* Update count of segments in this Write chunk */
|
|
|
|
*segcount = cpu_to_be32(nchunks);
|
|
|
|
|
|
|
|
/* Finish Write list */
|
|
|
|
*iptr++ = xdr_zero; /* Next item not present */
|
|
|
|
return iptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* XDR-encode the Reply chunk. Supports encoding an array of plain
|
|
|
|
* segments that belong to a single write (reply) chunk.
|
|
|
|
*
|
|
|
|
* Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
|
|
|
|
*
|
|
|
|
* Reply chunk (a counted array):
|
|
|
|
* N elements:
|
|
|
|
* 1 - N - HLOO - HLOO - ... - HLOO
|
|
|
|
*
|
|
|
|
* Returns a pointer to the XDR word in the RDMA header following
|
|
|
|
* the end of the Reply chunk, or an error pointer.
|
|
|
|
*/
|
|
|
|
static __be32 *
|
|
|
|
rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
|
|
|
|
struct rpcrdma_req *req, struct rpc_rqst *rqst,
|
|
|
|
__be32 *iptr, enum rpcrdma_chunktype wtype)
|
|
|
|
{
|
2016-06-30 01:54:25 +08:00
|
|
|
struct rpcrdma_mr_seg *seg;
|
2016-06-30 01:54:16 +08:00
|
|
|
struct rpcrdma_mw *mw;
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
int n, nsegs, nchunks;
|
|
|
|
__be32 *segcount;
|
|
|
|
|
|
|
|
if (wtype != rpcrdma_replych) {
|
|
|
|
*iptr++ = xdr_zero; /* no Reply chunk present */
|
|
|
|
return iptr;
|
|
|
|
}
|
|
|
|
|
2016-06-30 01:54:25 +08:00
|
|
|
seg = req->rl_segments;
|
2016-09-15 22:57:16 +08:00
|
|
|
nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg,
|
|
|
|
r_xprt->rx_ia.ri_reminv_expected);
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
if (nsegs < 0)
|
|
|
|
return ERR_PTR(nsegs);
|
|
|
|
|
|
|
|
*iptr++ = xdr_one; /* Reply chunk present */
|
|
|
|
segcount = iptr++; /* save location of segment count */
|
|
|
|
|
|
|
|
nchunks = 0;
|
|
|
|
do {
|
2016-06-30 01:54:16 +08:00
|
|
|
n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
|
|
|
|
true, &mw);
|
2016-06-30 01:53:52 +08:00
|
|
|
if (n < 0)
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
return ERR_PTR(n);
|
2016-06-30 01:54:16 +08:00
|
|
|
list_add(&mw->mw_list, &req->rl_registered);
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
|
2016-06-30 01:54:16 +08:00
|
|
|
iptr = xdr_encode_rdma_segment(iptr, mw);
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
|
2016-06-30 01:54:16 +08:00
|
|
|
dprintk("RPC: %5u %s: %u@0x%016llx:0x%08x (%s)\n",
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
rqst->rq_task->tk_pid, __func__,
|
2016-06-30 01:54:16 +08:00
|
|
|
mw->mw_length, (unsigned long long)mw->mw_offset,
|
|
|
|
mw->mw_handle, n < nsegs ? "more" : "last");
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
|
|
|
|
r_xprt->rx_stats.reply_chunk_count++;
|
|
|
|
r_xprt->rx_stats.total_rdma_request += seg->mr_len;
|
|
|
|
nchunks++;
|
|
|
|
seg += n;
|
|
|
|
nsegs -= n;
|
|
|
|
} while (nsegs);
|
|
|
|
|
|
|
|
/* Update count of segments in the Reply chunk */
|
|
|
|
*segcount = cpu_to_be32(nchunks);
|
|
|
|
|
|
|
|
return iptr;
|
|
|
|
}
|
|
|
|
|
xprtrdma: Use gathered Send for large inline messages
An RPC Call message that is sent inline but that has a data payload
(ie, one or more items in rq_snd_buf's page list) must be "pulled
up:"
- call_allocate has to reserve enough RPC Call buffer space to
accommodate the data payload
- call_transmit has to memcopy the rq_snd_buf's page list and tail
into its head iovec before it is sent
As the inline threshold is increased beyond its current 1KB default,
however, this means data payloads of more than a few KB are copied
by the host CPU. For example, if the inline threshold is increased
just to 4KB, then NFS WRITE requests up to 4KB would involve a
memcpy of the NFS WRITE's payload data into the RPC Call buffer.
This is an undesirable amount of participation by the host CPU.
The inline threshold may be much larger than 4KB in the future,
after negotiation with a peer server.
Instead of copying the components of rq_snd_buf into its head iovec,
construct a gather list of these components, and send them all in
place. The same approach is already used in the Linux server's
RPC-over-RDMA reply path.
This mechanism also eliminates the need for rpcrdma_tail_pullup,
which is used to manage the XDR pad and trailing inline content when
a Read list is present.
This requires that the pages in rq_snd_buf's page list be DMA-mapped
during marshaling, and unmapped when a data-bearing RPC is
completed. This is slightly less efficient for very small I/O
payloads, but significantly more efficient as data payload size and
inline threshold increase past a kilobyte.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-09-15 22:57:24 +08:00
|
|
|
/* Prepare the RPC-over-RDMA header SGE.
|
2007-09-11 01:50:42 +08:00
|
|
|
*/
|
xprtrdma: Use gathered Send for large inline messages
An RPC Call message that is sent inline but that has a data payload
(ie, one or more items in rq_snd_buf's page list) must be "pulled
up:"
- call_allocate has to reserve enough RPC Call buffer space to
accommodate the data payload
- call_transmit has to memcopy the rq_snd_buf's page list and tail
into its head iovec before it is sent
As the inline threshold is increased beyond its current 1KB default,
however, this means data payloads of more than a few KB are copied
by the host CPU. For example, if the inline threshold is increased
just to 4KB, then NFS WRITE requests up to 4KB would involve a
memcpy of the NFS WRITE's payload data into the RPC Call buffer.
This is an undesirable amount of participation by the host CPU.
The inline threshold may be much larger than 4KB in the future,
after negotiation with a peer server.
Instead of copying the components of rq_snd_buf into its head iovec,
construct a gather list of these components, and send them all in
place. The same approach is already used in the Linux server's
RPC-over-RDMA reply path.
This mechanism also eliminates the need for rpcrdma_tail_pullup,
which is used to manage the XDR pad and trailing inline content when
a Read list is present.
This requires that the pages in rq_snd_buf's page list be DMA-mapped
during marshaling, and unmapped when a data-bearing RPC is
completed. This is slightly less efficient for very small I/O
payloads, but significantly more efficient as data payload size and
inline threshold increase past a kilobyte.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-09-15 22:57:24 +08:00
|
|
|
static bool
|
|
|
|
rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
|
|
|
|
u32 len)
|
2007-09-11 01:50:42 +08:00
|
|
|
{
|
xprtrdma: Use gathered Send for large inline messages
An RPC Call message that is sent inline but that has a data payload
(ie, one or more items in rq_snd_buf's page list) must be "pulled
up:"
- call_allocate has to reserve enough RPC Call buffer space to
accommodate the data payload
- call_transmit has to memcopy the rq_snd_buf's page list and tail
into its head iovec before it is sent
As the inline threshold is increased beyond its current 1KB default,
however, this means data payloads of more than a few KB are copied
by the host CPU. For example, if the inline threshold is increased
just to 4KB, then NFS WRITE requests up to 4KB would involve a
memcpy of the NFS WRITE's payload data into the RPC Call buffer.
This is an undesirable amount of participation by the host CPU.
The inline threshold may be much larger than 4KB in the future,
after negotiation with a peer server.
Instead of copying the components of rq_snd_buf into its head iovec,
construct a gather list of these components, and send them all in
place. The same approach is already used in the Linux server's
RPC-over-RDMA reply path.
This mechanism also eliminates the need for rpcrdma_tail_pullup,
which is used to manage the XDR pad and trailing inline content when
a Read list is present.
This requires that the pages in rq_snd_buf's page list be DMA-mapped
during marshaling, and unmapped when a data-bearing RPC is
completed. This is slightly less efficient for very small I/O
payloads, but significantly more efficient as data payload size and
inline threshold increase past a kilobyte.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-09-15 22:57:24 +08:00
|
|
|
struct rpcrdma_regbuf *rb = req->rl_rdmabuf;
|
|
|
|
struct ib_sge *sge = &req->rl_send_sge[0];
|
|
|
|
|
|
|
|
if (unlikely(!rpcrdma_regbuf_is_mapped(rb))) {
|
|
|
|
if (!__rpcrdma_dma_map_regbuf(ia, rb))
|
|
|
|
return false;
|
|
|
|
sge->addr = rdmab_addr(rb);
|
|
|
|
sge->lkey = rdmab_lkey(rb);
|
|
|
|
}
|
|
|
|
sge->length = len;
|
|
|
|
|
|
|
|
ib_dma_sync_single_for_device(ia->ri_device, sge->addr,
|
|
|
|
sge->length, DMA_TO_DEVICE);
|
|
|
|
req->rl_send_wr.num_sge++;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Prepare the Send SGEs. The head and tail iovec, and each entry
|
|
|
|
* in the page list, gets its own SGE.
|
|
|
|
*/
|
|
|
|
static bool
|
|
|
|
rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
|
|
|
|
struct xdr_buf *xdr, enum rpcrdma_chunktype rtype)
|
|
|
|
{
|
|
|
|
unsigned int sge_no, page_base, len, remaining;
|
|
|
|
struct rpcrdma_regbuf *rb = req->rl_sendbuf;
|
|
|
|
struct ib_device *device = ia->ri_device;
|
|
|
|
struct ib_sge *sge = req->rl_send_sge;
|
|
|
|
u32 lkey = ia->ri_pd->local_dma_lkey;
|
|
|
|
struct page *page, **ppages;
|
|
|
|
|
|
|
|
/* The head iovec is straightforward, as it is already
|
|
|
|
* DMA-mapped. Sync the content that has changed.
|
|
|
|
*/
|
|
|
|
if (!rpcrdma_dma_map_regbuf(ia, rb))
|
|
|
|
return false;
|
|
|
|
sge_no = 1;
|
|
|
|
sge[sge_no].addr = rdmab_addr(rb);
|
|
|
|
sge[sge_no].length = xdr->head[0].iov_len;
|
|
|
|
sge[sge_no].lkey = rdmab_lkey(rb);
|
|
|
|
ib_dma_sync_single_for_device(device, sge[sge_no].addr,
|
|
|
|
sge[sge_no].length, DMA_TO_DEVICE);
|
|
|
|
|
|
|
|
/* If there is a Read chunk, the page list is being handled
|
|
|
|
* via explicit RDMA, and thus is skipped here. However, the
|
|
|
|
* tail iovec may include an XDR pad for the page list, as
|
|
|
|
* well as additional content, and may not reside in the
|
|
|
|
* same page as the head iovec.
|
|
|
|
*/
|
|
|
|
if (rtype == rpcrdma_readch) {
|
|
|
|
len = xdr->tail[0].iov_len;
|
2007-09-11 01:50:42 +08:00
|
|
|
|
xprtrdma: Use gathered Send for large inline messages
An RPC Call message that is sent inline but that has a data payload
(ie, one or more items in rq_snd_buf's page list) must be "pulled
up:"
- call_allocate has to reserve enough RPC Call buffer space to
accommodate the data payload
- call_transmit has to memcopy the rq_snd_buf's page list and tail
into its head iovec before it is sent
As the inline threshold is increased beyond its current 1KB default,
however, this means data payloads of more than a few KB are copied
by the host CPU. For example, if the inline threshold is increased
just to 4KB, then NFS WRITE requests up to 4KB would involve a
memcpy of the NFS WRITE's payload data into the RPC Call buffer.
This is an undesirable amount of participation by the host CPU.
The inline threshold may be much larger than 4KB in the future,
after negotiation with a peer server.
Instead of copying the components of rq_snd_buf into its head iovec,
construct a gather list of these components, and send them all in
place. The same approach is already used in the Linux server's
RPC-over-RDMA reply path.
This mechanism also eliminates the need for rpcrdma_tail_pullup,
which is used to manage the XDR pad and trailing inline content when
a Read list is present.
This requires that the pages in rq_snd_buf's page list be DMA-mapped
during marshaling, and unmapped when a data-bearing RPC is
completed. This is slightly less efficient for very small I/O
payloads, but significantly more efficient as data payload size and
inline threshold increase past a kilobyte.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-09-15 22:57:24 +08:00
|
|
|
/* Do not include the tail if it is only an XDR pad */
|
|
|
|
if (len < 4)
|
|
|
|
goto out;
|
2007-09-11 01:50:42 +08:00
|
|
|
|
xprtrdma: Use gathered Send for large inline messages
An RPC Call message that is sent inline but that has a data payload
(ie, one or more items in rq_snd_buf's page list) must be "pulled
up:"
- call_allocate has to reserve enough RPC Call buffer space to
accommodate the data payload
- call_transmit has to memcopy the rq_snd_buf's page list and tail
into its head iovec before it is sent
As the inline threshold is increased beyond its current 1KB default,
however, this means data payloads of more than a few KB are copied
by the host CPU. For example, if the inline threshold is increased
just to 4KB, then NFS WRITE requests up to 4KB would involve a
memcpy of the NFS WRITE's payload data into the RPC Call buffer.
This is an undesirable amount of participation by the host CPU.
The inline threshold may be much larger than 4KB in the future,
after negotiation with a peer server.
Instead of copying the components of rq_snd_buf into its head iovec,
construct a gather list of these components, and send them all in
place. The same approach is already used in the Linux server's
RPC-over-RDMA reply path.
This mechanism also eliminates the need for rpcrdma_tail_pullup,
which is used to manage the XDR pad and trailing inline content when
a Read list is present.
This requires that the pages in rq_snd_buf's page list be DMA-mapped
during marshaling, and unmapped when a data-bearing RPC is
completed. This is slightly less efficient for very small I/O
payloads, but significantly more efficient as data payload size and
inline threshold increase past a kilobyte.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-09-15 22:57:24 +08:00
|
|
|
page = virt_to_page(xdr->tail[0].iov_base);
|
|
|
|
page_base = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK;
|
2007-09-11 01:50:42 +08:00
|
|
|
|
xprtrdma: Use gathered Send for large inline messages
An RPC Call message that is sent inline but that has a data payload
(ie, one or more items in rq_snd_buf's page list) must be "pulled
up:"
- call_allocate has to reserve enough RPC Call buffer space to
accommodate the data payload
- call_transmit has to memcopy the rq_snd_buf's page list and tail
into its head iovec before it is sent
As the inline threshold is increased beyond its current 1KB default,
however, this means data payloads of more than a few KB are copied
by the host CPU. For example, if the inline threshold is increased
just to 4KB, then NFS WRITE requests up to 4KB would involve a
memcpy of the NFS WRITE's payload data into the RPC Call buffer.
This is an undesirable amount of participation by the host CPU.
The inline threshold may be much larger than 4KB in the future,
after negotiation with a peer server.
Instead of copying the components of rq_snd_buf into its head iovec,
construct a gather list of these components, and send them all in
place. The same approach is already used in the Linux server's
RPC-over-RDMA reply path.
This mechanism also eliminates the need for rpcrdma_tail_pullup,
which is used to manage the XDR pad and trailing inline content when
a Read list is present.
This requires that the pages in rq_snd_buf's page list be DMA-mapped
during marshaling, and unmapped when a data-bearing RPC is
completed. This is slightly less efficient for very small I/O
payloads, but significantly more efficient as data payload size and
inline threshold increase past a kilobyte.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-09-15 22:57:24 +08:00
|
|
|
/* If the content in the page list is an odd length,
|
|
|
|
* xdr_write_pages() has added a pad at the beginning
|
|
|
|
* of the tail iovec. Force the tail's non-pad content
|
|
|
|
* to land at the next XDR position in the Send message.
|
|
|
|
*/
|
|
|
|
page_base += len & 3;
|
|
|
|
len -= len & 3;
|
|
|
|
goto map_tail;
|
|
|
|
}
|
2009-03-12 02:37:55 +08:00
|
|
|
|
xprtrdma: Use gathered Send for large inline messages
An RPC Call message that is sent inline but that has a data payload
(ie, one or more items in rq_snd_buf's page list) must be "pulled
up:"
- call_allocate has to reserve enough RPC Call buffer space to
accommodate the data payload
- call_transmit has to memcopy the rq_snd_buf's page list and tail
into its head iovec before it is sent
As the inline threshold is increased beyond its current 1KB default,
however, this means data payloads of more than a few KB are copied
by the host CPU. For example, if the inline threshold is increased
just to 4KB, then NFS WRITE requests up to 4KB would involve a
memcpy of the NFS WRITE's payload data into the RPC Call buffer.
This is an undesirable amount of participation by the host CPU.
The inline threshold may be much larger than 4KB in the future,
after negotiation with a peer server.
Instead of copying the components of rq_snd_buf into its head iovec,
construct a gather list of these components, and send them all in
place. The same approach is already used in the Linux server's
RPC-over-RDMA reply path.
This mechanism also eliminates the need for rpcrdma_tail_pullup,
which is used to manage the XDR pad and trailing inline content when
a Read list is present.
This requires that the pages in rq_snd_buf's page list be DMA-mapped
during marshaling, and unmapped when a data-bearing RPC is
completed. This is slightly less efficient for very small I/O
payloads, but significantly more efficient as data payload size and
inline threshold increase past a kilobyte.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-09-15 22:57:24 +08:00
|
|
|
/* If there is a page list present, temporarily DMA map
|
|
|
|
* and prepare an SGE for each page to be sent.
|
|
|
|
*/
|
|
|
|
if (xdr->page_len) {
|
|
|
|
ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
|
|
|
|
page_base = xdr->page_base & ~PAGE_MASK;
|
|
|
|
remaining = xdr->page_len;
|
|
|
|
while (remaining) {
|
|
|
|
sge_no++;
|
|
|
|
if (sge_no > RPCRDMA_MAX_SEND_SGES - 2)
|
|
|
|
goto out_mapping_overflow;
|
|
|
|
|
|
|
|
len = min_t(u32, PAGE_SIZE - page_base, remaining);
|
|
|
|
sge[sge_no].addr = ib_dma_map_page(device, *ppages,
|
|
|
|
page_base, len,
|
|
|
|
DMA_TO_DEVICE);
|
|
|
|
if (ib_dma_mapping_error(device, sge[sge_no].addr))
|
|
|
|
goto out_mapping_err;
|
|
|
|
sge[sge_no].length = len;
|
|
|
|
sge[sge_no].lkey = lkey;
|
|
|
|
|
|
|
|
req->rl_mapped_sges++;
|
|
|
|
ppages++;
|
|
|
|
remaining -= len;
|
|
|
|
page_base = 0;
|
2009-03-12 02:37:55 +08:00
|
|
|
}
|
|
|
|
}
|
xprtrdma: Use gathered Send for large inline messages
An RPC Call message that is sent inline but that has a data payload
(ie, one or more items in rq_snd_buf's page list) must be "pulled
up:"
- call_allocate has to reserve enough RPC Call buffer space to
accommodate the data payload
- call_transmit has to memcopy the rq_snd_buf's page list and tail
into its head iovec before it is sent
As the inline threshold is increased beyond its current 1KB default,
however, this means data payloads of more than a few KB are copied
by the host CPU. For example, if the inline threshold is increased
just to 4KB, then NFS WRITE requests up to 4KB would involve a
memcpy of the NFS WRITE's payload data into the RPC Call buffer.
This is an undesirable amount of participation by the host CPU.
The inline threshold may be much larger than 4KB in the future,
after negotiation with a peer server.
Instead of copying the components of rq_snd_buf into its head iovec,
construct a gather list of these components, and send them all in
place. The same approach is already used in the Linux server's
RPC-over-RDMA reply path.
This mechanism also eliminates the need for rpcrdma_tail_pullup,
which is used to manage the XDR pad and trailing inline content when
a Read list is present.
This requires that the pages in rq_snd_buf's page list be DMA-mapped
during marshaling, and unmapped when a data-bearing RPC is
completed. This is slightly less efficient for very small I/O
payloads, but significantly more efficient as data payload size and
inline threshold increase past a kilobyte.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-09-15 22:57:24 +08:00
|
|
|
|
|
|
|
/* The tail iovec is not always constructed in the same
|
|
|
|
* page where the head iovec resides (see, for example,
|
|
|
|
* gss_wrap_req_priv). To neatly accommodate that case,
|
|
|
|
* DMA map it separately.
|
|
|
|
*/
|
|
|
|
if (xdr->tail[0].iov_len) {
|
|
|
|
page = virt_to_page(xdr->tail[0].iov_base);
|
|
|
|
page_base = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK;
|
|
|
|
len = xdr->tail[0].iov_len;
|
|
|
|
|
|
|
|
map_tail:
|
|
|
|
sge_no++;
|
|
|
|
sge[sge_no].addr = ib_dma_map_page(device, page,
|
|
|
|
page_base, len,
|
|
|
|
DMA_TO_DEVICE);
|
|
|
|
if (ib_dma_mapping_error(device, sge[sge_no].addr))
|
|
|
|
goto out_mapping_err;
|
|
|
|
sge[sge_no].length = len;
|
|
|
|
sge[sge_no].lkey = lkey;
|
|
|
|
req->rl_mapped_sges++;
|
2007-09-11 01:50:42 +08:00
|
|
|
}
|
xprtrdma: Use gathered Send for large inline messages
An RPC Call message that is sent inline but that has a data payload
(ie, one or more items in rq_snd_buf's page list) must be "pulled
up:"
- call_allocate has to reserve enough RPC Call buffer space to
accommodate the data payload
- call_transmit has to memcopy the rq_snd_buf's page list and tail
into its head iovec before it is sent
As the inline threshold is increased beyond its current 1KB default,
however, this means data payloads of more than a few KB are copied
by the host CPU. For example, if the inline threshold is increased
just to 4KB, then NFS WRITE requests up to 4KB would involve a
memcpy of the NFS WRITE's payload data into the RPC Call buffer.
This is an undesirable amount of participation by the host CPU.
The inline threshold may be much larger than 4KB in the future,
after negotiation with a peer server.
Instead of copying the components of rq_snd_buf into its head iovec,
construct a gather list of these components, and send them all in
place. The same approach is already used in the Linux server's
RPC-over-RDMA reply path.
This mechanism also eliminates the need for rpcrdma_tail_pullup,
which is used to manage the XDR pad and trailing inline content when
a Read list is present.
This requires that the pages in rq_snd_buf's page list be DMA-mapped
during marshaling, and unmapped when a data-bearing RPC is
completed. This is slightly less efficient for very small I/O
payloads, but significantly more efficient as data payload size and
inline threshold increase past a kilobyte.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-09-15 22:57:24 +08:00
|
|
|
|
|
|
|
out:
|
|
|
|
req->rl_send_wr.num_sge = sge_no + 1;
|
|
|
|
return true;
|
|
|
|
|
|
|
|
out_mapping_overflow:
|
|
|
|
pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no);
|
|
|
|
return false;
|
|
|
|
|
|
|
|
out_mapping_err:
|
|
|
|
pr_err("rpcrdma: Send mapping error\n");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
|
|
|
rpcrdma_prepare_send_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
|
|
|
|
u32 hdrlen, struct xdr_buf *xdr,
|
|
|
|
enum rpcrdma_chunktype rtype)
|
|
|
|
{
|
|
|
|
req->rl_send_wr.num_sge = 0;
|
|
|
|
req->rl_mapped_sges = 0;
|
|
|
|
|
|
|
|
if (!rpcrdma_prepare_hdr_sge(ia, req, hdrlen))
|
|
|
|
goto out_map;
|
|
|
|
|
|
|
|
if (rtype != rpcrdma_areadch)
|
|
|
|
if (!rpcrdma_prepare_msg_sges(ia, req, xdr, rtype))
|
|
|
|
goto out_map;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
out_map:
|
|
|
|
pr_err("rpcrdma: failed to DMA map a Send buffer\n");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
rpcrdma_unmap_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
|
|
|
|
{
|
|
|
|
struct ib_device *device = ia->ri_device;
|
|
|
|
struct ib_sge *sge;
|
|
|
|
int count;
|
|
|
|
|
|
|
|
sge = &req->rl_send_sge[2];
|
|
|
|
for (count = req->rl_mapped_sges; count--; sge++)
|
|
|
|
ib_dma_unmap_page(device, sge->addr, sge->length,
|
|
|
|
DMA_TO_DEVICE);
|
|
|
|
req->rl_mapped_sges = 0;
|
2007-09-11 01:50:42 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Marshal a request: the primary job of this routine is to choose
|
|
|
|
* the transfer modes. See comments below.
|
|
|
|
*
|
2014-05-28 22:35:14 +08:00
|
|
|
* Returns zero on success, otherwise a negative errno.
|
2007-09-11 01:50:42 +08:00
|
|
|
*/
|
|
|
|
|
|
|
|
int
|
|
|
|
rpcrdma_marshal_req(struct rpc_rqst *rqst)
|
|
|
|
{
|
2013-01-08 22:10:21 +08:00
|
|
|
struct rpc_xprt *xprt = rqst->rq_xprt;
|
2007-09-11 01:50:42 +08:00
|
|
|
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
|
|
|
|
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
|
2015-03-31 02:33:53 +08:00
|
|
|
enum rpcrdma_chunktype rtype, wtype;
|
2007-09-11 01:50:42 +08:00
|
|
|
struct rpcrdma_msg *headerp;
|
2016-06-30 01:55:06 +08:00
|
|
|
bool ddp_allowed;
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
ssize_t hdrlen;
|
|
|
|
size_t rpclen;
|
|
|
|
__be32 *iptr;
|
2007-09-11 01:50:42 +08:00
|
|
|
|
2015-10-25 05:27:59 +08:00
|
|
|
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
|
|
|
|
if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state))
|
|
|
|
return rpcrdma_bc_marshal_reply(rqst);
|
|
|
|
#endif
|
|
|
|
|
2015-01-22 00:04:16 +08:00
|
|
|
headerp = rdmab_to_msg(req->rl_rdmabuf);
|
2015-01-22 00:02:13 +08:00
|
|
|
/* don't byte-swap XID, it's already done in request */
|
2007-09-11 01:50:42 +08:00
|
|
|
headerp->rm_xid = rqst->rq_xid;
|
2015-01-22 00:02:13 +08:00
|
|
|
headerp->rm_vers = rpcrdma_version;
|
|
|
|
headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests);
|
|
|
|
headerp->rm_type = rdma_msg;
|
2007-09-11 01:50:42 +08:00
|
|
|
|
2016-06-30 01:55:06 +08:00
|
|
|
/* When the ULP employs a GSS flavor that guarantees integrity
|
|
|
|
* or privacy, direct data placement of individual data items
|
|
|
|
* is not allowed.
|
|
|
|
*/
|
|
|
|
ddp_allowed = !(rqst->rq_cred->cr_auth->au_flags &
|
|
|
|
RPCAUTH_AUTH_DATATOUCH);
|
|
|
|
|
2007-09-11 01:50:42 +08:00
|
|
|
/*
|
|
|
|
* Chunks needed for results?
|
|
|
|
*
|
|
|
|
* o If the expected result is under the inline threshold, all ops
|
2015-08-04 01:04:08 +08:00
|
|
|
* return as inline.
|
2016-05-03 02:41:14 +08:00
|
|
|
* o Large read ops return data as write chunk(s), header as
|
|
|
|
* inline.
|
2007-09-11 01:50:42 +08:00
|
|
|
* o Large non-read ops return as a single reply chunk.
|
|
|
|
*/
|
2016-05-03 02:41:14 +08:00
|
|
|
if (rpcrdma_results_inline(r_xprt, rqst))
|
2015-08-04 01:03:58 +08:00
|
|
|
wtype = rpcrdma_noch;
|
2016-06-30 01:55:06 +08:00
|
|
|
else if (ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ)
|
2016-05-03 02:41:14 +08:00
|
|
|
wtype = rpcrdma_writech;
|
2007-09-11 01:50:42 +08:00
|
|
|
else
|
2015-03-31 02:33:53 +08:00
|
|
|
wtype = rpcrdma_replych;
|
2007-09-11 01:50:42 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Chunks needed for arguments?
|
|
|
|
*
|
|
|
|
* o If the total request is under the inline threshold, all ops
|
|
|
|
* are sent as inline.
|
|
|
|
* o Large write ops transmit data as read chunk(s), header as
|
|
|
|
* inline.
|
2015-08-04 01:04:26 +08:00
|
|
|
* o Large non-write ops are sent with the entire message as a
|
|
|
|
* single read chunk (protocol 0-position special case).
|
2007-09-11 01:50:42 +08:00
|
|
|
*
|
2015-08-04 01:04:26 +08:00
|
|
|
* This assumes that the upper layer does not present a request
|
|
|
|
* that both has a data payload, and whose non-data arguments
|
|
|
|
* by themselves are larger than the inline threshold.
|
2007-09-11 01:50:42 +08:00
|
|
|
*/
|
2016-05-03 02:41:05 +08:00
|
|
|
if (rpcrdma_args_inline(r_xprt, rqst)) {
|
2015-03-31 02:33:53 +08:00
|
|
|
rtype = rpcrdma_noch;
|
xprtrdma: Use gathered Send for large inline messages
An RPC Call message that is sent inline but that has a data payload
(ie, one or more items in rq_snd_buf's page list) must be "pulled
up:"
- call_allocate has to reserve enough RPC Call buffer space to
accommodate the data payload
- call_transmit has to memcopy the rq_snd_buf's page list and tail
into its head iovec before it is sent
As the inline threshold is increased beyond its current 1KB default,
however, this means data payloads of more than a few KB are copied
by the host CPU. For example, if the inline threshold is increased
just to 4KB, then NFS WRITE requests up to 4KB would involve a
memcpy of the NFS WRITE's payload data into the RPC Call buffer.
This is an undesirable amount of participation by the host CPU.
The inline threshold may be much larger than 4KB in the future,
after negotiation with a peer server.
Instead of copying the components of rq_snd_buf into its head iovec,
construct a gather list of these components, and send them all in
place. The same approach is already used in the Linux server's
RPC-over-RDMA reply path.
This mechanism also eliminates the need for rpcrdma_tail_pullup,
which is used to manage the XDR pad and trailing inline content when
a Read list is present.
This requires that the pages in rq_snd_buf's page list be DMA-mapped
during marshaling, and unmapped when a data-bearing RPC is
completed. This is slightly less efficient for very small I/O
payloads, but significantly more efficient as data payload size and
inline threshold increase past a kilobyte.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-09-15 22:57:24 +08:00
|
|
|
rpclen = rqst->rq_snd_buf.len;
|
2016-06-30 01:55:06 +08:00
|
|
|
} else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
|
2015-03-31 02:33:53 +08:00
|
|
|
rtype = rpcrdma_readch;
|
xprtrdma: Use gathered Send for large inline messages
An RPC Call message that is sent inline but that has a data payload
(ie, one or more items in rq_snd_buf's page list) must be "pulled
up:"
- call_allocate has to reserve enough RPC Call buffer space to
accommodate the data payload
- call_transmit has to memcopy the rq_snd_buf's page list and tail
into its head iovec before it is sent
As the inline threshold is increased beyond its current 1KB default,
however, this means data payloads of more than a few KB are copied
by the host CPU. For example, if the inline threshold is increased
just to 4KB, then NFS WRITE requests up to 4KB would involve a
memcpy of the NFS WRITE's payload data into the RPC Call buffer.
This is an undesirable amount of participation by the host CPU.
The inline threshold may be much larger than 4KB in the future,
after negotiation with a peer server.
Instead of copying the components of rq_snd_buf into its head iovec,
construct a gather list of these components, and send them all in
place. The same approach is already used in the Linux server's
RPC-over-RDMA reply path.
This mechanism also eliminates the need for rpcrdma_tail_pullup,
which is used to manage the XDR pad and trailing inline content when
a Read list is present.
This requires that the pages in rq_snd_buf's page list be DMA-mapped
during marshaling, and unmapped when a data-bearing RPC is
completed. This is slightly less efficient for very small I/O
payloads, but significantly more efficient as data payload size and
inline threshold increase past a kilobyte.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-09-15 22:57:24 +08:00
|
|
|
rpclen = rqst->rq_snd_buf.head[0].iov_len +
|
|
|
|
rqst->rq_snd_buf.tail[0].iov_len;
|
2015-08-04 01:04:26 +08:00
|
|
|
} else {
|
2015-08-04 01:04:45 +08:00
|
|
|
r_xprt->rx_stats.nomsg_call_count++;
|
2015-08-04 01:04:26 +08:00
|
|
|
headerp->rm_type = htonl(RDMA_NOMSG);
|
|
|
|
rtype = rpcrdma_areadch;
|
|
|
|
rpclen = 0;
|
|
|
|
}
|
2007-09-11 01:50:42 +08:00
|
|
|
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
/* This implementation supports the following combinations
|
|
|
|
* of chunk lists in one RPC-over-RDMA Call message:
|
|
|
|
*
|
|
|
|
* - Read list
|
|
|
|
* - Write list
|
|
|
|
* - Reply chunk
|
|
|
|
* - Read list + Reply chunk
|
|
|
|
*
|
|
|
|
* It might not yet support the following combinations:
|
|
|
|
*
|
|
|
|
* - Read list + Write list
|
|
|
|
*
|
|
|
|
* It does not support the following combinations:
|
|
|
|
*
|
|
|
|
* - Write list + Reply chunk
|
|
|
|
* - Read list + Write list + Reply chunk
|
|
|
|
*
|
|
|
|
* This implementation supports only a single chunk in each
|
|
|
|
* Read or Write list. Thus for example the client cannot
|
|
|
|
* send a Call message with a Position Zero Read chunk and a
|
|
|
|
* regular Read chunk at the same time.
|
2007-09-11 01:50:42 +08:00
|
|
|
*/
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
iptr = headerp->rm_body.rm_chunks;
|
|
|
|
iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype);
|
|
|
|
if (IS_ERR(iptr))
|
|
|
|
goto out_unmap;
|
|
|
|
iptr = rpcrdma_encode_write_list(r_xprt, req, rqst, iptr, wtype);
|
|
|
|
if (IS_ERR(iptr))
|
|
|
|
goto out_unmap;
|
|
|
|
iptr = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, iptr, wtype);
|
|
|
|
if (IS_ERR(iptr))
|
|
|
|
goto out_unmap;
|
|
|
|
hdrlen = (unsigned char *)iptr - (unsigned char *)headerp;
|
2007-09-11 01:50:42 +08:00
|
|
|
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n",
|
|
|
|
rqst->rq_task->tk_pid, __func__,
|
|
|
|
transfertypes[rtype], transfertypes[wtype],
|
|
|
|
hdrlen, rpclen);
|
2007-09-11 01:50:42 +08:00
|
|
|
|
xprtrdma: Use gathered Send for large inline messages
An RPC Call message that is sent inline but that has a data payload
(ie, one or more items in rq_snd_buf's page list) must be "pulled
up:"
- call_allocate has to reserve enough RPC Call buffer space to
accommodate the data payload
- call_transmit has to memcopy the rq_snd_buf's page list and tail
into its head iovec before it is sent
As the inline threshold is increased beyond its current 1KB default,
however, this means data payloads of more than a few KB are copied
by the host CPU. For example, if the inline threshold is increased
just to 4KB, then NFS WRITE requests up to 4KB would involve a
memcpy of the NFS WRITE's payload data into the RPC Call buffer.
This is an undesirable amount of participation by the host CPU.
The inline threshold may be much larger than 4KB in the future,
after negotiation with a peer server.
Instead of copying the components of rq_snd_buf into its head iovec,
construct a gather list of these components, and send them all in
place. The same approach is already used in the Linux server's
RPC-over-RDMA reply path.
This mechanism also eliminates the need for rpcrdma_tail_pullup,
which is used to manage the XDR pad and trailing inline content when
a Read list is present.
This requires that the pages in rq_snd_buf's page list be DMA-mapped
during marshaling, and unmapped when a data-bearing RPC is
completed. This is slightly less efficient for very small I/O
payloads, but significantly more efficient as data payload size and
inline threshold increase past a kilobyte.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-09-15 22:57:24 +08:00
|
|
|
if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, hdrlen,
|
|
|
|
&rqst->rq_snd_buf, rtype)) {
|
|
|
|
iptr = ERR_PTR(-EIO);
|
|
|
|
goto out_unmap;
|
|
|
|
}
|
2007-09-11 01:50:42 +08:00
|
|
|
return 0;
|
2016-05-03 02:41:05 +08:00
|
|
|
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
out_unmap:
|
2016-05-03 02:42:46 +08:00
|
|
|
r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
|
xprtrdma: Allow Read list and Reply chunk simultaneously
rpcrdma_marshal_req() makes a simplifying assumption: that NFS
operations with large Call messages have small Reply messages, and
vice versa. Therefore with RPC-over-RDMA, only one chunk type is
ever needed for each Call/Reply pair, because one direction needs
chunks, the other direction will always fit inline.
In fact, this assumption is asserted in the code:
if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
}
But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p
perform data transformation on RPC messages before they are
transmitted, direct data placement techniques cannot be used, thus
RPC messages must be sent via a Long call in both directions.
All such calls are sent with a Position Zero Read chunk, and all
such replies are handled with a Reply chunk. Thus the client must
provide every Call/Reply pair with both a Read list and a Reply
chunk.
Without any special security in effect, NFSv4 WRITEs may now also
use the Read list and provide a Reply chunk. The marshal_req
logic was preventing that, meaning an NFSv4 WRITE with a large
payload that included a GETATTR result larger than the inline
threshold would fail.
The code that encodes each chunk list is now completely contained in
its own function. There is some code duplication, but the trade-off
is that the overall logic should be more clear.
Note that all three chunk lists now share the rl_segments array.
Some additional per-req accounting is necessary to track this
usage. For the same reasons that the above simplifying assumption
has held true for so long, I don't expect more array elements are
needed at this time.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-05-03 02:41:30 +08:00
|
|
|
return PTR_ERR(iptr);
|
2007-09-11 01:50:42 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Chase down a received write or reply chunklist to get length
|
|
|
|
* RDMA'd by server. See map at rpcrdma_create_chunks()! :-)
|
|
|
|
*/
|
|
|
|
static int
|
2016-06-30 01:54:16 +08:00
|
|
|
rpcrdma_count_chunks(struct rpcrdma_rep *rep, int wrchunk, __be32 **iptrp)
|
2007-09-11 01:50:42 +08:00
|
|
|
{
|
|
|
|
unsigned int i, total_len;
|
|
|
|
struct rpcrdma_write_chunk *cur_wchunk;
|
2015-01-22 00:04:25 +08:00
|
|
|
char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf);
|
2007-09-11 01:50:42 +08:00
|
|
|
|
2015-01-22 00:02:13 +08:00
|
|
|
i = be32_to_cpu(**iptrp);
|
2007-09-11 01:50:42 +08:00
|
|
|
cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1);
|
|
|
|
total_len = 0;
|
|
|
|
while (i--) {
|
|
|
|
struct rpcrdma_segment *seg = &cur_wchunk->wc_target;
|
|
|
|
ifdebug(FACILITY) {
|
|
|
|
u64 off;
|
2007-10-29 12:37:58 +08:00
|
|
|
xdr_decode_hyper((__be32 *)&seg->rs_offset, &off);
|
2016-11-29 23:53:29 +08:00
|
|
|
dprintk("RPC: %s: chunk %d@0x%016llx:0x%08x\n",
|
2007-09-11 01:50:42 +08:00
|
|
|
__func__,
|
2015-01-22 00:02:13 +08:00
|
|
|
be32_to_cpu(seg->rs_length),
|
2007-10-30 15:44:32 +08:00
|
|
|
(unsigned long long)off,
|
2015-01-22 00:02:13 +08:00
|
|
|
be32_to_cpu(seg->rs_handle));
|
2007-09-11 01:50:42 +08:00
|
|
|
}
|
2015-01-22 00:02:13 +08:00
|
|
|
total_len += be32_to_cpu(seg->rs_length);
|
2007-09-11 01:50:42 +08:00
|
|
|
++cur_wchunk;
|
|
|
|
}
|
|
|
|
/* check and adjust for properly terminated write chunk */
|
|
|
|
if (wrchunk) {
|
2007-10-29 12:37:58 +08:00
|
|
|
__be32 *w = (__be32 *) cur_wchunk;
|
2007-09-11 01:50:42 +08:00
|
|
|
if (*w++ != xdr_zero)
|
|
|
|
return -1;
|
|
|
|
cur_wchunk = (struct rpcrdma_write_chunk *) w;
|
|
|
|
}
|
2015-01-22 00:04:25 +08:00
|
|
|
if ((char *)cur_wchunk > base + rep->rr_len)
|
2007-09-11 01:50:42 +08:00
|
|
|
return -1;
|
|
|
|
|
2007-10-29 12:37:58 +08:00
|
|
|
*iptrp = (__be32 *) cur_wchunk;
|
2007-09-11 01:50:42 +08:00
|
|
|
return total_len;
|
|
|
|
}
|
|
|
|
|
xprtrdma: Do not update {head, tail}.iov_len in rpcrdma_inline_fixup()
While trying NFSv4.0/RDMA with sec=krb5p, I noticed small NFS READ
operations failed. After the client unwrapped the NFS READ reply
message, the NFS READ XDR decoder was not able to decode the reply.
The message was "Server cheating in reply", with the reported
number of received payload bytes being zero. Applications reported
a read(2) that returned -1/EIO.
The problem is rpcrdma_inline_fixup() sets the tail.iov_len to zero
when the incoming reply fits entirely in the head iovec. The zero
tail.iov_len confused xdr_buf_trim(), which then mangled the actual
reply data instead of simply removing the trailing GSS checksum.
As near as I can tell, RPC transports are not supposed to update the
head.iov_len, page_len, or tail.iov_len fields in the receive XDR
buffer when handling an incoming RPC reply message. These fields
contain the length of each component of the XDR buffer, and hence
the maximum number of bytes of reply data that can be stored in each
XDR buffer component. I've concluded this because:
- This is how xdr_partial_copy_from_skb() appears to behave
- rpcrdma_inline_fixup() already does not alter page_len
- call_decode() compares rq_private_buf and rq_rcv_buf and WARNs
if they are not exactly the same
Unfortunately, as soon as I tried the simple fix to just remove the
line that sets tail.iov_len to zero, I saw that the logic that
appends the implicit Write chunk pad inline depends on inline_fixup
setting tail.iov_len to zero.
To address this, re-organize the tail iovec handling logic to use
the same approach as with the head iovec: simply point tail.iov_base
to the correct bytes in the receive buffer.
While I remember all this, write down the conclusion in documenting
comments.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-06-30 01:54:41 +08:00
|
|
|
/**
|
|
|
|
* rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs
|
|
|
|
* @rqst: controlling RPC request
|
|
|
|
* @srcp: points to RPC message payload in receive buffer
|
|
|
|
* @copy_len: remaining length of receive buffer content
|
|
|
|
* @pad: Write chunk pad bytes needed (zero for pure inline)
|
|
|
|
*
|
|
|
|
* The upper layer has set the maximum number of bytes it can
|
|
|
|
* receive in each component of rq_rcv_buf. These values are set in
|
|
|
|
* the head.iov_len, page_len, tail.iov_len, and buflen fields.
|
2016-06-30 01:54:49 +08:00
|
|
|
*
|
|
|
|
* Unlike the TCP equivalent (xdr_partial_copy_from_skb), in
|
|
|
|
* many cases this function simply updates iov_base pointers in
|
|
|
|
* rq_rcv_buf to point directly to the received reply data, to
|
|
|
|
* avoid copying reply data.
|
2016-06-30 01:54:58 +08:00
|
|
|
*
|
|
|
|
* Returns the count of bytes which had to be memcopied.
|
2007-09-11 01:50:42 +08:00
|
|
|
*/
|
2016-06-30 01:54:58 +08:00
|
|
|
static unsigned long
|
2008-10-10 03:01:11 +08:00
|
|
|
rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
|
2007-09-11 01:50:42 +08:00
|
|
|
{
|
2016-06-30 01:54:58 +08:00
|
|
|
unsigned long fixup_copy_count;
|
|
|
|
int i, npages, curlen;
|
2007-09-11 01:50:42 +08:00
|
|
|
char *destp;
|
2011-02-10 03:45:28 +08:00
|
|
|
struct page **ppages;
|
|
|
|
int page_base;
|
2007-09-11 01:50:42 +08:00
|
|
|
|
xprtrdma: Do not update {head, tail}.iov_len in rpcrdma_inline_fixup()
While trying NFSv4.0/RDMA with sec=krb5p, I noticed small NFS READ
operations failed. After the client unwrapped the NFS READ reply
message, the NFS READ XDR decoder was not able to decode the reply.
The message was "Server cheating in reply", with the reported
number of received payload bytes being zero. Applications reported
a read(2) that returned -1/EIO.
The problem is rpcrdma_inline_fixup() sets the tail.iov_len to zero
when the incoming reply fits entirely in the head iovec. The zero
tail.iov_len confused xdr_buf_trim(), which then mangled the actual
reply data instead of simply removing the trailing GSS checksum.
As near as I can tell, RPC transports are not supposed to update the
head.iov_len, page_len, or tail.iov_len fields in the receive XDR
buffer when handling an incoming RPC reply message. These fields
contain the length of each component of the XDR buffer, and hence
the maximum number of bytes of reply data that can be stored in each
XDR buffer component. I've concluded this because:
- This is how xdr_partial_copy_from_skb() appears to behave
- rpcrdma_inline_fixup() already does not alter page_len
- call_decode() compares rq_private_buf and rq_rcv_buf and WARNs
if they are not exactly the same
Unfortunately, as soon as I tried the simple fix to just remove the
line that sets tail.iov_len to zero, I saw that the logic that
appends the implicit Write chunk pad inline depends on inline_fixup
setting tail.iov_len to zero.
To address this, re-organize the tail iovec handling logic to use
the same approach as with the head iovec: simply point tail.iov_base
to the correct bytes in the receive buffer.
While I remember all this, write down the conclusion in documenting
comments.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-06-30 01:54:41 +08:00
|
|
|
/* The head iovec is redirected to the RPC reply message
|
|
|
|
* in the receive buffer, to avoid a memcopy.
|
|
|
|
*/
|
|
|
|
rqst->rq_rcv_buf.head[0].iov_base = srcp;
|
2016-06-30 01:54:49 +08:00
|
|
|
rqst->rq_private_buf.head[0].iov_base = srcp;
|
xprtrdma: Do not update {head, tail}.iov_len in rpcrdma_inline_fixup()
While trying NFSv4.0/RDMA with sec=krb5p, I noticed small NFS READ
operations failed. After the client unwrapped the NFS READ reply
message, the NFS READ XDR decoder was not able to decode the reply.
The message was "Server cheating in reply", with the reported
number of received payload bytes being zero. Applications reported
a read(2) that returned -1/EIO.
The problem is rpcrdma_inline_fixup() sets the tail.iov_len to zero
when the incoming reply fits entirely in the head iovec. The zero
tail.iov_len confused xdr_buf_trim(), which then mangled the actual
reply data instead of simply removing the trailing GSS checksum.
As near as I can tell, RPC transports are not supposed to update the
head.iov_len, page_len, or tail.iov_len fields in the receive XDR
buffer when handling an incoming RPC reply message. These fields
contain the length of each component of the XDR buffer, and hence
the maximum number of bytes of reply data that can be stored in each
XDR buffer component. I've concluded this because:
- This is how xdr_partial_copy_from_skb() appears to behave
- rpcrdma_inline_fixup() already does not alter page_len
- call_decode() compares rq_private_buf and rq_rcv_buf and WARNs
if they are not exactly the same
Unfortunately, as soon as I tried the simple fix to just remove the
line that sets tail.iov_len to zero, I saw that the logic that
appends the implicit Write chunk pad inline depends on inline_fixup
setting tail.iov_len to zero.
To address this, re-organize the tail iovec handling logic to use
the same approach as with the head iovec: simply point tail.iov_base
to the correct bytes in the receive buffer.
While I remember all this, write down the conclusion in documenting
comments.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-06-30 01:54:41 +08:00
|
|
|
|
|
|
|
/* The contents of the receive buffer that follow
|
|
|
|
* head.iov_len bytes are copied into the page list.
|
|
|
|
*/
|
2007-09-11 01:50:42 +08:00
|
|
|
curlen = rqst->rq_rcv_buf.head[0].iov_len;
|
xprtrdma: Do not update {head, tail}.iov_len in rpcrdma_inline_fixup()
While trying NFSv4.0/RDMA with sec=krb5p, I noticed small NFS READ
operations failed. After the client unwrapped the NFS READ reply
message, the NFS READ XDR decoder was not able to decode the reply.
The message was "Server cheating in reply", with the reported
number of received payload bytes being zero. Applications reported
a read(2) that returned -1/EIO.
The problem is rpcrdma_inline_fixup() sets the tail.iov_len to zero
when the incoming reply fits entirely in the head iovec. The zero
tail.iov_len confused xdr_buf_trim(), which then mangled the actual
reply data instead of simply removing the trailing GSS checksum.
As near as I can tell, RPC transports are not supposed to update the
head.iov_len, page_len, or tail.iov_len fields in the receive XDR
buffer when handling an incoming RPC reply message. These fields
contain the length of each component of the XDR buffer, and hence
the maximum number of bytes of reply data that can be stored in each
XDR buffer component. I've concluded this because:
- This is how xdr_partial_copy_from_skb() appears to behave
- rpcrdma_inline_fixup() already does not alter page_len
- call_decode() compares rq_private_buf and rq_rcv_buf and WARNs
if they are not exactly the same
Unfortunately, as soon as I tried the simple fix to just remove the
line that sets tail.iov_len to zero, I saw that the logic that
appends the implicit Write chunk pad inline depends on inline_fixup
setting tail.iov_len to zero.
To address this, re-organize the tail iovec handling logic to use
the same approach as with the head iovec: simply point tail.iov_base
to the correct bytes in the receive buffer.
While I remember all this, write down the conclusion in documenting
comments.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-06-30 01:54:41 +08:00
|
|
|
if (curlen > copy_len)
|
2007-09-11 01:50:42 +08:00
|
|
|
curlen = copy_len;
|
|
|
|
dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n",
|
|
|
|
__func__, srcp, copy_len, curlen);
|
|
|
|
srcp += curlen;
|
|
|
|
copy_len -= curlen;
|
|
|
|
|
2011-02-10 03:45:28 +08:00
|
|
|
page_base = rqst->rq_rcv_buf.page_base;
|
|
|
|
ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT);
|
|
|
|
page_base &= ~PAGE_MASK;
|
2016-06-30 01:54:58 +08:00
|
|
|
fixup_copy_count = 0;
|
2007-09-11 01:50:42 +08:00
|
|
|
if (copy_len && rqst->rq_rcv_buf.page_len) {
|
2016-06-30 01:54:33 +08:00
|
|
|
int pagelist_len;
|
|
|
|
|
|
|
|
pagelist_len = rqst->rq_rcv_buf.page_len;
|
|
|
|
if (pagelist_len > copy_len)
|
|
|
|
pagelist_len = copy_len;
|
|
|
|
npages = PAGE_ALIGN(page_base + pagelist_len) >> PAGE_SHIFT;
|
2016-06-30 01:54:58 +08:00
|
|
|
for (i = 0; i < npages; i++) {
|
2011-02-10 03:45:28 +08:00
|
|
|
curlen = PAGE_SIZE - page_base;
|
2016-06-30 01:54:33 +08:00
|
|
|
if (curlen > pagelist_len)
|
|
|
|
curlen = pagelist_len;
|
|
|
|
|
2007-09-11 01:50:42 +08:00
|
|
|
dprintk("RPC: %s: page %d"
|
|
|
|
" srcp 0x%p len %d curlen %d\n",
|
|
|
|
__func__, i, srcp, copy_len, curlen);
|
2011-11-25 23:14:40 +08:00
|
|
|
destp = kmap_atomic(ppages[i]);
|
2011-02-10 03:45:28 +08:00
|
|
|
memcpy(destp + page_base, srcp, curlen);
|
|
|
|
flush_dcache_page(ppages[i]);
|
2011-11-25 23:14:40 +08:00
|
|
|
kunmap_atomic(destp);
|
2007-09-11 01:50:42 +08:00
|
|
|
srcp += curlen;
|
|
|
|
copy_len -= curlen;
|
2016-06-30 01:54:58 +08:00
|
|
|
fixup_copy_count += curlen;
|
2016-06-30 01:54:33 +08:00
|
|
|
pagelist_len -= curlen;
|
|
|
|
if (!pagelist_len)
|
2007-09-11 01:50:42 +08:00
|
|
|
break;
|
2011-02-10 03:45:28 +08:00
|
|
|
page_base = 0;
|
2007-09-11 01:50:42 +08:00
|
|
|
}
|
|
|
|
|
xprtrdma: Do not update {head, tail}.iov_len in rpcrdma_inline_fixup()
While trying NFSv4.0/RDMA with sec=krb5p, I noticed small NFS READ
operations failed. After the client unwrapped the NFS READ reply
message, the NFS READ XDR decoder was not able to decode the reply.
The message was "Server cheating in reply", with the reported
number of received payload bytes being zero. Applications reported
a read(2) that returned -1/EIO.
The problem is rpcrdma_inline_fixup() sets the tail.iov_len to zero
when the incoming reply fits entirely in the head iovec. The zero
tail.iov_len confused xdr_buf_trim(), which then mangled the actual
reply data instead of simply removing the trailing GSS checksum.
As near as I can tell, RPC transports are not supposed to update the
head.iov_len, page_len, or tail.iov_len fields in the receive XDR
buffer when handling an incoming RPC reply message. These fields
contain the length of each component of the XDR buffer, and hence
the maximum number of bytes of reply data that can be stored in each
XDR buffer component. I've concluded this because:
- This is how xdr_partial_copy_from_skb() appears to behave
- rpcrdma_inline_fixup() already does not alter page_len
- call_decode() compares rq_private_buf and rq_rcv_buf and WARNs
if they are not exactly the same
Unfortunately, as soon as I tried the simple fix to just remove the
line that sets tail.iov_len to zero, I saw that the logic that
appends the implicit Write chunk pad inline depends on inline_fixup
setting tail.iov_len to zero.
To address this, re-organize the tail iovec handling logic to use
the same approach as with the head iovec: simply point tail.iov_base
to the correct bytes in the receive buffer.
While I remember all this, write down the conclusion in documenting
comments.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-06-30 01:54:41 +08:00
|
|
|
/* Implicit padding for the last segment in a Write
|
|
|
|
* chunk is inserted inline at the front of the tail
|
|
|
|
* iovec. The upper layer ignores the content of
|
|
|
|
* the pad. Simply ensure inline content in the tail
|
|
|
|
* that follows the Write chunk is properly aligned.
|
|
|
|
*/
|
|
|
|
if (pad)
|
|
|
|
srcp -= pad;
|
2008-10-10 03:01:11 +08:00
|
|
|
}
|
|
|
|
|
xprtrdma: Do not update {head, tail}.iov_len in rpcrdma_inline_fixup()
While trying NFSv4.0/RDMA with sec=krb5p, I noticed small NFS READ
operations failed. After the client unwrapped the NFS READ reply
message, the NFS READ XDR decoder was not able to decode the reply.
The message was "Server cheating in reply", with the reported
number of received payload bytes being zero. Applications reported
a read(2) that returned -1/EIO.
The problem is rpcrdma_inline_fixup() sets the tail.iov_len to zero
when the incoming reply fits entirely in the head iovec. The zero
tail.iov_len confused xdr_buf_trim(), which then mangled the actual
reply data instead of simply removing the trailing GSS checksum.
As near as I can tell, RPC transports are not supposed to update the
head.iov_len, page_len, or tail.iov_len fields in the receive XDR
buffer when handling an incoming RPC reply message. These fields
contain the length of each component of the XDR buffer, and hence
the maximum number of bytes of reply data that can be stored in each
XDR buffer component. I've concluded this because:
- This is how xdr_partial_copy_from_skb() appears to behave
- rpcrdma_inline_fixup() already does not alter page_len
- call_decode() compares rq_private_buf and rq_rcv_buf and WARNs
if they are not exactly the same
Unfortunately, as soon as I tried the simple fix to just remove the
line that sets tail.iov_len to zero, I saw that the logic that
appends the implicit Write chunk pad inline depends on inline_fixup
setting tail.iov_len to zero.
To address this, re-organize the tail iovec handling logic to use
the same approach as with the head iovec: simply point tail.iov_base
to the correct bytes in the receive buffer.
While I remember all this, write down the conclusion in documenting
comments.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-06-30 01:54:41 +08:00
|
|
|
/* The tail iovec is redirected to the remaining data
|
|
|
|
* in the receive buffer, to avoid a memcopy.
|
|
|
|
*/
|
2016-06-30 01:54:49 +08:00
|
|
|
if (copy_len || pad) {
|
xprtrdma: Do not update {head, tail}.iov_len in rpcrdma_inline_fixup()
While trying NFSv4.0/RDMA with sec=krb5p, I noticed small NFS READ
operations failed. After the client unwrapped the NFS READ reply
message, the NFS READ XDR decoder was not able to decode the reply.
The message was "Server cheating in reply", with the reported
number of received payload bytes being zero. Applications reported
a read(2) that returned -1/EIO.
The problem is rpcrdma_inline_fixup() sets the tail.iov_len to zero
when the incoming reply fits entirely in the head iovec. The zero
tail.iov_len confused xdr_buf_trim(), which then mangled the actual
reply data instead of simply removing the trailing GSS checksum.
As near as I can tell, RPC transports are not supposed to update the
head.iov_len, page_len, or tail.iov_len fields in the receive XDR
buffer when handling an incoming RPC reply message. These fields
contain the length of each component of the XDR buffer, and hence
the maximum number of bytes of reply data that can be stored in each
XDR buffer component. I've concluded this because:
- This is how xdr_partial_copy_from_skb() appears to behave
- rpcrdma_inline_fixup() already does not alter page_len
- call_decode() compares rq_private_buf and rq_rcv_buf and WARNs
if they are not exactly the same
Unfortunately, as soon as I tried the simple fix to just remove the
line that sets tail.iov_len to zero, I saw that the logic that
appends the implicit Write chunk pad inline depends on inline_fixup
setting tail.iov_len to zero.
To address this, re-organize the tail iovec handling logic to use
the same approach as with the head iovec: simply point tail.iov_base
to the correct bytes in the receive buffer.
While I remember all this, write down the conclusion in documenting
comments.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-06-30 01:54:41 +08:00
|
|
|
rqst->rq_rcv_buf.tail[0].iov_base = srcp;
|
2016-06-30 01:54:49 +08:00
|
|
|
rqst->rq_private_buf.tail[0].iov_base = srcp;
|
|
|
|
}
|
xprtrdma: Do not update {head, tail}.iov_len in rpcrdma_inline_fixup()
While trying NFSv4.0/RDMA with sec=krb5p, I noticed small NFS READ
operations failed. After the client unwrapped the NFS READ reply
message, the NFS READ XDR decoder was not able to decode the reply.
The message was "Server cheating in reply", with the reported
number of received payload bytes being zero. Applications reported
a read(2) that returned -1/EIO.
The problem is rpcrdma_inline_fixup() sets the tail.iov_len to zero
when the incoming reply fits entirely in the head iovec. The zero
tail.iov_len confused xdr_buf_trim(), which then mangled the actual
reply data instead of simply removing the trailing GSS checksum.
As near as I can tell, RPC transports are not supposed to update the
head.iov_len, page_len, or tail.iov_len fields in the receive XDR
buffer when handling an incoming RPC reply message. These fields
contain the length of each component of the XDR buffer, and hence
the maximum number of bytes of reply data that can be stored in each
XDR buffer component. I've concluded this because:
- This is how xdr_partial_copy_from_skb() appears to behave
- rpcrdma_inline_fixup() already does not alter page_len
- call_decode() compares rq_private_buf and rq_rcv_buf and WARNs
if they are not exactly the same
Unfortunately, as soon as I tried the simple fix to just remove the
line that sets tail.iov_len to zero, I saw that the logic that
appends the implicit Write chunk pad inline depends on inline_fixup
setting tail.iov_len to zero.
To address this, re-organize the tail iovec handling logic to use
the same approach as with the head iovec: simply point tail.iov_base
to the correct bytes in the receive buffer.
While I remember all this, write down the conclusion in documenting
comments.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2016-06-30 01:54:41 +08:00
|
|
|
|
2016-06-30 01:54:58 +08:00
|
|
|
return fixup_copy_count;
|
2007-09-11 01:50:42 +08:00
|
|
|
}
|
|
|
|
|
2015-10-25 05:28:08 +08:00
|
|
|
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
|
|
|
|
/* By convention, backchannel calls arrive via rdma_msg type
|
|
|
|
* messages, and never populate the chunk lists. This makes
|
|
|
|
* the RPC/RDMA header small and fixed in size, so it is
|
|
|
|
* straightforward to check the RPC header's direction field.
|
|
|
|
*/
|
|
|
|
static bool
|
|
|
|
rpcrdma_is_bcall(struct rpcrdma_msg *headerp)
|
|
|
|
{
|
|
|
|
__be32 *p = (__be32 *)headerp;
|
|
|
|
|
|
|
|
if (headerp->rm_type != rdma_msg)
|
|
|
|
return false;
|
|
|
|
if (headerp->rm_body.rm_chunks[0] != xdr_zero)
|
|
|
|
return false;
|
|
|
|
if (headerp->rm_body.rm_chunks[1] != xdr_zero)
|
|
|
|
return false;
|
|
|
|
if (headerp->rm_body.rm_chunks[2] != xdr_zero)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/* sanity */
|
|
|
|
if (p[7] != headerp->rm_xid)
|
|
|
|
return false;
|
|
|
|
/* call direction */
|
|
|
|
if (p[8] != cpu_to_be32(RPC_CALL))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_SUNRPC_BACKCHANNEL */
|
|
|
|
|
2015-10-25 05:27:10 +08:00
|
|
|
/* Process received RPC/RDMA messages.
|
|
|
|
*
|
2007-09-11 01:50:42 +08:00
|
|
|
* Errors must result in the RPC task either being awakened, or
|
|
|
|
* allowed to timeout, to discover the errors at that time.
|
|
|
|
*/
|
|
|
|
void
|
2016-09-15 22:57:57 +08:00
|
|
|
rpcrdma_reply_handler(struct work_struct *work)
|
2007-09-11 01:50:42 +08:00
|
|
|
{
|
2016-09-15 22:57:57 +08:00
|
|
|
struct rpcrdma_rep *rep =
|
|
|
|
container_of(work, struct rpcrdma_rep, rr_work);
|
2007-09-11 01:50:42 +08:00
|
|
|
struct rpcrdma_msg *headerp;
|
|
|
|
struct rpcrdma_req *req;
|
|
|
|
struct rpc_rqst *rqst;
|
2015-05-26 23:51:37 +08:00
|
|
|
struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
|
|
|
|
struct rpc_xprt *xprt = &r_xprt->rx_xprt;
|
2007-10-29 12:37:58 +08:00
|
|
|
__be32 *iptr;
|
2016-03-05 00:28:18 +08:00
|
|
|
int rdmalen, status, rmerr;
|
2014-05-28 22:34:57 +08:00
|
|
|
unsigned long cwnd;
|
2007-09-11 01:50:42 +08:00
|
|
|
|
2015-10-25 05:26:54 +08:00
|
|
|
dprintk("RPC: %s: incoming rep %p\n", __func__, rep);
|
|
|
|
|
|
|
|
if (rep->rr_len == RPCRDMA_BAD_LEN)
|
|
|
|
goto out_badstatus;
|
2016-03-05 00:28:18 +08:00
|
|
|
if (rep->rr_len < RPCRDMA_HDRLEN_ERR)
|
2015-10-25 05:26:54 +08:00
|
|
|
goto out_shortreply;
|
|
|
|
|
2015-01-22 00:04:25 +08:00
|
|
|
headerp = rdmab_to_msg(rep->rr_rdmabuf);
|
2015-10-25 05:28:08 +08:00
|
|
|
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
|
|
|
|
if (rpcrdma_is_bcall(headerp))
|
|
|
|
goto out_bcall;
|
|
|
|
#endif
|
2007-09-11 01:50:42 +08:00
|
|
|
|
2015-10-25 05:27:10 +08:00
|
|
|
/* Match incoming rpcrdma_rep to an rpcrdma_req to
|
|
|
|
* get context for handling any incoming chunks.
|
|
|
|
*/
|
|
|
|
spin_lock_bh(&xprt->transport_lock);
|
2007-09-11 01:50:42 +08:00
|
|
|
rqst = xprt_lookup_rqst(xprt, headerp->rm_xid);
|
2015-10-25 05:26:54 +08:00
|
|
|
if (!rqst)
|
|
|
|
goto out_nomatch;
|
2007-09-11 01:50:42 +08:00
|
|
|
|
|
|
|
req = rpcr_to_rdmar(rqst);
|
2015-10-25 05:26:54 +08:00
|
|
|
if (req->rl_reply)
|
|
|
|
goto out_duplicate;
|
2007-09-11 01:50:42 +08:00
|
|
|
|
xprtrdma: Invalidate in the RPC reply handler
There is a window between the time the RPC reply handler wakes the
waiting RPC task and when xprt_release() invokes ops->buf_free.
During this time, memory regions containing the data payload may
still be accessed by a broken or malicious server, but the RPC
application has already been allowed access to the memory containing
the RPC request's data payloads.
The server should be fenced from client memory containing RPC data
payloads _before_ the RPC application is allowed to continue.
This change also more strongly enforces send queue accounting. There
is a maximum number of RPC calls allowed to be outstanding. When an
RPC/RDMA transport is set up, just enough send queue resources are
allocated to handle registration, Send, and invalidation WRs for
each those RPCs at the same time.
Before, additional RPC calls could be dispatched while invalidation
WRs were still consuming send WQEs. When invalidation WRs backed
up, dispatching additional RPCs resulted in a send queue overrun.
Now, the reply handler prevents RPC dispatch until invalidation is
complete. This prevents RPC call dispatch until there are enough
send queue resources to proceed.
Still to do: If an RPC exits early (say, ^C), the reply handler has
no opportunity to perform invalidation. Currently, xprt_rdma_free()
still frees remaining RDMA resources, which could deadlock.
Additional changes are needed to handle invalidation properly in this
case.
Reported-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Devesh Sharma <devesh.sharma@avagotech.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2015-12-17 06:23:11 +08:00
|
|
|
/* Sanity checking has passed. We are now committed
|
|
|
|
* to complete this transaction.
|
|
|
|
*/
|
|
|
|
list_del_init(&rqst->rq_list);
|
|
|
|
spin_unlock_bh(&xprt->transport_lock);
|
2016-03-05 00:27:43 +08:00
|
|
|
dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n",
|
|
|
|
__func__, rep, req, be32_to_cpu(headerp->rm_xid));
|
2007-09-11 01:50:42 +08:00
|
|
|
|
|
|
|
/* from here on, the reply is no longer an orphan */
|
|
|
|
req->rl_reply = rep;
|
2014-05-28 22:34:41 +08:00
|
|
|
xprt->reestablish_timeout = 0;
|
2007-09-11 01:50:42 +08:00
|
|
|
|
2016-03-05 00:28:18 +08:00
|
|
|
if (headerp->rm_vers != rpcrdma_version)
|
|
|
|
goto out_badversion;
|
|
|
|
|
2007-09-11 01:50:42 +08:00
|
|
|
/* check for expected message types */
|
|
|
|
/* The order of some of these tests is important. */
|
|
|
|
switch (headerp->rm_type) {
|
2015-01-22 00:02:13 +08:00
|
|
|
case rdma_msg:
|
2007-09-11 01:50:42 +08:00
|
|
|
/* never expect read chunks */
|
|
|
|
/* never expect reply chunks (two ways to check) */
|
|
|
|
/* never expect write chunks without having offered RDMA */
|
|
|
|
if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
|
|
|
|
(headerp->rm_body.rm_chunks[1] == xdr_zero &&
|
|
|
|
headerp->rm_body.rm_chunks[2] != xdr_zero) ||
|
|
|
|
(headerp->rm_body.rm_chunks[1] != xdr_zero &&
|
2016-06-30 01:54:16 +08:00
|
|
|
list_empty(&req->rl_registered)))
|
2007-09-11 01:50:42 +08:00
|
|
|
goto badheader;
|
|
|
|
if (headerp->rm_body.rm_chunks[1] != xdr_zero) {
|
|
|
|
/* count any expected write chunks in read reply */
|
|
|
|
/* start at write chunk array count */
|
|
|
|
iptr = &headerp->rm_body.rm_chunks[2];
|
2016-06-30 01:54:16 +08:00
|
|
|
rdmalen = rpcrdma_count_chunks(rep, 1, &iptr);
|
2007-09-11 01:50:42 +08:00
|
|
|
/* check for validity, and no reply chunk after */
|
|
|
|
if (rdmalen < 0 || *iptr++ != xdr_zero)
|
|
|
|
goto badheader;
|
|
|
|
rep->rr_len -=
|
|
|
|
((unsigned char *)iptr - (unsigned char *)headerp);
|
|
|
|
status = rep->rr_len + rdmalen;
|
|
|
|
r_xprt->rx_stats.total_rdma_reply += rdmalen;
|
2008-10-10 03:01:11 +08:00
|
|
|
/* special case - last chunk may omit padding */
|
|
|
|
if (rdmalen &= 3) {
|
|
|
|
rdmalen = 4 - rdmalen;
|
|
|
|
status += rdmalen;
|
|
|
|
}
|
2007-09-11 01:50:42 +08:00
|
|
|
} else {
|
|
|
|
/* else ordinary inline */
|
2008-10-10 03:01:11 +08:00
|
|
|
rdmalen = 0;
|
2015-01-22 00:02:29 +08:00
|
|
|
iptr = (__be32 *)((unsigned char *)headerp +
|
|
|
|
RPCRDMA_HDRLEN_MIN);
|
|
|
|
rep->rr_len -= RPCRDMA_HDRLEN_MIN;
|
2007-09-11 01:50:42 +08:00
|
|
|
status = rep->rr_len;
|
|
|
|
}
|
2016-06-30 01:54:58 +08:00
|
|
|
|
|
|
|
r_xprt->rx_stats.fixup_copy_count +=
|
|
|
|
rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len,
|
|
|
|
rdmalen);
|
2007-09-11 01:50:42 +08:00
|
|
|
break;
|
|
|
|
|
2015-01-22 00:02:13 +08:00
|
|
|
case rdma_nomsg:
|
2007-09-11 01:50:42 +08:00
|
|
|
/* never expect read or write chunks, always reply chunks */
|
|
|
|
if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
|
|
|
|
headerp->rm_body.rm_chunks[1] != xdr_zero ||
|
|
|
|
headerp->rm_body.rm_chunks[2] != xdr_one ||
|
2016-06-30 01:54:16 +08:00
|
|
|
list_empty(&req->rl_registered))
|
2007-09-11 01:50:42 +08:00
|
|
|
goto badheader;
|
2015-01-22 00:02:29 +08:00
|
|
|
iptr = (__be32 *)((unsigned char *)headerp +
|
|
|
|
RPCRDMA_HDRLEN_MIN);
|
2016-06-30 01:54:16 +08:00
|
|
|
rdmalen = rpcrdma_count_chunks(rep, 0, &iptr);
|
2007-09-11 01:50:42 +08:00
|
|
|
if (rdmalen < 0)
|
|
|
|
goto badheader;
|
|
|
|
r_xprt->rx_stats.total_rdma_reply += rdmalen;
|
|
|
|
/* Reply chunk buffer already is the reply vector - no fixup. */
|
|
|
|
status = rdmalen;
|
|
|
|
break;
|
|
|
|
|
2016-03-05 00:28:18 +08:00
|
|
|
case rdma_error:
|
|
|
|
goto out_rdmaerr;
|
|
|
|
|
2007-09-11 01:50:42 +08:00
|
|
|
badheader:
|
|
|
|
default:
|
2016-06-30 01:54:16 +08:00
|
|
|
dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n",
|
|
|
|
rqst->rq_task->tk_pid, __func__,
|
|
|
|
be32_to_cpu(headerp->rm_type));
|
2007-09-11 01:50:42 +08:00
|
|
|
status = -EIO;
|
|
|
|
r_xprt->rx_stats.bad_reply_count++;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2016-03-05 00:28:18 +08:00
|
|
|
out:
|
xprtrdma: Invalidate in the RPC reply handler
There is a window between the time the RPC reply handler wakes the
waiting RPC task and when xprt_release() invokes ops->buf_free.
During this time, memory regions containing the data payload may
still be accessed by a broken or malicious server, but the RPC
application has already been allowed access to the memory containing
the RPC request's data payloads.
The server should be fenced from client memory containing RPC data
payloads _before_ the RPC application is allowed to continue.
This change also more strongly enforces send queue accounting. There
is a maximum number of RPC calls allowed to be outstanding. When an
RPC/RDMA transport is set up, just enough send queue resources are
allocated to handle registration, Send, and invalidation WRs for
each those RPCs at the same time.
Before, additional RPC calls could be dispatched while invalidation
WRs were still consuming send WQEs. When invalidation WRs backed
up, dispatching additional RPCs resulted in a send queue overrun.
Now, the reply handler prevents RPC dispatch until invalidation is
complete. This prevents RPC call dispatch until there are enough
send queue resources to proceed.
Still to do: If an RPC exits early (say, ^C), the reply handler has
no opportunity to perform invalidation. Currently, xprt_rdma_free()
still frees remaining RDMA resources, which could deadlock.
Additional changes are needed to handle invalidation properly in this
case.
Reported-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Devesh Sharma <devesh.sharma@avagotech.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2015-12-17 06:23:11 +08:00
|
|
|
/* Invalidate and flush the data payloads before waking the
|
|
|
|
* waiting application. This guarantees the memory region is
|
|
|
|
* properly fenced from the server before the application
|
|
|
|
* accesses the data. It also ensures proper send flow
|
|
|
|
* control: waking the next RPC waits until this RPC has
|
|
|
|
* relinquished all its Send Queue entries.
|
|
|
|
*/
|
2016-06-30 01:54:16 +08:00
|
|
|
if (!list_empty(&req->rl_registered))
|
xprtrdma: Invalidate in the RPC reply handler
There is a window between the time the RPC reply handler wakes the
waiting RPC task and when xprt_release() invokes ops->buf_free.
During this time, memory regions containing the data payload may
still be accessed by a broken or malicious server, but the RPC
application has already been allowed access to the memory containing
the RPC request's data payloads.
The server should be fenced from client memory containing RPC data
payloads _before_ the RPC application is allowed to continue.
This change also more strongly enforces send queue accounting. There
is a maximum number of RPC calls allowed to be outstanding. When an
RPC/RDMA transport is set up, just enough send queue resources are
allocated to handle registration, Send, and invalidation WRs for
each those RPCs at the same time.
Before, additional RPC calls could be dispatched while invalidation
WRs were still consuming send WQEs. When invalidation WRs backed
up, dispatching additional RPCs resulted in a send queue overrun.
Now, the reply handler prevents RPC dispatch until invalidation is
complete. This prevents RPC call dispatch until there are enough
send queue resources to proceed.
Still to do: If an RPC exits early (say, ^C), the reply handler has
no opportunity to perform invalidation. Currently, xprt_rdma_free()
still frees remaining RDMA resources, which could deadlock.
Additional changes are needed to handle invalidation properly in this
case.
Reported-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Devesh Sharma <devesh.sharma@avagotech.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2015-12-17 06:23:11 +08:00
|
|
|
r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req);
|
|
|
|
|
|
|
|
spin_lock_bh(&xprt->transport_lock);
|
2014-05-28 22:34:57 +08:00
|
|
|
cwnd = xprt->cwnd;
|
2016-03-05 00:28:27 +08:00
|
|
|
xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT;
|
2014-05-28 22:34:57 +08:00
|
|
|
if (xprt->cwnd > cwnd)
|
|
|
|
xprt_release_rqst_cong(rqst->rq_task);
|
|
|
|
|
2015-10-25 05:26:54 +08:00
|
|
|
xprt_complete_rqst(rqst->rq_task, status);
|
2015-10-25 05:27:10 +08:00
|
|
|
spin_unlock_bh(&xprt->transport_lock);
|
2007-09-11 01:50:42 +08:00
|
|
|
dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n",
|
|
|
|
__func__, xprt, rqst, status);
|
2015-10-25 05:26:54 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
out_badstatus:
|
|
|
|
rpcrdma_recv_buffer_put(rep);
|
|
|
|
if (r_xprt->rx_ep.rep_connected == 1) {
|
|
|
|
r_xprt->rx_ep.rep_connected = -EIO;
|
|
|
|
rpcrdma_conn_func(&r_xprt->rx_ep);
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
|
2015-10-25 05:28:08 +08:00
|
|
|
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
|
|
|
|
out_bcall:
|
|
|
|
rpcrdma_bc_receive_call(r_xprt, rep);
|
|
|
|
return;
|
|
|
|
#endif
|
|
|
|
|
2016-03-05 00:28:18 +08:00
|
|
|
/* If the incoming reply terminated a pending RPC, the next
|
|
|
|
* RPC call will post a replacement receive buffer as it is
|
|
|
|
* being marshaled.
|
|
|
|
*/
|
2015-10-25 05:26:54 +08:00
|
|
|
out_badversion:
|
|
|
|
dprintk("RPC: %s: invalid version %d\n",
|
|
|
|
__func__, be32_to_cpu(headerp->rm_vers));
|
2016-03-05 00:28:18 +08:00
|
|
|
status = -EIO;
|
|
|
|
r_xprt->rx_stats.bad_reply_count++;
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
out_rdmaerr:
|
|
|
|
rmerr = be32_to_cpu(headerp->rm_body.rm_error.rm_err);
|
|
|
|
switch (rmerr) {
|
|
|
|
case ERR_VERS:
|
|
|
|
pr_err("%s: server reports header version error (%u-%u)\n",
|
|
|
|
__func__,
|
|
|
|
be32_to_cpu(headerp->rm_body.rm_error.rm_vers_low),
|
|
|
|
be32_to_cpu(headerp->rm_body.rm_error.rm_vers_high));
|
|
|
|
break;
|
|
|
|
case ERR_CHUNK:
|
|
|
|
pr_err("%s: server reports header decoding error\n",
|
|
|
|
__func__);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
pr_err("%s: server reports unknown error %d\n",
|
|
|
|
__func__, rmerr);
|
|
|
|
}
|
|
|
|
status = -EREMOTEIO;
|
|
|
|
r_xprt->rx_stats.bad_reply_count++;
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/* If no pending RPC transaction was matched, post a replacement
|
|
|
|
* receive buffer before returning.
|
|
|
|
*/
|
|
|
|
out_shortreply:
|
|
|
|
dprintk("RPC: %s: short/invalid reply\n", __func__);
|
2015-10-25 05:26:54 +08:00
|
|
|
goto repost;
|
|
|
|
|
|
|
|
out_nomatch:
|
2015-10-25 05:27:10 +08:00
|
|
|
spin_unlock_bh(&xprt->transport_lock);
|
2015-10-25 05:26:54 +08:00
|
|
|
dprintk("RPC: %s: no match for incoming xid 0x%08x len %d\n",
|
|
|
|
__func__, be32_to_cpu(headerp->rm_xid),
|
|
|
|
rep->rr_len);
|
|
|
|
goto repost;
|
|
|
|
|
|
|
|
out_duplicate:
|
2015-10-25 05:27:10 +08:00
|
|
|
spin_unlock_bh(&xprt->transport_lock);
|
2015-10-25 05:26:54 +08:00
|
|
|
dprintk("RPC: %s: "
|
|
|
|
"duplicate reply %p to RPC request %p: xid 0x%08x\n",
|
|
|
|
__func__, rep, req, be32_to_cpu(headerp->rm_xid));
|
|
|
|
|
|
|
|
repost:
|
|
|
|
r_xprt->rx_stats.bad_reply_count++;
|
2016-09-15 22:56:35 +08:00
|
|
|
if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep))
|
2015-10-25 05:26:54 +08:00
|
|
|
rpcrdma_recv_buffer_put(rep);
|
2007-09-11 01:50:42 +08:00
|
|
|
}
|