NFS Client Updates for Linux 5.12

- New Features:
   - Support for eager writes, and the write=eager and write=wait mount options
 
 - Other Bugfixes and Cleanups:
   - Fix typos in some comments
   - Fix up fall-through warnings for Clang
   - Cleanups to the NFS readpage codepath
   - Remove FMR support in rpcrdma_convert_iovs()
   - Various other cleanups to xprtrdma
   - Fix xprtrdma pad optimization for servers that don't support RFC 8797
   - Improvements to rpcrdma tracepoints
   - Fix up nfs4_bitmask_adjust()
   - Optimize sparse writes past the end of files
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCAAdFiEEnZ5MQTpR7cLU7KEp18tUv7ClQOsFAmAwOLwACgkQ18tUv7Cl
 QOsUfw//W2KoJ+2IQohQNFcoi+bG1OQE7jnqHtQ+tsKfpJKemcDcu8wQEAqrwALg
 vXioG1Ye0QU7P5PZtNxCorylqSTVGvJSIOrfa3lTdn/PDbI7NIgN52w56TzzfeXn
 pJ4gDwZzPwUFUblF0LBQUIhJv5IQvOXVgUsMqezbIbMXSiuLR/bjnZ96Q/woKpoL
 eg2IZ5EO9Jb0QjuQ1e9U303X7c2qOl1jzpxyQLQfD7ONnWBx3HnJk1l+3JJRi8JV
 smnae3I0L3nUZ7rBqoqsvK7YUjUchCEBvkmEMsnHT94D5tI9mxxX5OquREee6QHn
 NuJRSNbsIiCD3Ne27fkCut78d6SetoMko7jZ97T6smhyijtXJiLG/6dycMPV9rt/
 bVIudWMm9/A9AsXyY2YP5LC6Y6W6dhQRXygUjVgEPBl6kVsb2Eca8IA9QZghF9IL
 +XSEulASvxo2rWPylJJ+3aLynfqoHrowVN/Tu61svDnJWTcb+FCxQ5zyLox7erEH
 mUhraf1D0uoX9odH1069toN6favZFE6SIDvlUk1QTOjr6p3Jxmkuyl6PNs5t66/S
 550z5JVb2deIHOPQxOie7xz/Dk6dnRoaFhTNq/Ootkt9GNe0A+NqSUdoRA5XxN5m
 wW11ecLSZSehDksuXjyFmkHtkagLreFxLsHbVnaAtwEm7h/thRI=
 =Dssn
 -----END PGP SIGNATURE-----

Merge tag 'nfs-for-5.12-1' of git://git.linux-nfs.org/projects/anna/linux-nfs

Pull NFS Client Updates from Anna Schumaker:
 "New Features:
   - Support for eager writes, and the write=eager and write=wait mount
     options

- Other Bugfixes and Cleanups:
   - Fix typos in some comments
   - Fix up fall-through warnings for Clang
   - Cleanups to the NFS readpage codepath
   - Remove FMR support in rpcrdma_convert_iovs()
   - Various other cleanups to xprtrdma
   - Fix xprtrdma pad optimization for servers that don't support
     RFC 8797
   - Improvements to rpcrdma tracepoints
   - Fix up nfs4_bitmask_adjust()
   - Optimize sparse writes past the end of files"

* tag 'nfs-for-5.12-1' of git://git.linux-nfs.org/projects/anna/linux-nfs: (27 commits)
  NFS: Support the '-owrite=' option in /proc/self/mounts and mountinfo
  NFS: Set the stable writes flag when initialising the super block
  NFS: Add mount options supporting eager writes
  NFS: Add support for eager writes
  NFS: 'flags' field should be unsigned in struct nfs_server
  NFS: Don't set NFS_INO_INVALID_XATTR if there is no xattr cache
  NFS: Always clear an invalid mapping when attempting a buffered write
  NFS: Optimise sparse writes past the end of file
  NFS: Fix documenting comment for nfs_revalidate_file_size()
  NFSv4: Fixes for nfs4_bitmask_adjust()
  xprtrdma: Clean up rpcrdma_prepare_readch()
  rpcrdma: Capture bytes received in Receive completion tracepoints
  xprtrdma: Pad optimization, revisited
  rpcrdma: Fix comments about reverse-direction operation
  xprtrdma: Refactor invocations of offset_in_page()
  xprtrdma: Simplify rpcrdma_convert_kvec() and frwr_map()
  xprtrdma: Remove FMR support in rpcrdma_convert_iovs()
  NFS: Add nfs_pageio_complete_read() and remove nfs_readpage_async()
  NFS: Call readpage_async_filler() from nfs_readpage_async()
  NFS: Refactor nfs_readpage() and nfs_readpage_async() to use nfs_readdesc
  ...
This commit is contained in:
Linus Torvalds 2021-02-26 09:17:24 -08:00
commit 1c9077cdec
22 changed files with 378 additions and 294 deletions

View File

@ -89,7 +89,7 @@ nfs_file_release(struct inode *inode, struct file *filp)
EXPORT_SYMBOL_GPL(nfs_file_release);
/**
* nfs_revalidate_size - Revalidate the file size
* nfs_revalidate_file_size - Revalidate the file size
* @inode: pointer to inode struct
* @filp: pointer to struct file
*
@ -606,8 +606,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
unsigned long written = 0;
ssize_t result;
unsigned int mntflags = NFS_SERVER(inode)->flags;
ssize_t result, written;
errseq_t since;
int error;
@ -626,13 +626,13 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
/*
* O_APPEND implies that we must revalidate the file length.
*/
if (iocb->ki_flags & IOCB_APPEND) {
if (iocb->ki_flags & IOCB_APPEND || iocb->ki_pos > i_size_read(inode)) {
result = nfs_revalidate_file_size(inode, file);
if (result)
goto out;
}
if (iocb->ki_pos > i_size_read(inode))
nfs_revalidate_mapping(inode, file->f_mapping);
nfs_clear_invalid_mapping(file->f_mapping);
since = filemap_sample_wb_err(file->f_mapping);
nfs_start_io_write(inode);
@ -648,6 +648,21 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
written = result;
iocb->ki_pos += written;
if (mntflags & NFS_MOUNT_WRITE_EAGER) {
result = filemap_fdatawrite_range(file->f_mapping,
iocb->ki_pos - written,
iocb->ki_pos - 1);
if (result < 0)
goto out;
}
if (mntflags & NFS_MOUNT_WRITE_WAIT) {
result = filemap_fdatawait_range(file->f_mapping,
iocb->ki_pos - written,
iocb->ki_pos - 1);
if (result < 0)
goto out;
}
result = generic_write_sync(iocb, written);
if (result < 0)
goto out;

View File

@ -82,6 +82,7 @@ enum nfs_param {
Opt_v,
Opt_vers,
Opt_wsize,
Opt_write,
};
enum {
@ -113,6 +114,19 @@ static const struct constant_table nfs_param_enums_lookupcache[] = {
{}
};
enum {
Opt_write_lazy,
Opt_write_eager,
Opt_write_wait,
};
static const struct constant_table nfs_param_enums_write[] = {
{ "lazy", Opt_write_lazy },
{ "eager", Opt_write_eager },
{ "wait", Opt_write_wait },
{}
};
static const struct fs_parameter_spec nfs_fs_parameters[] = {
fsparam_flag_no("ac", Opt_ac),
fsparam_u32 ("acdirmax", Opt_acdirmax),
@ -171,6 +185,7 @@ static const struct fs_parameter_spec nfs_fs_parameters[] = {
fsparam_flag ("v4.1", Opt_v),
fsparam_flag ("v4.2", Opt_v),
fsparam_string("vers", Opt_vers),
fsparam_enum ("write", Opt_write, nfs_param_enums_write),
fsparam_u32 ("wsize", Opt_wsize),
{}
};
@ -770,6 +785,24 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
goto out_invalid_value;
}
break;
case Opt_write:
switch (result.uint_32) {
case Opt_write_lazy:
ctx->flags &=
~(NFS_MOUNT_WRITE_EAGER | NFS_MOUNT_WRITE_WAIT);
break;
case Opt_write_eager:
ctx->flags |= NFS_MOUNT_WRITE_EAGER;
ctx->flags &= ~NFS_MOUNT_WRITE_WAIT;
break;
case Opt_write_wait:
ctx->flags |=
NFS_MOUNT_WRITE_EAGER | NFS_MOUNT_WRITE_WAIT;
break;
default:
goto out_invalid_value;
}
break;
/*
* Special options
@ -1479,6 +1512,8 @@ static int nfs_init_fs_context(struct fs_context *fc)
ctx->selected_flavor = RPC_AUTH_MAXFLAVOR;
ctx->minorversion = 0;
ctx->need_mount = true;
fc->s_iflags |= SB_I_STABLE_WRITES;
}
fc->fs_private = ctx;
fc->ops = &nfs_fs_context_ops;

View File

@ -390,10 +390,6 @@ static void nfs_readpage_from_fscache_complete(struct page *page,
if (!error) {
SetPageUptodate(page);
unlock_page(page);
} else {
error = nfs_readpage_async(context, page->mapping->host, page);
if (error)
unlock_page(page);
}
}

View File

@ -195,6 +195,18 @@ bool nfs_check_cache_invalid(struct inode *inode, unsigned long flags)
}
EXPORT_SYMBOL_GPL(nfs_check_cache_invalid);
#ifdef CONFIG_NFS_V4_2
static bool nfs_has_xattr_cache(const struct nfs_inode *nfsi)
{
return nfsi->xattr_cache != NULL;
}
#else
static bool nfs_has_xattr_cache(const struct nfs_inode *nfsi)
{
return false;
}
#endif
static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
{
struct nfs_inode *nfsi = NFS_I(inode);
@ -209,6 +221,8 @@ static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
| NFS_INO_INVALID_XATTR);
}
if (!nfs_has_xattr_cache(nfsi))
flags &= ~NFS_INO_INVALID_XATTR;
if (inode->i_mapping->nrpages == 0)
flags &= ~(NFS_INO_INVALID_DATA|NFS_INO_DATA_INVAL_DEFER);
nfsi->cache_validity |= flags;
@ -1258,6 +1272,63 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
return 0;
}
/**
* nfs_clear_invalid_mapping - Conditionally clear a mapping
* @mapping: pointer to mapping
*
* If the NFS_INO_INVALID_DATA inode flag is set, clear the mapping.
*/
int nfs_clear_invalid_mapping(struct address_space *mapping)
{
struct inode *inode = mapping->host;
struct nfs_inode *nfsi = NFS_I(inode);
unsigned long *bitlock = &nfsi->flags;
int ret = 0;
/*
* We must clear NFS_INO_INVALID_DATA first to ensure that
* invalidations that come in while we're shooting down the mappings
* are respected. But, that leaves a race window where one revalidator
* can clear the flag, and then another checks it before the mapping
* gets invalidated. Fix that by serializing access to this part of
* the function.
*
* At the same time, we need to allow other tasks to see whether we
* might be in the middle of invalidating the pages, so we only set
* the bit lock here if it looks like we're going to be doing that.
*/
for (;;) {
ret = wait_on_bit_action(bitlock, NFS_INO_INVALIDATING,
nfs_wait_bit_killable, TASK_KILLABLE);
if (ret)
goto out;
spin_lock(&inode->i_lock);
if (test_bit(NFS_INO_INVALIDATING, bitlock)) {
spin_unlock(&inode->i_lock);
continue;
}
if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
break;
spin_unlock(&inode->i_lock);
goto out;
}
set_bit(NFS_INO_INVALIDATING, bitlock);
smp_wmb();
nfsi->cache_validity &=
~(NFS_INO_INVALID_DATA | NFS_INO_DATA_INVAL_DEFER);
spin_unlock(&inode->i_lock);
trace_nfs_invalidate_mapping_enter(inode);
ret = nfs_invalidate_mapping(inode, mapping);
trace_nfs_invalidate_mapping_exit(inode, ret);
clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
smp_mb__after_atomic();
wake_up_bit(bitlock, NFS_INO_INVALIDATING);
out:
return ret;
}
bool nfs_mapping_need_revalidate_inode(struct inode *inode)
{
return nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE) ||
@ -1290,65 +1361,19 @@ out:
* @inode: pointer to host inode
* @mapping: pointer to mapping
*/
int nfs_revalidate_mapping(struct inode *inode,
struct address_space *mapping)
int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
{
struct nfs_inode *nfsi = NFS_I(inode);
unsigned long *bitlock = &nfsi->flags;
int ret = 0;
/* swapfiles are not supposed to be shared. */
if (IS_SWAPFILE(inode))
goto out;
return 0;
if (nfs_mapping_need_revalidate_inode(inode)) {
ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
int ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
if (ret < 0)
goto out;
return ret;
}
/*
* We must clear NFS_INO_INVALID_DATA first to ensure that
* invalidations that come in while we're shooting down the mappings
* are respected. But, that leaves a race window where one revalidator
* can clear the flag, and then another checks it before the mapping
* gets invalidated. Fix that by serializing access to this part of
* the function.
*
* At the same time, we need to allow other tasks to see whether we
* might be in the middle of invalidating the pages, so we only set
* the bit lock here if it looks like we're going to be doing that.
*/
for (;;) {
ret = wait_on_bit_action(bitlock, NFS_INO_INVALIDATING,
nfs_wait_bit_killable, TASK_KILLABLE);
if (ret)
goto out;
spin_lock(&inode->i_lock);
if (test_bit(NFS_INO_INVALIDATING, bitlock)) {
spin_unlock(&inode->i_lock);
continue;
}
if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
break;
spin_unlock(&inode->i_lock);
goto out;
}
set_bit(NFS_INO_INVALIDATING, bitlock);
smp_wmb();
nfsi->cache_validity &= ~(NFS_INO_INVALID_DATA|
NFS_INO_DATA_INVAL_DEFER);
spin_unlock(&inode->i_lock);
trace_nfs_invalidate_mapping_enter(inode);
ret = nfs_invalidate_mapping(inode, mapping);
trace_nfs_invalidate_mapping_exit(inode, ret);
clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
smp_mb__after_atomic();
wake_up_bit(bitlock, NFS_INO_INVALIDATING);
out:
return ret;
return nfs_clear_invalid_mapping(mapping);
}
static bool nfs_file_has_writers(struct nfs_inode *nfsi)

View File

@ -111,6 +111,7 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type)
fallthrough;
case -ENOTSUPP:
status = -EOPNOTSUPP;
goto getout;
default:
goto getout;
}

View File

@ -609,6 +609,7 @@ found:
* changed. Schedule recovery!
*/
nfs4_schedule_path_down_recovery(pos);
goto out;
default:
goto out;
}

View File

@ -71,10 +71,6 @@
#include "nfs4trace.h"
#ifdef CONFIG_NFS_V4_2
#include "nfs42.h"
#endif /* CONFIG_NFS_V4_2 */
#define NFSDBG_FACILITY NFSDBG_PROC
#define NFS4_BITMASK_SZ 3
@ -2231,6 +2227,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
default:
printk(KERN_ERR "NFS: %s: unhandled error "
"%d.\n", __func__, err);
fallthrough;
case 0:
case -ENOENT:
case -EAGAIN:
@ -5438,15 +5435,16 @@ static void nfs4_bitmask_adjust(__u32 *bitmask, struct inode *inode,
if (cache_validity & NFS_INO_INVALID_ATIME)
bitmask[1] |= FATTR4_WORD1_TIME_ACCESS;
if (cache_validity & NFS_INO_INVALID_ACCESS)
bitmask[0] |= FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER |
FATTR4_WORD1_OWNER_GROUP;
if (cache_validity & NFS_INO_INVALID_ACL)
bitmask[0] |= FATTR4_WORD0_ACL;
if (cache_validity & NFS_INO_INVALID_LABEL)
if (cache_validity & NFS_INO_INVALID_OTHER)
bitmask[1] |= FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER |
FATTR4_WORD1_OWNER_GROUP |
FATTR4_WORD1_NUMLINKS;
if (label && label->len && cache_validity & NFS_INO_INVALID_LABEL)
bitmask[2] |= FATTR4_WORD2_SECURITY_LABEL;
if (cache_validity & NFS_INO_INVALID_CTIME)
if (cache_validity & NFS_INO_INVALID_CHANGE)
bitmask[0] |= FATTR4_WORD0_CHANGE;
if (cache_validity & NFS_INO_INVALID_CTIME)
bitmask[1] |= FATTR4_WORD1_TIME_METADATA;
if (cache_validity & NFS_INO_INVALID_MTIME)
bitmask[1] |= FATTR4_WORD1_TIME_MODIFY;
if (cache_validity & NFS_INO_INVALID_SIZE)
@ -9708,6 +9706,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
case -NFS4ERR_BADLAYOUT: /* no layout */
case -NFS4ERR_GRACE: /* loca_recalim always false */
task->tk_status = 0;
break;
case 0:
break;
default:

View File

@ -1125,6 +1125,7 @@ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
" sequence-id error on an"
" unconfirmed sequence %p!\n",
seqid->sequence);
return;
case -NFS4ERR_STALE_CLIENTID:
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_BAD_STATEID:

View File

@ -2875,6 +2875,7 @@ pnfs_do_write(struct nfs_pageio_descriptor *desc,
switch (trypnfs) {
case PNFS_NOT_ATTEMPTED:
pnfs_write_through_mds(desc, hdr);
break;
case PNFS_ATTEMPTED:
break;
case PNFS_TRY_AGAIN:
@ -3019,6 +3020,7 @@ pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
switch (trypnfs) {
case PNFS_NOT_ATTEMPTED:
pnfs_read_through_mds(desc, hdr);
break;
case PNFS_ATTEMPTED:
break;
case PNFS_TRY_AGAIN:

View File

@ -74,6 +74,24 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
}
EXPORT_SYMBOL_GPL(nfs_pageio_init_read);
static void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio,
struct inode *inode)
{
struct nfs_pgio_mirror *pgm;
unsigned long npages;
nfs_pageio_complete(pgio);
/* It doesn't make sense to do mirrored reads! */
WARN_ON_ONCE(pgio->pg_mirror_count != 1);
pgm = &pgio->pg_mirrors[0];
NFS_I(inode)->read_io += pgm->pg_bytes_written;
npages = (pgm->pg_bytes_written + PAGE_SIZE - 1) >> PAGE_SHIFT;
nfs_add_stats(inode, NFSIOS_READPAGES, npages);
}
void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
{
struct nfs_pgio_mirror *mirror;
@ -114,41 +132,10 @@ static void nfs_readpage_release(struct nfs_page *req, int error)
nfs_release_request(req);
}
int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
struct page *page)
{
struct nfs_page *new;
unsigned int len;
struct nfs_readdesc {
struct nfs_pageio_descriptor pgio;
struct nfs_pgio_mirror *pgm;
len = nfs_page_length(page);
if (len == 0)
return nfs_return_empty_page(page);
new = nfs_create_request(ctx, page, 0, len);
if (IS_ERR(new)) {
unlock_page(page);
return PTR_ERR(new);
}
if (len < PAGE_SIZE)
zero_user_segment(page, len, PAGE_SIZE);
nfs_pageio_init_read(&pgio, inode, false,
&nfs_async_read_completion_ops);
if (!nfs_pageio_add_request(&pgio, new)) {
nfs_list_remove_request(new);
nfs_readpage_release(new, pgio.pg_error);
}
nfs_pageio_complete(&pgio);
/* It doesn't make sense to do mirrored reads! */
WARN_ON_ONCE(pgio.pg_mirror_count != 1);
pgm = &pgio.pg_mirrors[0];
NFS_I(inode)->read_io += pgm->pg_bytes_written;
return pgio.pg_error < 0 ? pgio.pg_error : 0;
}
struct nfs_open_context *ctx;
};
static void nfs_page_group_set_uptodate(struct nfs_page *req)
{
@ -171,8 +158,7 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr)
if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) {
/* note: regions of the page not covered by a
* request are zeroed in nfs_readpage_async /
* readpage_async_filler */
* request are zeroed in readpage_async_filler */
if (bytes > hdr->good_bytes) {
/* nothing in this request was good, so zero
* the full extent of the request */
@ -304,78 +290,10 @@ static void nfs_readpage_result(struct rpc_task *task,
nfs_readpage_retry(task, hdr);
}
/*
* Read a page over NFS.
* We read the page synchronously in the following case:
* - The error flag is set for this page. This happens only when a
* previous async read operation failed.
*/
int nfs_readpage(struct file *file, struct page *page)
{
struct nfs_open_context *ctx;
struct inode *inode = page_file_mapping(page)->host;
int error;
dprintk("NFS: nfs_readpage (%p %ld@%lu)\n",
page, PAGE_SIZE, page_index(page));
nfs_inc_stats(inode, NFSIOS_VFSREADPAGE);
nfs_add_stats(inode, NFSIOS_READPAGES, 1);
/*
* Try to flush any pending writes to the file..
*
* NOTE! Because we own the page lock, there cannot
* be any new pending writes generated at this point
* for this page (other pages can be written to).
*/
error = nfs_wb_page(inode, page);
if (error)
goto out_unlock;
if (PageUptodate(page))
goto out_unlock;
error = -ESTALE;
if (NFS_STALE(inode))
goto out_unlock;
if (file == NULL) {
error = -EBADF;
ctx = nfs_find_open_context(inode, NULL, FMODE_READ);
if (ctx == NULL)
goto out_unlock;
} else
ctx = get_nfs_open_context(nfs_file_open_context(file));
if (!IS_SYNC(inode)) {
error = nfs_readpage_from_fscache(ctx, inode, page);
if (error == 0)
goto out;
}
xchg(&ctx->error, 0);
error = nfs_readpage_async(ctx, inode, page);
if (!error) {
error = wait_on_page_locked_killable(page);
if (!PageUptodate(page) && !error)
error = xchg(&ctx->error, 0);
}
out:
put_nfs_open_context(ctx);
return error;
out_unlock:
unlock_page(page);
return error;
}
struct nfs_readdesc {
struct nfs_pageio_descriptor *pgio;
struct nfs_open_context *ctx;
};
static int
readpage_async_filler(void *data, struct page *page)
{
struct nfs_readdesc *desc = (struct nfs_readdesc *)data;
struct nfs_readdesc *desc = data;
struct nfs_page *new;
unsigned int len;
int error;
@ -390,9 +308,9 @@ readpage_async_filler(void *data, struct page *page)
if (len < PAGE_SIZE)
zero_user_segment(page, len, PAGE_SIZE);
if (!nfs_pageio_add_request(desc->pgio, new)) {
if (!nfs_pageio_add_request(&desc->pgio, new)) {
nfs_list_remove_request(new);
error = desc->pgio->pg_error;
error = desc->pgio.pg_error;
nfs_readpage_release(new, error);
goto out;
}
@ -404,17 +322,82 @@ out:
return error;
}
int nfs_readpages(struct file *filp, struct address_space *mapping,
/*
* Read a page over NFS.
* We read the page synchronously in the following case:
* - The error flag is set for this page. This happens only when a
* previous async read operation failed.
*/
int nfs_readpage(struct file *file, struct page *page)
{
struct nfs_readdesc desc;
struct inode *inode = page_file_mapping(page)->host;
int ret;
dprintk("NFS: nfs_readpage (%p %ld@%lu)\n",
page, PAGE_SIZE, page_index(page));
nfs_inc_stats(inode, NFSIOS_VFSREADPAGE);
/*
* Try to flush any pending writes to the file..
*
* NOTE! Because we own the page lock, there cannot
* be any new pending writes generated at this point
* for this page (other pages can be written to).
*/
ret = nfs_wb_page(inode, page);
if (ret)
goto out_unlock;
if (PageUptodate(page))
goto out_unlock;
ret = -ESTALE;
if (NFS_STALE(inode))
goto out_unlock;
if (file == NULL) {
ret = -EBADF;
desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ);
if (desc.ctx == NULL)
goto out_unlock;
} else
desc.ctx = get_nfs_open_context(nfs_file_open_context(file));
if (!IS_SYNC(inode)) {
ret = nfs_readpage_from_fscache(desc.ctx, inode, page);
if (ret == 0)
goto out;
}
xchg(&desc.ctx->error, 0);
nfs_pageio_init_read(&desc.pgio, inode, false,
&nfs_async_read_completion_ops);
ret = readpage_async_filler(&desc, page);
if (!ret)
nfs_pageio_complete_read(&desc.pgio, inode);
ret = desc.pgio.pg_error < 0 ? desc.pgio.pg_error : 0;
if (!ret) {
ret = wait_on_page_locked_killable(page);
if (!PageUptodate(page) && !ret)
ret = xchg(&desc.ctx->error, 0);
}
out:
put_nfs_open_context(desc.ctx);
return ret;
out_unlock:
unlock_page(page);
return ret;
}
int nfs_readpages(struct file *file, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages)
{
struct nfs_pageio_descriptor pgio;
struct nfs_pgio_mirror *pgm;
struct nfs_readdesc desc = {
.pgio = &pgio,
};
struct nfs_readdesc desc;
struct inode *inode = mapping->host;
unsigned long npages;
int ret = -ESTALE;
int ret;
dprintk("NFS: nfs_readpages (%s/%Lu %d)\n",
inode->i_sb->s_id,
@ -422,15 +405,17 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
nr_pages);
nfs_inc_stats(inode, NFSIOS_VFSREADPAGES);
ret = -ESTALE;
if (NFS_STALE(inode))
goto out;
if (filp == NULL) {
if (file == NULL) {
ret = -EBADF;
desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ);
if (desc.ctx == NULL)
return -EBADF;
goto out;
} else
desc.ctx = get_nfs_open_context(nfs_file_open_context(filp));
desc.ctx = get_nfs_open_context(nfs_file_open_context(file));
/* attempt to read as many of the pages as possible from the cache
* - this returns -ENOBUFS immediately if the cookie is negative
@ -440,20 +425,13 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
if (ret == 0)
goto read_complete; /* all pages were read */
nfs_pageio_init_read(&pgio, inode, false,
nfs_pageio_init_read(&desc.pgio, inode, false,
&nfs_async_read_completion_ops);
ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
nfs_pageio_complete(&pgio);
/* It doesn't make sense to do mirrored reads! */
WARN_ON_ONCE(pgio.pg_mirror_count != 1);
nfs_pageio_complete_read(&desc.pgio, inode);
pgm = &pgio.pg_mirrors[0];
NFS_I(inode)->read_io += pgm->pg_bytes_written;
npages = (pgm->pg_bytes_written + PAGE_SIZE - 1) >>
PAGE_SHIFT;
nfs_add_stats(inode, NFSIOS_READPAGES, npages);
read_complete:
put_nfs_open_context(desc.ctx);
out:

View File

@ -523,6 +523,13 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
seq_puts(m, ",local_lock=flock");
else
seq_puts(m, ",local_lock=posix");
if (nfss->flags & NFS_MOUNT_WRITE_EAGER) {
if (nfss->flags & NFS_MOUNT_WRITE_WAIT)
seq_puts(m, ",write=wait");
else
seq_puts(m, ",write=eager");
}
}
/*

View File

@ -712,16 +712,23 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
struct nfs_pageio_descriptor pgio;
struct nfs_io_completion *ioc;
struct nfs_io_completion *ioc = NULL;
unsigned int mntflags = NFS_SERVER(inode)->flags;
int priority = 0;
int err;
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
ioc = nfs_io_completion_alloc(GFP_KERNEL);
if (ioc)
nfs_io_completion_init(ioc, nfs_io_completion_commit, inode);
if (!(mntflags & NFS_MOUNT_WRITE_EAGER) || wbc->for_kupdate ||
wbc->for_background || wbc->for_sync || wbc->for_reclaim) {
ioc = nfs_io_completion_alloc(GFP_KERNEL);
if (ioc)
nfs_io_completion_init(ioc, nfs_io_completion_commit,
inode);
priority = wb_priority(wbc);
}
nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false,
nfs_pageio_init_write(&pgio, inode, priority, false,
&nfs_async_write_completion_ops);
pgio.pg_io_completion = ioc;
err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
@ -1278,19 +1285,21 @@ bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode)
* the PageUptodate() flag. In this case, we will need to turn off
* write optimisations that depend on the page contents being correct.
*/
static bool nfs_write_pageuptodate(struct page *page, struct inode *inode)
static bool nfs_write_pageuptodate(struct page *page, struct inode *inode,
unsigned int pagelen)
{
struct nfs_inode *nfsi = NFS_I(inode);
if (nfs_have_delegated_attributes(inode))
goto out;
if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
if (nfsi->cache_validity &
(NFS_INO_REVAL_PAGECACHE | NFS_INO_INVALID_SIZE))
return false;
smp_rmb();
if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags))
if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags) && pagelen != 0)
return false;
out:
if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
if (nfsi->cache_validity & NFS_INO_INVALID_DATA && pagelen != 0)
return false;
return PageUptodate(page) != 0;
}
@ -1310,7 +1319,8 @@ is_whole_file_wrlock(struct file_lock *fl)
* If the file is opened for synchronous writes then we can just skip the rest
* of the checks.
*/
static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode)
static int nfs_can_extend_write(struct file *file, struct page *page,
struct inode *inode, unsigned int pagelen)
{
int ret;
struct file_lock_context *flctx = inode->i_flctx;
@ -1318,7 +1328,7 @@ static int nfs_can_extend_write(struct file *file, struct page *page, struct ino
if (file->f_flags & O_DSYNC)
return 0;
if (!nfs_write_pageuptodate(page, inode))
if (!nfs_write_pageuptodate(page, inode, pagelen))
return 0;
if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
return 1;
@ -1356,6 +1366,7 @@ int nfs_updatepage(struct file *file, struct page *page,
struct nfs_open_context *ctx = nfs_file_open_context(file);
struct address_space *mapping = page_file_mapping(page);
struct inode *inode = mapping->host;
unsigned int pagelen = nfs_page_length(page);
int status = 0;
nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
@ -1366,8 +1377,8 @@ int nfs_updatepage(struct file *file, struct page *page,
if (!count)
goto out;
if (nfs_can_extend_write(file, page, inode)) {
count = max(count + offset, nfs_page_length(page));
if (nfs_can_extend_write(file, page, inode, pagelen)) {
count = max(count + offset, pagelen);
offset = 0;
}

View File

@ -388,6 +388,7 @@ extern int nfs_open(struct inode *, struct file *);
extern int nfs_attribute_cache_expired(struct inode *inode);
extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode);
extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *);
extern int nfs_clear_invalid_mapping(struct address_space *mapping);
extern bool nfs_mapping_need_revalidate_inode(struct inode *inode);
extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping);
extern int nfs_revalidate_mapping_rcu(struct inode *inode);
@ -571,8 +572,6 @@ nfs_have_writebacks(struct inode *inode)
extern int nfs_readpage(struct file *, struct page *);
extern int nfs_readpages(struct file *, struct address_space *,
struct list_head *, unsigned);
extern int nfs_readpage_async(struct nfs_open_context *, struct inode *,
struct page *);
/*
* inline functions

View File

@ -142,7 +142,7 @@ struct nfs_server {
struct nlm_host *nlm_host; /* NLM client handle */
struct nfs_iostats __percpu *io_stats; /* I/O statistics */
atomic_long_t writeback; /* number of writeback pages */
int flags; /* various flags */
unsigned int flags; /* various flags */
/* The following are for internal use only. Also see uapi/linux/nfs_mount.h */
#define NFS_MOUNT_LOOKUP_CACHE_NONEG 0x10000
@ -153,6 +153,8 @@ struct nfs_server {
#define NFS_MOUNT_LOCAL_FCNTL 0x200000
#define NFS_MOUNT_SOFTERR 0x400000
#define NFS_MOUNT_SOFTREVAL 0x800000
#define NFS_MOUNT_WRITE_EAGER 0x01000000
#define NFS_MOUNT_WRITE_WAIT 0x02000000
unsigned int caps; /* server capabilities */
unsigned int rsize; /* read size */

View File

@ -60,6 +60,51 @@ DECLARE_EVENT_CLASS(rpcrdma_completion_class,
), \
TP_ARGS(wc, cid))
DECLARE_EVENT_CLASS(rpcrdma_receive_completion_class,
TP_PROTO(
const struct ib_wc *wc,
const struct rpc_rdma_cid *cid
),
TP_ARGS(wc, cid),
TP_STRUCT__entry(
__field(u32, cq_id)
__field(int, completion_id)
__field(u32, received)
__field(unsigned long, status)
__field(unsigned int, vendor_err)
),
TP_fast_assign(
__entry->cq_id = cid->ci_queue_id;
__entry->completion_id = cid->ci_completion_id;
__entry->status = wc->status;
if (wc->status) {
__entry->received = 0;
__entry->vendor_err = wc->vendor_err;
} else {
__entry->received = wc->byte_len;
__entry->vendor_err = 0;
}
),
TP_printk("cq.id=%u cid=%d status=%s (%lu/0x%x) received=%u",
__entry->cq_id, __entry->completion_id,
rdma_show_wc_status(__entry->status),
__entry->status, __entry->vendor_err,
__entry->received
)
);
#define DEFINE_RECEIVE_COMPLETION_EVENT(name) \
DEFINE_EVENT(rpcrdma_receive_completion_class, name, \
TP_PROTO( \
const struct ib_wc *wc, \
const struct rpc_rdma_cid *cid \
), \
TP_ARGS(wc, cid))
DECLARE_EVENT_CLASS(xprtrdma_reply_class,
TP_PROTO(
const struct rpcrdma_rep *rep
@ -838,7 +883,8 @@ TRACE_EVENT(xprtrdma_post_linv_err,
** Completion events
**/
DEFINE_COMPLETION_EVENT(xprtrdma_wc_receive);
DEFINE_RECEIVE_COMPLETION_EVENT(xprtrdma_wc_receive);
DEFINE_COMPLETION_EVENT(xprtrdma_wc_send);
DEFINE_COMPLETION_EVENT(xprtrdma_wc_fastreg);
DEFINE_COMPLETION_EVENT(xprtrdma_wc_li);
@ -1790,7 +1836,7 @@ TRACE_EVENT(svcrdma_post_recv,
)
);
DEFINE_COMPLETION_EVENT(svcrdma_wc_receive);
DEFINE_RECEIVE_COMPLETION_EVENT(svcrdma_wc_receive);
TRACE_EVENT(svcrdma_rq_post_err,
TP_PROTO(

View File

@ -478,6 +478,7 @@ rpc_get_inode(struct super_block *sb, umode_t mode)
inode->i_fop = &simple_dir_operations;
inode->i_op = &simple_dir_inode_operations;
inc_nlink(inode);
break;
default:
break;
}

View File

@ -2,7 +2,7 @@
/*
* Copyright (c) 2015-2020, Oracle and/or its affiliates.
*
* Support for backward direction RPCs on RPC/RDMA.
* Support for reverse-direction RPCs on RPC/RDMA.
*/
#include <linux/sunrpc/xprt.h>
@ -208,7 +208,7 @@ create_req:
}
/**
* rpcrdma_bc_receive_call - Handle a backward direction call
* rpcrdma_bc_receive_call - Handle a reverse-direction Call
* @r_xprt: transport receiving the call
* @rep: receive buffer containing the call
*

View File

@ -306,20 +306,14 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
if (nsegs > ep->re_max_fr_depth)
nsegs = ep->re_max_fr_depth;
for (i = 0; i < nsegs;) {
if (seg->mr_page)
sg_set_page(&mr->mr_sg[i],
seg->mr_page,
seg->mr_len,
offset_in_page(seg->mr_offset));
else
sg_set_buf(&mr->mr_sg[i], seg->mr_offset,
seg->mr_len);
sg_set_page(&mr->mr_sg[i], seg->mr_page,
seg->mr_len, seg->mr_offset);
++seg;
++i;
if (ep->re_mrtype == IB_MR_TYPE_SG_GAPS)
continue;
if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
if ((i < nsegs && seg->mr_offset) ||
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
break;
}

View File

@ -204,9 +204,7 @@ rpcrdma_alloc_sparse_pages(struct xdr_buf *buf)
return 0;
}
/* Split @vec on page boundaries into SGEs. FMR registers pages, not
* a byte range. Other modes coalesce these SGEs into a single MR
* when they can.
/* Convert @vec to a single SGL element.
*
* Returns pointer to next available SGE, and bumps the total number
* of SGEs consumed.
@ -215,22 +213,11 @@ static struct rpcrdma_mr_seg *
rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
unsigned int *n)
{
u32 remaining, page_offset;
char *base;
base = vec->iov_base;
page_offset = offset_in_page(base);
remaining = vec->iov_len;
while (remaining) {
seg->mr_page = NULL;
seg->mr_offset = base;
seg->mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining);
remaining -= seg->mr_len;
base += seg->mr_len;
++seg;
++(*n);
page_offset = 0;
}
seg->mr_page = virt_to_page(vec->iov_base);
seg->mr_offset = offset_in_page(vec->iov_base);
seg->mr_len = vec->iov_len;
++seg;
++(*n);
return seg;
}
@ -259,7 +246,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
page_base = offset_in_page(xdrbuf->page_base);
while (len) {
seg->mr_page = *ppages;
seg->mr_offset = (char *)page_base;
seg->mr_offset = page_base;
seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len);
len -= seg->mr_len;
++ppages;
@ -268,10 +255,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
page_base = 0;
}
/* When encoding a Read chunk, the tail iovec contains an
* XDR pad and may be omitted.
*/
if (type == rpcrdma_readch && r_xprt->rx_ep->re_implicit_roundup)
if (type == rpcrdma_readch)
goto out;
/* When encoding a Write chunk, some servers need to see an
@ -283,7 +267,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
goto out;
if (xdrbuf->tail[0].iov_len)
seg = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n);
rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n);
out:
if (unlikely(n > RPCRDMA_MAX_SEGS))
@ -644,9 +628,8 @@ out_mapping_err:
return false;
}
/* The tail iovec may include an XDR pad for the page list,
* as well as additional content, and may not reside in the
* same page as the head iovec.
/* The tail iovec might not reside in the same page as the
* head iovec.
*/
static bool rpcrdma_prepare_tail_iov(struct rpcrdma_req *req,
struct xdr_buf *xdr,
@ -764,27 +747,19 @@ static bool rpcrdma_prepare_readch(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_req *req,
struct xdr_buf *xdr)
{
struct kvec *tail = &xdr->tail[0];
if (!rpcrdma_prepare_head_iov(r_xprt, req, xdr->head[0].iov_len))
return false;
/* If there is a Read chunk, the page list is being handled
/* If there is a Read chunk, the page list is handled
* via explicit RDMA, and thus is skipped here.
*/
/* Do not include the tail if it is only an XDR pad */
if (xdr->tail[0].iov_len > 3) {
unsigned int page_base, len;
/* If the content in the page list is an odd length,
* xdr_write_pages() adds a pad at the beginning of
* the tail iovec. Force the tail's non-pad content to
* land at the next XDR position in the Send message.
*/
page_base = offset_in_page(xdr->tail[0].iov_base);
len = xdr->tail[0].iov_len;
page_base += len & 3;
len -= len & 3;
if (!rpcrdma_prepare_tail_iov(req, xdr, page_base, len))
if (tail->iov_len) {
if (!rpcrdma_prepare_tail_iov(req, xdr,
offset_in_page(tail->iov_base),
tail->iov_len))
return false;
kref_get(&req->rl_kref);
}
@ -1164,14 +1139,10 @@ rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep)
*/
p = xdr_inline_decode(xdr, 3 * sizeof(*p));
if (unlikely(!p))
goto out_short;
return true;
rpcrdma_bc_receive_call(r_xprt, rep);
return true;
out_short:
pr_warn("RPC/RDMA short backward direction call\n");
return true;
}
#else /* CONFIG_SUNRPC_BACKCHANNEL */
{

View File

@ -2,7 +2,7 @@
/*
* Copyright (c) 2015-2018 Oracle. All rights reserved.
*
* Support for backward direction RPCs on RPC/RDMA (server-side).
* Support for reverse-direction RPCs on RPC/RDMA (server-side).
*/
#include <linux/sunrpc/svc_rdma.h>
@ -59,7 +59,7 @@ out_unlock:
spin_unlock(&xprt->queue_lock);
}
/* Send a backwards direction RPC call.
/* Send a reverse-direction RPC Call.
*
* Caller holds the connection's mutex and has already marshaled
* the RPC/RDMA request.

View File

@ -98,9 +98,9 @@ struct rpcrdma_ep {
atomic_t re_completion_ids;
};
/* Pre-allocate extra Work Requests for handling backward receives
* and sends. This is a fixed value because the Work Queues are
* allocated when the forward channel is set up, long before the
/* Pre-allocate extra Work Requests for handling reverse-direction
* Receives and Sends. This is a fixed value because the Work Queues
* are allocated when the forward channel is set up, long before the
* backchannel is provisioned. This value is two times
* NFS4_DEF_CB_SLOT_TABLE_SIZE.
*/
@ -283,10 +283,11 @@ enum {
RPCRDMA_MAX_IOV_SEGS,
};
struct rpcrdma_mr_seg { /* chunk descriptors */
u32 mr_len; /* length of chunk or segment */
struct page *mr_page; /* owning page, if any */
char *mr_offset; /* kva if no page, else offset */
/* Arguments for DMA mapping and registration */
struct rpcrdma_mr_seg {
u32 mr_len; /* length of segment */
struct page *mr_page; /* underlying struct page */
u64 mr_offset; /* IN: page offset, OUT: iova */
};
/* The Send SGE array is provisioned to send a maximum size

View File

@ -829,7 +829,7 @@ xs_stream_record_marker(struct xdr_buf *xdr)
* EAGAIN: The socket was blocked, please call again later to
* complete the request
* ENOTCONN: Caller needs to invoke connect logic then call again
* other: Some other error occured, the request was not sent
* other: Some other error occurred, the request was not sent
*/
static int xs_local_send_request(struct rpc_rqst *req)
{
@ -1665,7 +1665,7 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock)
* This ensures that we can continue to establish TCP
* connections even when all local ephemeral ports are already
* a part of some TCP connection. This makes no difference
* for UDP sockets, but also doens't harm them.
* for UDP sockets, but also doesn't harm them.
*
* If we're asking for any reserved port (i.e. port == 0 &&
* transport->xprt.resvport == 1) xs_get_srcport above will
@ -1875,6 +1875,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport)
xprt->stat.connect_time += (long)jiffies -
xprt->stat.connect_start;
xprt_set_connected(xprt);
break;
case -ENOBUFS:
break;
case -ENOENT:
@ -2276,10 +2277,8 @@ static void xs_tcp_setup_socket(struct work_struct *work)
case -EHOSTUNREACH:
case -EADDRINUSE:
case -ENOBUFS:
/*
* xs_tcp_force_close() wakes tasks with -EIO.
* We need to wake them first to ensure the
* correct error code.
/* xs_tcp_force_close() wakes tasks with a fixed error code.
* We need to wake them first to ensure the correct error code.
*/
xprt_wake_pending_tasks(xprt, status);
xs_tcp_force_close(xprt);
@ -2380,7 +2379,7 @@ static void xs_error_handle(struct work_struct *work)
}
/**
* xs_local_print_stats - display AF_LOCAL socket-specifc stats
* xs_local_print_stats - display AF_LOCAL socket-specific stats
* @xprt: rpc_xprt struct containing statistics
* @seq: output file
*
@ -2409,7 +2408,7 @@ static void xs_local_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
}
/**
* xs_udp_print_stats - display UDP socket-specifc stats
* xs_udp_print_stats - display UDP socket-specific stats
* @xprt: rpc_xprt struct containing statistics
* @seq: output file
*
@ -2433,7 +2432,7 @@ static void xs_udp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
}
/**
* xs_tcp_print_stats - display TCP socket-specifc stats
* xs_tcp_print_stats - display TCP socket-specific stats
* @xprt: rpc_xprt struct containing statistics
* @seq: output file
*