diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 63940a7a70be..16ad5050e046 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -89,7 +89,7 @@ nfs_file_release(struct inode *inode, struct file *filp) EXPORT_SYMBOL_GPL(nfs_file_release); /** - * nfs_revalidate_size - Revalidate the file size + * nfs_revalidate_file_size - Revalidate the file size * @inode: pointer to inode struct * @filp: pointer to struct file * @@ -606,8 +606,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - unsigned long written = 0; - ssize_t result; + unsigned int mntflags = NFS_SERVER(inode)->flags; + ssize_t result, written; errseq_t since; int error; @@ -626,13 +626,13 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) /* * O_APPEND implies that we must revalidate the file length. */ - if (iocb->ki_flags & IOCB_APPEND) { + if (iocb->ki_flags & IOCB_APPEND || iocb->ki_pos > i_size_read(inode)) { result = nfs_revalidate_file_size(inode, file); if (result) goto out; } - if (iocb->ki_pos > i_size_read(inode)) - nfs_revalidate_mapping(inode, file->f_mapping); + + nfs_clear_invalid_mapping(file->f_mapping); since = filemap_sample_wb_err(file->f_mapping); nfs_start_io_write(inode); @@ -648,6 +648,21 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) written = result; iocb->ki_pos += written; + + if (mntflags & NFS_MOUNT_WRITE_EAGER) { + result = filemap_fdatawrite_range(file->f_mapping, + iocb->ki_pos - written, + iocb->ki_pos - 1); + if (result < 0) + goto out; + } + if (mntflags & NFS_MOUNT_WRITE_WAIT) { + result = filemap_fdatawait_range(file->f_mapping, + iocb->ki_pos - written, + iocb->ki_pos - 1); + if (result < 0) + goto out; + } result = generic_write_sync(iocb, written); if (result < 0) goto out; diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 06894bcdea2d..971a9251c1d9 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -82,6 +82,7 @@ enum nfs_param { Opt_v, Opt_vers, Opt_wsize, + Opt_write, }; enum { @@ -113,6 +114,19 @@ static const struct constant_table nfs_param_enums_lookupcache[] = { {} }; +enum { + Opt_write_lazy, + Opt_write_eager, + Opt_write_wait, +}; + +static const struct constant_table nfs_param_enums_write[] = { + { "lazy", Opt_write_lazy }, + { "eager", Opt_write_eager }, + { "wait", Opt_write_wait }, + {} +}; + static const struct fs_parameter_spec nfs_fs_parameters[] = { fsparam_flag_no("ac", Opt_ac), fsparam_u32 ("acdirmax", Opt_acdirmax), @@ -171,6 +185,7 @@ static const struct fs_parameter_spec nfs_fs_parameters[] = { fsparam_flag ("v4.1", Opt_v), fsparam_flag ("v4.2", Opt_v), fsparam_string("vers", Opt_vers), + fsparam_enum ("write", Opt_write, nfs_param_enums_write), fsparam_u32 ("wsize", Opt_wsize), {} }; @@ -770,6 +785,24 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, goto out_invalid_value; } break; + case Opt_write: + switch (result.uint_32) { + case Opt_write_lazy: + ctx->flags &= + ~(NFS_MOUNT_WRITE_EAGER | NFS_MOUNT_WRITE_WAIT); + break; + case Opt_write_eager: + ctx->flags |= NFS_MOUNT_WRITE_EAGER; + ctx->flags &= ~NFS_MOUNT_WRITE_WAIT; + break; + case Opt_write_wait: + ctx->flags |= + NFS_MOUNT_WRITE_EAGER | NFS_MOUNT_WRITE_WAIT; + break; + default: + goto out_invalid_value; + } + break; /* * Special options @@ -1479,6 +1512,8 @@ static int nfs_init_fs_context(struct fs_context *fc) ctx->selected_flavor = RPC_AUTH_MAXFLAVOR; ctx->minorversion = 0; ctx->need_mount = true; + + fc->s_iflags |= SB_I_STABLE_WRITES; } fc->fs_private = ctx; fc->ops = &nfs_fs_context_ops; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index a60df88efc40..c4c021c6ebbd 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -390,10 +390,6 @@ static void nfs_readpage_from_fscache_complete(struct page *page, if (!error) { SetPageUptodate(page); unlock_page(page); - } else { - error = nfs_readpage_async(context, page->mapping->host, page); - if (error) - unlock_page(page); } } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 447e95974386..749bbea14d99 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -195,6 +195,18 @@ bool nfs_check_cache_invalid(struct inode *inode, unsigned long flags) } EXPORT_SYMBOL_GPL(nfs_check_cache_invalid); +#ifdef CONFIG_NFS_V4_2 +static bool nfs_has_xattr_cache(const struct nfs_inode *nfsi) +{ + return nfsi->xattr_cache != NULL; +} +#else +static bool nfs_has_xattr_cache(const struct nfs_inode *nfsi) +{ + return false; +} +#endif + static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) { struct nfs_inode *nfsi = NFS_I(inode); @@ -209,6 +221,8 @@ static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) | NFS_INO_INVALID_XATTR); } + if (!nfs_has_xattr_cache(nfsi)) + flags &= ~NFS_INO_INVALID_XATTR; if (inode->i_mapping->nrpages == 0) flags &= ~(NFS_INO_INVALID_DATA|NFS_INO_DATA_INVAL_DEFER); nfsi->cache_validity |= flags; @@ -1258,6 +1272,63 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map return 0; } +/** + * nfs_clear_invalid_mapping - Conditionally clear a mapping + * @mapping: pointer to mapping + * + * If the NFS_INO_INVALID_DATA inode flag is set, clear the mapping. + */ +int nfs_clear_invalid_mapping(struct address_space *mapping) +{ + struct inode *inode = mapping->host; + struct nfs_inode *nfsi = NFS_I(inode); + unsigned long *bitlock = &nfsi->flags; + int ret = 0; + + /* + * We must clear NFS_INO_INVALID_DATA first to ensure that + * invalidations that come in while we're shooting down the mappings + * are respected. But, that leaves a race window where one revalidator + * can clear the flag, and then another checks it before the mapping + * gets invalidated. Fix that by serializing access to this part of + * the function. + * + * At the same time, we need to allow other tasks to see whether we + * might be in the middle of invalidating the pages, so we only set + * the bit lock here if it looks like we're going to be doing that. + */ + for (;;) { + ret = wait_on_bit_action(bitlock, NFS_INO_INVALIDATING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (ret) + goto out; + spin_lock(&inode->i_lock); + if (test_bit(NFS_INO_INVALIDATING, bitlock)) { + spin_unlock(&inode->i_lock); + continue; + } + if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + break; + spin_unlock(&inode->i_lock); + goto out; + } + + set_bit(NFS_INO_INVALIDATING, bitlock); + smp_wmb(); + nfsi->cache_validity &= + ~(NFS_INO_INVALID_DATA | NFS_INO_DATA_INVAL_DEFER); + spin_unlock(&inode->i_lock); + trace_nfs_invalidate_mapping_enter(inode); + ret = nfs_invalidate_mapping(inode, mapping); + trace_nfs_invalidate_mapping_exit(inode, ret); + + clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); + smp_mb__after_atomic(); + wake_up_bit(bitlock, NFS_INO_INVALIDATING); +out: + return ret; +} + bool nfs_mapping_need_revalidate_inode(struct inode *inode) { return nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE) || @@ -1290,65 +1361,19 @@ out: * @inode: pointer to host inode * @mapping: pointer to mapping */ -int nfs_revalidate_mapping(struct inode *inode, - struct address_space *mapping) +int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) { - struct nfs_inode *nfsi = NFS_I(inode); - unsigned long *bitlock = &nfsi->flags; - int ret = 0; - /* swapfiles are not supposed to be shared. */ if (IS_SWAPFILE(inode)) - goto out; + return 0; if (nfs_mapping_need_revalidate_inode(inode)) { - ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); + int ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); if (ret < 0) - goto out; + return ret; } - /* - * We must clear NFS_INO_INVALID_DATA first to ensure that - * invalidations that come in while we're shooting down the mappings - * are respected. But, that leaves a race window where one revalidator - * can clear the flag, and then another checks it before the mapping - * gets invalidated. Fix that by serializing access to this part of - * the function. - * - * At the same time, we need to allow other tasks to see whether we - * might be in the middle of invalidating the pages, so we only set - * the bit lock here if it looks like we're going to be doing that. - */ - for (;;) { - ret = wait_on_bit_action(bitlock, NFS_INO_INVALIDATING, - nfs_wait_bit_killable, TASK_KILLABLE); - if (ret) - goto out; - spin_lock(&inode->i_lock); - if (test_bit(NFS_INO_INVALIDATING, bitlock)) { - spin_unlock(&inode->i_lock); - continue; - } - if (nfsi->cache_validity & NFS_INO_INVALID_DATA) - break; - spin_unlock(&inode->i_lock); - goto out; - } - - set_bit(NFS_INO_INVALIDATING, bitlock); - smp_wmb(); - nfsi->cache_validity &= ~(NFS_INO_INVALID_DATA| - NFS_INO_DATA_INVAL_DEFER); - spin_unlock(&inode->i_lock); - trace_nfs_invalidate_mapping_enter(inode); - ret = nfs_invalidate_mapping(inode, mapping); - trace_nfs_invalidate_mapping_exit(inode, ret); - - clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); - smp_mb__after_atomic(); - wake_up_bit(bitlock, NFS_INO_INVALIDATING); -out: - return ret; + return nfs_clear_invalid_mapping(mapping); } static bool nfs_file_has_writers(struct nfs_inode *nfsi) diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 5604e807fc01..bb386a691e69 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -111,6 +111,7 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type) fallthrough; case -ENOTSUPP: status = -EOPNOTSUPP; + goto getout; default: goto getout; } diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 86acffe7335c..889a9f4c0310 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -609,6 +609,7 @@ found: * changed. Schedule recovery! */ nfs4_schedule_path_down_recovery(pos); + goto out; default: goto out; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a07530cf673d..74bc5120013d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -71,10 +71,6 @@ #include "nfs4trace.h" -#ifdef CONFIG_NFS_V4_2 -#include "nfs42.h" -#endif /* CONFIG_NFS_V4_2 */ - #define NFSDBG_FACILITY NFSDBG_PROC #define NFS4_BITMASK_SZ 3 @@ -2231,6 +2227,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct default: printk(KERN_ERR "NFS: %s: unhandled error " "%d.\n", __func__, err); + fallthrough; case 0: case -ENOENT: case -EAGAIN: @@ -5438,15 +5435,16 @@ static void nfs4_bitmask_adjust(__u32 *bitmask, struct inode *inode, if (cache_validity & NFS_INO_INVALID_ATIME) bitmask[1] |= FATTR4_WORD1_TIME_ACCESS; - if (cache_validity & NFS_INO_INVALID_ACCESS) - bitmask[0] |= FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | - FATTR4_WORD1_OWNER_GROUP; - if (cache_validity & NFS_INO_INVALID_ACL) - bitmask[0] |= FATTR4_WORD0_ACL; - if (cache_validity & NFS_INO_INVALID_LABEL) + if (cache_validity & NFS_INO_INVALID_OTHER) + bitmask[1] |= FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | + FATTR4_WORD1_OWNER_GROUP | + FATTR4_WORD1_NUMLINKS; + if (label && label->len && cache_validity & NFS_INO_INVALID_LABEL) bitmask[2] |= FATTR4_WORD2_SECURITY_LABEL; - if (cache_validity & NFS_INO_INVALID_CTIME) + if (cache_validity & NFS_INO_INVALID_CHANGE) bitmask[0] |= FATTR4_WORD0_CHANGE; + if (cache_validity & NFS_INO_INVALID_CTIME) + bitmask[1] |= FATTR4_WORD1_TIME_METADATA; if (cache_validity & NFS_INO_INVALID_MTIME) bitmask[1] |= FATTR4_WORD1_TIME_MODIFY; if (cache_validity & NFS_INO_INVALID_SIZE) @@ -9708,6 +9706,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) case -NFS4ERR_BADLAYOUT: /* no layout */ case -NFS4ERR_GRACE: /* loca_recalim always false */ task->tk_status = 0; + break; case 0: break; default: diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 4bf10792cb5b..3a51351bdc6a 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1125,6 +1125,7 @@ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid) " sequence-id error on an" " unconfirmed sequence %p!\n", seqid->sequence); + return; case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_BAD_STATEID: diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index af64b4e6fd1f..102b66e0bdef 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2875,6 +2875,7 @@ pnfs_do_write(struct nfs_pageio_descriptor *desc, switch (trypnfs) { case PNFS_NOT_ATTEMPTED: pnfs_write_through_mds(desc, hdr); + break; case PNFS_ATTEMPTED: break; case PNFS_TRY_AGAIN: @@ -3019,6 +3020,7 @@ pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) switch (trypnfs) { case PNFS_NOT_ATTEMPTED: pnfs_read_through_mds(desc, hdr); + break; case PNFS_ATTEMPTED: break; case PNFS_TRY_AGAIN: diff --git a/fs/nfs/read.c b/fs/nfs/read.c index eb854f1f86e2..d2b6dce1f99f 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -74,6 +74,24 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, } EXPORT_SYMBOL_GPL(nfs_pageio_init_read); +static void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio, + struct inode *inode) +{ + struct nfs_pgio_mirror *pgm; + unsigned long npages; + + nfs_pageio_complete(pgio); + + /* It doesn't make sense to do mirrored reads! */ + WARN_ON_ONCE(pgio->pg_mirror_count != 1); + + pgm = &pgio->pg_mirrors[0]; + NFS_I(inode)->read_io += pgm->pg_bytes_written; + npages = (pgm->pg_bytes_written + PAGE_SIZE - 1) >> PAGE_SHIFT; + nfs_add_stats(inode, NFSIOS_READPAGES, npages); +} + + void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) { struct nfs_pgio_mirror *mirror; @@ -114,41 +132,10 @@ static void nfs_readpage_release(struct nfs_page *req, int error) nfs_release_request(req); } -int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, - struct page *page) -{ - struct nfs_page *new; - unsigned int len; +struct nfs_readdesc { struct nfs_pageio_descriptor pgio; - struct nfs_pgio_mirror *pgm; - - len = nfs_page_length(page); - if (len == 0) - return nfs_return_empty_page(page); - new = nfs_create_request(ctx, page, 0, len); - if (IS_ERR(new)) { - unlock_page(page); - return PTR_ERR(new); - } - if (len < PAGE_SIZE) - zero_user_segment(page, len, PAGE_SIZE); - - nfs_pageio_init_read(&pgio, inode, false, - &nfs_async_read_completion_ops); - if (!nfs_pageio_add_request(&pgio, new)) { - nfs_list_remove_request(new); - nfs_readpage_release(new, pgio.pg_error); - } - nfs_pageio_complete(&pgio); - - /* It doesn't make sense to do mirrored reads! */ - WARN_ON_ONCE(pgio.pg_mirror_count != 1); - - pgm = &pgio.pg_mirrors[0]; - NFS_I(inode)->read_io += pgm->pg_bytes_written; - - return pgio.pg_error < 0 ? pgio.pg_error : 0; -} + struct nfs_open_context *ctx; +}; static void nfs_page_group_set_uptodate(struct nfs_page *req) { @@ -171,8 +158,7 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr) if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) { /* note: regions of the page not covered by a - * request are zeroed in nfs_readpage_async / - * readpage_async_filler */ + * request are zeroed in readpage_async_filler */ if (bytes > hdr->good_bytes) { /* nothing in this request was good, so zero * the full extent of the request */ @@ -304,78 +290,10 @@ static void nfs_readpage_result(struct rpc_task *task, nfs_readpage_retry(task, hdr); } -/* - * Read a page over NFS. - * We read the page synchronously in the following case: - * - The error flag is set for this page. This happens only when a - * previous async read operation failed. - */ -int nfs_readpage(struct file *file, struct page *page) -{ - struct nfs_open_context *ctx; - struct inode *inode = page_file_mapping(page)->host; - int error; - - dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", - page, PAGE_SIZE, page_index(page)); - nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); - nfs_add_stats(inode, NFSIOS_READPAGES, 1); - - /* - * Try to flush any pending writes to the file.. - * - * NOTE! Because we own the page lock, there cannot - * be any new pending writes generated at this point - * for this page (other pages can be written to). - */ - error = nfs_wb_page(inode, page); - if (error) - goto out_unlock; - if (PageUptodate(page)) - goto out_unlock; - - error = -ESTALE; - if (NFS_STALE(inode)) - goto out_unlock; - - if (file == NULL) { - error = -EBADF; - ctx = nfs_find_open_context(inode, NULL, FMODE_READ); - if (ctx == NULL) - goto out_unlock; - } else - ctx = get_nfs_open_context(nfs_file_open_context(file)); - - if (!IS_SYNC(inode)) { - error = nfs_readpage_from_fscache(ctx, inode, page); - if (error == 0) - goto out; - } - - xchg(&ctx->error, 0); - error = nfs_readpage_async(ctx, inode, page); - if (!error) { - error = wait_on_page_locked_killable(page); - if (!PageUptodate(page) && !error) - error = xchg(&ctx->error, 0); - } -out: - put_nfs_open_context(ctx); - return error; -out_unlock: - unlock_page(page); - return error; -} - -struct nfs_readdesc { - struct nfs_pageio_descriptor *pgio; - struct nfs_open_context *ctx; -}; - static int readpage_async_filler(void *data, struct page *page) { - struct nfs_readdesc *desc = (struct nfs_readdesc *)data; + struct nfs_readdesc *desc = data; struct nfs_page *new; unsigned int len; int error; @@ -390,9 +308,9 @@ readpage_async_filler(void *data, struct page *page) if (len < PAGE_SIZE) zero_user_segment(page, len, PAGE_SIZE); - if (!nfs_pageio_add_request(desc->pgio, new)) { + if (!nfs_pageio_add_request(&desc->pgio, new)) { nfs_list_remove_request(new); - error = desc->pgio->pg_error; + error = desc->pgio.pg_error; nfs_readpage_release(new, error); goto out; } @@ -404,17 +322,82 @@ out: return error; } -int nfs_readpages(struct file *filp, struct address_space *mapping, +/* + * Read a page over NFS. + * We read the page synchronously in the following case: + * - The error flag is set for this page. This happens only when a + * previous async read operation failed. + */ +int nfs_readpage(struct file *file, struct page *page) +{ + struct nfs_readdesc desc; + struct inode *inode = page_file_mapping(page)->host; + int ret; + + dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", + page, PAGE_SIZE, page_index(page)); + nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); + + /* + * Try to flush any pending writes to the file.. + * + * NOTE! Because we own the page lock, there cannot + * be any new pending writes generated at this point + * for this page (other pages can be written to). + */ + ret = nfs_wb_page(inode, page); + if (ret) + goto out_unlock; + if (PageUptodate(page)) + goto out_unlock; + + ret = -ESTALE; + if (NFS_STALE(inode)) + goto out_unlock; + + if (file == NULL) { + ret = -EBADF; + desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ); + if (desc.ctx == NULL) + goto out_unlock; + } else + desc.ctx = get_nfs_open_context(nfs_file_open_context(file)); + + if (!IS_SYNC(inode)) { + ret = nfs_readpage_from_fscache(desc.ctx, inode, page); + if (ret == 0) + goto out; + } + + xchg(&desc.ctx->error, 0); + nfs_pageio_init_read(&desc.pgio, inode, false, + &nfs_async_read_completion_ops); + + ret = readpage_async_filler(&desc, page); + + if (!ret) + nfs_pageio_complete_read(&desc.pgio, inode); + + ret = desc.pgio.pg_error < 0 ? desc.pgio.pg_error : 0; + if (!ret) { + ret = wait_on_page_locked_killable(page); + if (!PageUptodate(page) && !ret) + ret = xchg(&desc.ctx->error, 0); + } +out: + put_nfs_open_context(desc.ctx); + return ret; +out_unlock: + unlock_page(page); + return ret; +} + +int nfs_readpages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { - struct nfs_pageio_descriptor pgio; - struct nfs_pgio_mirror *pgm; - struct nfs_readdesc desc = { - .pgio = &pgio, - }; + struct nfs_readdesc desc; struct inode *inode = mapping->host; - unsigned long npages; - int ret = -ESTALE; + int ret; dprintk("NFS: nfs_readpages (%s/%Lu %d)\n", inode->i_sb->s_id, @@ -422,15 +405,17 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, nr_pages); nfs_inc_stats(inode, NFSIOS_VFSREADPAGES); + ret = -ESTALE; if (NFS_STALE(inode)) goto out; - if (filp == NULL) { + if (file == NULL) { + ret = -EBADF; desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ); if (desc.ctx == NULL) - return -EBADF; + goto out; } else - desc.ctx = get_nfs_open_context(nfs_file_open_context(filp)); + desc.ctx = get_nfs_open_context(nfs_file_open_context(file)); /* attempt to read as many of the pages as possible from the cache * - this returns -ENOBUFS immediately if the cookie is negative @@ -440,20 +425,13 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, if (ret == 0) goto read_complete; /* all pages were read */ - nfs_pageio_init_read(&pgio, inode, false, + nfs_pageio_init_read(&desc.pgio, inode, false, &nfs_async_read_completion_ops); ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); - nfs_pageio_complete(&pgio); - /* It doesn't make sense to do mirrored reads! */ - WARN_ON_ONCE(pgio.pg_mirror_count != 1); + nfs_pageio_complete_read(&desc.pgio, inode); - pgm = &pgio.pg_mirrors[0]; - NFS_I(inode)->read_io += pgm->pg_bytes_written; - npages = (pgm->pg_bytes_written + PAGE_SIZE - 1) >> - PAGE_SHIFT; - nfs_add_stats(inode, NFSIOS_READPAGES, npages); read_complete: put_nfs_open_context(desc.ctx); out: diff --git a/fs/nfs/super.c b/fs/nfs/super.c index c7a924580eec..94885c6f8f54 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -523,6 +523,13 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, seq_puts(m, ",local_lock=flock"); else seq_puts(m, ",local_lock=posix"); + + if (nfss->flags & NFS_MOUNT_WRITE_EAGER) { + if (nfss->flags & NFS_MOUNT_WRITE_WAIT) + seq_puts(m, ",write=wait"); + else + seq_puts(m, ",write=eager"); + } } /* diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 639c34fec04a..82bdcb982186 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -712,16 +712,23 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; struct nfs_pageio_descriptor pgio; - struct nfs_io_completion *ioc; + struct nfs_io_completion *ioc = NULL; + unsigned int mntflags = NFS_SERVER(inode)->flags; + int priority = 0; int err; nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); - ioc = nfs_io_completion_alloc(GFP_KERNEL); - if (ioc) - nfs_io_completion_init(ioc, nfs_io_completion_commit, inode); + if (!(mntflags & NFS_MOUNT_WRITE_EAGER) || wbc->for_kupdate || + wbc->for_background || wbc->for_sync || wbc->for_reclaim) { + ioc = nfs_io_completion_alloc(GFP_KERNEL); + if (ioc) + nfs_io_completion_init(ioc, nfs_io_completion_commit, + inode); + priority = wb_priority(wbc); + } - nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false, + nfs_pageio_init_write(&pgio, inode, priority, false, &nfs_async_write_completion_ops); pgio.pg_io_completion = ioc; err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); @@ -1278,19 +1285,21 @@ bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode) * the PageUptodate() flag. In this case, we will need to turn off * write optimisations that depend on the page contents being correct. */ -static bool nfs_write_pageuptodate(struct page *page, struct inode *inode) +static bool nfs_write_pageuptodate(struct page *page, struct inode *inode, + unsigned int pagelen) { struct nfs_inode *nfsi = NFS_I(inode); if (nfs_have_delegated_attributes(inode)) goto out; - if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) + if (nfsi->cache_validity & + (NFS_INO_REVAL_PAGECACHE | NFS_INO_INVALID_SIZE)) return false; smp_rmb(); - if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags)) + if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags) && pagelen != 0) return false; out: - if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + if (nfsi->cache_validity & NFS_INO_INVALID_DATA && pagelen != 0) return false; return PageUptodate(page) != 0; } @@ -1310,7 +1319,8 @@ is_whole_file_wrlock(struct file_lock *fl) * If the file is opened for synchronous writes then we can just skip the rest * of the checks. */ -static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode) +static int nfs_can_extend_write(struct file *file, struct page *page, + struct inode *inode, unsigned int pagelen) { int ret; struct file_lock_context *flctx = inode->i_flctx; @@ -1318,7 +1328,7 @@ static int nfs_can_extend_write(struct file *file, struct page *page, struct ino if (file->f_flags & O_DSYNC) return 0; - if (!nfs_write_pageuptodate(page, inode)) + if (!nfs_write_pageuptodate(page, inode, pagelen)) return 0; if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) return 1; @@ -1356,6 +1366,7 @@ int nfs_updatepage(struct file *file, struct page *page, struct nfs_open_context *ctx = nfs_file_open_context(file); struct address_space *mapping = page_file_mapping(page); struct inode *inode = mapping->host; + unsigned int pagelen = nfs_page_length(page); int status = 0; nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); @@ -1366,8 +1377,8 @@ int nfs_updatepage(struct file *file, struct page *page, if (!count) goto out; - if (nfs_can_extend_write(file, page, inode)) { - count = max(count + offset, nfs_page_length(page)); + if (nfs_can_extend_write(file, page, inode, pagelen)) { + count = max(count + offset, pagelen); offset = 0; } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 8c6c4e32fc2f..eadaabd18dc7 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -388,6 +388,7 @@ extern int nfs_open(struct inode *, struct file *); extern int nfs_attribute_cache_expired(struct inode *inode); extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode); extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *); +extern int nfs_clear_invalid_mapping(struct address_space *mapping); extern bool nfs_mapping_need_revalidate_inode(struct inode *inode); extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping); extern int nfs_revalidate_mapping_rcu(struct inode *inode); @@ -571,8 +572,6 @@ nfs_have_writebacks(struct inode *inode) extern int nfs_readpage(struct file *, struct page *); extern int nfs_readpages(struct file *, struct address_space *, struct list_head *, unsigned); -extern int nfs_readpage_async(struct nfs_open_context *, struct inode *, - struct page *); /* * inline functions diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 38e60ec742df..6f76b32a0238 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -142,7 +142,7 @@ struct nfs_server { struct nlm_host *nlm_host; /* NLM client handle */ struct nfs_iostats __percpu *io_stats; /* I/O statistics */ atomic_long_t writeback; /* number of writeback pages */ - int flags; /* various flags */ + unsigned int flags; /* various flags */ /* The following are for internal use only. Also see uapi/linux/nfs_mount.h */ #define NFS_MOUNT_LOOKUP_CACHE_NONEG 0x10000 @@ -153,6 +153,8 @@ struct nfs_server { #define NFS_MOUNT_LOCAL_FCNTL 0x200000 #define NFS_MOUNT_SOFTERR 0x400000 #define NFS_MOUNT_SOFTREVAL 0x800000 +#define NFS_MOUNT_WRITE_EAGER 0x01000000 +#define NFS_MOUNT_WRITE_WAIT 0x02000000 unsigned int caps; /* server capabilities */ unsigned int rsize; /* read size */ diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 76e85e16854b..c838e7ac1c2d 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -60,6 +60,51 @@ DECLARE_EVENT_CLASS(rpcrdma_completion_class, ), \ TP_ARGS(wc, cid)) +DECLARE_EVENT_CLASS(rpcrdma_receive_completion_class, + TP_PROTO( + const struct ib_wc *wc, + const struct rpc_rdma_cid *cid + ), + + TP_ARGS(wc, cid), + + TP_STRUCT__entry( + __field(u32, cq_id) + __field(int, completion_id) + __field(u32, received) + __field(unsigned long, status) + __field(unsigned int, vendor_err) + ), + + TP_fast_assign( + __entry->cq_id = cid->ci_queue_id; + __entry->completion_id = cid->ci_completion_id; + __entry->status = wc->status; + if (wc->status) { + __entry->received = 0; + __entry->vendor_err = wc->vendor_err; + } else { + __entry->received = wc->byte_len; + __entry->vendor_err = 0; + } + ), + + TP_printk("cq.id=%u cid=%d status=%s (%lu/0x%x) received=%u", + __entry->cq_id, __entry->completion_id, + rdma_show_wc_status(__entry->status), + __entry->status, __entry->vendor_err, + __entry->received + ) +); + +#define DEFINE_RECEIVE_COMPLETION_EVENT(name) \ + DEFINE_EVENT(rpcrdma_receive_completion_class, name, \ + TP_PROTO( \ + const struct ib_wc *wc, \ + const struct rpc_rdma_cid *cid \ + ), \ + TP_ARGS(wc, cid)) + DECLARE_EVENT_CLASS(xprtrdma_reply_class, TP_PROTO( const struct rpcrdma_rep *rep @@ -838,7 +883,8 @@ TRACE_EVENT(xprtrdma_post_linv_err, ** Completion events **/ -DEFINE_COMPLETION_EVENT(xprtrdma_wc_receive); +DEFINE_RECEIVE_COMPLETION_EVENT(xprtrdma_wc_receive); + DEFINE_COMPLETION_EVENT(xprtrdma_wc_send); DEFINE_COMPLETION_EVENT(xprtrdma_wc_fastreg); DEFINE_COMPLETION_EVENT(xprtrdma_wc_li); @@ -1790,7 +1836,7 @@ TRACE_EVENT(svcrdma_post_recv, ) ); -DEFINE_COMPLETION_EVENT(svcrdma_wc_receive); +DEFINE_RECEIVE_COMPLETION_EVENT(svcrdma_wc_receive); TRACE_EVENT(svcrdma_rq_post_err, TP_PROTO( diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 8241f5a4a01c..09c000d490a1 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -478,6 +478,7 @@ rpc_get_inode(struct super_block *sb, umode_t mode) inode->i_fop = &simple_dir_operations; inode->i_op = &simple_dir_inode_operations; inc_nlink(inode); + break; default: break; } diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 946edf2db646..a249837d6a55 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -2,7 +2,7 @@ /* * Copyright (c) 2015-2020, Oracle and/or its affiliates. * - * Support for backward direction RPCs on RPC/RDMA. + * Support for reverse-direction RPCs on RPC/RDMA. */ #include @@ -208,7 +208,7 @@ create_req: } /** - * rpcrdma_bc_receive_call - Handle a backward direction call + * rpcrdma_bc_receive_call - Handle a reverse-direction Call * @r_xprt: transport receiving the call * @rep: receive buffer containing the call * diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index baca49fe83af..766a1048a48a 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -306,20 +306,14 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, if (nsegs > ep->re_max_fr_depth) nsegs = ep->re_max_fr_depth; for (i = 0; i < nsegs;) { - if (seg->mr_page) - sg_set_page(&mr->mr_sg[i], - seg->mr_page, - seg->mr_len, - offset_in_page(seg->mr_offset)); - else - sg_set_buf(&mr->mr_sg[i], seg->mr_offset, - seg->mr_len); + sg_set_page(&mr->mr_sg[i], seg->mr_page, + seg->mr_len, seg->mr_offset); ++seg; ++i; if (ep->re_mrtype == IB_MR_TYPE_SG_GAPS) continue; - if ((i < nsegs && offset_in_page(seg->mr_offset)) || + if ((i < nsegs && seg->mr_offset) || offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) break; } diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 8f5d0cb68360..292f066d006e 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -204,9 +204,7 @@ rpcrdma_alloc_sparse_pages(struct xdr_buf *buf) return 0; } -/* Split @vec on page boundaries into SGEs. FMR registers pages, not - * a byte range. Other modes coalesce these SGEs into a single MR - * when they can. +/* Convert @vec to a single SGL element. * * Returns pointer to next available SGE, and bumps the total number * of SGEs consumed. @@ -215,22 +213,11 @@ static struct rpcrdma_mr_seg * rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, unsigned int *n) { - u32 remaining, page_offset; - char *base; - - base = vec->iov_base; - page_offset = offset_in_page(base); - remaining = vec->iov_len; - while (remaining) { - seg->mr_page = NULL; - seg->mr_offset = base; - seg->mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); - remaining -= seg->mr_len; - base += seg->mr_len; - ++seg; - ++(*n); - page_offset = 0; - } + seg->mr_page = virt_to_page(vec->iov_base); + seg->mr_offset = offset_in_page(vec->iov_base); + seg->mr_len = vec->iov_len; + ++seg; + ++(*n); return seg; } @@ -259,7 +246,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, page_base = offset_in_page(xdrbuf->page_base); while (len) { seg->mr_page = *ppages; - seg->mr_offset = (char *)page_base; + seg->mr_offset = page_base; seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len); len -= seg->mr_len; ++ppages; @@ -268,10 +255,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, page_base = 0; } - /* When encoding a Read chunk, the tail iovec contains an - * XDR pad and may be omitted. - */ - if (type == rpcrdma_readch && r_xprt->rx_ep->re_implicit_roundup) + if (type == rpcrdma_readch) goto out; /* When encoding a Write chunk, some servers need to see an @@ -283,7 +267,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, goto out; if (xdrbuf->tail[0].iov_len) - seg = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n); + rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n); out: if (unlikely(n > RPCRDMA_MAX_SEGS)) @@ -644,9 +628,8 @@ out_mapping_err: return false; } -/* The tail iovec may include an XDR pad for the page list, - * as well as additional content, and may not reside in the - * same page as the head iovec. +/* The tail iovec might not reside in the same page as the + * head iovec. */ static bool rpcrdma_prepare_tail_iov(struct rpcrdma_req *req, struct xdr_buf *xdr, @@ -764,27 +747,19 @@ static bool rpcrdma_prepare_readch(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, struct xdr_buf *xdr) { + struct kvec *tail = &xdr->tail[0]; + if (!rpcrdma_prepare_head_iov(r_xprt, req, xdr->head[0].iov_len)) return false; - /* If there is a Read chunk, the page list is being handled + /* If there is a Read chunk, the page list is handled * via explicit RDMA, and thus is skipped here. */ - /* Do not include the tail if it is only an XDR pad */ - if (xdr->tail[0].iov_len > 3) { - unsigned int page_base, len; - - /* If the content in the page list is an odd length, - * xdr_write_pages() adds a pad at the beginning of - * the tail iovec. Force the tail's non-pad content to - * land at the next XDR position in the Send message. - */ - page_base = offset_in_page(xdr->tail[0].iov_base); - len = xdr->tail[0].iov_len; - page_base += len & 3; - len -= len & 3; - if (!rpcrdma_prepare_tail_iov(req, xdr, page_base, len)) + if (tail->iov_len) { + if (!rpcrdma_prepare_tail_iov(req, xdr, + offset_in_page(tail->iov_base), + tail->iov_len)) return false; kref_get(&req->rl_kref); } @@ -1164,14 +1139,10 @@ rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) */ p = xdr_inline_decode(xdr, 3 * sizeof(*p)); if (unlikely(!p)) - goto out_short; + return true; rpcrdma_bc_receive_call(r_xprt, rep); return true; - -out_short: - pr_warn("RPC/RDMA short backward direction call\n"); - return true; } #else /* CONFIG_SUNRPC_BACKCHANNEL */ { diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index 63f8be974df2..4a1edbb4028e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -2,7 +2,7 @@ /* * Copyright (c) 2015-2018 Oracle. All rights reserved. * - * Support for backward direction RPCs on RPC/RDMA (server-side). + * Support for reverse-direction RPCs on RPC/RDMA (server-side). */ #include @@ -59,7 +59,7 @@ out_unlock: spin_unlock(&xprt->queue_lock); } -/* Send a backwards direction RPC call. +/* Send a reverse-direction RPC Call. * * Caller holds the connection's mutex and has already marshaled * the RPC/RDMA request. diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 94b28657aeeb..fe3be985e239 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -98,9 +98,9 @@ struct rpcrdma_ep { atomic_t re_completion_ids; }; -/* Pre-allocate extra Work Requests for handling backward receives - * and sends. This is a fixed value because the Work Queues are - * allocated when the forward channel is set up, long before the +/* Pre-allocate extra Work Requests for handling reverse-direction + * Receives and Sends. This is a fixed value because the Work Queues + * are allocated when the forward channel is set up, long before the * backchannel is provisioned. This value is two times * NFS4_DEF_CB_SLOT_TABLE_SIZE. */ @@ -283,10 +283,11 @@ enum { RPCRDMA_MAX_IOV_SEGS, }; -struct rpcrdma_mr_seg { /* chunk descriptors */ - u32 mr_len; /* length of chunk or segment */ - struct page *mr_page; /* owning page, if any */ - char *mr_offset; /* kva if no page, else offset */ +/* Arguments for DMA mapping and registration */ +struct rpcrdma_mr_seg { + u32 mr_len; /* length of segment */ + struct page *mr_page; /* underlying struct page */ + u64 mr_offset; /* IN: page offset, OUT: iova */ }; /* The Send SGE array is provisioned to send a maximum size diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index c56a66cdf4ac..e35760f238a4 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -829,7 +829,7 @@ xs_stream_record_marker(struct xdr_buf *xdr) * EAGAIN: The socket was blocked, please call again later to * complete the request * ENOTCONN: Caller needs to invoke connect logic then call again - * other: Some other error occured, the request was not sent + * other: Some other error occurred, the request was not sent */ static int xs_local_send_request(struct rpc_rqst *req) { @@ -1665,7 +1665,7 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock) * This ensures that we can continue to establish TCP * connections even when all local ephemeral ports are already * a part of some TCP connection. This makes no difference - * for UDP sockets, but also doens't harm them. + * for UDP sockets, but also doesn't harm them. * * If we're asking for any reserved port (i.e. port == 0 && * transport->xprt.resvport == 1) xs_get_srcport above will @@ -1875,6 +1875,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport) xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start; xprt_set_connected(xprt); + break; case -ENOBUFS: break; case -ENOENT: @@ -2276,10 +2277,8 @@ static void xs_tcp_setup_socket(struct work_struct *work) case -EHOSTUNREACH: case -EADDRINUSE: case -ENOBUFS: - /* - * xs_tcp_force_close() wakes tasks with -EIO. - * We need to wake them first to ensure the - * correct error code. + /* xs_tcp_force_close() wakes tasks with a fixed error code. + * We need to wake them first to ensure the correct error code. */ xprt_wake_pending_tasks(xprt, status); xs_tcp_force_close(xprt); @@ -2380,7 +2379,7 @@ static void xs_error_handle(struct work_struct *work) } /** - * xs_local_print_stats - display AF_LOCAL socket-specifc stats + * xs_local_print_stats - display AF_LOCAL socket-specific stats * @xprt: rpc_xprt struct containing statistics * @seq: output file * @@ -2409,7 +2408,7 @@ static void xs_local_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) } /** - * xs_udp_print_stats - display UDP socket-specifc stats + * xs_udp_print_stats - display UDP socket-specific stats * @xprt: rpc_xprt struct containing statistics * @seq: output file * @@ -2433,7 +2432,7 @@ static void xs_udp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) } /** - * xs_tcp_print_stats - display TCP socket-specifc stats + * xs_tcp_print_stats - display TCP socket-specific stats * @xprt: rpc_xprt struct containing statistics * @seq: output file *