diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 23d05ec87fcc..e22e631cb115 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2417,10 +2417,10 @@ static int statx_to_caps(u32 want, umode_t mode) { int mask = 0; - if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME)) + if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME|STATX_CHANGE_COOKIE)) mask |= CEPH_CAP_AUTH_SHARED; - if (want & (STATX_NLINK|STATX_CTIME)) { + if (want & (STATX_NLINK|STATX_CTIME|STATX_CHANGE_COOKIE)) { /* * The link count for directories depends on inode->i_subdirs, * and that is only updated when Fs caps are held. @@ -2431,11 +2431,10 @@ static int statx_to_caps(u32 want, umode_t mode) mask |= CEPH_CAP_LINK_SHARED; } - if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE| - STATX_BLOCKS)) + if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE|STATX_BLOCKS|STATX_CHANGE_COOKIE)) mask |= CEPH_CAP_FILE_SHARED; - if (want & (STATX_CTIME)) + if (want & (STATX_CTIME|STATX_CHANGE_COOKIE)) mask |= CEPH_CAP_XATTR_SHARED; return mask; @@ -2478,6 +2477,11 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, valid_mask |= STATX_BTIME; } + if (request_mask & STATX_CHANGE_COOKIE) { + stat->change_cookie = inode_peek_iversion_raw(inode); + valid_mask |= STATX_CHANGE_COOKIE; + } + if (ceph_snap(inode) == CEPH_NOSNAP) stat->dev = sb->s_dev; else @@ -2519,6 +2523,8 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, stat->nlink = 1 + 1 + ci->i_subdirs; } + stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC; + stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC; stat->result_mask = request_mask & valid_mask; return err; } diff --git a/fs/libfs.c b/fs/libfs.c index aada4e7c8713..17ecc47696e1 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1582,3 +1582,39 @@ bool inode_maybe_inc_iversion(struct inode *inode, bool force) return true; } EXPORT_SYMBOL(inode_maybe_inc_iversion); + +/** + * inode_query_iversion - read i_version for later use + * @inode: inode from which i_version should be read + * + * Read the inode i_version counter. This should be used by callers that wish + * to store the returned i_version for later comparison. This will guarantee + * that a later query of the i_version will result in a different value if + * anything has changed. + * + * In this implementation, we fetch the current value, set the QUERIED flag and + * then try to swap it into place with a cmpxchg, if it wasn't already set. If + * that fails, we try again with the newly fetched value from the cmpxchg. + */ +u64 inode_query_iversion(struct inode *inode) +{ + u64 cur, new; + + cur = inode_peek_iversion_raw(inode); + do { + /* If flag is already set, then no need to swap */ + if (cur & I_VERSION_QUERIED) { + /* + * This barrier (and the implicit barrier in the + * cmpxchg below) pairs with the barrier in + * inode_maybe_inc_iversion(). + */ + smp_mb(); + break; + } + + new = cur | I_VERSION_QUERIED; + } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); + return cur >> I_VERSION_QUERIED_SHIFT; +} +EXPORT_SYMBOL(inode_query_iversion); diff --git a/fs/nfs/export.c b/fs/nfs/export.c index 01596f2d0a1e..1a9d5aa51dfb 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -145,17 +145,10 @@ out: return parent; } -static u64 nfs_fetch_iversion(struct inode *inode) -{ - nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE); - return inode_peek_iversion_raw(inode); -} - const struct export_operations nfs_export_ops = { .encode_fh = nfs_encode_fh, .fh_to_dentry = nfs_fh_to_dentry, .get_parent = nfs_get_parent, - .fetch_iversion = nfs_fetch_iversion, .flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK| EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS| EXPORT_OP_NOATOMIC_ATTR, diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index e98ee7599eeb..756ad4fa6f2a 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -825,6 +825,8 @@ static u32 nfs_get_valid_attrmask(struct inode *inode) reply_mask |= STATX_UID | STATX_GID; if (!(cache_validity & NFS_INO_INVALID_BLOCKS)) reply_mask |= STATX_BLOCKS; + if (!(cache_validity & NFS_INO_INVALID_CHANGE)) + reply_mask |= STATX_CHANGE_COOKIE; return reply_mask; } @@ -843,7 +845,8 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, request_mask &= STATX_TYPE | STATX_MODE | STATX_NLINK | STATX_UID | STATX_GID | STATX_ATIME | STATX_MTIME | STATX_CTIME | - STATX_INO | STATX_SIZE | STATX_BLOCKS; + STATX_INO | STATX_SIZE | STATX_BLOCKS | STATX_BTIME | + STATX_CHANGE_COOKIE; if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) { if (readdirplus_enabled) @@ -851,8 +854,8 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, goto out_no_revalidate; } - /* Flush out writes to the server in order to update c/mtime. */ - if ((request_mask & (STATX_CTIME | STATX_MTIME)) && + /* Flush out writes to the server in order to update c/mtime/version. */ + if ((request_mask & (STATX_CTIME | STATX_MTIME | STATX_CHANGE_COOKIE)) && S_ISREG(inode->i_mode)) filemap_write_and_wait(inode->i_mapping); @@ -872,7 +875,8 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, /* Is the user requesting attributes that might need revalidation? */ if (!(request_mask & (STATX_MODE|STATX_NLINK|STATX_ATIME|STATX_CTIME| STATX_MTIME|STATX_UID|STATX_GID| - STATX_SIZE|STATX_BLOCKS))) + STATX_SIZE|STATX_BLOCKS| + STATX_CHANGE_COOKIE))) goto out_no_revalidate; /* Check whether the cached attributes are stale */ @@ -910,6 +914,10 @@ out_no_revalidate: generic_fillattr(&init_user_ns, inode, stat); stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); + stat->change_cookie = inode_peek_iversion_raw(inode); + stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC; + if (server->change_attr_type != NFS4_CHANGE_TYPE_IS_UNDEFINED) + stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC; if (S_ISDIR(inode->i_mode)) stat->blksize = NFS_SERVER(inode)->dtsize; out: diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 97edb32be77f..e12e5a4ad502 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2965,7 +2965,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, goto out; } - err = vfs_getattr(&path, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT); + err = vfs_getattr(&path, &stat, + STATX_BASIC_STATS | STATX_BTIME | STATX_CHANGE_COOKIE, + AT_STATX_SYNC_AS_STAT); if (err) goto out_nfserr; if (!(stat.result_mask & STATX_BTIME)) diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 8c52b6c9d31a..76ea268dc420 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -628,6 +628,10 @@ void fh_fill_pre_attrs(struct svc_fh *fhp) stat.mtime = inode->i_mtime; stat.ctime = inode->i_ctime; stat.size = inode->i_size; + if (v4 && IS_I_VERSION(inode)) { + stat.change_cookie = inode_query_iversion(inode); + stat.result_mask |= STATX_CHANGE_COOKIE; + } } if (v4) fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); @@ -659,6 +663,10 @@ void fh_fill_post_attrs(struct svc_fh *fhp) if (err) { fhp->fh_post_saved = false; fhp->fh_post_attr.ctime = inode->i_ctime; + if (v4 && IS_I_VERSION(inode)) { + fhp->fh_post_attr.change_cookie = inode_query_iversion(inode); + fhp->fh_post_attr.result_mask |= STATX_CHANGE_COOKIE; + } } else fhp->fh_post_saved = true; if (v4) @@ -748,3 +756,37 @@ enum fsid_source fsid_source(const struct svc_fh *fhp) return FSIDSOURCE_UUID; return FSIDSOURCE_DEV; } + +/* + * We could use i_version alone as the change attribute. However, i_version + * can go backwards on a regular file after an unclean shutdown. On its own + * that doesn't necessarily cause a problem, but if i_version goes backwards + * and then is incremented again it could reuse a value that was previously + * used before boot, and a client who queried the two values might incorrectly + * assume nothing changed. + * + * By using both ctime and the i_version counter we guarantee that as long as + * time doesn't go backwards we never reuse an old value. If the filesystem + * advertises STATX_ATTR_CHANGE_MONOTONIC, then this mitigation is not + * needed. + * + * We only need to do this for regular files as well. For directories, we + * assume that the new change attr is always logged to stable storage in some + * fashion before the results can be seen. + */ +u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode) +{ + u64 chattr; + + if (stat->result_mask & STATX_CHANGE_COOKIE) { + chattr = stat->change_cookie; + if (S_ISREG(inode->i_mode) && + !(stat->attributes & STATX_ATTR_CHANGE_MONOTONIC)) { + chattr += (u64)stat->ctime.tv_sec << 30; + chattr += stat->ctime.tv_nsec; + } + } else { + chattr = time_to_chattr(&stat->ctime); + } + return chattr; +} diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 513e028b0bbe..4e0ecf0ae2cf 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -293,34 +293,7 @@ static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp) fhp->fh_pre_saved = false; } -/* - * We could use i_version alone as the change attribute. However, - * i_version can go backwards after a reboot. On its own that doesn't - * necessarily cause a problem, but if i_version goes backwards and then - * is incremented again it could reuse a value that was previously used - * before boot, and a client who queried the two values might - * incorrectly assume nothing changed. - * - * By using both ctime and the i_version counter we guarantee that as - * long as time doesn't go backwards we never reuse an old value. - */ -static inline u64 nfsd4_change_attribute(struct kstat *stat, - struct inode *inode) -{ - if (inode->i_sb->s_export_op->fetch_iversion) - return inode->i_sb->s_export_op->fetch_iversion(inode); - else if (IS_I_VERSION(inode)) { - u64 chattr; - - chattr = stat->ctime.tv_sec; - chattr <<= 30; - chattr += stat->ctime.tv_nsec; - chattr += inode_query_iversion(inode); - return chattr; - } else - return time_to_chattr(&stat->ctime); -} - +u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode); extern void fh_fill_pre_attrs(struct svc_fh *fhp); extern void fh_fill_post_attrs(struct svc_fh *fhp); extern void fh_fill_both_attrs(struct svc_fh *fhp); diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index dbdfef7ae85b..43fb57a301d3 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -170,9 +170,14 @@ static inline void fh_drop_write(struct svc_fh *fh) static inline __be32 fh_getattr(const struct svc_fh *fh, struct kstat *stat) { + u32 request_mask = STATX_BASIC_STATS; struct path p = {.mnt = fh->fh_export->ex_path.mnt, .dentry = fh->fh_dentry}; - return nfserrno(vfs_getattr(&p, stat, STATX_BASIC_STATS, + + if (fh->fh_maxsize == NFS4_FHSIZE) + request_mask |= (STATX_BTIME | STATX_CHANGE_COOKIE); + + return nfserrno(vfs_getattr(&p, stat, request_mask, AT_STATX_SYNC_AS_STAT)); } diff --git a/fs/stat.c b/fs/stat.c index d6cc74ca8486..f43afe0081fe 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -122,6 +123,11 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, stat->attributes_mask |= (STATX_ATTR_AUTOMOUNT | STATX_ATTR_DAX); + if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) { + stat->result_mask |= STATX_CHANGE_COOKIE; + stat->change_cookie = inode_query_iversion(inode); + } + mnt_userns = mnt_user_ns(path->mnt); if (inode->i_op->getattr) return inode->i_op->getattr(mnt_userns, path, stat, @@ -602,9 +608,11 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer) memset(&tmp, 0, sizeof(tmp)); - tmp.stx_mask = stat->result_mask; + /* STATX_CHANGE_COOKIE is kernel-only for now */ + tmp.stx_mask = stat->result_mask & ~STATX_CHANGE_COOKIE; tmp.stx_blksize = stat->blksize; - tmp.stx_attributes = stat->attributes; + /* STATX_ATTR_CHANGE_MONOTONIC is kernel-only for now */ + tmp.stx_attributes = stat->attributes & ~STATX_ATTR_CHANGE_MONOTONIC; tmp.stx_nlink = stat->nlink; tmp.stx_uid = from_kuid_munged(current_user_ns(), stat->uid); tmp.stx_gid = from_kgid_munged(current_user_ns(), stat->gid); @@ -643,6 +651,11 @@ int do_statx(int dfd, struct filename *filename, unsigned int flags, if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE) return -EINVAL; + /* STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests + * from userland. + */ + mask &= ~STATX_CHANGE_COOKIE; + error = vfs_statx(dfd, filename, flags, &stat, mask); if (error) return error; diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index fe848901fcc3..9f4d4bcbf251 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -213,7 +213,6 @@ struct export_operations { bool write, u32 *device_generation); int (*commit_blocks)(struct inode *inode, struct iomap *iomaps, int nr_iomaps, struct iattr *iattr); - u64 (*fetch_iversion)(struct inode *); #define EXPORT_OP_NOWCC (0x1) /* don't collect v3 wcc data */ #define EXPORT_OP_NOSUBTREECHK (0x2) /* no subtree checking */ #define EXPORT_OP_CLOSE_BEFORE_UNLINK (0x4) /* close files before unlink */ diff --git a/include/linux/iversion.h b/include/linux/iversion.h index e27bd4f55d84..f174ff1b59ee 100644 --- a/include/linux/iversion.h +++ b/include/linux/iversion.h @@ -9,8 +9,26 @@ * --------------------------- * The change attribute (i_version) is mandated by NFSv4 and is mostly for * knfsd, but is also used for other purposes (e.g. IMA). The i_version must - * appear different to observers if there was a change to the inode's data or - * metadata since it was last queried. + * appear larger to observers if there was an explicit change to the inode's + * data or metadata since it was last queried. + * + * An explicit change is one that would ordinarily result in a change to the + * inode status change time (aka ctime). i_version must appear to change, even + * if the ctime does not (since the whole point is to avoid missing updates due + * to timestamp granularity). If POSIX or other relevant spec mandates that the + * ctime must change due to an operation, then the i_version counter must be + * incremented as well. + * + * Making the i_version update completely atomic with the operation itself would + * be prohibitively expensive. Traditionally the kernel has updated the times on + * directories after an operation that changes its contents. For regular files, + * the ctime is usually updated before the data is copied into the cache for a + * write. This means that there is a window of time when an observer can + * associate a new timestamp with old file contents. Since the purpose of the + * i_version is to allow for better cache coherency, the i_version must always + * be updated after the results of the operation are visible. Updating it before + * and after a change is also permitted. (Note that no filesystems currently do + * this. Fixing that is a work-in-progress). * * Observers see the i_version as a 64-bit number that never decreases. If it * remains the same since it was last checked, then nothing has changed in the @@ -234,42 +252,6 @@ inode_peek_iversion(const struct inode *inode) return inode_peek_iversion_raw(inode) >> I_VERSION_QUERIED_SHIFT; } -/** - * inode_query_iversion - read i_version for later use - * @inode: inode from which i_version should be read - * - * Read the inode i_version counter. This should be used by callers that wish - * to store the returned i_version for later comparison. This will guarantee - * that a later query of the i_version will result in a different value if - * anything has changed. - * - * In this implementation, we fetch the current value, set the QUERIED flag and - * then try to swap it into place with a cmpxchg, if it wasn't already set. If - * that fails, we try again with the newly fetched value from the cmpxchg. - */ -static inline u64 -inode_query_iversion(struct inode *inode) -{ - u64 cur, new; - - cur = inode_peek_iversion_raw(inode); - do { - /* If flag is already set, then no need to swap */ - if (cur & I_VERSION_QUERIED) { - /* - * This barrier (and the implicit barrier in the - * cmpxchg below) pairs with the barrier in - * inode_maybe_inc_iversion(). - */ - smp_mb(); - break; - } - - new = cur | I_VERSION_QUERIED; - } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); - return cur >> I_VERSION_QUERIED_SHIFT; -} - /* * For filesystems without any sort of change attribute, the best we can * do is fake one up from the ctime: @@ -283,6 +265,8 @@ static inline u64 time_to_chattr(struct timespec64 *t) return chattr; } +u64 inode_query_iversion(struct inode *inode); + /** * inode_eq_iversion_raw - check whether the raw i_version counter has changed * @inode: inode to check diff --git a/include/linux/stat.h b/include/linux/stat.h index ff277ced50e9..52150570d37a 100644 --- a/include/linux/stat.h +++ b/include/linux/stat.h @@ -52,6 +52,15 @@ struct kstat { u64 mnt_id; u32 dio_mem_align; u32 dio_offset_align; + u64 change_cookie; }; +/* These definitions are internal to the kernel for now. Mainly used by nfsd. */ + +/* mask values */ +#define STATX_CHANGE_COOKIE 0x40000000U /* Want/got stx_change_attr */ + +/* file attribute values */ +#define STATX_ATTR_CHANGE_MONOTONIC 0x8000000000000000ULL /* version monotonically increases */ + #endif