From a5b3a80b899bda0f456f1246c4c5a1191ea01519 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 1 Feb 2016 16:43:04 +0000 Subject: [PATCH 01/32] CacheFiles: Provide read-and-reset release counters for cachefilesd Provide read-and-reset objects- and blocks-released counters for cachefilesd to use to work out whether there's anything new that can be culled. One of the problems cachefilesd has is that if all the objects in the cache are pinned by inodes lying dormant in the kernel inode cache, there isn't anything for it to cull. In such a case, it just spins around walking the filesystem tree and scanning for something to cull. This eats up a lot of CPU time. By telling cachefilesd if there have been any releases, the daemon can sleep until there is the possibility of something to do. cachefilesd finds this information by the following means: (1) When the control fd is read, the kernel presents a list of values of interest. "freleased=N" and "breleased=N" are added to this list to indicate the number of files released and number of blocks released since the last read call. At this point the counters are reset. (2) POLLIN is signalled if the number of files released becomes greater than 0. Note that by 'released' it just means that the kernel has released its interest in those files for the moment, not necessarily that the files should be deleted from the cache. Signed-off-by: David Howells Reviewed-by: Steve Dickson Signed-off-by: Al Viro --- fs/cachefiles/daemon.c | 13 ++++++++++--- fs/cachefiles/interface.c | 11 ++--------- fs/cachefiles/internal.h | 4 ++++ fs/cachefiles/namei.c | 28 +++++++++++++++++++++++----- 4 files changed, 39 insertions(+), 17 deletions(-) diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index 452e98dd7560..1ee54ffd3a24 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -162,6 +162,8 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer, size_t buflen, loff_t *pos) { struct cachefiles_cache *cache = file->private_data; + unsigned long long b_released; + unsigned f_released; char buffer[256]; int n; @@ -174,6 +176,8 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer, cachefiles_has_space(cache, 0, 0); /* summarise */ + f_released = atomic_xchg(&cache->f_released, 0); + b_released = atomic_long_xchg(&cache->b_released, 0); clear_bit(CACHEFILES_STATE_CHANGED, &cache->flags); n = snprintf(buffer, sizeof(buffer), @@ -183,15 +187,18 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer, " fstop=%llx" " brun=%llx" " bcull=%llx" - " bstop=%llx", + " bstop=%llx" + " freleased=%x" + " breleased=%llx", test_bit(CACHEFILES_CULLING, &cache->flags) ? '1' : '0', (unsigned long long) cache->frun, (unsigned long long) cache->fcull, (unsigned long long) cache->fstop, (unsigned long long) cache->brun, (unsigned long long) cache->bcull, - (unsigned long long) cache->bstop - ); + (unsigned long long) cache->bstop, + f_released, + b_released); if (n > buflen) return -EMSGSIZE; diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 675a3332d72f..861d611b8c05 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -291,15 +291,8 @@ static void cachefiles_drop_object(struct fscache_object *_object) } /* note that the object is now inactive */ - if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) { - write_lock(&cache->active_lock); - if (!test_and_clear_bit(CACHEFILES_OBJECT_ACTIVE, - &object->flags)) - BUG(); - rb_erase(&object->active_node, &cache->active_nodes); - wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE); - write_unlock(&cache->active_lock); - } + if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) + cachefiles_mark_object_inactive(cache, object); dput(object->dentry); object->dentry = NULL; diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 9c4b737a54df..2fcde1a34b7c 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -66,6 +66,8 @@ struct cachefiles_cache { struct rb_root active_nodes; /* active nodes (can't be culled) */ rwlock_t active_lock; /* lock for active_nodes */ atomic_t gravecounter; /* graveyard uniquifier */ + atomic_t f_released; /* number of objects released lately */ + atomic_long_t b_released; /* number of blocks released lately */ unsigned frun_percent; /* when to stop culling (% files) */ unsigned fcull_percent; /* when to start culling (% files) */ unsigned fstop_percent; /* when to stop allocating (% files) */ @@ -157,6 +159,8 @@ extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type); /* * namei.c */ +extern void cachefiles_mark_object_inactive(struct cachefiles_cache *cache, + struct cachefiles_object *object); extern int cachefiles_delete_object(struct cachefiles_cache *cache, struct cachefiles_object *object); extern int cachefiles_walk_to_object(struct cachefiles_object *parent, diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 1c2334c163dd..4ae75006e73b 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -257,6 +257,28 @@ requeue: return -ETIMEDOUT; } +/* + * Mark an object as being inactive. + */ +void cachefiles_mark_object_inactive(struct cachefiles_cache *cache, + struct cachefiles_object *object) +{ + write_lock(&cache->active_lock); + rb_erase(&object->active_node, &cache->active_nodes); + clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); + write_unlock(&cache->active_lock); + + wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE); + + /* This object can now be culled, so we need to let the daemon know + * that there is something it can remove if it needs to. + */ + atomic_long_add(d_backing_inode(object->dentry)->i_blocks, + &cache->b_released); + if (atomic_inc_return(&cache->f_released)) + cachefiles_state_changed(cache); +} + /* * delete an object representation from the cache * - file backed objects are unlinked @@ -684,11 +706,7 @@ mark_active_timed_out: check_error: _debug("check error %d", ret); - write_lock(&cache->active_lock); - rb_erase(&object->active_node, &cache->active_nodes); - clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); - wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE); - write_unlock(&cache->active_lock); + cachefiles_mark_object_inactive(cache, object); release_dentry: dput(object->dentry); object->dentry = NULL; From 6b719e53099f1bf1cd32dd1d6704c101a7506c46 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 22 Feb 2016 17:48:19 -0500 Subject: [PATCH 02/32] ecryptfs_lookup(): use lookup_one_len_unlocked() Signed-off-by: Al Viro --- fs/ecryptfs/inode.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 4e685ac1024d..34bbf5d93f57 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -397,11 +397,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, int rc = 0; lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); - inode_lock(d_inode(lower_dir_dentry)); - lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name, + lower_dentry = lookup_one_len_unlocked(ecryptfs_dentry->d_name.name, lower_dir_dentry, ecryptfs_dentry->d_name.len); - inode_unlock(d_inode(lower_dir_dentry)); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " @@ -426,11 +424,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, "filename; rc = [%d]\n", __func__, rc); goto out; } - inode_lock(d_inode(lower_dir_dentry)); - lower_dentry = lookup_one_len(encrypted_and_encoded_name, + lower_dentry = lookup_one_len_unlocked(encrypted_and_encoded_name, lower_dir_dentry, encrypted_and_encoded_name_size); - inode_unlock(d_inode(lower_dir_dentry)); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " From 97c31606075b30d0c5d70e477f54bb3f222e048d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 22 Feb 2016 18:14:25 -0500 Subject: [PATCH 03/32] ecryptfs_encrypt_and_encode_filename(): drop unused argument the last time it was getting something other than NULL as crypt_stat had been back in 2009... Signed-off-by: Al Viro --- fs/ecryptfs/crypto.c | 27 +++++++++------------------ fs/ecryptfs/ecryptfs_kernel.h | 1 - fs/ecryptfs/inode.c | 3 +-- 3 files changed, 10 insertions(+), 21 deletions(-) diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 80d6901493cf..87dbdd4881ab 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1499,16 +1499,14 @@ out: */ static int ecryptfs_encrypt_filename(struct ecryptfs_filename *filename, - struct ecryptfs_crypt_stat *crypt_stat, struct ecryptfs_mount_crypt_stat *mount_crypt_stat) { int rc = 0; filename->encrypted_filename = NULL; filename->encrypted_filename_size = 0; - if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCFN_USE_MOUNT_FNEK)) - || (mount_crypt_stat && (mount_crypt_stat->flags - & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) { + if (mount_crypt_stat && (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)) { size_t packet_size; size_t remaining_bytes; @@ -1944,7 +1942,6 @@ out: int ecryptfs_encrypt_and_encode_filename( char **encoded_name, size_t *encoded_name_size, - struct ecryptfs_crypt_stat *crypt_stat, struct ecryptfs_mount_crypt_stat *mount_crypt_stat, const char *name, size_t name_size) { @@ -1953,9 +1950,8 @@ int ecryptfs_encrypt_and_encode_filename( (*encoded_name) = NULL; (*encoded_name_size) = 0; - if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES)) - || (mount_crypt_stat && (mount_crypt_stat->flags - & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) { + if (mount_crypt_stat && (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)) { struct ecryptfs_filename *filename; filename = kzalloc(sizeof(*filename), GFP_KERNEL); @@ -1968,8 +1964,7 @@ int ecryptfs_encrypt_and_encode_filename( } filename->filename = (char *)name; filename->filename_size = name_size; - rc = ecryptfs_encrypt_filename(filename, crypt_stat, - mount_crypt_stat); + rc = ecryptfs_encrypt_filename(filename, mount_crypt_stat); if (rc) { printk(KERN_ERR "%s: Error attempting to encrypt " "filename; rc = [%d]\n", __func__, rc); @@ -1980,11 +1975,9 @@ int ecryptfs_encrypt_and_encode_filename( NULL, &encoded_name_no_prefix_size, filename->encrypted_filename, filename->encrypted_filename_size); - if ((crypt_stat && (crypt_stat->flags - & ECRYPTFS_ENCFN_USE_MOUNT_FNEK)) - || (mount_crypt_stat + if (mount_crypt_stat && (mount_crypt_stat->flags - & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) + & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)) (*encoded_name_size) = (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE + encoded_name_no_prefix_size); @@ -2002,11 +1995,9 @@ int ecryptfs_encrypt_and_encode_filename( kfree(filename); goto out; } - if ((crypt_stat && (crypt_stat->flags - & ECRYPTFS_ENCFN_USE_MOUNT_FNEK)) - || (mount_crypt_stat + if (mount_crypt_stat && (mount_crypt_stat->flags - & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) { + & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)) { memcpy((*encoded_name), ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE); diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 7b39260c7bba..67e16128c572 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -569,7 +569,6 @@ int ecryptfs_fill_zeros(struct file *file, loff_t new_length); int ecryptfs_encrypt_and_encode_filename( char **encoded_name, size_t *encoded_name_size, - struct ecryptfs_crypt_stat *crypt_stat, struct ecryptfs_mount_crypt_stat *mount_crypt_stat, const char *name, size_t name_size); struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry); diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 34bbf5d93f57..26651636cd1d 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -417,7 +417,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, dput(lower_dentry); rc = ecryptfs_encrypt_and_encode_filename( &encrypted_and_encoded_name, &encrypted_and_encoded_name_size, - NULL, mount_crypt_stat, ecryptfs_dentry->d_name.name, + mount_crypt_stat, ecryptfs_dentry->d_name.name, ecryptfs_dentry->d_name.len); if (rc) { printk(KERN_ERR "%s: Error attempting to encrypt and encode " @@ -498,7 +498,6 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry, dir->i_sb)->mount_crypt_stat; rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname, &encoded_symlen, - NULL, mount_crypt_stat, symname, strlen(symname)); if (rc) From 793b80ef14af56d20c998265287648ad34239b6f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Mar 2016 16:03:58 +0100 Subject: [PATCH 04/32] vfs: pass a flags argument to vfs_readv/vfs_writev This way we can set kiocb flags also from the sync read/write path for the read_iter/write_iter operations. For now there is no way to pass flags to plain read/write operations as there is no real need for that, and all flags passed are explicitly rejected for these files. Signed-off-by: Milosz Tanski [hch: rebased on top of my kiocb changes] Signed-off-by: Christoph Hellwig Reviewed-by: Stephen Bates Tested-by: Stephen Bates Acked-by: Jeff Moyer Signed-off-by: Al Viro --- fs/nfsd/vfs.c | 4 ++-- fs/read_write.c | 44 ++++++++++++++++++++++++++------------------ fs/splice.c | 2 +- include/linux/fs.h | 4 ++-- 4 files changed, 31 insertions(+), 23 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 5d2a57e4c03a..d40010e4f1a9 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -870,7 +870,7 @@ __be32 nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen, oldfs = get_fs(); set_fs(KERNEL_DS); - host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset); + host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset, 0); set_fs(oldfs); return nfsd_finish_read(file, count, host_err); } @@ -957,7 +957,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, /* Write the data. */ oldfs = get_fs(); set_fs(KERNEL_DS); - host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos); + host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos, 0); set_fs(oldfs); if (host_err < 0) goto out_nfserr; diff --git a/fs/read_write.c b/fs/read_write.c index 324ec271cc4e..7d453c3e1cb6 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -692,11 +692,14 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to) EXPORT_SYMBOL(iov_shorten); static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, - loff_t *ppos, iter_fn_t fn) + loff_t *ppos, iter_fn_t fn, int flags) { struct kiocb kiocb; ssize_t ret; + if (flags) + return -EOPNOTSUPP; + init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; @@ -708,10 +711,13 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, /* Do it by hand, with file-ops */ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, - loff_t *ppos, io_fn_t fn) + loff_t *ppos, io_fn_t fn, int flags) { ssize_t ret = 0; + if (flags) + return -EOPNOTSUPP; + while (iov_iter_count(iter)) { struct iovec iovec = iov_iter_iovec(iter); ssize_t nr; @@ -812,7 +818,8 @@ out: static ssize_t do_readv_writev(int type, struct file *file, const struct iovec __user * uvector, - unsigned long nr_segs, loff_t *pos) + unsigned long nr_segs, loff_t *pos, + int flags) { size_t tot_len; struct iovec iovstack[UIO_FASTIOV]; @@ -844,9 +851,9 @@ static ssize_t do_readv_writev(int type, struct file *file, } if (iter_fn) - ret = do_iter_readv_writev(file, &iter, pos, iter_fn); + ret = do_iter_readv_writev(file, &iter, pos, iter_fn, flags); else - ret = do_loop_readv_writev(file, &iter, pos, fn); + ret = do_loop_readv_writev(file, &iter, pos, fn, flags); if (type != READ) file_end_write(file); @@ -863,27 +870,27 @@ out: } ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, - unsigned long vlen, loff_t *pos) + unsigned long vlen, loff_t *pos, int flags) { if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; - return do_readv_writev(READ, file, vec, vlen, pos); + return do_readv_writev(READ, file, vec, vlen, pos, flags); } EXPORT_SYMBOL(vfs_readv); ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, - unsigned long vlen, loff_t *pos) + unsigned long vlen, loff_t *pos, int flags) { if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; - return do_readv_writev(WRITE, file, vec, vlen, pos); + return do_readv_writev(WRITE, file, vec, vlen, pos, flags); } EXPORT_SYMBOL(vfs_writev); @@ -896,7 +903,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, if (f.file) { loff_t pos = file_pos_read(f.file); - ret = vfs_readv(f.file, vec, vlen, &pos); + ret = vfs_readv(f.file, vec, vlen, &pos, 0); if (ret >= 0) file_pos_write(f.file, pos); fdput_pos(f); @@ -916,7 +923,7 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, if (f.file) { loff_t pos = file_pos_read(f.file); - ret = vfs_writev(f.file, vec, vlen, &pos); + ret = vfs_writev(f.file, vec, vlen, &pos, 0); if (ret >= 0) file_pos_write(f.file, pos); fdput_pos(f); @@ -948,7 +955,7 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, if (f.file) { ret = -ESPIPE; if (f.file->f_mode & FMODE_PREAD) - ret = vfs_readv(f.file, vec, vlen, &pos); + ret = vfs_readv(f.file, vec, vlen, &pos, 0); fdput(f); } @@ -972,7 +979,7 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, if (f.file) { ret = -ESPIPE; if (f.file->f_mode & FMODE_PWRITE) - ret = vfs_writev(f.file, vec, vlen, &pos); + ret = vfs_writev(f.file, vec, vlen, &pos, 0); fdput(f); } @@ -986,7 +993,8 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, static ssize_t compat_do_readv_writev(int type, struct file *file, const struct compat_iovec __user *uvector, - unsigned long nr_segs, loff_t *pos) + unsigned long nr_segs, loff_t *pos, + int flags) { compat_ssize_t tot_len; struct iovec iovstack[UIO_FASTIOV]; @@ -1018,9 +1026,9 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, } if (iter_fn) - ret = do_iter_readv_writev(file, &iter, pos, iter_fn); + ret = do_iter_readv_writev(file, &iter, pos, iter_fn, flags); else - ret = do_loop_readv_writev(file, &iter, pos, fn); + ret = do_loop_readv_writev(file, &iter, pos, fn, flags); if (type != READ) file_end_write(file); @@ -1049,7 +1057,7 @@ static size_t compat_readv(struct file *file, if (!(file->f_mode & FMODE_CAN_READ)) goto out; - ret = compat_do_readv_writev(READ, file, vec, vlen, pos); + ret = compat_do_readv_writev(READ, file, vec, vlen, pos, 0); out: if (ret > 0) @@ -1126,7 +1134,7 @@ static size_t compat_writev(struct file *file, if (!(file->f_mode & FMODE_CAN_WRITE)) goto out; - ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos); + ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos, 0); out: if (ret > 0) diff --git a/fs/splice.c b/fs/splice.c index 82bc0d64fc38..3dc142637ab5 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -577,7 +577,7 @@ static ssize_t kernel_readv(struct file *file, const struct iovec *vec, old_fs = get_fs(); set_fs(get_ds()); /* The cast to a user pointer is valid due to the set_fs() */ - res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos); + res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0); set_fs(old_fs); return res; diff --git a/include/linux/fs.h b/include/linux/fs.h index 1a2046275cdf..6ec87964644f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1712,9 +1712,9 @@ extern ssize_t __vfs_write(struct file *, const char __user *, size_t, loff_t *) extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t vfs_readv(struct file *, const struct iovec __user *, - unsigned long, loff_t *); + unsigned long, loff_t *, int); extern ssize_t vfs_writev(struct file *, const struct iovec __user *, - unsigned long, loff_t *); + unsigned long, loff_t *, int); extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, loff_t, size_t, unsigned int); extern int vfs_clone_file_range(struct file *file_in, loff_t pos_in, From f17d8b35452cab31a70d224964cd583fb2845449 Mon Sep 17 00:00:00 2001 From: Milosz Tanski Date: Thu, 3 Mar 2016 16:03:59 +0100 Subject: [PATCH 05/32] vfs: vfs: Define new syscalls preadv2,pwritev2 New syscalls that take an flag argument. No flags are added yet in this patch. Signed-off-by: Milosz Tanski [hch: rebased on top of my kiocb changes] Signed-off-by: Christoph Hellwig Reviewed-by: Stephen Bates Tested-by: Stephen Bates Acked-by: Jeff Moyer Signed-off-by: Al Viro --- fs/read_write.c | 161 ++++++++++++++++++++++++++++++--------- include/linux/compat.h | 6 ++ include/linux/syscalls.h | 6 ++ 3 files changed, 138 insertions(+), 35 deletions(-) diff --git a/fs/read_write.c b/fs/read_write.c index 7d453c3e1cb6..e9c9e2a667ce 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -895,15 +895,15 @@ ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, EXPORT_SYMBOL(vfs_writev); -SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, - unsigned long, vlen) +static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, + unsigned long vlen, int flags) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { loff_t pos = file_pos_read(f.file); - ret = vfs_readv(f.file, vec, vlen, &pos, 0); + ret = vfs_readv(f.file, vec, vlen, &pos, flags); if (ret >= 0) file_pos_write(f.file, pos); fdput_pos(f); @@ -915,15 +915,15 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, return ret; } -SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, - unsigned long, vlen) +static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec, + unsigned long vlen, int flags) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { loff_t pos = file_pos_read(f.file); - ret = vfs_writev(f.file, vec, vlen, &pos, 0); + ret = vfs_writev(f.file, vec, vlen, &pos, flags); if (ret >= 0) file_pos_write(f.file, pos); fdput_pos(f); @@ -941,10 +941,9 @@ static inline loff_t pos_from_hilo(unsigned long high, unsigned long low) return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low; } -SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, - unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) +static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec, + unsigned long vlen, loff_t pos, int flags) { - loff_t pos = pos_from_hilo(pos_h, pos_l); struct fd f; ssize_t ret = -EBADF; @@ -955,7 +954,7 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, if (f.file) { ret = -ESPIPE; if (f.file->f_mode & FMODE_PREAD) - ret = vfs_readv(f.file, vec, vlen, &pos, 0); + ret = vfs_readv(f.file, vec, vlen, &pos, flags); fdput(f); } @@ -965,10 +964,9 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, return ret; } -SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, - unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) +static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec, + unsigned long vlen, loff_t pos, int flags) { - loff_t pos = pos_from_hilo(pos_h, pos_l); struct fd f; ssize_t ret = -EBADF; @@ -979,7 +977,7 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, if (f.file) { ret = -ESPIPE; if (f.file->f_mode & FMODE_PWRITE) - ret = vfs_writev(f.file, vec, vlen, &pos, 0); + ret = vfs_writev(f.file, vec, vlen, &pos, flags); fdput(f); } @@ -989,6 +987,58 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, return ret; } +SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, + unsigned long, vlen) +{ + return do_readv(fd, vec, vlen, 0); +} + +SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, + unsigned long, vlen) +{ + return do_writev(fd, vec, vlen, 0); +} + +SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, + unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) +{ + loff_t pos = pos_from_hilo(pos_h, pos_l); + + return do_preadv(fd, vec, vlen, pos, 0); +} + +SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec, + unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, + int, flags) +{ + loff_t pos = pos_from_hilo(pos_h, pos_l); + + if (pos == -1) + return do_readv(fd, vec, vlen, flags); + + return do_preadv(fd, vec, vlen, pos, flags); +} + +SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, + unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) +{ + loff_t pos = pos_from_hilo(pos_h, pos_l); + + return do_pwritev(fd, vec, vlen, pos, 0); +} + +SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec, + unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, + int, flags) +{ + loff_t pos = pos_from_hilo(pos_h, pos_l); + + if (pos == -1) + return do_writev(fd, vec, vlen, flags); + + return do_pwritev(fd, vec, vlen, pos, flags); +} + #ifdef CONFIG_COMPAT static ssize_t compat_do_readv_writev(int type, struct file *file, @@ -1046,7 +1096,7 @@ out: static size_t compat_readv(struct file *file, const struct compat_iovec __user *vec, - unsigned long vlen, loff_t *pos) + unsigned long vlen, loff_t *pos, int flags) { ssize_t ret = -EBADF; @@ -1057,7 +1107,7 @@ static size_t compat_readv(struct file *file, if (!(file->f_mode & FMODE_CAN_READ)) goto out; - ret = compat_do_readv_writev(READ, file, vec, vlen, pos, 0); + ret = compat_do_readv_writev(READ, file, vec, vlen, pos, flags); out: if (ret > 0) @@ -1066,9 +1116,9 @@ out: return ret; } -COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd, - const struct compat_iovec __user *,vec, - compat_ulong_t, vlen) +static size_t do_compat_readv(compat_ulong_t fd, + const struct compat_iovec __user *vec, + compat_ulong_t vlen, int flags) { struct fd f = fdget_pos(fd); ssize_t ret; @@ -1077,16 +1127,24 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd, if (!f.file) return -EBADF; pos = f.file->f_pos; - ret = compat_readv(f.file, vec, vlen, &pos); + ret = compat_readv(f.file, vec, vlen, &pos, flags); if (ret >= 0) f.file->f_pos = pos; fdput_pos(f); return ret; + } -static long __compat_sys_preadv64(unsigned long fd, +COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd, + const struct compat_iovec __user *,vec, + compat_ulong_t, vlen) +{ + return do_compat_readv(fd, vec, vlen, 0); +} + +static long do_compat_preadv64(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen, loff_t pos) + unsigned long vlen, loff_t pos, int flags) { struct fd f; ssize_t ret; @@ -1098,7 +1156,7 @@ static long __compat_sys_preadv64(unsigned long fd, return -EBADF; ret = -ESPIPE; if (f.file->f_mode & FMODE_PREAD) - ret = compat_readv(f.file, vec, vlen, &pos); + ret = compat_readv(f.file, vec, vlen, &pos, flags); fdput(f); return ret; } @@ -1108,7 +1166,7 @@ COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, const struct compat_iovec __user *,vec, unsigned long, vlen, loff_t, pos) { - return __compat_sys_preadv64(fd, vec, vlen, pos); + return do_compat_preadv64(fd, vec, vlen, pos, 0); } #endif @@ -1118,12 +1176,25 @@ COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, { loff_t pos = ((loff_t)pos_high << 32) | pos_low; - return __compat_sys_preadv64(fd, vec, vlen, pos); + return do_compat_preadv64(fd, vec, vlen, pos, 0); +} + +COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd, + const struct compat_iovec __user *,vec, + compat_ulong_t, vlen, u32, pos_low, u32, pos_high, + int, flags) +{ + loff_t pos = ((loff_t)pos_high << 32) | pos_low; + + if (pos == -1) + return do_compat_readv(fd, vec, vlen, flags); + + return do_compat_preadv64(fd, vec, vlen, pos, flags); } static size_t compat_writev(struct file *file, const struct compat_iovec __user *vec, - unsigned long vlen, loff_t *pos) + unsigned long vlen, loff_t *pos, int flags) { ssize_t ret = -EBADF; @@ -1143,9 +1214,9 @@ out: return ret; } -COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd, - const struct compat_iovec __user *, vec, - compat_ulong_t, vlen) +static size_t do_compat_writev(compat_ulong_t fd, + const struct compat_iovec __user* vec, + compat_ulong_t vlen, int flags) { struct fd f = fdget_pos(fd); ssize_t ret; @@ -1154,16 +1225,23 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd, if (!f.file) return -EBADF; pos = f.file->f_pos; - ret = compat_writev(f.file, vec, vlen, &pos); + ret = compat_writev(f.file, vec, vlen, &pos, flags); if (ret >= 0) f.file->f_pos = pos; fdput_pos(f); return ret; } -static long __compat_sys_pwritev64(unsigned long fd, +COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd, + const struct compat_iovec __user *, vec, + compat_ulong_t, vlen) +{ + return do_compat_writev(fd, vec, vlen, 0); +} + +static long do_compat_pwritev64(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen, loff_t pos) + unsigned long vlen, loff_t pos, int flags) { struct fd f; ssize_t ret; @@ -1175,7 +1253,7 @@ static long __compat_sys_pwritev64(unsigned long fd, return -EBADF; ret = -ESPIPE; if (f.file->f_mode & FMODE_PWRITE) - ret = compat_writev(f.file, vec, vlen, &pos); + ret = compat_writev(f.file, vec, vlen, &pos, flags); fdput(f); return ret; } @@ -1185,7 +1263,7 @@ COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, const struct compat_iovec __user *,vec, unsigned long, vlen, loff_t, pos) { - return __compat_sys_pwritev64(fd, vec, vlen, pos); + return do_compat_pwritev64(fd, vec, vlen, pos, 0); } #endif @@ -1195,8 +1273,21 @@ COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, { loff_t pos = ((loff_t)pos_high << 32) | pos_low; - return __compat_sys_pwritev64(fd, vec, vlen, pos); + return do_compat_pwritev64(fd, vec, vlen, pos, 0); } + +COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, + const struct compat_iovec __user *,vec, + compat_ulong_t, vlen, u32, pos_low, u32, pos_high, int, flags) +{ + loff_t pos = ((loff_t)pos_high << 32) | pos_low; + + if (pos == -1) + return do_compat_writev(fd, vec, vlen, flags); + + return do_compat_pwritev64(fd, vec, vlen, pos, flags); +} + #endif static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, diff --git a/include/linux/compat.h b/include/linux/compat.h index a76c9172b2eb..fe4ccd0c748a 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -340,6 +340,12 @@ asmlinkage ssize_t compat_sys_preadv(compat_ulong_t fd, asmlinkage ssize_t compat_sys_pwritev(compat_ulong_t fd, const struct compat_iovec __user *vec, compat_ulong_t vlen, u32 pos_low, u32 pos_high); +asmlinkage ssize_t compat_sys_preadv2(compat_ulong_t fd, + const struct compat_iovec __user *vec, + compat_ulong_t vlen, u32 pos_low, u32 pos_high, int flags); +asmlinkage ssize_t compat_sys_pwritev2(compat_ulong_t fd, + const struct compat_iovec __user *vec, + compat_ulong_t vlen, u32 pos_low, u32 pos_high, int flags); #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64 asmlinkage long compat_sys_preadv64(unsigned long fd, diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 185815c96433..d795472c54d8 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -575,8 +575,14 @@ asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf, size_t count, loff_t pos); asmlinkage long sys_preadv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, unsigned long pos_l, unsigned long pos_h); +asmlinkage long sys_preadv2(unsigned long fd, const struct iovec __user *vec, + unsigned long vlen, unsigned long pos_l, unsigned long pos_h, + int flags); asmlinkage long sys_pwritev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, unsigned long pos_l, unsigned long pos_h); +asmlinkage long sys_pwritev2(unsigned long fd, const struct iovec __user *vec, + unsigned long vlen, unsigned long pos_l, unsigned long pos_h, + int flags); asmlinkage long sys_getcwd(char __user *buf, unsigned long size); asmlinkage long sys_mkdir(const char __user *pathname, umode_t mode); asmlinkage long sys_chdir(const char __user *filename); From 4babf2c5efb79be778d45ebe95d9c9b3c880c83b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Mar 2016 16:04:00 +0100 Subject: [PATCH 06/32] x86: wire up preadv2 and pwritev2 Signed-off-by: Milosz Tanski [hch: rebased due to newly added syscalls] Reviewed-by: Stephen Bates Tested-by: Stephen Bates Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- arch/x86/entry/syscalls/syscall_32.tbl | 2 ++ arch/x86/entry/syscalls/syscall_64.tbl | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index cb713df81180..b30dd8154cc2 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -384,3 +384,5 @@ 375 i386 membarrier sys_membarrier 376 i386 mlock2 sys_mlock2 377 i386 copy_file_range sys_copy_file_range +378 i386 preadv2 sys_preadv2 +379 i386 pwritev2 sys_pwritev2 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index dc1040a50bdc..31cec929eb8d 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -333,6 +333,8 @@ 324 common membarrier sys_membarrier 325 common mlock2 sys_mlock2 326 common copy_file_range sys_copy_file_range +327 64 preadv2 sys_preadv2 +328 64 pwritev2 sys_pwritev2 # # x32-specific system call numbers start at 512 to avoid cache impact From 97be7ebe53915af504fb491fb99f064c7cf3cb09 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Mar 2016 16:04:01 +0100 Subject: [PATCH 07/32] vfs: add the RWF_HIPRI flag for preadv2/pwritev2 This adds a flag that tells the file system that this is a high priority request for which it's worth to poll the hardware. The flag is purely advisory and can be ignored if not supported. Signed-off-by: Christoph Hellwig Reviewed-by: Stephen Bates Tested-by: Stephen Bates Acked-by: Jeff Moyer Signed-off-by: Al Viro --- fs/read_write.c | 6 ++++-- include/linux/fs.h | 1 + include/uapi/linux/fs.h | 3 +++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/read_write.c b/fs/read_write.c index e9c9e2a667ce..07c53db04ec1 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -697,10 +697,12 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, struct kiocb kiocb; ssize_t ret; - if (flags) + if (flags & ~RWF_HIPRI) return -EOPNOTSUPP; init_sync_kiocb(&kiocb, filp); + if (flags & RWF_HIPRI) + kiocb.ki_flags |= IOCB_HIPRI; kiocb.ki_pos = *ppos; ret = fn(&kiocb, iter); @@ -715,7 +717,7 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, { ssize_t ret = 0; - if (flags) + if (flags & ~RWF_HIPRI) return -EOPNOTSUPP; while (iov_iter_count(iter)) { diff --git a/include/linux/fs.h b/include/linux/fs.h index 6ec87964644f..337de88ff50f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -320,6 +320,7 @@ struct writeback_control; #define IOCB_EVENTFD (1 << 0) #define IOCB_APPEND (1 << 1) #define IOCB_DIRECT (1 << 2) +#define IOCB_HIPRI (1 << 3) struct kiocb { struct file *ki_filp; diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 41e0433b4a83..847c656729f8 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -305,4 +305,7 @@ struct fsxattr { #define SYNC_FILE_RANGE_WRITE 2 #define SYNC_FILE_RANGE_WAIT_AFTER 4 +/* flags for preadv2/pwritev2: */ +#define RWF_HIPRI 0x00000001 /* high priority request, poll if possible */ + #endif /* _UAPI_LINUX_FS_H */ From c43c83a294e8dc25072ca9e6fca4cdbc5564f3d4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Mar 2016 16:04:02 +0100 Subject: [PATCH 08/32] direct-io: only use block polling if explicitly requested Signed-off-by: Christoph Hellwig Reviewed-by: Stephen Bates Tested-by: Stephen Bates Acked-by: Jeff Moyer Signed-off-by: Al Viro --- fs/direct-io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index 1b2f7ffc8b84..85463171053b 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -445,7 +445,8 @@ static struct bio *dio_await_one(struct dio *dio) __set_current_state(TASK_UNINTERRUPTIBLE); dio->waiter = current; spin_unlock_irqrestore(&dio->bio_lock, flags); - if (!blk_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie)) + if (!(dio->iocb->ki_flags & IOCB_HIPRI) || + !blk_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie)) io_schedule(); /* wake up sets us TASK_RUNNING */ spin_lock_irqsave(&dio->bio_lock, flags); From 8e0b60b96ba06d826a2b26e23b1986853a4e5291 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Mar 2016 16:04:03 +0100 Subject: [PATCH 09/32] blk-mq: enable polling support by default Now that applications need to explicitly ask for polling we can enable it by default in blk-mq drivers. Note that this will only have an affect on driver that supply a poll function, which currently only includes nvme. Signed-off-by: Christoph Hellwig Reviewed-by: Stephen Bates Tested-by: Stephen Bates Reviewed-by: Jeff Moyer Signed-off-by: Al Viro --- include/linux/blkdev.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 29189aeace19..ff33a1ab58c9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -499,7 +499,8 @@ struct request_queue { #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ - (1 << QUEUE_FLAG_SAME_COMP)) + (1 << QUEUE_FLAG_SAME_COMP) | \ + (1 << QUEUE_FLAG_POLL)) static inline void queue_lockdep_assert_held(struct request_queue *q) { From 6583fe22d1bce3631bc660e5927449690fadafe6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 5 Mar 2016 18:14:03 -0500 Subject: [PATCH 10/32] do_last(): reorder and simplify a bit bugger off on negatives a bit earlier, simplify the tests Signed-off-by: Al Viro --- fs/namei.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 9c590e0f66e9..5bd9e0786855 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3118,6 +3118,7 @@ static int do_last(struct nameidata *nd, return error; BUG_ON(nd->inode != dir->d_inode); + BUG_ON(nd->flags & LOOKUP_RCU); } else { /* create side of things */ /* @@ -3171,12 +3172,6 @@ retry_lookup: goto finish_open_created; } - /* - * create/update audit record if it already exists. - */ - if (d_is_positive(path.dentry)) - audit_inode(nd->name, path.dentry, 0); - /* * If atomic_open() acquired write access it is dropped now due to * possible mount and symlink following (this might be optimized away if @@ -3187,6 +3182,16 @@ retry_lookup: got_write = false; } + if (unlikely(d_is_negative(path.dentry))) { + path_to_nameidata(&path, nd); + return -ENOENT; + } + + /* + * create/update audit record if it already exists. + */ + audit_inode(nd->name, path.dentry, 0); + if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) { path_to_nameidata(&path, nd); return -EEXIST; @@ -3196,12 +3201,7 @@ retry_lookup: if (unlikely(error < 0)) return error; - BUG_ON(nd->flags & LOOKUP_RCU); seq = 0; /* out of RCU mode, so the value doesn't matter */ - if (unlikely(d_is_negative(path.dentry))) { - path_to_nameidata(&path, nd); - return -ENOENT; - } inode = d_backing_inode(path.dentry); finish_lookup: if (nd->depth) From 6c51e513a3aa49db87f238f96714e6cbe2d9efd5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 5 Mar 2016 20:09:32 -0500 Subject: [PATCH 11/32] lookup_dcache(): lift d_alloc() into callers ... and kill need_lookup thing Signed-off-by: Al Viro --- fs/namei.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 5bd9e0786855..50020b168f15 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1448,12 +1448,11 @@ static int follow_dotdot(struct nameidata *nd) * dir->d_inode->i_mutex must be held */ static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir, - unsigned int flags, bool *need_lookup) + unsigned int flags) { struct dentry *dentry; int error; - *need_lookup = false; dentry = d_lookup(dir, name); if (dentry) { if (dentry->d_flags & DCACHE_OP_REVALIDATE) { @@ -1470,14 +1469,6 @@ static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir, } } } - - if (!dentry) { - dentry = d_alloc(dir, name); - if (unlikely(!dentry)) - return ERR_PTR(-ENOMEM); - - *need_lookup = true; - } return dentry; } @@ -1509,13 +1500,15 @@ static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry, static struct dentry *__lookup_hash(struct qstr *name, struct dentry *base, unsigned int flags) { - bool need_lookup; - struct dentry *dentry; + struct dentry *dentry = lookup_dcache(name, base, flags); - dentry = lookup_dcache(name, base, flags, &need_lookup); - if (!need_lookup) + if (dentry) return dentry; + dentry = d_alloc(base, name); + if (unlikely(!dentry)) + return ERR_PTR(-ENOMEM); + return lookup_real(base->d_inode, dentry, flags); } @@ -3018,16 +3011,22 @@ static int lookup_open(struct nameidata *nd, struct path *path, struct inode *dir_inode = dir->d_inode; struct dentry *dentry; int error; - bool need_lookup; + bool need_lookup = false; *opened &= ~FILE_CREATED; - dentry = lookup_dcache(&nd->last, dir, nd->flags, &need_lookup); + dentry = lookup_dcache(&nd->last, dir, nd->flags); if (IS_ERR(dentry)) return PTR_ERR(dentry); - /* Cached positive dentry: will open in f_op->open */ - if (!need_lookup && dentry->d_inode) + if (!dentry) { + dentry = d_alloc(dir, &nd->last); + if (unlikely(!dentry)) + return -ENOMEM; + need_lookup = true; + } else if (dentry->d_inode) { + /* Cached positive dentry: will open in f_op->open */ goto out_no_open; + } if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) { return atomic_open(nd, dentry, path, file, op, got_write, From 5d0f49c136b520dd4da886e0e37458790f5a9617 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 5 Mar 2016 21:32:53 -0500 Subject: [PATCH 12/32] namei: untanlge lookup_fast() Signed-off-by: Al Viro --- fs/namei.c | 83 ++++++++++++++++++++++++------------------------------ 1 file changed, 37 insertions(+), 46 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 50020b168f15..edec6b87cae8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1512,32 +1512,29 @@ static struct dentry *__lookup_hash(struct qstr *name, return lookup_real(base->d_inode, dentry, flags); } -/* - * It's more convoluted than I'd like it to be, but... it's still fairly - * small and for now I'd prefer to have fast path as straight as possible. - * It _is_ time-critical. - */ static int lookup_fast(struct nameidata *nd, struct path *path, struct inode **inode, unsigned *seqp) { struct vfsmount *mnt = nd->path.mnt; struct dentry *dentry, *parent = nd->path.dentry; - int need_reval = 1; int status = 1; int err; /* * Rename seqlock is not required here because in the off chance - * of a false negative due to a concurrent rename, we're going to - * do the non-racy lookup, below. + * of a false negative due to a concurrent rename, the caller is + * going to fall back to non-racy lookup. */ if (nd->flags & LOOKUP_RCU) { unsigned seq; bool negative; dentry = __d_lookup_rcu(parent, &nd->last, &seq); - if (!dentry) - goto unlazy; + if (unlikely(!dentry)) { + if (unlazy_walk(nd, NULL, 0)) + return -ECHILD; + return 1; + } /* * This sequence count validates that the inode matches @@ -1545,7 +1542,7 @@ static int lookup_fast(struct nameidata *nd, */ *inode = d_backing_inode(dentry); negative = d_is_negative(dentry); - if (read_seqcount_retry(&dentry->d_seq, seq)) + if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) return -ECHILD; /* @@ -1555,63 +1552,57 @@ static int lookup_fast(struct nameidata *nd, * The memory barrier in read_seqcount_begin of child is * enough, we can use __read_seqcount_retry here. */ - if (__read_seqcount_retry(&parent->d_seq, nd->seq)) + if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq))) return -ECHILD; *seqp = seq; - if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { + if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) status = d_revalidate(dentry, nd->flags); - if (unlikely(status <= 0)) { - if (status != -ECHILD) - need_reval = 0; - goto unlazy; - } + if (unlikely(status <= 0)) { + if (unlazy_walk(nd, dentry, seq)) + return -ECHILD; + if (status == -ECHILD) + status = d_revalidate(dentry, nd->flags); + } else { + /* + * Note: do negative dentry check after revalidation in + * case that drops it. + */ + if (unlikely(negative)) + return -ENOENT; + path->mnt = mnt; + path->dentry = dentry; + if (likely(__follow_mount_rcu(nd, path, inode, seqp))) + return 0; + if (unlazy_walk(nd, dentry, seq)) + return -ECHILD; } - /* - * Note: do negative dentry check after revalidation in - * case that drops it. - */ - if (negative) - return -ENOENT; - path->mnt = mnt; - path->dentry = dentry; - if (likely(__follow_mount_rcu(nd, path, inode, seqp))) - return 0; -unlazy: - if (unlazy_walk(nd, dentry, seq)) - return -ECHILD; } else { dentry = __d_lookup(parent, &nd->last); + if (unlikely(!dentry)) + return 1; + if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) + status = d_revalidate(dentry, nd->flags); } - - if (unlikely(!dentry)) - goto need_lookup; - - if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval) - status = d_revalidate(dentry, nd->flags); if (unlikely(status <= 0)) { - if (status < 0) { - dput(dentry); - return status; + if (!status) { + d_invalidate(dentry); + status = 1; } - d_invalidate(dentry); dput(dentry); - goto need_lookup; + return status; } - if (unlikely(d_is_negative(dentry))) { dput(dentry); return -ENOENT; } + path->mnt = mnt; path->dentry = dentry; err = follow_managed(path, nd); if (likely(!err)) *inode = d_backing_inode(path->dentry); return err; - -need_lookup: - return 1; } /* Fast lookup failed, do it the slow way */ From e9742b5332061bf885d8dd95d17e18a1b21371fa Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 5 Mar 2016 22:04:59 -0500 Subject: [PATCH 13/32] namei: change calling conventions for lookup_{fast,slow} and follow_managed() Have lookup_fast() return 1 on success and 0 on "need to fall back"; lookup_slow() and follow_managed() return positive (1) on success. Signed-off-by: Al Viro --- fs/namei.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index edec6b87cae8..fc6c5458b5ae 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1220,8 +1220,8 @@ static int follow_managed(struct path *path, struct nameidata *nd) if (need_mntput && path->mnt == mnt) mntput(path->mnt); - if (ret == -EISDIR) - ret = 0; + if (ret == -EISDIR || !ret) + ret = 1; if (need_mntput) nd->flags |= LOOKUP_JUMPED; if (unlikely(ret < 0)) @@ -1533,7 +1533,7 @@ static int lookup_fast(struct nameidata *nd, if (unlikely(!dentry)) { if (unlazy_walk(nd, NULL, 0)) return -ECHILD; - return 1; + return 0; } /* @@ -1573,22 +1573,20 @@ static int lookup_fast(struct nameidata *nd, path->mnt = mnt; path->dentry = dentry; if (likely(__follow_mount_rcu(nd, path, inode, seqp))) - return 0; + return 1; if (unlazy_walk(nd, dentry, seq)) return -ECHILD; } } else { dentry = __d_lookup(parent, &nd->last); if (unlikely(!dentry)) - return 1; + return 0; if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) status = d_revalidate(dentry, nd->flags); } if (unlikely(status <= 0)) { - if (!status) { + if (!status) d_invalidate(dentry); - status = 1; - } dput(dentry); return status; } @@ -1600,7 +1598,7 @@ static int lookup_fast(struct nameidata *nd, path->mnt = mnt; path->dentry = dentry; err = follow_managed(path, nd); - if (likely(!err)) + if (likely(err > 0)) *inode = d_backing_inode(path->dentry); return err; } @@ -1724,7 +1722,7 @@ static int walk_component(struct nameidata *nd, int flags) return err; } err = lookup_fast(nd, &path, &inode, &seq); - if (unlikely(err)) { + if (unlikely(err <= 0)) { if (err < 0) return err; @@ -3101,7 +3099,7 @@ static int do_last(struct nameidata *nd, nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; /* we _can_ be in RCU mode here */ error = lookup_fast(nd, &path, &inode, &seq); - if (likely(!error)) + if (likely(error > 0)) goto finish_lookup; if (error < 0) From 74ff0ffc7f0682cb4c00252807d76116b95ecb9e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 5 Mar 2016 22:37:46 -0500 Subject: [PATCH 14/32] namei: simplify invalidation logics in lookup_dcache() Signed-off-by: Al Viro --- fs/namei.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index fc6c5458b5ae..955e886c8e2a 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1458,14 +1458,10 @@ static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir, if (dentry->d_flags & DCACHE_OP_REVALIDATE) { error = d_revalidate(dentry, flags); if (unlikely(error <= 0)) { - if (error < 0) { - dput(dentry); - return ERR_PTR(error); - } else { + if (!error) d_invalidate(dentry); - dput(dentry); - dentry = NULL; - } + dput(dentry); + return ERR_PTR(error); } } } From d6d95ded914eb321b0a2c8c26cdf0a225a0d9917 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 5 Mar 2016 22:31:50 -0500 Subject: [PATCH 15/32] lookup_one_len_unlocked(): use lookup_dcache() No need to lock parent just because of ->d_revalidate() on child; contrary to the stale comment, lookup_dcache() *can* be used without locking the parent. Result can be moved as soon as we return, of course, but the same is true for lookup_one_len_unlocked() itself. Signed-off-by: Al Viro --- fs/namei.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 955e886c8e2a..ae673355c386 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1444,8 +1444,6 @@ static int follow_dotdot(struct nameidata *nd) * This looks up the name in dcache, possibly revalidates the old dentry and * allocates a new one if not found or not valid. In the need_lookup argument * returns whether i_op->lookup is necessary. - * - * dir->d_inode->i_mutex must be held */ static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir, unsigned int flags) @@ -2351,15 +2349,7 @@ struct dentry *lookup_one_len_unlocked(const char *name, if (err) return ERR_PTR(err); - /* - * __d_lookup() is used to try to get a quick answer and avoid the - * mutex. A false-negative does no harm. - */ - ret = __d_lookup(base, &this); - if (ret && unlikely(ret->d_flags & DCACHE_OP_REVALIDATE)) { - dput(ret); - ret = NULL; - } + ret = lookup_dcache(&this, base, 0); if (ret) return ret; From e3c13928086f1ed3620329b9fff678df2294d327 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 6 Mar 2016 14:03:27 -0500 Subject: [PATCH 16/32] namei: massage lookup_slow() to be usable by lookup_one_len_unlocked() Return dentry and don't pass nameidata or path; lift crossing mountpoints into the caller. Signed-off-by: Al Viro --- fs/namei.c | 56 +++++++++++++++++++++++------------------------------- 1 file changed, 24 insertions(+), 32 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index ae673355c386..cb70a817d439 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1445,7 +1445,8 @@ static int follow_dotdot(struct nameidata *nd) * allocates a new one if not found or not valid. In the need_lookup argument * returns whether i_op->lookup is necessary. */ -static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir, +static struct dentry *lookup_dcache(const struct qstr *name, + struct dentry *dir, unsigned int flags) { struct dentry *dentry; @@ -1491,7 +1492,7 @@ static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry, return dentry; } -static struct dentry *__lookup_hash(struct qstr *name, +static struct dentry *__lookup_hash(const struct qstr *name, struct dentry *base, unsigned int flags) { struct dentry *dentry = lookup_dcache(name, base, flags); @@ -1598,21 +1599,15 @@ static int lookup_fast(struct nameidata *nd, } /* Fast lookup failed, do it the slow way */ -static int lookup_slow(struct nameidata *nd, struct path *path) +static struct dentry *lookup_slow(const struct qstr *name, + struct dentry *dir, + unsigned int flags) { - struct dentry *dentry, *parent; - - parent = nd->path.dentry; - BUG_ON(nd->inode != parent->d_inode); - - inode_lock(parent->d_inode); - dentry = __lookup_hash(&nd->last, parent, nd->flags); - inode_unlock(parent->d_inode); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - path->mnt = nd->path.mnt; - path->dentry = dentry; - return follow_managed(path, nd); + struct dentry *dentry; + inode_lock(dir->d_inode); + dentry = __lookup_hash(name, dir, flags); + inode_unlock(dir->d_inode); + return dentry; } static inline int may_lookup(struct nameidata *nd) @@ -1719,15 +1714,20 @@ static int walk_component(struct nameidata *nd, int flags) if (unlikely(err <= 0)) { if (err < 0) return err; - - err = lookup_slow(nd, &path); - if (err < 0) + path.dentry = lookup_slow(&nd->last, nd->path.dentry, + nd->flags); + if (IS_ERR(path.dentry)) + return PTR_ERR(path.dentry); + if (unlikely(d_is_negative(path.dentry))) { + dput(path.dentry); + return -ENOENT; + } + path.mnt = nd->path.mnt; + err = follow_managed(&path, nd); + if (unlikely(err < 0)) return err; seq = 0; /* we are already out of RCU mode */ - err = -ENOENT; - if (d_is_negative(path.dentry)) - goto out_path_put; inode = d_backing_inode(path.dentry); } @@ -1740,10 +1740,6 @@ static int walk_component(struct nameidata *nd, int flags) nd->inode = inode; nd->seq = seq; return 0; - -out_path_put: - path_to_nameidata(&path, nd); - return err; } /* @@ -2350,12 +2346,8 @@ struct dentry *lookup_one_len_unlocked(const char *name, return ERR_PTR(err); ret = lookup_dcache(&this, base, 0); - if (ret) - return ret; - - inode_lock(base->d_inode); - ret = __lookup_hash(&this, base, 0); - inode_unlock(base->d_inode); + if (!ret) + ret = lookup_slow(&this, base, 0); return ret; } EXPORT_SYMBOL(lookup_one_len_unlocked); From 949a852e46dda07caaa0ff02e181f55d24e3ebf8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 6 Mar 2016 14:20:52 -0500 Subject: [PATCH 17/32] namei: teach lookup_slow() to skip revalidate ... and make mountpoint_last() use it. That makes all candidates for lookup with parent locked shared go through lookup_slow(). Signed-off-by: Al Viro --- fs/namei.c | 58 ++++++++++++++++++++++++++----------------- include/linux/namei.h | 1 + 2 files changed, 36 insertions(+), 23 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index cb70a817d439..dbb8ec1a2006 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1605,7 +1605,29 @@ static struct dentry *lookup_slow(const struct qstr *name, { struct dentry *dentry; inode_lock(dir->d_inode); - dentry = __lookup_hash(name, dir, flags); + dentry = d_lookup(dir, name); + if (unlikely(dentry)) { + if ((dentry->d_flags & DCACHE_OP_REVALIDATE) && + !(flags & LOOKUP_NO_REVAL)) { + int error = d_revalidate(dentry, flags); + if (unlikely(error <= 0)) { + if (!error) + d_invalidate(dentry); + dput(dentry); + dentry = ERR_PTR(error); + } + } + if (dentry) { + inode_unlock(dir->d_inode); + return dentry; + } + } + dentry = d_alloc(dir, name); + if (unlikely(!dentry)) { + inode_unlock(dir->d_inode); + return ERR_PTR(-ENOMEM); + } + dentry = lookup_real(dir->d_inode, dentry, flags); inode_unlock(dir->d_inode); return dentry; } @@ -2425,31 +2447,21 @@ mountpoint_last(struct nameidata *nd, struct path *path) if (error) return error; dentry = dget(nd->path.dentry); - goto done; - } - - inode_lock(dir->d_inode); - dentry = d_lookup(dir, &nd->last); - if (!dentry) { - /* - * No cached dentry. Mounted dentries are pinned in the cache, - * so that means that this dentry is probably a symlink or the - * path doesn't actually point to a mounted dentry. - */ - dentry = d_alloc(dir, &nd->last); + } else { + dentry = d_lookup(dir, &nd->last); if (!dentry) { - inode_unlock(dir->d_inode); - return -ENOMEM; - } - dentry = lookup_real(dir->d_inode, dentry, nd->flags); - if (IS_ERR(dentry)) { - inode_unlock(dir->d_inode); - return PTR_ERR(dentry); + /* + * No cached dentry. Mounted dentries are pinned in the + * cache, so that means that this dentry is probably + * a symlink or the path doesn't actually point + * to a mounted dentry. + */ + dentry = lookup_slow(&nd->last, dir, + nd->flags | LOOKUP_NO_REVAL); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); } } - inode_unlock(dir->d_inode); - -done: if (d_is_negative(dentry)) { dput(dentry); return -ENOENT; diff --git a/include/linux/namei.h b/include/linux/namei.h index d0f25d81b46a..77d01700daf7 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -31,6 +31,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; #define LOOKUP_PARENT 0x0010 #define LOOKUP_REVAL 0x0020 #define LOOKUP_RCU 0x0040 +#define LOOKUP_NO_REVAL 0x0080 /* * Intent data From f7380af04bac0ecee88c10dba037749b14d26e7d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 6 Mar 2016 20:42:16 -0500 Subject: [PATCH 18/32] ceph: don't bother with d_rehash() in splice_dentry() d_splice_alias() guarantees that it'll be always hashed Signed-off-by: Al Viro --- fs/ceph/inode.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index fb4ba2e4e2a5..05dd66fe23a2 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -975,13 +975,8 @@ out_unlock: /* * splice a dentry to an inode. * caller must hold directory i_mutex for this to be safe. - * - * we will only rehash the resulting dentry if @prehash is - * true; @prehash will be set to false (for the benefit of - * the caller) if we fail. */ -static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, - bool *prehash) +static struct dentry *splice_dentry(struct dentry *dn, struct inode *in) { struct dentry *realdn; @@ -994,8 +989,6 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, if (IS_ERR(realdn)) { pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n", PTR_ERR(realdn), dn, in, ceph_vinop(in)); - if (prehash) - *prehash = false; /* don't rehash on error */ dn = realdn; /* note realdn contains the error */ goto out; } else if (realdn) { @@ -1011,8 +1004,6 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, dout("dn %p attached to %p ino %llx.%llx\n", dn, d_inode(dn), ceph_vinop(d_inode(dn))); } - if ((!prehash || *prehash) && d_unhashed(dn)) - d_rehash(dn); out: return dn; } @@ -1260,7 +1251,7 @@ retry_lookup: if (d_really_is_negative(dn)) { ceph_dir_clear_ordered(dir); ihold(in); - dn = splice_dentry(dn, in, &have_lease); + dn = splice_dentry(dn, in); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; @@ -1290,7 +1281,7 @@ retry_lookup: dout(" linking snapped dir %p to dn %p\n", in, dn); ceph_dir_clear_ordered(dir); ihold(in); - dn = splice_dentry(dn, in, NULL); + dn = splice_dentry(dn, in); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; @@ -1501,7 +1492,7 @@ retry_lookup: } if (d_really_is_negative(dn)) { - struct dentry *realdn = splice_dentry(dn, in, NULL); + struct dentry *realdn = splice_dentry(dn, in); if (IS_ERR(realdn)) { err = PTR_ERR(realdn); d_drop(dn); From 5cf3b560af903c82e9fc12578fac2fbcb8ca1533 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 7 Mar 2016 14:25:46 -0500 Subject: [PATCH 19/32] configfs: move d_rehash() into configfs_create() for regular files ... and turn it into d_add in there Signed-off-by: Al Viro --- fs/configfs/dir.c | 9 ++------- fs/configfs/inode.c | 12 ++++++++++-- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index f419519ec41f..214ec14149d9 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -432,14 +432,9 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den (sd->s_type & CONFIGFS_ITEM_BIN_ATTR) ? configfs_init_bin_file : configfs_init_file); - if (error) { + if (error) configfs_put(sd); - return error; - } - - d_rehash(dentry); - - return 0; + return error; } static struct dentry * configfs_lookup(struct inode *dir, diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index cee087d8f7e0..45811ea3fd87 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -199,9 +199,17 @@ int configfs_create(struct dentry * dentry, umode_t mode, void (*init)(struct in configfs_set_inode_lock_class(sd, inode); init(inode); - d_instantiate(dentry, inode); - if (S_ISDIR(mode) || S_ISLNK(mode)) + if (S_ISDIR(mode) || S_ISLNK(mode)) { + /* + * ->symlink(), ->mkdir(), configfs_register_subsystem() or + * create_default_group() - already hashed. + */ + d_instantiate(dentry, inode); dget(dentry); /* pin link and directory dentries in core */ + } else { + /* ->lookup() */ + d_add(dentry, inode); + } return error; } From de4acda16e11110bbba5195061789f93a6c0d29d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 7 Mar 2016 15:10:45 -0500 Subject: [PATCH 20/32] autofs4: don't bother with d_instantiate(dentry, NULL) in ->lookup() Signed-off-by: Al Viro --- fs/autofs4/root.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index c6d7d3dbd52a..75dd739ac3e6 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -537,8 +537,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, u ino->dentry = dentry; autofs4_add_active(dentry); - - d_instantiate(dentry, NULL); } return NULL; } From f8b31710e46c852b2af1635d1b6fab8e69bf7799 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 7 Mar 2016 15:11:43 -0500 Subject: [PATCH 21/32] ceph_fill_trace(): don't bother with d_instantiate(dn, NULL) ... and use d_add(dn, NULL) in case we need to hash a negative unhashed rather than using d_rehash() directly. Signed-off-by: Al Viro --- fs/ceph/inode.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 05dd66fe23a2..be2d87f33177 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1236,10 +1236,8 @@ retry_lookup: dout("d_delete %p\n", dn); d_delete(dn); } else { - dout("d_instantiate %p NULL\n", dn); - d_instantiate(dn, NULL); if (have_lease && d_unhashed(dn)) - d_rehash(dn); + d_add(dn, NULL); update_dentry_lease(dn, rinfo->dlease, session, req->r_request_started); From 9d95afd5971918b1aa8db1960ba24532c2d6ec89 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 1 Mar 2016 15:35:06 -0500 Subject: [PATCH 22/32] kill dentry_unhash() the last user is gone Signed-off-by: Al Viro --- fs/namei.c | 25 ------------------------- include/linux/fs.h | 5 ----- 2 files changed, 30 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index dbb8ec1a2006..794f81dce766 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3685,31 +3685,6 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) return sys_mkdirat(AT_FDCWD, pathname, mode); } -/* - * The dentry_unhash() helper will try to drop the dentry early: we - * should have a usage count of 1 if we're the only user of this - * dentry, and if that is true (possibly after pruning the dcache), - * then we drop the dentry now. - * - * A low-level filesystem can, if it choses, legally - * do a - * - * if (!d_unhashed(dentry)) - * return -EBUSY; - * - * if it cannot handle the case of removing a directory - * that is still in use by something else.. - */ -void dentry_unhash(struct dentry *dentry) -{ - shrink_dcache_parent(dentry); - spin_lock(&dentry->d_lock); - if (dentry->d_lockref.count == 1) - __d_drop(dentry); - spin_unlock(&dentry->d_lock); -} -EXPORT_SYMBOL(dentry_unhash); - int vfs_rmdir(struct inode *dir, struct dentry *dentry) { int error = may_delete(dir, dentry, 1); diff --git a/include/linux/fs.h b/include/linux/fs.h index ae681002100a..c577923cd400 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1539,11 +1539,6 @@ extern int vfs_unlink(struct inode *, struct dentry *, struct inode **); extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int); extern int vfs_whiteout(struct inode *, struct dentry *); -/* - * VFS dentry helper functions. - */ -extern void dentry_unhash(struct dentry *dentry); - /* * VFS file helper functions. */ From 130f9ab75dc3afdb96d49334bd941f2e6faf39a1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 7 Mar 2016 22:40:43 -0500 Subject: [PATCH 23/32] nfs_lookup: don't bother with d_instantiate(dentry, NULL) Signed-off-by: Al Viro --- fs/nfs/dir.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 9cce67043f92..4bfa7d8bcade 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1360,19 +1360,15 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in dfprintk(VFS, "NFS: lookup(%pd2)\n", dentry); nfs_inc_stats(dir, NFSIOS_VFSLOOKUP); - res = ERR_PTR(-ENAMETOOLONG); - if (dentry->d_name.len > NFS_SERVER(dir)->namelen) - goto out; + if (unlikely(dentry->d_name.len > NFS_SERVER(dir)->namelen)) + return ERR_PTR(-ENAMETOOLONG); /* * If we're doing an exclusive create, optimize away the lookup * but don't hash the dentry. */ - if (nfs_is_exclusive_create(dir, flags)) { - d_instantiate(dentry, NULL); - res = NULL; - goto out; - } + if (nfs_is_exclusive_create(dir, flags)) + return NULL; res = ERR_PTR(-ENOMEM); fhandle = nfs_alloc_fhandle(); From 85f40482bc175f14f63d78fe482ef1826cf3913d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 7 Mar 2016 23:20:44 -0500 Subject: [PATCH 24/32] cifs_get_root(): use lookup_one_len_unlocked() Signed-off-by: Al Viro --- fs/cifs/cifsfs.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index c48ca13673e3..09b1db2cac31 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -642,9 +642,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) while (*s && *s != sep) s++; - inode_lock(dir); - child = lookup_one_len(p, dentry, s - p); - inode_unlock(dir); + child = lookup_one_len_unlocked(p, dentry, s - p); dput(dentry); dentry = child; } while (!IS_ERR(dentry)); From e12a4e8a04a9ef84ba645379c56abad1a0ca9dbd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 7 Mar 2016 23:27:22 -0500 Subject: [PATCH 25/32] quota: use lookup_one_len_unlocked() Signed-off-by: Al Viro --- fs/quota/dquot.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 3c3b81bb6dfe..04ca0cc6d065 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2430,9 +2430,7 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name, struct dentry *dentry; int error; - inode_lock(d_inode(sb->s_root)); - dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name)); - inode_unlock(d_inode(sb->s_root)); + dentry = lookup_one_len_unlocked(qf_name, sb->s_root, strlen(qf_name)); if (IS_ERR(dentry)) return PTR_ERR(dentry); From 668d0cd56ef7bc71be6dd8c081007221e09d9a86 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 8 Mar 2016 12:44:17 -0500 Subject: [PATCH 26/32] replace d_add_unique() with saner primitive new primitive: d_exact_alias(dentry, inode). If there is an unhashed dentry with the same name/parent and given inode, rehash, grab and return it. Otherwise, return NULL. The only caller of d_add_unique() switched to d_exact_alias() + d_splice_alias(). Signed-off-by: Al Viro --- fs/dcache.c | 125 +++++++++++++++++------------------------ fs/nfs/nfs4proc.c | 13 +++-- include/linux/dcache.h | 18 +----- 3 files changed, 58 insertions(+), 98 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 2398f9f94337..4d20bf5c609b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1781,81 +1781,6 @@ void d_instantiate(struct dentry *entry, struct inode * inode) } EXPORT_SYMBOL(d_instantiate); -/** - * d_instantiate_unique - instantiate a non-aliased dentry - * @entry: dentry to instantiate - * @inode: inode to attach to this dentry - * - * Fill in inode information in the entry. On success, it returns NULL. - * If an unhashed alias of "entry" already exists, then we return the - * aliased dentry instead and drop one reference to inode. - * - * Note that in order to avoid conflicts with rename() etc, the caller - * had better be holding the parent directory semaphore. - * - * This also assumes that the inode count has been incremented - * (or otherwise set) by the caller to indicate that it is now - * in use by the dcache. - */ -static struct dentry *__d_instantiate_unique(struct dentry *entry, - struct inode *inode) -{ - struct dentry *alias; - int len = entry->d_name.len; - const char *name = entry->d_name.name; - unsigned int hash = entry->d_name.hash; - - if (!inode) { - __d_instantiate(entry, NULL); - return NULL; - } - - hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { - /* - * Don't need alias->d_lock here, because aliases with - * d_parent == entry->d_parent are not subject to name or - * parent changes, because the parent inode i_mutex is held. - */ - if (alias->d_name.hash != hash) - continue; - if (alias->d_parent != entry->d_parent) - continue; - if (alias->d_name.len != len) - continue; - if (dentry_cmp(alias, name, len)) - continue; - __dget(alias); - return alias; - } - - __d_instantiate(entry, inode); - return NULL; -} - -struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode) -{ - struct dentry *result; - - BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); - - if (inode) - spin_lock(&inode->i_lock); - result = __d_instantiate_unique(entry, inode); - if (inode) - spin_unlock(&inode->i_lock); - - if (!result) { - security_d_instantiate(entry, inode); - return NULL; - } - - BUG_ON(!d_unhashed(result)); - iput(inode); - return result; -} - -EXPORT_SYMBOL(d_instantiate_unique); - /** * d_instantiate_no_diralias - instantiate a non-aliased dentry * @entry: dentry to complete @@ -2436,6 +2361,56 @@ void d_rehash(struct dentry * entry) } EXPORT_SYMBOL(d_rehash); +/** + * d_exact_alias - find and hash an exact unhashed alias + * @entry: dentry to add + * @inode: The inode to go with this dentry + * + * If an unhashed dentry with the same name/parent and desired + * inode already exists, hash and return it. Otherwise, return + * NULL. + * + * Parent directory should be locked. + */ +struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode) +{ + struct dentry *alias; + int len = entry->d_name.len; + const char *name = entry->d_name.name; + unsigned int hash = entry->d_name.hash; + + spin_lock(&inode->i_lock); + hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { + /* + * Don't need alias->d_lock here, because aliases with + * d_parent == entry->d_parent are not subject to name or + * parent changes, because the parent inode i_mutex is held. + */ + if (alias->d_name.hash != hash) + continue; + if (alias->d_parent != entry->d_parent) + continue; + if (alias->d_name.len != len) + continue; + if (dentry_cmp(alias, name, len)) + continue; + spin_lock(&alias->d_lock); + if (!d_unhashed(alias)) { + spin_unlock(&alias->d_lock); + alias = NULL; + } else { + __dget_dlock(alias); + _d_rehash(alias); + spin_unlock(&alias->d_lock); + } + spin_unlock(&inode->i_lock); + return alias; + } + spin_unlock(&inode->i_lock); + return NULL; +} +EXPORT_SYMBOL(d_exact_alias); + /** * dentry_update_name_case - update case insensitive dentry with a new name * @dentry: dentry to be updated diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4bfc33ad0563..400a70b3be7b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2461,14 +2461,15 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, dentry = opendata->dentry; if (d_really_is_negative(dentry)) { - /* FIXME: Is this d_drop() ever needed? */ + struct dentry *alias; d_drop(dentry); - dentry = d_add_unique(dentry, igrab(state->inode)); - if (dentry == NULL) { - dentry = opendata->dentry; - } else if (dentry != ctx->dentry) { + alias = d_exact_alias(dentry, state->inode); + if (!alias) + alias = d_splice_alias(igrab(state->inode), dentry); + /* d_splice_alias() can't fail here - it's a non-directory */ + if (alias) { dput(ctx->dentry); - ctx->dentry = dget(dentry); + ctx->dentry = dentry = alias; } nfs_set_verifier(dentry, nfs_save_change_attribute(d_inode(opendata->dir))); diff --git a/include/linux/dcache.h b/include/linux/dcache.h index c4b5f4b3f8f8..bda4ec53886b 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -246,6 +246,7 @@ extern struct dentry * d_alloc(struct dentry *, const struct qstr *); extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *); extern struct dentry * d_splice_alias(struct inode *, struct dentry *); extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *); +extern struct dentry * d_exact_alias(struct dentry *, struct inode *); extern struct dentry *d_find_any_alias(struct inode *inode); extern struct dentry * d_obtain_alias(struct inode *); extern struct dentry * d_obtain_root(struct inode *); @@ -288,23 +289,6 @@ static inline void d_add(struct dentry *entry, struct inode *inode) d_rehash(entry); } -/** - * d_add_unique - add dentry to hash queues without aliasing - * @entry: dentry to add - * @inode: The inode to attach to this dentry - * - * This adds the entry to the hash queues and initializes @inode. - * The entry was actually filled in earlier during d_alloc(). - */ -static inline struct dentry *d_add_unique(struct dentry *entry, struct inode *inode) -{ - struct dentry *res; - - res = d_instantiate_unique(entry, inode); - d_rehash(res != NULL ? res : entry); - return res; -} - extern void dentry_update_name_case(struct dentry *, struct qstr *); /* used for rename() and baskets */ From 34d0d19dc0929ccc326448737f05a8fae3d47b8a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 8 Mar 2016 21:01:03 -0500 Subject: [PATCH 27/32] uninline d_add() Signed-off-by: Al Viro --- fs/dcache.c | 16 ++++++++++++++++ include/linux/dcache.h | 15 +-------------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 4d20bf5c609b..12280df07837 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2361,6 +2361,22 @@ void d_rehash(struct dentry * entry) } EXPORT_SYMBOL(d_rehash); +/** + * d_add - add dentry to hash queues + * @entry: dentry to add + * @inode: The inode to attach to this dentry + * + * This adds the entry to the hash queues and initializes @inode. + * The entry was actually filled in earlier during d_alloc(). + */ + +void d_add(struct dentry *entry, struct inode *inode) +{ + d_instantiate(entry, inode); + d_rehash(entry); +} +EXPORT_SYMBOL(d_add); + /** * d_exact_alias - find and hash an exact unhashed alias * @entry: dentry to add diff --git a/include/linux/dcache.h b/include/linux/dcache.h index bda4ec53886b..1c51d2d84a32 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -273,21 +273,8 @@ extern int have_submounts(struct dentry *); * This adds the entry to the hash queues. */ extern void d_rehash(struct dentry *); - -/** - * d_add - add dentry to hash queues - * @entry: dentry to add - * @inode: The inode to attach to this dentry - * - * This adds the entry to the hash queues and initializes @inode. - * The entry was actually filled in earlier during d_alloc(). - */ -static inline void d_add(struct dentry *entry, struct inode *inode) -{ - d_instantiate(entry, inode); - d_rehash(entry); -} +extern void d_add(struct dentry *, struct inode *); extern void dentry_update_name_case(struct dentry *, struct qstr *); From 27f203f655a2e1dab66598a5b19afb637c587f0b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 9 Mar 2016 17:58:49 -0500 Subject: [PATCH 28/32] untangle fsnotify_d_instantiate() a bit First of all, don't bother calling it if inode is NULL - that makes inode argument unused. Moreover, do it *before* dropping ->d_lock, not right after that (and don't bother grabbing ->d_lock in it, of course). Signed-off-by: Al Viro --- fs/dcache.c | 3 ++- include/linux/fsnotify.h | 9 --------- include/linux/fsnotify_backend.h | 9 ++------- 3 files changed, 4 insertions(+), 17 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 12280df07837..244fd2487fe9 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1750,8 +1750,9 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) raw_write_seqcount_begin(&dentry->d_seq); __d_set_inode_and_type(dentry, inode, add_flags); raw_write_seqcount_end(&dentry->d_seq); + if (inode) + __fsnotify_d_instantiate(dentry); spin_unlock(&dentry->d_lock); - fsnotify_d_instantiate(dentry, inode); } /** diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 7ee1774edee5..0141f257d67b 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -16,15 +16,6 @@ #include #include -/* - * fsnotify_d_instantiate - instantiate a dentry for inode - */ -static inline void fsnotify_d_instantiate(struct dentry *dentry, - struct inode *inode) -{ - __fsnotify_d_instantiate(dentry, inode); -} - /* Notify this dentry's parent about a child's events. */ static inline int fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask) { diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 6b7e89f45aa4..827b249259f7 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -293,14 +293,9 @@ static inline void __fsnotify_update_dcache_flags(struct dentry *dentry) /* * fsnotify_d_instantiate - instantiate a dentry for inode */ -static inline void __fsnotify_d_instantiate(struct dentry *dentry, struct inode *inode) +static inline void __fsnotify_d_instantiate(struct dentry *dentry) { - if (!inode) - return; - - spin_lock(&dentry->d_lock); __fsnotify_update_dcache_flags(dentry); - spin_unlock(&dentry->d_lock); } /* called from fsnotify listeners, such as fanotify or dnotify */ @@ -399,7 +394,7 @@ static inline void __fsnotify_vfsmount_delete(struct vfsmount *mnt) static inline void __fsnotify_update_dcache_flags(struct dentry *dentry) {} -static inline void __fsnotify_d_instantiate(struct dentry *dentry, struct inode *inode) +static inline void __fsnotify_d_instantiate(struct dentry *dentry) {} static inline u32 fsnotify_get_cookie(void) From de689f5e366373682e95059e9b89d981187e4544 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 9 Mar 2016 18:05:42 -0500 Subject: [PATCH 29/32] don't bother with __d_instantiate(dentry, NULL) it's a no-op - bumping ->d_seq is pointless there. Signed-off-by: Al Viro --- fs/dcache.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 244fd2487fe9..57da4127ea04 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1745,13 +1745,11 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) unsigned add_flags = d_flags_for_inode(inode); spin_lock(&dentry->d_lock); - if (inode) - hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); + hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); raw_write_seqcount_begin(&dentry->d_seq); __d_set_inode_and_type(dentry, inode, add_flags); raw_write_seqcount_end(&dentry->d_seq); - if (inode) - __fsnotify_d_instantiate(dentry); + __fsnotify_d_instantiate(dentry); spin_unlock(&dentry->d_lock); } @@ -1773,11 +1771,11 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) void d_instantiate(struct dentry *entry, struct inode * inode) { BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); - if (inode) + if (inode) { spin_lock(&inode->i_lock); - __d_instantiate(entry, inode); - if (inode) + __d_instantiate(entry, inode); spin_unlock(&inode->i_lock); + } security_d_instantiate(entry, inode); } EXPORT_SYMBOL(d_instantiate); @@ -2764,10 +2762,9 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) BUG_ON(!d_unhashed(dentry)); - if (!inode) { - __d_instantiate(dentry, NULL); + if (!inode) goto out; - } + spin_lock(&inode->i_lock); if (S_ISDIR(inode->i_mode)) { struct dentry *new = __d_find_any_alias(inode); From ed782b5a70a016dbfe503089fd5c11dd74953cc4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 9 Mar 2016 19:52:39 -0500 Subject: [PATCH 30/32] dcache.c: new helper: __d_add() d_add() with inode->i_lock already held; common to d_add() and d_splice_alias(). All ->lookup() instances that end up hashing the dentry they are given will hash it here. This almost completes the preparations to parallel lookups proper - the only remaining bit is taking security_d_instantiate() past d_rehash() and doing rehashing without dropping ->d_lock. Signed-off-by: Al Viro --- fs/dcache.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 57da4127ea04..32ceae3e6112 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2360,6 +2360,19 @@ void d_rehash(struct dentry * entry) } EXPORT_SYMBOL(d_rehash); + +/* inode->i_lock held if inode is non-NULL */ + +static inline void __d_add(struct dentry *dentry, struct inode *inode) +{ + if (inode) { + __d_instantiate(dentry, inode); + spin_unlock(&inode->i_lock); + } + security_d_instantiate(dentry, inode); + d_rehash(dentry); +} + /** * d_add - add dentry to hash queues * @entry: dentry to add @@ -2371,8 +2384,9 @@ EXPORT_SYMBOL(d_rehash); void d_add(struct dentry *entry, struct inode *inode) { - d_instantiate(entry, inode); - d_rehash(entry); + if (inode) + spin_lock(&inode->i_lock); + __d_add(entry, inode); } EXPORT_SYMBOL(d_add); @@ -2798,12 +2812,8 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) return new; } } - /* already taking inode->i_lock, so d_add() by hand */ - __d_instantiate(dentry, inode); - spin_unlock(&inode->i_lock); out: - security_d_instantiate(dentry, inode); - d_rehash(dentry); + __d_add(dentry, inode); return NULL; } EXPORT_SYMBOL(d_splice_alias); From 5f8d498d4364f544fee17125787a47553db02afa Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Thu, 19 Mar 2015 11:10:54 +0000 Subject: [PATCH 31/32] vfs: show_vfsstat: do not ignore errors from show_devname method Explicitly check show_devname method return code and bail out in case of an error. This fixes regression introduced by commit 9d4d65748a5c. Cc: stable@vger.kernel.org Signed-off-by: Dmitry V. Levin Signed-off-by: Al Viro --- fs/proc_namespace.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 2256e7e23e67..3f1190d18991 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -199,6 +199,8 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt) if (sb->s_op->show_devname) { seq_puts(m, "device "); err = sb->s_op->show_devname(m, mnt_path.dentry); + if (err) + goto out; } else { if (r->mnt_devname) { seq_puts(m, "device "); From d6785d9152147596f60234157da2b02540c3e60f Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Thu, 10 Mar 2016 21:19:06 +0100 Subject: [PATCH 32/32] splice: handle zero nr_pages in splice_to_pipe() Running the following command: busybox cat /sys/kernel/debug/tracing/trace_pipe > /dev/null with any tracing enabled pretty very quickly leads to various NULL pointer dereferences and VM BUG_ON()s, such as these: BUG: unable to handle kernel NULL pointer dereference at 0000000000000020 IP: [] generic_pipe_buf_release+0xc/0x40 Call Trace: [] splice_direct_to_actor+0x143/0x1e0 [] ? generic_pipe_buf_nosteal+0x10/0x10 [] do_splice_direct+0x8f/0xb0 [] do_sendfile+0x199/0x380 [] SyS_sendfile64+0x90/0xa0 [] entry_SYSCALL_64_fastpath+0x12/0x6d page dumped because: VM_BUG_ON_PAGE(atomic_read(&page->_count) == 0) kernel BUG at include/linux/mm.h:367! invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC RIP: [] generic_pipe_buf_release+0x3c/0x40 Call Trace: [] splice_direct_to_actor+0x143/0x1e0 [] ? generic_pipe_buf_nosteal+0x10/0x10 [] do_splice_direct+0x8f/0xb0 [] do_sendfile+0x199/0x380 [] SyS_sendfile64+0x90/0xa0 [] tracesys_phase2+0x84/0x89 (busybox's cat uses sendfile(2), unlike the coreutils version) This is because tracing_splice_read_pipe() can call splice_to_pipe() with spd->nr_pages == 0. spd_pages underflows in splice_to_pipe() and we fill the page pointers and the other fields of the pipe_buffers with garbage. All other callers of splice_to_pipe() avoid calling it when nr_pages == 0, and we could make tracing_splice_read_pipe() do that too, but it seems reasonable to have splice_to_page() handle this condition gracefully. Cc: stable@vger.kernel.org Signed-off-by: Rabin Vincent Reviewed-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/splice.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/splice.c b/fs/splice.c index 82bc0d64fc38..19e0b103d253 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -185,6 +185,9 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, unsigned int spd_pages = spd->nr_pages; int ret, do_wakeup, page_nr; + if (!spd_pages) + return 0; + ret = 0; do_wakeup = 0; page_nr = 0;