diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 1f9607ed087c..4cd5f982b1e5 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -398,7 +398,12 @@ 384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl 385 i386 io_pgetevents sys_io_pgetevents_time32 __ia32_compat_sys_io_pgetevents 386 i386 rseq sys_rseq __ia32_sys_rseq -# don't use numbers 387 through 392, add new calls at the end +387 i386 open_tree sys_open_tree __ia32_sys_open_tree +388 i386 move_mount sys_move_mount __ia32_sys_move_mount +389 i386 fsopen sys_fsopen __ia32_sys_fsopen +390 i386 fsconfig sys_fsconfig __ia32_sys_fsconfig +391 i386 fsmount sys_fsmount __ia32_sys_fsmount +392 i386 fspick sys_fspick __ia32_sys_fspick 393 i386 semget sys_semget __ia32_sys_semget 394 i386 semctl sys_semctl __ia32_compat_sys_semctl 395 i386 shmget sys_shmget __ia32_sys_shmget diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 92ee0b4378d4..64ca0d06259a 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -343,6 +343,12 @@ 332 common statx __x64_sys_statx 333 common io_pgetevents __x64_sys_io_pgetevents 334 common rseq __x64_sys_rseq +335 common open_tree __x64_sys_open_tree +336 common move_mount __x64_sys_move_mount +337 common fsopen __x64_sys_fsopen +338 common fsconfig __x64_sys_fsconfig +339 common fsmount __x64_sys_fsmount +340 common fspick __x64_sys_fspick # don't use numbers 387 through 423, add new calls after the last # 'common' entry 424 common pidfd_send_signal __x64_sys_pidfd_send_signal diff --git a/fs/Makefile b/fs/Makefile index 35945f8139e6..5a51bc2489ba 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -13,7 +13,7 @@ obj-y := open.o read_write.o file_table.o super.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o splice.o sync.o utimes.o d_path.o \ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ - fs_types.o fs_context.o fs_parser.o + fs_types.o fs_context.o fs_parser.o fsopen.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o block_dev.o direct-io.o mpage.o diff --git a/fs/file_table.c b/fs/file_table.c index 155d7514a094..3f9c1b452c1d 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -255,6 +255,7 @@ static void __fput(struct file *file) struct dentry *dentry = file->f_path.dentry; struct vfsmount *mnt = file->f_path.mnt; struct inode *inode = file->f_inode; + fmode_t mode = file->f_mode; if (unlikely(!(file->f_mode & FMODE_OPENED))) goto out; @@ -277,18 +278,20 @@ static void __fput(struct file *file) if (file->f_op->release) file->f_op->release(inode, file); if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL && - !(file->f_mode & FMODE_PATH))) { + !(mode & FMODE_PATH))) { cdev_put(inode->i_cdev); } fops_put(file->f_op); put_pid(file->f_owner.pid); - if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) + if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_dec(inode); - if (file->f_mode & FMODE_WRITER) { + if (mode & FMODE_WRITER) { put_write_access(inode); __mnt_drop_write(mnt); } dput(dentry); + if (unlikely(mode & FMODE_NEED_UNMOUNT)) + dissolve_on_fput(mnt); mntput(mnt); out: file_free(file); diff --git a/fs/fs_context.c b/fs/fs_context.c index 87e3546b9a52..a47ccd5a4a78 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -11,6 +11,7 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -23,6 +24,7 @@ #include #include #include +#include #include "mount.h" #include "internal.h" @@ -271,6 +273,8 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type, fc->cred = get_current_cred(); fc->net_ns = get_net(current->nsproxy->net_ns); + mutex_init(&fc->uapi_mutex); + switch (purpose) { case FS_CONTEXT_FOR_MOUNT: fc->user_ns = get_user_ns(fc->cred->user_ns); @@ -353,6 +357,8 @@ struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc) if (!fc) return ERR_PTR(-ENOMEM); + mutex_init(&fc->uapi_mutex); + fc->fs_private = NULL; fc->s_fs_info = NULL; fc->source = NULL; @@ -361,6 +367,8 @@ struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc) get_net(fc->net_ns); get_user_ns(fc->user_ns); get_cred(fc->cred); + if (fc->log) + refcount_inc(&fc->log->usage); /* Can't call put until we've called ->dup */ ret = fc->ops->dup(fc, src_fc); @@ -378,7 +386,6 @@ err_fc: } EXPORT_SYMBOL(vfs_dup_fs_context); -#ifdef CONFIG_PRINTK /** * logfc - Log a message to a filesystem context * @fc: The filesystem context to log to. @@ -386,27 +393,100 @@ EXPORT_SYMBOL(vfs_dup_fs_context); */ void logfc(struct fs_context *fc, const char *fmt, ...) { + static const char store_failure[] = "OOM: Can't store error string"; + struct fc_log *log = fc ? fc->log : NULL; + const char *p; va_list va; + char *q; + u8 freeable; va_start(va, fmt); - - switch (fmt[0]) { - case 'w': - vprintk_emit(0, LOGLEVEL_WARNING, NULL, 0, fmt, va); - break; - case 'e': - vprintk_emit(0, LOGLEVEL_ERR, NULL, 0, fmt, va); - break; - default: - vprintk_emit(0, LOGLEVEL_NOTICE, NULL, 0, fmt, va); - break; + if (!strchr(fmt, '%')) { + p = fmt; + goto unformatted_string; + } + if (strcmp(fmt, "%s") == 0) { + p = va_arg(va, const char *); + goto unformatted_string; } - pr_cont("\n"); + q = kvasprintf(GFP_KERNEL, fmt, va); +copied_string: + if (!q) + goto store_failure; + freeable = 1; + goto store_string; + +unformatted_string: + if ((unsigned long)p >= (unsigned long)__start_rodata && + (unsigned long)p < (unsigned long)__end_rodata) + goto const_string; + if (log && within_module_core((unsigned long)p, log->owner)) + goto const_string; + q = kstrdup(p, GFP_KERNEL); + goto copied_string; + +store_failure: + p = store_failure; +const_string: + q = (char *)p; + freeable = 0; +store_string: + if (!log) { + switch (fmt[0]) { + case 'w': + printk(KERN_WARNING "%s\n", q + 2); + break; + case 'e': + printk(KERN_ERR "%s\n", q + 2); + break; + default: + printk(KERN_NOTICE "%s\n", q + 2); + break; + } + if (freeable) + kfree(q); + } else { + unsigned int logsize = ARRAY_SIZE(log->buffer); + u8 index; + + index = log->head & (logsize - 1); + BUILD_BUG_ON(sizeof(log->head) != sizeof(u8) || + sizeof(log->tail) != sizeof(u8)); + if ((u8)(log->head - log->tail) == logsize) { + /* The buffer is full, discard the oldest message */ + if (log->need_free & (1 << index)) + kfree(log->buffer[index]); + log->tail++; + } + + log->buffer[index] = q; + log->need_free &= ~(1 << index); + log->need_free |= freeable << index; + log->head++; + } va_end(va); } EXPORT_SYMBOL(logfc); -#endif + +/* + * Free a logging structure. + */ +static void put_fc_log(struct fs_context *fc) +{ + struct fc_log *log = fc->log; + int i; + + if (log) { + if (refcount_dec_and_test(&log->usage)) { + fc->log = NULL; + for (i = 0; i <= 7; i++) + if (log->need_free & (1 << i)) + kfree(log->buffer[i]); + kfree(log); + } + } +} /** * put_fs_context - Dispose of a superblock configuration context. @@ -431,6 +511,7 @@ void put_fs_context(struct fs_context *fc) put_user_ns(fc->user_ns); put_cred(fc->cred); kfree(fc->subtype); + put_fc_log(fc); put_filesystem(fc->fs_type); kfree(fc->source); kfree(fc); @@ -640,3 +721,54 @@ int parse_monolithic_mount_data(struct fs_context *fc, void *data) return monolithic_mount_data(fc, data); } + +/* + * Clean up a context after performing an action on it and put it into a state + * from where it can be used to reconfigure a superblock. + * + * Note that here we do only the parts that can't fail; the rest is in + * finish_clean_context() below and in between those fs_context is marked + * FS_CONTEXT_AWAITING_RECONF. The reason for splitup is that after + * successful mount or remount we need to report success to userland. + * Trying to do full reinit (for the sake of possible subsequent remount) + * and failing to allocate memory would've put us into a nasty situation. + * So here we only discard the old state and reinitialization is left + * until we actually try to reconfigure. + */ +void vfs_clean_context(struct fs_context *fc) +{ + if (fc->need_free && fc->ops && fc->ops->free) + fc->ops->free(fc); + fc->need_free = false; + fc->fs_private = NULL; + fc->s_fs_info = NULL; + fc->sb_flags = 0; + security_free_mnt_opts(&fc->security); + kfree(fc->subtype); + fc->subtype = NULL; + kfree(fc->source); + fc->source = NULL; + + fc->purpose = FS_CONTEXT_FOR_RECONFIGURE; + fc->phase = FS_CONTEXT_AWAITING_RECONF; +} + +int finish_clean_context(struct fs_context *fc) +{ + int error; + + if (fc->phase != FS_CONTEXT_AWAITING_RECONF) + return 0; + + if (fc->fs_type->init_fs_context) + error = fc->fs_type->init_fs_context(fc); + else + error = legacy_init_fs_context(fc); + if (unlikely(error)) { + fc->phase = FS_CONTEXT_FAILED; + return error; + } + fc->need_free = true; + fc->phase = FS_CONTEXT_RECONF_PARAMS; + return 0; +} diff --git a/fs/fsopen.c b/fs/fsopen.c new file mode 100644 index 000000000000..3bb9c0c8cbcc --- /dev/null +++ b/fs/fsopen.c @@ -0,0 +1,477 @@ +/* Filesystem access-by-fd. + * + * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" +#include "mount.h" + +/* + * Allow the user to read back any error, warning or informational messages. + */ +static ssize_t fscontext_read(struct file *file, + char __user *_buf, size_t len, loff_t *pos) +{ + struct fs_context *fc = file->private_data; + struct fc_log *log = fc->log; + unsigned int logsize = ARRAY_SIZE(log->buffer); + ssize_t ret; + char *p; + bool need_free; + int index, n; + + ret = mutex_lock_interruptible(&fc->uapi_mutex); + if (ret < 0) + return ret; + + if (log->head == log->tail) { + mutex_unlock(&fc->uapi_mutex); + return -ENODATA; + } + + index = log->tail & (logsize - 1); + p = log->buffer[index]; + need_free = log->need_free & (1 << index); + log->buffer[index] = NULL; + log->need_free &= ~(1 << index); + log->tail++; + mutex_unlock(&fc->uapi_mutex); + + ret = -EMSGSIZE; + n = strlen(p); + if (n > len) + goto err_free; + ret = -EFAULT; + if (copy_to_user(_buf, p, n) != 0) + goto err_free; + ret = n; + +err_free: + if (need_free) + kfree(p); + return ret; +} + +static int fscontext_release(struct inode *inode, struct file *file) +{ + struct fs_context *fc = file->private_data; + + if (fc) { + file->private_data = NULL; + put_fs_context(fc); + } + return 0; +} + +const struct file_operations fscontext_fops = { + .read = fscontext_read, + .release = fscontext_release, + .llseek = no_llseek, +}; + +/* + * Attach a filesystem context to a file and an fd. + */ +static int fscontext_create_fd(struct fs_context *fc, unsigned int o_flags) +{ + int fd; + + fd = anon_inode_getfd("fscontext", &fscontext_fops, fc, + O_RDWR | o_flags); + if (fd < 0) + put_fs_context(fc); + return fd; +} + +static int fscontext_alloc_log(struct fs_context *fc) +{ + fc->log = kzalloc(sizeof(*fc->log), GFP_KERNEL); + if (!fc->log) + return -ENOMEM; + refcount_set(&fc->log->usage, 1); + fc->log->owner = fc->fs_type->owner; + return 0; +} + +/* + * Open a filesystem by name so that it can be configured for mounting. + * + * We are allowed to specify a container in which the filesystem will be + * opened, thereby indicating which namespaces will be used (notably, which + * network namespace will be used for network filesystems). + */ +SYSCALL_DEFINE2(fsopen, const char __user *, _fs_name, unsigned int, flags) +{ + struct file_system_type *fs_type; + struct fs_context *fc; + const char *fs_name; + int ret; + + if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + if (flags & ~FSOPEN_CLOEXEC) + return -EINVAL; + + fs_name = strndup_user(_fs_name, PAGE_SIZE); + if (IS_ERR(fs_name)) + return PTR_ERR(fs_name); + + fs_type = get_fs_type(fs_name); + kfree(fs_name); + if (!fs_type) + return -ENODEV; + + fc = fs_context_for_mount(fs_type, 0); + put_filesystem(fs_type); + if (IS_ERR(fc)) + return PTR_ERR(fc); + + fc->phase = FS_CONTEXT_CREATE_PARAMS; + + ret = fscontext_alloc_log(fc); + if (ret < 0) + goto err_fc; + + return fscontext_create_fd(fc, flags & FSOPEN_CLOEXEC ? O_CLOEXEC : 0); + +err_fc: + put_fs_context(fc); + return ret; +} + +/* + * Pick a superblock into a context for reconfiguration. + */ +SYSCALL_DEFINE3(fspick, int, dfd, const char __user *, path, unsigned int, flags) +{ + struct fs_context *fc; + struct path target; + unsigned int lookup_flags; + int ret; + + if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + if ((flags & ~(FSPICK_CLOEXEC | + FSPICK_SYMLINK_NOFOLLOW | + FSPICK_NO_AUTOMOUNT | + FSPICK_EMPTY_PATH)) != 0) + return -EINVAL; + + lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT; + if (flags & FSPICK_SYMLINK_NOFOLLOW) + lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & FSPICK_NO_AUTOMOUNT) + lookup_flags &= ~LOOKUP_AUTOMOUNT; + if (flags & FSPICK_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + ret = user_path_at(dfd, path, lookup_flags, &target); + if (ret < 0) + goto err; + + ret = -EINVAL; + if (target.mnt->mnt_root != target.dentry) + goto err_path; + + fc = fs_context_for_reconfigure(target.dentry, 0, 0); + if (IS_ERR(fc)) { + ret = PTR_ERR(fc); + goto err_path; + } + + fc->phase = FS_CONTEXT_RECONF_PARAMS; + + ret = fscontext_alloc_log(fc); + if (ret < 0) + goto err_fc; + + path_put(&target); + return fscontext_create_fd(fc, flags & FSPICK_CLOEXEC ? O_CLOEXEC : 0); + +err_fc: + put_fs_context(fc); +err_path: + path_put(&target); +err: + return ret; +} + +/* + * Check the state and apply the configuration. Note that this function is + * allowed to 'steal' the value by setting param->xxx to NULL before returning. + */ +static int vfs_fsconfig_locked(struct fs_context *fc, int cmd, + struct fs_parameter *param) +{ + struct super_block *sb; + int ret; + + ret = finish_clean_context(fc); + if (ret) + return ret; + switch (cmd) { + case FSCONFIG_CMD_CREATE: + if (fc->phase != FS_CONTEXT_CREATE_PARAMS) + return -EBUSY; + fc->phase = FS_CONTEXT_CREATING; + ret = vfs_get_tree(fc); + if (ret) + break; + sb = fc->root->d_sb; + ret = security_sb_kern_mount(sb); + if (unlikely(ret)) { + fc_drop_locked(fc); + break; + } + up_write(&sb->s_umount); + fc->phase = FS_CONTEXT_AWAITING_MOUNT; + return 0; + case FSCONFIG_CMD_RECONFIGURE: + if (fc->phase != FS_CONTEXT_RECONF_PARAMS) + return -EBUSY; + fc->phase = FS_CONTEXT_RECONFIGURING; + sb = fc->root->d_sb; + if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) { + ret = -EPERM; + break; + } + down_write(&sb->s_umount); + ret = reconfigure_super(fc); + up_write(&sb->s_umount); + if (ret) + break; + vfs_clean_context(fc); + return 0; + default: + if (fc->phase != FS_CONTEXT_CREATE_PARAMS && + fc->phase != FS_CONTEXT_RECONF_PARAMS) + return -EBUSY; + + return vfs_parse_fs_param(fc, param); + } + fc->phase = FS_CONTEXT_FAILED; + return ret; +} + +/** + * sys_fsconfig - Set parameters and trigger actions on a context + * @fd: The filesystem context to act upon + * @cmd: The action to take + * @_key: Where appropriate, the parameter key to set + * @_value: Where appropriate, the parameter value to set + * @aux: Additional information for the value + * + * This system call is used to set parameters on a context, including + * superblock settings, data source and security labelling. + * + * Actions include triggering the creation of a superblock and the + * reconfiguration of the superblock attached to the specified context. + * + * When setting a parameter, @cmd indicates the type of value being proposed + * and @_key indicates the parameter to be altered. + * + * @_value and @aux are used to specify the value, should a value be required: + * + * (*) fsconfig_set_flag: No value is specified. The parameter must be boolean + * in nature. The key may be prefixed with "no" to invert the + * setting. @_value must be NULL and @aux must be 0. + * + * (*) fsconfig_set_string: A string value is specified. The parameter can be + * expecting boolean, integer, string or take a path. A conversion to an + * appropriate type will be attempted (which may include looking up as a + * path). @_value points to a NUL-terminated string and @aux must be 0. + * + * (*) fsconfig_set_binary: A binary blob is specified. @_value points to the + * blob and @aux indicates its size. The parameter must be expecting a + * blob. + * + * (*) fsconfig_set_path: A non-empty path is specified. The parameter must be + * expecting a path object. @_value points to a NUL-terminated string that + * is the path and @aux is a file descriptor at which to start a relative + * lookup or AT_FDCWD. + * + * (*) fsconfig_set_path_empty: As fsconfig_set_path, but with AT_EMPTY_PATH + * implied. + * + * (*) fsconfig_set_fd: An open file descriptor is specified. @_value must be + * NULL and @aux indicates the file descriptor. + */ +SYSCALL_DEFINE5(fsconfig, + int, fd, + unsigned int, cmd, + const char __user *, _key, + const void __user *, _value, + int, aux) +{ + struct fs_context *fc; + struct fd f; + int ret; + + struct fs_parameter param = { + .type = fs_value_is_undefined, + }; + + if (fd < 0) + return -EINVAL; + + switch (cmd) { + case FSCONFIG_SET_FLAG: + if (!_key || _value || aux) + return -EINVAL; + break; + case FSCONFIG_SET_STRING: + if (!_key || !_value || aux) + return -EINVAL; + break; + case FSCONFIG_SET_BINARY: + if (!_key || !_value || aux <= 0 || aux > 1024 * 1024) + return -EINVAL; + break; + case FSCONFIG_SET_PATH: + case FSCONFIG_SET_PATH_EMPTY: + if (!_key || !_value || (aux != AT_FDCWD && aux < 0)) + return -EINVAL; + break; + case FSCONFIG_SET_FD: + if (!_key || _value || aux < 0) + return -EINVAL; + break; + case FSCONFIG_CMD_CREATE: + case FSCONFIG_CMD_RECONFIGURE: + if (_key || _value || aux) + return -EINVAL; + break; + default: + return -EOPNOTSUPP; + } + + f = fdget(fd); + if (!f.file) + return -EBADF; + ret = -EINVAL; + if (f.file->f_op != &fscontext_fops) + goto out_f; + + fc = f.file->private_data; + if (fc->ops == &legacy_fs_context_ops) { + switch (cmd) { + case FSCONFIG_SET_BINARY: + case FSCONFIG_SET_PATH: + case FSCONFIG_SET_PATH_EMPTY: + case FSCONFIG_SET_FD: + ret = -EOPNOTSUPP; + goto out_f; + } + } + + if (_key) { + param.key = strndup_user(_key, 256); + if (IS_ERR(param.key)) { + ret = PTR_ERR(param.key); + goto out_f; + } + } + + switch (cmd) { + case FSCONFIG_SET_FLAG: + param.type = fs_value_is_flag; + break; + case FSCONFIG_SET_STRING: + param.type = fs_value_is_string; + param.string = strndup_user(_value, 256); + if (IS_ERR(param.string)) { + ret = PTR_ERR(param.string); + goto out_key; + } + param.size = strlen(param.string); + break; + case FSCONFIG_SET_BINARY: + param.type = fs_value_is_blob; + param.size = aux; + param.blob = memdup_user_nul(_value, aux); + if (IS_ERR(param.blob)) { + ret = PTR_ERR(param.blob); + goto out_key; + } + break; + case FSCONFIG_SET_PATH: + param.type = fs_value_is_filename; + param.name = getname_flags(_value, 0, NULL); + if (IS_ERR(param.name)) { + ret = PTR_ERR(param.name); + goto out_key; + } + param.dirfd = aux; + param.size = strlen(param.name->name); + break; + case FSCONFIG_SET_PATH_EMPTY: + param.type = fs_value_is_filename_empty; + param.name = getname_flags(_value, LOOKUP_EMPTY, NULL); + if (IS_ERR(param.name)) { + ret = PTR_ERR(param.name); + goto out_key; + } + param.dirfd = aux; + param.size = strlen(param.name->name); + break; + case FSCONFIG_SET_FD: + param.type = fs_value_is_file; + ret = -EBADF; + param.file = fget(aux); + if (!param.file) + goto out_key; + break; + default: + break; + } + + ret = mutex_lock_interruptible(&fc->uapi_mutex); + if (ret == 0) { + ret = vfs_fsconfig_locked(fc, cmd, ¶m); + mutex_unlock(&fc->uapi_mutex); + } + + /* Clean up the our record of any value that we obtained from + * userspace. Note that the value may have been stolen by the LSM or + * filesystem, in which case the value pointer will have been cleared. + */ + switch (cmd) { + case FSCONFIG_SET_STRING: + case FSCONFIG_SET_BINARY: + kfree(param.string); + break; + case FSCONFIG_SET_PATH: + case FSCONFIG_SET_PATH_EMPTY: + if (param.name) + putname(param.name); + break; + case FSCONFIG_SET_FD: + if (param.file) + fput(param.file); + break; + default: + break; + } +out_key: + kfree(param.key); +out_f: + fdput(f); + return ret; +} diff --git a/fs/internal.h b/fs/internal.h index 17a8ae967493..0010889f2e85 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -55,8 +55,11 @@ extern void __init chrdev_init(void); /* * fs_context.c */ +extern const struct fs_context_operations legacy_fs_context_ops; extern int parse_monolithic_mount_data(struct fs_context *, void *); extern void fc_drop_locked(struct fs_context *); +extern void vfs_clean_context(struct fs_context *fc); +extern int finish_clean_context(struct fs_context *fc); /* * namei.c @@ -92,6 +95,7 @@ extern void __init mnt_init(void); extern int __mnt_want_write_file(struct file *); extern void __mnt_drop_write_file(struct file *); +extern void dissolve_on_fput(struct vfsmount *); /* * fs_struct.c */ diff --git a/fs/namespace.c b/fs/namespace.c index c9cab307fa77..3357c3d65475 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -20,6 +20,7 @@ #include /* init_rootfs */ #include /* get_fs_root et.al. */ #include /* fsnotify_vfsmount_delete */ +#include #include #include #include @@ -1832,6 +1833,27 @@ struct vfsmount *collect_mounts(const struct path *path) return &tree->mnt; } +static void free_mnt_ns(struct mnt_namespace *); +static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool); + +void dissolve_on_fput(struct vfsmount *mnt) +{ + struct mnt_namespace *ns; + namespace_lock(); + lock_mount_hash(); + ns = real_mount(mnt)->mnt_ns; + if (ns) { + if (is_anon_ns(ns)) + umount_tree(real_mount(mnt), UMOUNT_CONNECTED); + else + ns = NULL; + } + unlock_mount_hash(); + namespace_unlock(); + if (ns) + free_mnt_ns(ns); +} + void drop_collected_mounts(struct vfsmount *mnt) { namespace_lock(); @@ -2065,6 +2087,10 @@ static int attach_recursive_mnt(struct mount *source_mnt, attach_mnt(source_mnt, dest_mnt, dest_mp); touch_mnt_namespace(source_mnt->mnt_ns); } else { + if (source_mnt->mnt_ns) { + /* move from anon - the caller will destroy */ + list_del_init(&source_mnt->mnt_ns->list); + } mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); commit_tree(source_mnt); } @@ -2222,6 +2248,30 @@ static bool has_locked_children(struct mount *mnt, struct dentry *dentry) return false; } +static struct mount *__do_loopback(struct path *old_path, int recurse) +{ + struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt); + + if (IS_MNT_UNBINDABLE(old)) + return mnt; + + if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations) + return mnt; + + if (!recurse && has_locked_children(old, old_path->dentry)) + return mnt; + + if (recurse) + mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE); + else + mnt = clone_mnt(old, old_path->dentry, 0); + + if (!IS_ERR(mnt)) + mnt->mnt.mnt_flags &= ~MNT_LOCKED; + + return mnt; +} + /* * do loopback mount. */ @@ -2229,7 +2279,7 @@ static int do_loopback(struct path *path, const char *old_name, int recurse) { struct path old_path; - struct mount *mnt = NULL, *old, *parent; + struct mount *mnt = NULL, *parent; struct mountpoint *mp; int err; if (!old_name || !*old_name) @@ -2243,38 +2293,21 @@ static int do_loopback(struct path *path, const char *old_name, goto out; mp = lock_mount(path); - err = PTR_ERR(mp); - if (IS_ERR(mp)) + if (IS_ERR(mp)) { + err = PTR_ERR(mp); goto out; + } - old = real_mount(old_path.mnt); parent = real_mount(path->mnt); - - err = -EINVAL; - if (IS_MNT_UNBINDABLE(old)) - goto out2; - if (!check_mnt(parent)) goto out2; - if (!check_mnt(old) && old_path.dentry->d_op != &ns_dentry_operations) - goto out2; - - if (!recurse && has_locked_children(old, old_path.dentry)) - goto out2; - - if (recurse) - mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE); - else - mnt = clone_mnt(old, old_path.dentry, 0); - + mnt = __do_loopback(&old_path, recurse); if (IS_ERR(mnt)) { err = PTR_ERR(mnt); goto out2; } - mnt->mnt.mnt_flags &= ~MNT_LOCKED; - err = graft_tree(mnt, parent, mp); if (err) { lock_mount_hash(); @@ -2288,6 +2321,96 @@ out: return err; } +static struct file *open_detached_copy(struct path *path, bool recursive) +{ + struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; + struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true); + struct mount *mnt, *p; + struct file *file; + + if (IS_ERR(ns)) + return ERR_CAST(ns); + + namespace_lock(); + mnt = __do_loopback(path, recursive); + if (IS_ERR(mnt)) { + namespace_unlock(); + free_mnt_ns(ns); + return ERR_CAST(mnt); + } + + lock_mount_hash(); + for (p = mnt; p; p = next_mnt(p, mnt)) { + p->mnt_ns = ns; + ns->mounts++; + } + ns->root = mnt; + list_add_tail(&ns->list, &mnt->mnt_list); + mntget(&mnt->mnt); + unlock_mount_hash(); + namespace_unlock(); + + mntput(path->mnt); + path->mnt = &mnt->mnt; + file = dentry_open(path, O_PATH, current_cred()); + if (IS_ERR(file)) + dissolve_on_fput(path->mnt); + else + file->f_mode |= FMODE_NEED_UNMOUNT; + return file; +} + +SYSCALL_DEFINE3(open_tree, int, dfd, const char *, filename, unsigned, flags) +{ + struct file *file; + struct path path; + int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; + bool detached = flags & OPEN_TREE_CLONE; + int error; + int fd; + + BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC); + + if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE | + AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE | + OPEN_TREE_CLOEXEC)) + return -EINVAL; + + if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE) + return -EINVAL; + + if (flags & AT_NO_AUTOMOUNT) + lookup_flags &= ~LOOKUP_AUTOMOUNT; + if (flags & AT_SYMLINK_NOFOLLOW) + lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + + if (detached && !may_mount()) + return -EPERM; + + fd = get_unused_fd_flags(flags & O_CLOEXEC); + if (fd < 0) + return fd; + + error = user_path_at(dfd, filename, lookup_flags, &path); + if (unlikely(error)) { + file = ERR_PTR(error); + } else { + if (detached) + file = open_detached_copy(&path, flags & AT_RECURSIVE); + else + file = dentry_open(&path, O_PATH, current_cred()); + path_put(&path); + } + if (IS_ERR(file)) { + put_unused_fd(fd); + return PTR_ERR(file); + } + fd_install(fd, file); + return fd; +} + /* * Don't allow locked mount flags to be cleared. * @@ -2426,72 +2549,117 @@ static inline int tree_contains_unbindable(struct mount *mnt) return 0; } -static int do_move_mount(struct path *path, const char *old_name) +/* + * Check that there aren't references to earlier/same mount namespaces in the + * specified subtree. Such references can act as pins for mount namespaces + * that aren't checked by the mount-cycle checking code, thereby allowing + * cycles to be made. + */ +static bool check_for_nsfs_mounts(struct mount *subtree) { - struct path old_path, parent_path; + struct mount *p; + bool ret = false; + + lock_mount_hash(); + for (p = subtree; p; p = next_mnt(p, subtree)) + if (mnt_ns_loop(p->mnt.mnt_root)) + goto out; + + ret = true; +out: + unlock_mount_hash(); + return ret; +} + +static int do_move_mount(struct path *old_path, struct path *new_path) +{ + struct path parent_path = {.mnt = NULL, .dentry = NULL}; + struct mnt_namespace *ns; struct mount *p; struct mount *old; struct mountpoint *mp; int err; - if (!old_name || !*old_name) - return -EINVAL; - err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); - if (err) - return err; + bool attached; - mp = lock_mount(path); - err = PTR_ERR(mp); + mp = lock_mount(new_path); if (IS_ERR(mp)) + return PTR_ERR(mp); + + old = real_mount(old_path->mnt); + p = real_mount(new_path->mnt); + attached = mnt_has_parent(old); + ns = old->mnt_ns; + + err = -EINVAL; + /* The mountpoint must be in our namespace. */ + if (!check_mnt(p)) goto out; - old = real_mount(old_path.mnt); - p = real_mount(path->mnt); + /* The thing moved should be either ours or completely unattached. */ + if (attached && !check_mnt(old)) + goto out; - err = -EINVAL; - if (!check_mnt(p) || !check_mnt(old)) - goto out1; + if (!attached && !is_anon_ns(ns)) + goto out; if (old->mnt.mnt_flags & MNT_LOCKED) - goto out1; + goto out; - err = -EINVAL; - if (old_path.dentry != old_path.mnt->mnt_root) - goto out1; + if (old_path->dentry != old_path->mnt->mnt_root) + goto out; - if (!mnt_has_parent(old)) - goto out1; - - if (d_is_dir(path->dentry) != - d_is_dir(old_path.dentry)) - goto out1; + if (d_is_dir(new_path->dentry) != + d_is_dir(old_path->dentry)) + goto out; /* * Don't move a mount residing in a shared parent. */ - if (IS_MNT_SHARED(old->mnt_parent)) - goto out1; + if (attached && IS_MNT_SHARED(old->mnt_parent)) + goto out; /* * Don't move a mount tree containing unbindable mounts to a destination * mount which is shared. */ if (IS_MNT_SHARED(p) && tree_contains_unbindable(old)) - goto out1; + goto out; err = -ELOOP; + if (!check_for_nsfs_mounts(old)) + goto out; for (; mnt_has_parent(p); p = p->mnt_parent) if (p == old) - goto out1; + goto out; - err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path); + err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, + attached ? &parent_path : NULL); if (err) - goto out1; + goto out; /* if the mount is moved, it should no longer be expire * automatically */ list_del_init(&old->mnt_expire); -out1: - unlock_mount(mp); out: - if (!err) + unlock_mount(mp); + if (!err) { path_put(&parent_path); + if (!attached) + free_mnt_ns(ns); + } + return err; +} + +static int do_move_mount_old(struct path *path, const char *old_name) +{ + struct path old_path; + int err; + + if (!old_name || !*old_name) + return -EINVAL; + + err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); + if (err) + return err; + + err = do_move_mount(&old_path, path); path_put(&old_path); return err; } @@ -2937,7 +3105,7 @@ long do_mount(const char *dev_name, const char __user *dir_name, else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) retval = do_change_type(&path, flags); else if (flags & MS_MOVE) - retval = do_move_mount(&path, dev_name); + retval = do_move_mount_old(&path, dev_name); else retval = do_new_mount(&path, type_page, sb_flags, mnt_flags, dev_name, data_page); @@ -3165,6 +3333,203 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, return ksys_mount(dev_name, dir_name, type, flags, data); } +/* + * Create a kernel mount representation for a new, prepared superblock + * (specified by fs_fd) and attach to an open_tree-like file descriptor. + */ +SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, + unsigned int, attr_flags) +{ + struct mnt_namespace *ns; + struct fs_context *fc; + struct file *file; + struct path newmount; + struct mount *mnt; + struct fd f; + unsigned int mnt_flags = 0; + long ret; + + if (!may_mount()) + return -EPERM; + + if ((flags & ~(FSMOUNT_CLOEXEC)) != 0) + return -EINVAL; + + if (attr_flags & ~(MOUNT_ATTR_RDONLY | + MOUNT_ATTR_NOSUID | + MOUNT_ATTR_NODEV | + MOUNT_ATTR_NOEXEC | + MOUNT_ATTR__ATIME | + MOUNT_ATTR_NODIRATIME)) + return -EINVAL; + + if (attr_flags & MOUNT_ATTR_RDONLY) + mnt_flags |= MNT_READONLY; + if (attr_flags & MOUNT_ATTR_NOSUID) + mnt_flags |= MNT_NOSUID; + if (attr_flags & MOUNT_ATTR_NODEV) + mnt_flags |= MNT_NODEV; + if (attr_flags & MOUNT_ATTR_NOEXEC) + mnt_flags |= MNT_NOEXEC; + if (attr_flags & MOUNT_ATTR_NODIRATIME) + mnt_flags |= MNT_NODIRATIME; + + switch (attr_flags & MOUNT_ATTR__ATIME) { + case MOUNT_ATTR_STRICTATIME: + break; + case MOUNT_ATTR_NOATIME: + mnt_flags |= MNT_NOATIME; + break; + case MOUNT_ATTR_RELATIME: + mnt_flags |= MNT_RELATIME; + break; + default: + return -EINVAL; + } + + f = fdget(fs_fd); + if (!f.file) + return -EBADF; + + ret = -EINVAL; + if (f.file->f_op != &fscontext_fops) + goto err_fsfd; + + fc = f.file->private_data; + + ret = mutex_lock_interruptible(&fc->uapi_mutex); + if (ret < 0) + goto err_fsfd; + + /* There must be a valid superblock or we can't mount it */ + ret = -EINVAL; + if (!fc->root) + goto err_unlock; + + ret = -EPERM; + if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) { + pr_warn("VFS: Mount too revealing\n"); + goto err_unlock; + } + + ret = -EBUSY; + if (fc->phase != FS_CONTEXT_AWAITING_MOUNT) + goto err_unlock; + + ret = -EPERM; + if ((fc->sb_flags & SB_MANDLOCK) && !may_mandlock()) + goto err_unlock; + + newmount.mnt = vfs_create_mount(fc); + if (IS_ERR(newmount.mnt)) { + ret = PTR_ERR(newmount.mnt); + goto err_unlock; + } + newmount.dentry = dget(fc->root); + newmount.mnt->mnt_flags = mnt_flags; + + /* We've done the mount bit - now move the file context into more or + * less the same state as if we'd done an fspick(). We don't want to + * do any memory allocation or anything like that at this point as we + * don't want to have to handle any errors incurred. + */ + vfs_clean_context(fc); + + ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true); + if (IS_ERR(ns)) { + ret = PTR_ERR(ns); + goto err_path; + } + mnt = real_mount(newmount.mnt); + mnt->mnt_ns = ns; + ns->root = mnt; + ns->mounts = 1; + list_add(&mnt->mnt_list, &ns->list); + + /* Attach to an apparent O_PATH fd with a note that we need to unmount + * it, not just simply put it. + */ + file = dentry_open(&newmount, O_PATH, fc->cred); + if (IS_ERR(file)) { + dissolve_on_fput(newmount.mnt); + ret = PTR_ERR(file); + goto err_path; + } + file->f_mode |= FMODE_NEED_UNMOUNT; + + ret = get_unused_fd_flags((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0); + if (ret >= 0) + fd_install(ret, file); + else + fput(file); + +err_path: + path_put(&newmount); +err_unlock: + mutex_unlock(&fc->uapi_mutex); +err_fsfd: + fdput(f); + return ret; +} + +/* + * Move a mount from one place to another. In combination with + * fsopen()/fsmount() this is used to install a new mount and in combination + * with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be used to copy + * a mount subtree. + * + * Note the flags value is a combination of MOVE_MOUNT_* flags. + */ +SYSCALL_DEFINE5(move_mount, + int, from_dfd, const char *, from_pathname, + int, to_dfd, const char *, to_pathname, + unsigned int, flags) +{ + struct path from_path, to_path; + unsigned int lflags; + int ret = 0; + + if (!may_mount()) + return -EPERM; + + if (flags & ~MOVE_MOUNT__MASK) + return -EINVAL; + + /* If someone gives a pathname, they aren't permitted to move + * from an fd that requires unmount as we can't get at the flag + * to clear it afterwards. + */ + lflags = 0; + if (flags & MOVE_MOUNT_F_SYMLINKS) lflags |= LOOKUP_FOLLOW; + if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; + if (flags & MOVE_MOUNT_F_EMPTY_PATH) lflags |= LOOKUP_EMPTY; + + ret = user_path_at(from_dfd, from_pathname, lflags, &from_path); + if (ret < 0) + return ret; + + lflags = 0; + if (flags & MOVE_MOUNT_T_SYMLINKS) lflags |= LOOKUP_FOLLOW; + if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; + if (flags & MOVE_MOUNT_T_EMPTY_PATH) lflags |= LOOKUP_EMPTY; + + ret = user_path_at(to_dfd, to_pathname, lflags, &to_path); + if (ret < 0) + goto out_from; + + ret = security_move_mount(&from_path, &to_path); + if (ret < 0) + goto out_to; + + ret = do_move_mount(&from_path, &to_path); + +out_to: + path_put(&to_path); +out_from: + path_put(&from_path); + return ret; +} + /* * Return true if path is reachable from root * diff --git a/include/linux/fs.h b/include/linux/fs.h index 5174405e40d5..ec07f4c5630d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -165,10 +165,13 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define FMODE_NONOTIFY ((__force fmode_t)0x4000000) /* File is capable of returning -EAGAIN if I/O will block */ -#define FMODE_NOWAIT ((__force fmode_t)0x8000000) +#define FMODE_NOWAIT ((__force fmode_t)0x8000000) + +/* File represents mount that needs unmounting */ +#define FMODE_NEED_UNMOUNT ((__force fmode_t)0x10000000) /* File does not contribute to nr_files count */ -#define FMODE_NOACCOUNT ((__force fmode_t)0x20000000) +#define FMODE_NOACCOUNT ((__force fmode_t)0x20000000) /* * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index eaca452088fa..1f966670c8dc 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -13,8 +13,10 @@ #define _LINUX_FS_CONTEXT_H #include +#include #include #include +#include struct cred; struct dentry; @@ -34,6 +36,19 @@ enum fs_context_purpose { FS_CONTEXT_FOR_RECONFIGURE, /* Superblock reconfiguration (remount) */ }; +/* + * Userspace usage phase for fsopen/fspick. + */ +enum fs_context_phase { + FS_CONTEXT_CREATE_PARAMS, /* Loading params for sb creation */ + FS_CONTEXT_CREATING, /* A superblock is being created */ + FS_CONTEXT_AWAITING_MOUNT, /* Superblock created, awaiting fsmount() */ + FS_CONTEXT_AWAITING_RECONF, /* Awaiting initialisation for reconfiguration */ + FS_CONTEXT_RECONF_PARAMS, /* Loading params for reconfiguration */ + FS_CONTEXT_RECONFIGURING, /* Reconfiguring the superblock */ + FS_CONTEXT_FAILED, /* Failed to correctly transition a context */ +}; + /* * Type of parameter value. */ @@ -74,12 +89,14 @@ struct fs_parameter { */ struct fs_context { const struct fs_context_operations *ops; + struct mutex uapi_mutex; /* Userspace access mutex */ struct file_system_type *fs_type; void *fs_private; /* The filesystem's context */ struct dentry *root; /* The root and superblock */ struct user_namespace *user_ns; /* The user namespace for this mount */ struct net *net_ns; /* The network namespace for this mount */ const struct cred *cred; /* The mounter's credentials */ + struct fc_log *log; /* Logging buffer */ const char *source; /* The source name (eg. dev path) */ const char *subtype; /* The subtype to set on the superblock */ void *security; /* Linux S&M options */ @@ -88,6 +105,7 @@ struct fs_context { unsigned int sb_flags_mask; /* Superblock flags that were changed */ unsigned int lsm_flags; /* Information flags from the fs to the LSM */ enum fs_context_purpose purpose:8; + enum fs_context_phase phase:8; /* The phase the context is in */ bool need_free:1; /* Need to call ops->free() */ bool global:1; /* Goes into &init_user_ns */ }; @@ -135,15 +153,21 @@ extern int vfs_get_super(struct fs_context *fc, extern const struct file_operations fscontext_fops; -#ifdef CONFIG_PRINTK +/* + * Mount error, warning and informational message logging. This structure is + * shareable between a mount and a subordinate mount. + */ +struct fc_log { + refcount_t usage; + u8 head; /* Insertion index in buffer[] */ + u8 tail; /* Removal index in buffer[] */ + u8 need_free; /* Mask of kfree'able items in buffer[] */ + struct module *owner; /* Owner module for strings that don't then need freeing */ + char *buffer[8]; +}; + extern __attribute__((format(printf, 2, 3))) void logfc(struct fs_context *fc, const char *fmt, ...); -#else -static inline __attribute__((format(printf, 2, 3))) -void logfc(struct fs_context *fc, const char *fmt, ...) -{ -} -#endif /** * infof - Store supplementary informational message diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index f7e55d0d2672..47f58cfb6a19 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -159,6 +159,10 @@ * Parse a string of security data filling in the opts structure * @options string containing all mount options known by the LSM * @opts binary data structure usable by the LSM + * @move_mount: + * Check permission before a mount is moved. + * @from_path indicates the mount that is going to be moved. + * @to_path indicates the mountpoint that will be mounted upon. * @dentry_init_security: * Compute a context for a dentry as the inode is not yet available * since NFSv4 has no label backed by an EA anyway. @@ -1502,6 +1506,7 @@ union security_list_options { unsigned long *set_kern_flags); int (*sb_add_mnt_opt)(const char *option, const char *val, int len, void **mnt_opts); + int (*move_mount)(const struct path *from_path, const struct path *to_path); int (*dentry_init_security)(struct dentry *dentry, int mode, const struct qstr *name, void **ctx, u32 *ctxlen); @@ -1839,6 +1844,7 @@ struct security_hook_heads { struct hlist_head sb_set_mnt_opts; struct hlist_head sb_clone_mnt_opts; struct hlist_head sb_add_mnt_opt; + struct hlist_head move_mount; struct hlist_head dentry_init_security; struct hlist_head dentry_create_files_as; #ifdef CONFIG_SECURITY_PATH diff --git a/include/linux/module.h b/include/linux/module.h index 5bf5dcd91009..7dc4dc79b634 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -709,6 +709,12 @@ static inline bool is_module_text_address(unsigned long addr) return false; } +static inline bool within_module_core(unsigned long addr, + const struct module *mod) +{ + return false; +} + /* Get/put a kernel symbol (calls should be symmetric) */ #define symbol_get(x) ({ extern typeof(x) x __attribute__((weak)); &(x); }) #define symbol_put(x) do { } while (0) diff --git a/include/linux/security.h b/include/linux/security.h index d543293216b9..659071c2e57c 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -251,6 +251,7 @@ int security_sb_clone_mnt_opts(const struct super_block *oldsb, unsigned long *set_kern_flags); int security_add_mnt_opt(const char *option, const char *val, int len, void **mnt_opts); +int security_move_mount(const struct path *from_path, const struct path *to_path); int security_dentry_init_security(struct dentry *dentry, int mode, const struct qstr *name, void **ctx, u32 *ctxlen); @@ -614,6 +615,12 @@ static inline int security_add_mnt_opt(const char *option, const char *val, return 0; } +static inline int security_move_mount(const struct path *from_path, + const struct path *to_path) +{ + return 0; +} + static inline int security_inode_alloc(struct inode *inode) { return 0; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index e446806a561f..e2870fe1be5b 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -985,6 +985,15 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, unsigned mask, struct statx __user *buffer); asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len, int flags, uint32_t sig); +asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags); +asmlinkage long sys_move_mount(int from_dfd, const char __user *from_path, + int to_dfd, const char __user *to_path, + unsigned int ms_flags); +asmlinkage long sys_fsopen(const char __user *fs_name, unsigned int flags); +asmlinkage long sys_fsconfig(int fs_fd, unsigned int cmd, const char __user *key, + const void __user *value, int aux); +asmlinkage long sys_fsmount(int fs_fd, unsigned int flags, unsigned int ms_flags); +asmlinkage long sys_fspick(int dfd, const char __user *path, unsigned int flags); asmlinkage long sys_pidfd_send_signal(int pidfd, int sig, siginfo_t __user *info, unsigned int flags); diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index a2f8658f1c55..1d338357df8a 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -91,5 +91,7 @@ #define AT_STATX_FORCE_SYNC 0x2000 /* - Force the attributes to be sync'd with the server */ #define AT_STATX_DONT_SYNC 0x4000 /* - Don't sync attributes with the server */ +#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */ + #endif /* _UAPI_LINUX_FCNTL_H */ diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index 3f9ec42510b0..96a0240f23fe 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -55,4 +55,66 @@ #define MS_MGC_VAL 0xC0ED0000 #define MS_MGC_MSK 0xffff0000 +/* + * open_tree() flags. + */ +#define OPEN_TREE_CLONE 1 /* Clone the target tree and attach the clone */ +#define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */ + +/* + * move_mount() flags. + */ +#define MOVE_MOUNT_F_SYMLINKS 0x00000001 /* Follow symlinks on from path */ +#define MOVE_MOUNT_F_AUTOMOUNTS 0x00000002 /* Follow automounts on from path */ +#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */ +#define MOVE_MOUNT_T_SYMLINKS 0x00000010 /* Follow symlinks on to path */ +#define MOVE_MOUNT_T_AUTOMOUNTS 0x00000020 /* Follow automounts on to path */ +#define MOVE_MOUNT_T_EMPTY_PATH 0x00000040 /* Empty to path permitted */ +#define MOVE_MOUNT__MASK 0x00000077 + +/* + * fsopen() flags. + */ +#define FSOPEN_CLOEXEC 0x00000001 + +/* + * fspick() flags. + */ +#define FSPICK_CLOEXEC 0x00000001 +#define FSPICK_SYMLINK_NOFOLLOW 0x00000002 +#define FSPICK_NO_AUTOMOUNT 0x00000004 +#define FSPICK_EMPTY_PATH 0x00000008 + +/* + * The type of fsconfig() call made. + */ +enum fsconfig_command { + FSCONFIG_SET_FLAG = 0, /* Set parameter, supplying no value */ + FSCONFIG_SET_STRING = 1, /* Set parameter, supplying a string value */ + FSCONFIG_SET_BINARY = 2, /* Set parameter, supplying a binary blob value */ + FSCONFIG_SET_PATH = 3, /* Set parameter, supplying an object by path */ + FSCONFIG_SET_PATH_EMPTY = 4, /* Set parameter, supplying an object by (empty) path */ + FSCONFIG_SET_FD = 5, /* Set parameter, supplying an object by fd */ + FSCONFIG_CMD_CREATE = 6, /* Invoke superblock creation */ + FSCONFIG_CMD_RECONFIGURE = 7, /* Invoke superblock reconfiguration */ +}; + +/* + * fsmount() flags. + */ +#define FSMOUNT_CLOEXEC 0x00000001 + +/* + * Mount attributes. + */ +#define MOUNT_ATTR_RDONLY 0x00000001 /* Mount read-only */ +#define MOUNT_ATTR_NOSUID 0x00000002 /* Ignore suid and sgid bits */ +#define MOUNT_ATTR_NODEV 0x00000004 /* Disallow access to device special files */ +#define MOUNT_ATTR_NOEXEC 0x00000008 /* Disallow program execution */ +#define MOUNT_ATTR__ATIME 0x00000070 /* Setting on how atime should be updated */ +#define MOUNT_ATTR_RELATIME 0x00000000 /* - Update atime relative to mtime/ctime. */ +#define MOUNT_ATTR_NOATIME 0x00000010 /* - Do not update access times. */ +#define MOUNT_ATTR_STRICTATIME 0x00000020 /* - Always perform atime updates */ +#define MOUNT_ATTR_NODIRATIME 0x00000080 /* Do not update directory access times */ + #endif /* _UAPI_LINUX_MOUNT_H */ diff --git a/samples/Kconfig b/samples/Kconfig index d19754ccad08..30a89425009c 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -154,10 +154,11 @@ config SAMPLE_ANDROID_BINDERFS Builds a sample program to illustrate the use of the Android binderfs filesystem. -config SAMPLE_STATX - bool "Build example extended-stat using code" - depends on BROKEN +config SAMPLE_VFS + bool "Build example programs that use new VFS system calls" help - Build example userspace program to use the new extended-stat syscall. + Build example userspace programs that use new VFS system calls such + as mount API and statx(). Note that this is restricted to the x86 + arch whilst it accesses system calls that aren't yet in all arches. endif # SAMPLES diff --git a/samples/Makefile b/samples/Makefile index fadadb1c3b05..2484cc262d3e 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -3,4 +3,4 @@ obj-$(CONFIG_SAMPLES) += kobject/ kprobes/ trace_events/ livepatch/ \ hw_breakpoint/ kfifo/ kdb/ hidraw/ rpmsg/ seccomp/ \ configfs/ connector/ v4l/ trace_printk/ \ - vfio-mdev/ statx/ qmi/ binderfs/ pidfd/ + vfio-mdev/ vfs/ qmi/ binderfs/ pidfd/ diff --git a/samples/statx/Makefile b/samples/vfs/Makefile similarity index 55% rename from samples/statx/Makefile rename to samples/vfs/Makefile index 59df7c25a9d1..4ac9690fb3c4 100644 --- a/samples/statx/Makefile +++ b/samples/vfs/Makefile @@ -1,7 +1,10 @@ # List of programs to build -hostprogs-$(CONFIG_SAMPLE_STATX) := test-statx +hostprogs-$(CONFIG_SAMPLE_VFS) := \ + test-fsmount \ + test-statx # Tell kbuild to always build the programs always := $(hostprogs-y) +HOSTCFLAGS_test-fsmount.o += -I$(objtree)/usr/include HOSTCFLAGS_test-statx.o += -I$(objtree)/usr/include diff --git a/samples/vfs/test-fsmount.c b/samples/vfs/test-fsmount.c new file mode 100644 index 000000000000..266d72b3dce4 --- /dev/null +++ b/samples/vfs/test-fsmount.c @@ -0,0 +1,133 @@ +/* fd-based mount test. + * + * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define E(x) do { if ((x) == -1) { perror(#x); exit(1); } } while(0) + +static void check_messages(int fd) +{ + char buf[4096]; + int err, n; + + err = errno; + + for (;;) { + n = read(fd, buf, sizeof(buf)); + if (n < 0) + break; + n -= 2; + + switch (buf[0]) { + case 'e': + fprintf(stderr, "Error: %*.*s\n", n, n, buf + 2); + break; + case 'w': + fprintf(stderr, "Warning: %*.*s\n", n, n, buf + 2); + break; + case 'i': + fprintf(stderr, "Info: %*.*s\n", n, n, buf + 2); + break; + } + } + + errno = err; +} + +static __attribute__((noreturn)) +void mount_error(int fd, const char *s) +{ + check_messages(fd); + fprintf(stderr, "%s: %m\n", s); + exit(1); +} + +/* Hope -1 isn't a syscall */ +#ifndef __NR_fsopen +#define __NR_fsopen -1 +#endif +#ifndef __NR_fsmount +#define __NR_fsmount -1 +#endif +#ifndef __NR_fsconfig +#define __NR_fsconfig -1 +#endif +#ifndef __NR_move_mount +#define __NR_move_mount -1 +#endif + + +static inline int fsopen(const char *fs_name, unsigned int flags) +{ + return syscall(__NR_fsopen, fs_name, flags); +} + +static inline int fsmount(int fsfd, unsigned int flags, unsigned int ms_flags) +{ + return syscall(__NR_fsmount, fsfd, flags, ms_flags); +} + +static inline int fsconfig(int fsfd, unsigned int cmd, + const char *key, const void *val, int aux) +{ + return syscall(__NR_fsconfig, fsfd, cmd, key, val, aux); +} + +static inline int move_mount(int from_dfd, const char *from_pathname, + int to_dfd, const char *to_pathname, + unsigned int flags) +{ + return syscall(__NR_move_mount, + from_dfd, from_pathname, + to_dfd, to_pathname, flags); +} + +#define E_fsconfig(fd, cmd, key, val, aux) \ + do { \ + if (fsconfig(fd, cmd, key, val, aux) == -1) \ + mount_error(fd, key ?: "create"); \ + } while (0) + +int main(int argc, char *argv[]) +{ + int fsfd, mfd; + + /* Mount a publically available AFS filesystem */ + fsfd = fsopen("afs", 0); + if (fsfd == -1) { + perror("fsopen"); + exit(1); + } + + E_fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "#grand.central.org:root.cell.", 0); + E_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0); + + mfd = fsmount(fsfd, 0, MOUNT_ATTR_RDONLY); + if (mfd < 0) + mount_error(fsfd, "fsmount"); + E(close(fsfd)); + + if (move_mount(mfd, "", AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH) < 0) { + perror("move_mount"); + exit(1); + } + + E(close(mfd)); + exit(0); +} diff --git a/samples/statx/test-statx.c b/samples/vfs/test-statx.c similarity index 96% rename from samples/statx/test-statx.c rename to samples/vfs/test-statx.c index d4d77b09412c..e91f918e84c4 100644 --- a/samples/statx/test-statx.c +++ b/samples/vfs/test-statx.c @@ -25,13 +25,21 @@ #include #include #include +#define statx foo +#define statx_timestamp foo_timestamp #include +#undef statx +#undef statx_timestamp #define AT_STATX_SYNC_TYPE 0x6000 #define AT_STATX_SYNC_AS_STAT 0x0000 #define AT_STATX_FORCE_SYNC 0x2000 #define AT_STATX_DONT_SYNC 0x4000 +#ifndef __NR_statx +#define __NR_statx -1 +#endif + static __attribute__((unused)) ssize_t statx(int dfd, const char *filename, unsigned flags, unsigned int mask, struct statx *buffer) @@ -157,7 +165,8 @@ static void dump_statx(struct statx *stx) "?dai?c??" /* 7- 0 0x00000000-000000ff */ ; - printf("Attributes: %016llx (", stx->stx_attributes); + printf("Attributes: %016llx (", + (unsigned long long)stx->stx_attributes); for (byte = 64 - 8; byte >= 0; byte -= 8) { bits = stx->stx_attributes >> byte; mbits = stx->stx_attributes_mask >> byte; diff --git a/security/security.c b/security/security.c index 8d6ef9da94eb..613a5c00e602 100644 --- a/security/security.c +++ b/security/security.c @@ -866,6 +866,11 @@ int security_add_mnt_opt(const char *option, const char *val, int len, } EXPORT_SYMBOL(security_add_mnt_opt); +int security_move_mount(const struct path *from_path, const struct path *to_path) +{ + return call_int_hook(move_mount, 0, from_path, to_path); +} + int security_inode_alloc(struct inode *inode) { int rc = lsm_inode_alloc(inode);