vfs: syscall: Add open_tree(2) to reference or clone a mount
open_tree(dfd, pathname, flags) Returns an O_PATH-opened file descriptor or an error. dfd and pathname specify the location to open, in usual fashion (see e.g. fstatat(2)). flags should be an OR of some of the following: * AT_PATH_EMPTY, AT_NO_AUTOMOUNT, AT_SYMLINK_NOFOLLOW - same meanings as usual * OPEN_TREE_CLOEXEC - make the resulting descriptor close-on-exec * OPEN_TREE_CLONE or OPEN_TREE_CLONE | AT_RECURSIVE - instead of opening the location in question, create a detached mount tree matching the subtree rooted at location specified by dfd/pathname. With AT_RECURSIVE the entire subtree is cloned, without it - only the part within in the mount containing the location in question. In other words, the same as mount --rbind or mount --bind would've taken. The detached tree will be dissolved on the final close of obtained file. Creation of such detached trees requires the same capabilities as doing mount --bind. Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: David Howells <dhowells@redhat.com> cc: linux-api@vger.kernel.org Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
This commit is contained in:
parent
9e98c678c2
commit
a07b200047
|
@ -398,7 +398,8 @@
|
|||
384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl
|
||||
385 i386 io_pgetevents sys_io_pgetevents_time32 __ia32_compat_sys_io_pgetevents
|
||||
386 i386 rseq sys_rseq __ia32_sys_rseq
|
||||
# don't use numbers 387 through 392, add new calls at the end
|
||||
387 i386 open_tree sys_open_tree __ia32_sys_open_tree
|
||||
# don't use numbers 388 through 392, add new calls at the end
|
||||
393 i386 semget sys_semget __ia32_sys_semget
|
||||
394 i386 semctl sys_semctl __ia32_compat_sys_semctl
|
||||
395 i386 shmget sys_shmget __ia32_sys_shmget
|
||||
|
|
|
@ -343,6 +343,7 @@
|
|||
332 common statx __x64_sys_statx
|
||||
333 common io_pgetevents __x64_sys_io_pgetevents
|
||||
334 common rseq __x64_sys_rseq
|
||||
335 common open_tree __x64_sys_open_tree
|
||||
# don't use numbers 387 through 423, add new calls after the last
|
||||
# 'common' entry
|
||||
424 common pidfd_send_signal __x64_sys_pidfd_send_signal
|
||||
|
|
|
@ -255,6 +255,7 @@ static void __fput(struct file *file)
|
|||
struct dentry *dentry = file->f_path.dentry;
|
||||
struct vfsmount *mnt = file->f_path.mnt;
|
||||
struct inode *inode = file->f_inode;
|
||||
fmode_t mode = file->f_mode;
|
||||
|
||||
if (unlikely(!(file->f_mode & FMODE_OPENED)))
|
||||
goto out;
|
||||
|
@ -277,18 +278,20 @@ static void __fput(struct file *file)
|
|||
if (file->f_op->release)
|
||||
file->f_op->release(inode, file);
|
||||
if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
|
||||
!(file->f_mode & FMODE_PATH))) {
|
||||
!(mode & FMODE_PATH))) {
|
||||
cdev_put(inode->i_cdev);
|
||||
}
|
||||
fops_put(file->f_op);
|
||||
put_pid(file->f_owner.pid);
|
||||
if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
|
||||
if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
|
||||
i_readcount_dec(inode);
|
||||
if (file->f_mode & FMODE_WRITER) {
|
||||
if (mode & FMODE_WRITER) {
|
||||
put_write_access(inode);
|
||||
__mnt_drop_write(mnt);
|
||||
}
|
||||
dput(dentry);
|
||||
if (unlikely(mode & FMODE_NEED_UNMOUNT))
|
||||
dissolve_on_fput(mnt);
|
||||
mntput(mnt);
|
||||
out:
|
||||
file_free(file);
|
||||
|
|
|
@ -94,6 +94,7 @@ extern int __mnt_want_write_file(struct file *);
|
|||
extern void __mnt_drop_write(struct vfsmount *);
|
||||
extern void __mnt_drop_write_file(struct file *);
|
||||
|
||||
extern void dissolve_on_fput(struct vfsmount *);
|
||||
/*
|
||||
* fs_struct.c
|
||||
*/
|
||||
|
|
157
fs/namespace.c
157
fs/namespace.c
|
@ -20,6 +20,7 @@
|
|||
#include <linux/init.h> /* init_rootfs */
|
||||
#include <linux/fs_struct.h> /* get_fs_root et.al. */
|
||||
#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
|
||||
#include <linux/file.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/proc_ns.h>
|
||||
#include <linux/magic.h>
|
||||
|
@ -1832,6 +1833,21 @@ struct vfsmount *collect_mounts(const struct path *path)
|
|||
return &tree->mnt;
|
||||
}
|
||||
|
||||
static void free_mnt_ns(struct mnt_namespace *);
|
||||
static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);
|
||||
|
||||
void dissolve_on_fput(struct vfsmount *mnt)
|
||||
{
|
||||
struct mnt_namespace *ns;
|
||||
namespace_lock();
|
||||
lock_mount_hash();
|
||||
ns = real_mount(mnt)->mnt_ns;
|
||||
umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
|
||||
unlock_mount_hash();
|
||||
namespace_unlock();
|
||||
free_mnt_ns(ns);
|
||||
}
|
||||
|
||||
void drop_collected_mounts(struct vfsmount *mnt)
|
||||
{
|
||||
namespace_lock();
|
||||
|
@ -2222,6 +2238,30 @@ static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
|
|||
return false;
|
||||
}
|
||||
|
||||
static struct mount *__do_loopback(struct path *old_path, int recurse)
|
||||
{
|
||||
struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);
|
||||
|
||||
if (IS_MNT_UNBINDABLE(old))
|
||||
return mnt;
|
||||
|
||||
if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations)
|
||||
return mnt;
|
||||
|
||||
if (!recurse && has_locked_children(old, old_path->dentry))
|
||||
return mnt;
|
||||
|
||||
if (recurse)
|
||||
mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
|
||||
else
|
||||
mnt = clone_mnt(old, old_path->dentry, 0);
|
||||
|
||||
if (!IS_ERR(mnt))
|
||||
mnt->mnt.mnt_flags &= ~MNT_LOCKED;
|
||||
|
||||
return mnt;
|
||||
}
|
||||
|
||||
/*
|
||||
* do loopback mount.
|
||||
*/
|
||||
|
@ -2229,7 +2269,7 @@ static int do_loopback(struct path *path, const char *old_name,
|
|||
int recurse)
|
||||
{
|
||||
struct path old_path;
|
||||
struct mount *mnt = NULL, *old, *parent;
|
||||
struct mount *mnt = NULL, *parent;
|
||||
struct mountpoint *mp;
|
||||
int err;
|
||||
if (!old_name || !*old_name)
|
||||
|
@ -2243,38 +2283,21 @@ static int do_loopback(struct path *path, const char *old_name,
|
|||
goto out;
|
||||
|
||||
mp = lock_mount(path);
|
||||
err = PTR_ERR(mp);
|
||||
if (IS_ERR(mp))
|
||||
if (IS_ERR(mp)) {
|
||||
err = PTR_ERR(mp);
|
||||
goto out;
|
||||
}
|
||||
|
||||
old = real_mount(old_path.mnt);
|
||||
parent = real_mount(path->mnt);
|
||||
|
||||
err = -EINVAL;
|
||||
if (IS_MNT_UNBINDABLE(old))
|
||||
goto out2;
|
||||
|
||||
if (!check_mnt(parent))
|
||||
goto out2;
|
||||
|
||||
if (!check_mnt(old) && old_path.dentry->d_op != &ns_dentry_operations)
|
||||
goto out2;
|
||||
|
||||
if (!recurse && has_locked_children(old, old_path.dentry))
|
||||
goto out2;
|
||||
|
||||
if (recurse)
|
||||
mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);
|
||||
else
|
||||
mnt = clone_mnt(old, old_path.dentry, 0);
|
||||
|
||||
mnt = __do_loopback(&old_path, recurse);
|
||||
if (IS_ERR(mnt)) {
|
||||
err = PTR_ERR(mnt);
|
||||
goto out2;
|
||||
}
|
||||
|
||||
mnt->mnt.mnt_flags &= ~MNT_LOCKED;
|
||||
|
||||
err = graft_tree(mnt, parent, mp);
|
||||
if (err) {
|
||||
lock_mount_hash();
|
||||
|
@ -2288,6 +2311,96 @@ out:
|
|||
return err;
|
||||
}
|
||||
|
||||
static struct file *open_detached_copy(struct path *path, bool recursive)
|
||||
{
|
||||
struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
|
||||
struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true);
|
||||
struct mount *mnt, *p;
|
||||
struct file *file;
|
||||
|
||||
if (IS_ERR(ns))
|
||||
return ERR_CAST(ns);
|
||||
|
||||
namespace_lock();
|
||||
mnt = __do_loopback(path, recursive);
|
||||
if (IS_ERR(mnt)) {
|
||||
namespace_unlock();
|
||||
free_mnt_ns(ns);
|
||||
return ERR_CAST(mnt);
|
||||
}
|
||||
|
||||
lock_mount_hash();
|
||||
for (p = mnt; p; p = next_mnt(p, mnt)) {
|
||||
p->mnt_ns = ns;
|
||||
ns->mounts++;
|
||||
}
|
||||
ns->root = mnt;
|
||||
list_add_tail(&ns->list, &mnt->mnt_list);
|
||||
mntget(&mnt->mnt);
|
||||
unlock_mount_hash();
|
||||
namespace_unlock();
|
||||
|
||||
mntput(path->mnt);
|
||||
path->mnt = &mnt->mnt;
|
||||
file = dentry_open(path, O_PATH, current_cred());
|
||||
if (IS_ERR(file))
|
||||
dissolve_on_fput(path->mnt);
|
||||
else
|
||||
file->f_mode |= FMODE_NEED_UNMOUNT;
|
||||
return file;
|
||||
}
|
||||
|
||||
SYSCALL_DEFINE3(open_tree, int, dfd, const char *, filename, unsigned, flags)
|
||||
{
|
||||
struct file *file;
|
||||
struct path path;
|
||||
int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
|
||||
bool detached = flags & OPEN_TREE_CLONE;
|
||||
int error;
|
||||
int fd;
|
||||
|
||||
BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
|
||||
|
||||
if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
|
||||
AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
|
||||
OPEN_TREE_CLOEXEC))
|
||||
return -EINVAL;
|
||||
|
||||
if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
|
||||
return -EINVAL;
|
||||
|
||||
if (flags & AT_NO_AUTOMOUNT)
|
||||
lookup_flags &= ~LOOKUP_AUTOMOUNT;
|
||||
if (flags & AT_SYMLINK_NOFOLLOW)
|
||||
lookup_flags &= ~LOOKUP_FOLLOW;
|
||||
if (flags & AT_EMPTY_PATH)
|
||||
lookup_flags |= LOOKUP_EMPTY;
|
||||
|
||||
if (detached && !may_mount())
|
||||
return -EPERM;
|
||||
|
||||
fd = get_unused_fd_flags(flags & O_CLOEXEC);
|
||||
if (fd < 0)
|
||||
return fd;
|
||||
|
||||
error = user_path_at(dfd, filename, lookup_flags, &path);
|
||||
if (unlikely(error)) {
|
||||
file = ERR_PTR(error);
|
||||
} else {
|
||||
if (detached)
|
||||
file = open_detached_copy(&path, flags & AT_RECURSIVE);
|
||||
else
|
||||
file = dentry_open(&path, O_PATH, current_cred());
|
||||
path_put(&path);
|
||||
}
|
||||
if (IS_ERR(file)) {
|
||||
put_unused_fd(fd);
|
||||
return PTR_ERR(file);
|
||||
}
|
||||
fd_install(fd, file);
|
||||
return fd;
|
||||
}
|
||||
|
||||
/*
|
||||
* Don't allow locked mount flags to be cleared.
|
||||
*
|
||||
|
|
|
@ -162,10 +162,13 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
|
|||
#define FMODE_NONOTIFY ((__force fmode_t)0x4000000)
|
||||
|
||||
/* File is capable of returning -EAGAIN if I/O will block */
|
||||
#define FMODE_NOWAIT ((__force fmode_t)0x8000000)
|
||||
#define FMODE_NOWAIT ((__force fmode_t)0x8000000)
|
||||
|
||||
/* File represents mount that needs unmounting */
|
||||
#define FMODE_NEED_UNMOUNT ((__force fmode_t)0x10000000)
|
||||
|
||||
/* File does not contribute to nr_files count */
|
||||
#define FMODE_NOACCOUNT ((__force fmode_t)0x20000000)
|
||||
#define FMODE_NOACCOUNT ((__force fmode_t)0x20000000)
|
||||
|
||||
/*
|
||||
* Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
|
||||
|
|
|
@ -985,6 +985,7 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
|
|||
unsigned mask, struct statx __user *buffer);
|
||||
asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len,
|
||||
int flags, uint32_t sig);
|
||||
asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags);
|
||||
asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
|
||||
siginfo_t __user *info,
|
||||
unsigned int flags);
|
||||
|
|
|
@ -91,5 +91,7 @@
|
|||
#define AT_STATX_FORCE_SYNC 0x2000 /* - Force the attributes to be sync'd with the server */
|
||||
#define AT_STATX_DONT_SYNC 0x4000 /* - Don't sync attributes with the server */
|
||||
|
||||
#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */
|
||||
|
||||
|
||||
#endif /* _UAPI_LINUX_FCNTL_H */
|
||||
|
|
|
@ -55,4 +55,10 @@
|
|||
#define MS_MGC_VAL 0xC0ED0000
|
||||
#define MS_MGC_MSK 0xffff0000
|
||||
|
||||
/*
|
||||
* open_tree() flags.
|
||||
*/
|
||||
#define OPEN_TREE_CLONE 1 /* Clone the target tree and attach the clone */
|
||||
#define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */
|
||||
|
||||
#endif /* _UAPI_LINUX_MOUNT_H */
|
||||
|
|
Loading…
Reference in New Issue