2019-05-21 01:08:01 +08:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
2018-11-04 16:19:03 +08:00
|
|
|
/* Filesystem superblock creation and reconfiguration context.
|
|
|
|
*
|
|
|
|
* Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
|
|
|
|
* Written by David Howells (dhowells@redhat.com)
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef _LINUX_FS_CONTEXT_H
|
|
|
|
#define _LINUX_FS_CONTEXT_H
|
|
|
|
|
|
|
|
#include <linux/kernel.h>
|
vfs: Implement logging through fs_context
Implement the ability for filesystems to log error, warning and
informational messages through the fs_context. These can be extracted by
userspace by reading from an fd created by fsopen().
Error messages are prefixed with "e ", warnings with "w " and informational
messages with "i ".
Inside the kernel, formatted messages are malloc'd but unformatted messages
are not copied if they're either in the core .rodata section or in the
.rodata section of the filesystem module pinned by fs_context::fs_type.
The messages are only good till the fs_type is released.
Note that the logging object is shared between duplicated fs_context
structures. This is so that such as NFS which do a mount within a mount
can get at least some of the errors from the inner mount.
Five logging functions are provided for this:
(1) void logfc(struct fs_context *fc, const char *fmt, ...);
This logs a message into the context. If the buffer is full, the
earliest message is discarded.
(2) void errorf(fc, fmt, ...);
This wraps logfc() to log an error.
(3) void invalf(fc, fmt, ...);
This wraps errorf() and returns -EINVAL for convenience.
(4) void warnf(fc, fmt, ...);
This wraps logfc() to log a warning.
(5) void infof(fc, fmt, ...);
This wraps logfc() to log an informational message.
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-02 07:34:29 +08:00
|
|
|
#include <linux/refcount.h>
|
2018-11-04 16:19:03 +08:00
|
|
|
#include <linux/errno.h>
|
|
|
|
#include <linux/security.h>
|
vfs: syscall: Add fsopen() to prepare for superblock creation
Provide an fsopen() system call that starts the process of preparing to
create a superblock that will then be mountable, using an fd as a context
handle. fsopen() is given the name of the filesystem that will be used:
int mfd = fsopen(const char *fsname, unsigned int flags);
where flags can be 0 or FSOPEN_CLOEXEC.
For example:
sfd = fsopen("ext4", FSOPEN_CLOEXEC);
fsconfig(sfd, FSCONFIG_SET_PATH, "source", "/dev/sda1", AT_FDCWD);
fsconfig(sfd, FSCONFIG_SET_FLAG, "noatime", NULL, 0);
fsconfig(sfd, FSCONFIG_SET_FLAG, "acl", NULL, 0);
fsconfig(sfd, FSCONFIG_SET_FLAG, "user_xattr", NULL, 0);
fsconfig(sfd, FSCONFIG_SET_STRING, "sb", "1", 0);
fsconfig(sfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
fsinfo(sfd, NULL, ...); // query new superblock attributes
mfd = fsmount(sfd, FSMOUNT_CLOEXEC, MS_RELATIME);
move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);
sfd = fsopen("afs", -1);
fsconfig(fd, FSCONFIG_SET_STRING, "source",
"#grand.central.org:root.cell", 0);
fsconfig(fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
mfd = fsmount(sfd, 0, MS_NODEV);
move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);
If an error is reported at any step, an error message may be available to be
read() back (ENODATA will be reported if there isn't an error available) in
the form:
"e <subsys>:<problem>"
"e SELinux:Mount on mountpoint not permitted"
Once fsmount() has been called, further fsconfig() calls will incur EBUSY,
even if the fsmount() fails. read() is still possible to retrieve error
information.
The fsopen() syscall creates a mount context and hangs it of the fd that it
returns.
Netlink is not used because it is optional and would make the core VFS
dependent on the networking layer and also potentially add network
namespace issues.
Note that, for the moment, the caller must have SYS_CAP_ADMIN to use
fsopen().
Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-api@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-02 07:33:31 +08:00
|
|
|
#include <linux/mutex.h>
|
2018-11-04 16:19:03 +08:00
|
|
|
|
|
|
|
struct cred;
|
|
|
|
struct dentry;
|
|
|
|
struct file_operations;
|
|
|
|
struct file_system_type;
|
2018-12-24 07:55:56 +08:00
|
|
|
struct mnt_namespace;
|
2018-11-04 16:19:03 +08:00
|
|
|
struct net;
|
2018-12-24 07:55:56 +08:00
|
|
|
struct pid_namespace;
|
|
|
|
struct super_block;
|
2018-11-04 16:19:03 +08:00
|
|
|
struct user_namespace;
|
2018-12-24 07:55:56 +08:00
|
|
|
struct vfsmount;
|
|
|
|
struct path;
|
2018-11-04 16:19:03 +08:00
|
|
|
|
|
|
|
enum fs_context_purpose {
|
|
|
|
FS_CONTEXT_FOR_MOUNT, /* New superblock for explicit mount */
|
2018-12-24 05:25:31 +08:00
|
|
|
FS_CONTEXT_FOR_SUBMOUNT, /* New superblock for automatic submount */
|
2018-11-04 22:28:36 +08:00
|
|
|
FS_CONTEXT_FOR_RECONFIGURE, /* Superblock reconfiguration (remount) */
|
2018-11-04 16:19:03 +08:00
|
|
|
};
|
|
|
|
|
vfs: syscall: Add fsopen() to prepare for superblock creation
Provide an fsopen() system call that starts the process of preparing to
create a superblock that will then be mountable, using an fd as a context
handle. fsopen() is given the name of the filesystem that will be used:
int mfd = fsopen(const char *fsname, unsigned int flags);
where flags can be 0 or FSOPEN_CLOEXEC.
For example:
sfd = fsopen("ext4", FSOPEN_CLOEXEC);
fsconfig(sfd, FSCONFIG_SET_PATH, "source", "/dev/sda1", AT_FDCWD);
fsconfig(sfd, FSCONFIG_SET_FLAG, "noatime", NULL, 0);
fsconfig(sfd, FSCONFIG_SET_FLAG, "acl", NULL, 0);
fsconfig(sfd, FSCONFIG_SET_FLAG, "user_xattr", NULL, 0);
fsconfig(sfd, FSCONFIG_SET_STRING, "sb", "1", 0);
fsconfig(sfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
fsinfo(sfd, NULL, ...); // query new superblock attributes
mfd = fsmount(sfd, FSMOUNT_CLOEXEC, MS_RELATIME);
move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);
sfd = fsopen("afs", -1);
fsconfig(fd, FSCONFIG_SET_STRING, "source",
"#grand.central.org:root.cell", 0);
fsconfig(fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
mfd = fsmount(sfd, 0, MS_NODEV);
move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);
If an error is reported at any step, an error message may be available to be
read() back (ENODATA will be reported if there isn't an error available) in
the form:
"e <subsys>:<problem>"
"e SELinux:Mount on mountpoint not permitted"
Once fsmount() has been called, further fsconfig() calls will incur EBUSY,
even if the fsmount() fails. read() is still possible to retrieve error
information.
The fsopen() syscall creates a mount context and hangs it of the fd that it
returns.
Netlink is not used because it is optional and would make the core VFS
dependent on the networking layer and also potentially add network
namespace issues.
Note that, for the moment, the caller must have SYS_CAP_ADMIN to use
fsopen().
Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-api@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-02 07:33:31 +08:00
|
|
|
/*
|
|
|
|
* Userspace usage phase for fsopen/fspick.
|
|
|
|
*/
|
|
|
|
enum fs_context_phase {
|
|
|
|
FS_CONTEXT_CREATE_PARAMS, /* Loading params for sb creation */
|
|
|
|
FS_CONTEXT_CREATING, /* A superblock is being created */
|
|
|
|
FS_CONTEXT_AWAITING_MOUNT, /* Superblock created, awaiting fsmount() */
|
|
|
|
FS_CONTEXT_AWAITING_RECONF, /* Awaiting initialisation for reconfiguration */
|
|
|
|
FS_CONTEXT_RECONF_PARAMS, /* Loading params for reconfiguration */
|
|
|
|
FS_CONTEXT_RECONFIGURING, /* Reconfiguring the superblock */
|
|
|
|
FS_CONTEXT_FAILED, /* Failed to correctly transition a context */
|
|
|
|
};
|
|
|
|
|
vfs: Add configuration parser helpers
Because the new API passes in key,value parameters, match_token() cannot be
used with it. Instead, provide three new helpers to aid with parsing:
(1) fs_parse(). This takes a parameter and a simple static description of
all the parameters and maps the key name to an ID. It returns 1 on a
match, 0 on no match if unknowns should be ignored and some other
negative error code on a parse error.
The parameter description includes a list of key names to IDs, desired
parameter types and a list of enumeration name -> ID mappings.
[!] Note that for the moment I've required that the key->ID mapping
array is expected to be sorted and unterminated. The size of the
array is noted in the fsconfig_parser struct. This allows me to use
bsearch(), but I'm not sure any performance gain is worth the hassle
of requiring people to keep the array sorted.
The parameter type array is sized according to the number of parameter
IDs and is indexed directly. The optional enum mapping array is an
unterminated, unsorted list and the size goes into the fsconfig_parser
struct.
The function can do some additional things:
(a) If it's not ambiguous and no value is given, the prefix "no" on
a key name is permitted to indicate that the parameter should
be considered negatory.
(b) If the desired type is a single simple integer, it will perform
an appropriate conversion and store the result in a union in
the parse result.
(c) If the desired type is an enumeration, {key ID, name} will be
looked up in the enumeration list and the matching value will
be stored in the parse result union.
(d) Optionally generate an error if the key is unrecognised.
This is called something like:
enum rdt_param {
Opt_cdp,
Opt_cdpl2,
Opt_mba_mpbs,
nr__rdt_params
};
const struct fs_parameter_spec rdt_param_specs[nr__rdt_params] = {
[Opt_cdp] = { fs_param_is_bool },
[Opt_cdpl2] = { fs_param_is_bool },
[Opt_mba_mpbs] = { fs_param_is_bool },
};
const const char *const rdt_param_keys[nr__rdt_params] = {
[Opt_cdp] = "cdp",
[Opt_cdpl2] = "cdpl2",
[Opt_mba_mpbs] = "mba_mbps",
};
const struct fs_parameter_description rdt_parser = {
.name = "rdt",
.nr_params = nr__rdt_params,
.keys = rdt_param_keys,
.specs = rdt_param_specs,
.no_source = true,
};
int rdt_parse_param(struct fs_context *fc,
struct fs_parameter *param)
{
struct fs_parse_result parse;
struct rdt_fs_context *ctx = rdt_fc2context(fc);
int ret;
ret = fs_parse(fc, &rdt_parser, param, &parse);
if (ret < 0)
return ret;
switch (parse.key) {
case Opt_cdp:
ctx->enable_cdpl3 = true;
return 0;
case Opt_cdpl2:
ctx->enable_cdpl2 = true;
return 0;
case Opt_mba_mpbs:
ctx->enable_mba_mbps = true;
return 0;
}
return -EINVAL;
}
(2) fs_lookup_param(). This takes a { dirfd, path, LOOKUP_EMPTY? } or
string value and performs an appropriate path lookup to convert it
into a path object, which it will then return.
If the desired type was a blockdev, the type of the looked up inode
will be checked to make sure it is one.
This can be used like:
enum foo_param {
Opt_source,
nr__foo_params
};
const struct fs_parameter_spec foo_param_specs[nr__foo_params] = {
[Opt_source] = { fs_param_is_blockdev },
};
const char *char foo_param_keys[nr__foo_params] = {
[Opt_source] = "source",
};
const struct constant_table foo_param_alt_keys[] = {
{ "device", Opt_source },
};
const struct fs_parameter_description foo_parser = {
.name = "foo",
.nr_params = nr__foo_params,
.nr_alt_keys = ARRAY_SIZE(foo_param_alt_keys),
.keys = foo_param_keys,
.alt_keys = foo_param_alt_keys,
.specs = foo_param_specs,
};
int foo_parse_param(struct fs_context *fc,
struct fs_parameter *param)
{
struct fs_parse_result parse;
struct foo_fs_context *ctx = foo_fc2context(fc);
int ret;
ret = fs_parse(fc, &foo_parser, param, &parse);
if (ret < 0)
return ret;
switch (parse.key) {
case Opt_source:
return fs_lookup_param(fc, &foo_parser, param,
&parse, &ctx->source);
default:
return -EINVAL;
}
}
(3) lookup_constant(). This takes a table of named constants and looks up
the given name within it. The table is expected to be sorted such
that bsearch() be used upon it.
Possibly I should require the table be terminated and just use a
for-loop to scan it instead of using bsearch() to reduce hassle.
Tables look something like:
static const struct constant_table bool_names[] = {
{ "0", false },
{ "1", true },
{ "false", false },
{ "no", false },
{ "true", true },
{ "yes", true },
};
and a lookup is done with something like:
b = lookup_constant(bool_names, param->string, -1);
Additionally, optional validation routines for the parameter description
are provided that can be enabled at compile time. A later patch will
invoke these when a filesystem is registered.
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-02 07:07:24 +08:00
|
|
|
/*
|
|
|
|
* Type of parameter value.
|
|
|
|
*/
|
|
|
|
enum fs_value_type {
|
|
|
|
fs_value_is_undefined,
|
|
|
|
fs_value_is_flag, /* Value not given a value */
|
|
|
|
fs_value_is_string, /* Value is a string */
|
|
|
|
fs_value_is_blob, /* Value is a binary blob */
|
|
|
|
fs_value_is_filename, /* Value is a filename* + dirfd */
|
|
|
|
fs_value_is_file, /* Value is a file* */
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Configuration parameter.
|
|
|
|
*/
|
|
|
|
struct fs_parameter {
|
|
|
|
const char *key; /* Parameter name */
|
|
|
|
enum fs_value_type type:8; /* The type of value here */
|
|
|
|
union {
|
|
|
|
char *string;
|
|
|
|
void *blob;
|
|
|
|
struct filename *name;
|
|
|
|
struct file *file;
|
|
|
|
};
|
|
|
|
size_t size;
|
|
|
|
int dirfd;
|
|
|
|
};
|
|
|
|
|
2019-12-21 12:43:32 +08:00
|
|
|
struct p_log {
|
|
|
|
const char *prefix;
|
|
|
|
struct fc_log *log;
|
|
|
|
};
|
|
|
|
|
2018-11-04 16:19:03 +08:00
|
|
|
/*
|
|
|
|
* Filesystem context for holding the parameters used in the creation or
|
|
|
|
* reconfiguration of a superblock.
|
|
|
|
*
|
|
|
|
* Superblock creation fills in ->root whereas reconfiguration begins with this
|
|
|
|
* already set.
|
|
|
|
*
|
2020-04-28 05:17:09 +08:00
|
|
|
* See Documentation/filesystems/mount_api.rst
|
2018-11-04 16:19:03 +08:00
|
|
|
*/
|
|
|
|
struct fs_context {
|
2018-12-24 07:55:56 +08:00
|
|
|
const struct fs_context_operations *ops;
|
vfs: syscall: Add fsopen() to prepare for superblock creation
Provide an fsopen() system call that starts the process of preparing to
create a superblock that will then be mountable, using an fd as a context
handle. fsopen() is given the name of the filesystem that will be used:
int mfd = fsopen(const char *fsname, unsigned int flags);
where flags can be 0 or FSOPEN_CLOEXEC.
For example:
sfd = fsopen("ext4", FSOPEN_CLOEXEC);
fsconfig(sfd, FSCONFIG_SET_PATH, "source", "/dev/sda1", AT_FDCWD);
fsconfig(sfd, FSCONFIG_SET_FLAG, "noatime", NULL, 0);
fsconfig(sfd, FSCONFIG_SET_FLAG, "acl", NULL, 0);
fsconfig(sfd, FSCONFIG_SET_FLAG, "user_xattr", NULL, 0);
fsconfig(sfd, FSCONFIG_SET_STRING, "sb", "1", 0);
fsconfig(sfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
fsinfo(sfd, NULL, ...); // query new superblock attributes
mfd = fsmount(sfd, FSMOUNT_CLOEXEC, MS_RELATIME);
move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);
sfd = fsopen("afs", -1);
fsconfig(fd, FSCONFIG_SET_STRING, "source",
"#grand.central.org:root.cell", 0);
fsconfig(fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
mfd = fsmount(sfd, 0, MS_NODEV);
move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);
If an error is reported at any step, an error message may be available to be
read() back (ENODATA will be reported if there isn't an error available) in
the form:
"e <subsys>:<problem>"
"e SELinux:Mount on mountpoint not permitted"
Once fsmount() has been called, further fsconfig() calls will incur EBUSY,
even if the fsmount() fails. read() is still possible to retrieve error
information.
The fsopen() syscall creates a mount context and hangs it of the fd that it
returns.
Netlink is not used because it is optional and would make the core VFS
dependent on the networking layer and also potentially add network
namespace issues.
Note that, for the moment, the caller must have SYS_CAP_ADMIN to use
fsopen().
Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-api@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-02 07:33:31 +08:00
|
|
|
struct mutex uapi_mutex; /* Userspace access mutex */
|
2018-11-04 16:19:03 +08:00
|
|
|
struct file_system_type *fs_type;
|
|
|
|
void *fs_private; /* The filesystem's context */
|
2019-03-27 22:15:16 +08:00
|
|
|
void *sget_key;
|
2018-11-04 16:19:03 +08:00
|
|
|
struct dentry *root; /* The root and superblock */
|
|
|
|
struct user_namespace *user_ns; /* The user namespace for this mount */
|
|
|
|
struct net *net_ns; /* The network namespace for this mount */
|
|
|
|
const struct cred *cred; /* The mounter's credentials */
|
2019-12-21 13:16:49 +08:00
|
|
|
struct p_log log; /* Logging buffer */
|
2018-11-04 16:19:03 +08:00
|
|
|
const char *source; /* The source name (eg. dev path) */
|
2022-10-18 02:08:09 +08:00
|
|
|
void *security; /* LSM options */
|
2018-12-24 06:25:47 +08:00
|
|
|
void *s_fs_info; /* Proposed s_fs_info */
|
2018-11-04 16:19:03 +08:00
|
|
|
unsigned int sb_flags; /* Proposed superblock flags (SB_*) */
|
|
|
|
unsigned int sb_flags_mask; /* Superblock flags that were changed */
|
2019-03-26 00:38:23 +08:00
|
|
|
unsigned int s_iflags; /* OR'd with sb->s_iflags */
|
2018-11-04 16:19:03 +08:00
|
|
|
enum fs_context_purpose purpose:8;
|
vfs: syscall: Add fsopen() to prepare for superblock creation
Provide an fsopen() system call that starts the process of preparing to
create a superblock that will then be mountable, using an fd as a context
handle. fsopen() is given the name of the filesystem that will be used:
int mfd = fsopen(const char *fsname, unsigned int flags);
where flags can be 0 or FSOPEN_CLOEXEC.
For example:
sfd = fsopen("ext4", FSOPEN_CLOEXEC);
fsconfig(sfd, FSCONFIG_SET_PATH, "source", "/dev/sda1", AT_FDCWD);
fsconfig(sfd, FSCONFIG_SET_FLAG, "noatime", NULL, 0);
fsconfig(sfd, FSCONFIG_SET_FLAG, "acl", NULL, 0);
fsconfig(sfd, FSCONFIG_SET_FLAG, "user_xattr", NULL, 0);
fsconfig(sfd, FSCONFIG_SET_STRING, "sb", "1", 0);
fsconfig(sfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
fsinfo(sfd, NULL, ...); // query new superblock attributes
mfd = fsmount(sfd, FSMOUNT_CLOEXEC, MS_RELATIME);
move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);
sfd = fsopen("afs", -1);
fsconfig(fd, FSCONFIG_SET_STRING, "source",
"#grand.central.org:root.cell", 0);
fsconfig(fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
mfd = fsmount(sfd, 0, MS_NODEV);
move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);
If an error is reported at any step, an error message may be available to be
read() back (ENODATA will be reported if there isn't an error available) in
the form:
"e <subsys>:<problem>"
"e SELinux:Mount on mountpoint not permitted"
Once fsmount() has been called, further fsconfig() calls will incur EBUSY,
even if the fsmount() fails. read() is still possible to retrieve error
information.
The fsopen() syscall creates a mount context and hangs it of the fd that it
returns.
Netlink is not used because it is optional and would make the core VFS
dependent on the networking layer and also potentially add network
namespace issues.
Note that, for the moment, the caller must have SYS_CAP_ADMIN to use
fsopen().
Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-api@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-02 07:33:31 +08:00
|
|
|
enum fs_context_phase phase:8; /* The phase the context is in */
|
2018-11-04 16:19:03 +08:00
|
|
|
bool need_free:1; /* Need to call ops->free() */
|
2018-12-24 06:25:47 +08:00
|
|
|
bool global:1; /* Goes into &init_user_ns */
|
2020-07-14 20:45:41 +08:00
|
|
|
bool oldapi:1; /* Coming from mount(2) */
|
fs: add FSCONFIG_CMD_CREATE_EXCL
Summary
=======
This introduces FSCONFIG_CMD_CREATE_EXCL which will allows userspace to
implement something like mount -t ext4 --exclusive /dev/sda /B which
fails if a superblock for the requested filesystem does already exist:
Before this patch
-----------------
$ sudo ./move-mount -f xfs -o source=/dev/sda4 /A
Requesting filesystem type xfs
Mount options requested: source=/dev/sda4
Attaching mount at /A
Moving single attached mount
Setting key(source) with val(/dev/sda4)
$ sudo ./move-mount -f xfs -o source=/dev/sda4 /B
Requesting filesystem type xfs
Mount options requested: source=/dev/sda4
Attaching mount at /B
Moving single attached mount
Setting key(source) with val(/dev/sda4)
After this patch with --exclusive as a switch for FSCONFIG_CMD_CREATE_EXCL
--------------------------------------------------------------------------
$ sudo ./move-mount -f xfs --exclusive -o source=/dev/sda4 /A
Requesting filesystem type xfs
Request exclusive superblock creation
Mount options requested: source=/dev/sda4
Attaching mount at /A
Moving single attached mount
Setting key(source) with val(/dev/sda4)
$ sudo ./move-mount -f xfs --exclusive -o source=/dev/sda4 /B
Requesting filesystem type xfs
Request exclusive superblock creation
Mount options requested: source=/dev/sda4
Attaching mount at /B
Moving single attached mount
Setting key(source) with val(/dev/sda4)
Device or resource busy | move-mount.c: 300: do_fsconfig: i xfs: reusing existing filesystem not allowed
Details
=======
As mentioned on the list (cf. [1]-[3]) mount requests like
mount -t ext4 /dev/sda /A are ambigous for userspace. Either a new
superblock has been created and mounted or an existing superblock has
been reused and a bind-mount has been created.
This becomes clear in the following example where two processes create
the same mount for the same block device:
P1 P2
fd_fs = fsopen("ext4"); fd_fs = fsopen("ext4");
fsconfig(fd_fs, FSCONFIG_SET_STRING, "source", "/dev/sda"); fsconfig(fd_fs, FSCONFIG_SET_STRING, "source", "/dev/sda");
fsconfig(fd_fs, FSCONFIG_SET_STRING, "dax", "always"); fsconfig(fd_fs, FSCONFIG_SET_STRING, "resuid", "1000");
// wins and creates superblock
fsconfig(fd_fs, FSCONFIG_CMD_CREATE, ...)
// finds compatible superblock of P1
// spins until P1 sets SB_BORN and grabs a reference
fsconfig(fd_fs, FSCONFIG_CMD_CREATE, ...)
fd_mnt1 = fsmount(fd_fs); fd_mnt2 = fsmount(fd_fs);
move_mount(fd_mnt1, "/A") move_mount(fd_mnt2, "/B")
Not just does P2 get a bind-mount but the mount options that P2
requestes are silently ignored. The VFS itself doesn't, can't and
shouldn't enforce filesystem specific mount option compatibility. It
only enforces incompatibility for read-only <-> read-write transitions:
mount -t ext4 /dev/sda /A
mount -t ext4 -o ro /dev/sda /B
The read-only request will fail with EBUSY as the VFS can't just
silently transition a superblock from read-write to read-only or vica
versa without risking security issues.
To userspace this silent superblock reuse can become a security issue in
because there is currently no straightforward way for userspace to know
that they did indeed manage to create a new superblock and didn't just
reuse an existing one.
This adds a new FSCONFIG_CMD_CREATE_EXCL command to fsconfig() that
returns EBUSY if an existing superblock would be reused. Userspace that
needs to be sure that it did create a new superblock with the requested
mount options can request superblock creation using this command. If the
command succeeds they can be sure that they did create a new superblock
with the requested mount options.
This requires the new mount api. With the old mount api it would be
necessary to plumb this through every legacy filesystem's
file_system_type->mount() method. If they want this feature they are
most welcome to switch to the new mount api.
Following is an analysis of the effect of FSCONFIG_CMD_CREATE_EXCL on
each high-level superblock creation helper:
(1) get_tree_nodev()
Always allocate new superblock. Hence, FSCONFIG_CMD_CREATE and
FSCONFIG_CMD_CREATE_EXCL are equivalent.
The binderfs or overlayfs filesystems are examples.
(4) get_tree_keyed()
Finds an existing superblock based on sb->s_fs_info. Hence,
FSCONFIG_CMD_CREATE would reuse an existing superblock whereas
FSCONFIG_CMD_CREATE_EXCL would reject it with EBUSY.
The mqueue or nfsd filesystems are examples.
(2) get_tree_bdev()
This effectively works like get_tree_keyed().
The ext4 or xfs filesystems are examples.
(3) get_tree_single()
Only one superblock of this filesystem type can ever exist.
Hence, FSCONFIG_CMD_CREATE would reuse an existing superblock
whereas FSCONFIG_CMD_CREATE_EXCL would reject it with EBUSY.
The securityfs or configfs filesystems are examples.
Note that some single-instance filesystems never destroy the
superblock once it has been created during the first mount. For
example, if securityfs has been mounted at least onces then the
created superblock will never be destroyed again as long as there is
still an LSM making use it. Consequently, even if securityfs is
unmounted and the superblock seemingly destroyed it really isn't
which means that FSCONFIG_CMD_CREATE_EXCL will continue rejecting
reusing an existing superblock.
This is acceptable thugh since special purpose filesystems such as
this shouldn't have a need to use FSCONFIG_CMD_CREATE_EXCL anyway
and if they do it's probably to make sure that mount options aren't
ignored.
Following is an analysis of the effect of FSCONFIG_CMD_CREATE_EXCL on
filesystems that make use of the low-level sget_fc() helper directly.
They're all effectively variants on get_tree_keyed(), get_tree_bdev(),
or get_tree_nodev():
(5) mtd_get_sb()
Similar logic to get_tree_keyed().
(6) afs_get_tree()
Similar logic to get_tree_keyed().
(7) ceph_get_tree()
Similar logic to get_tree_keyed().
Already explicitly allows forcing the allocation of a new superblock
via CEPH_OPT_NOSHARE. This turns it into get_tree_nodev().
(8) fuse_get_tree_submount()
Similar logic to get_tree_nodev().
(9) fuse_get_tree()
Forces reuse of existing FUSE superblock.
Forces reuse of existing superblock if passed in file refers to an
existing FUSE connection.
If FSCONFIG_CMD_CREATE_EXCL is specified together with an fd
referring to an existing FUSE connections this would cause the
superblock reusal to fail. If reusing is the intent then
FSCONFIG_CMD_CREATE_EXCL shouldn't be specified.
(10) fuse_get_tree()
-> get_tree_nodev()
Same logic as in get_tree_nodev().
(11) fuse_get_tree()
-> get_tree_bdev()
Same logic as in get_tree_bdev().
(12) virtio_fs_get_tree()
Same logic as get_tree_keyed().
(13) gfs2_meta_get_tree()
Forces reuse of existing gfs2 superblock.
Mounting gfs2meta enforces that a gf2s superblock must already
exist. If not, it will error out. Consequently, mounting gfs2meta
with FSCONFIG_CMD_CREATE_EXCL would always fail. If reusing is the
intent then FSCONFIG_CMD_CREATE_EXCL shouldn't be specified.
(14) kernfs_get_tree()
Similar logic to get_tree_keyed().
(15) nfs_get_tree_common()
Similar logic to get_tree_keyed().
Already explicitly allows forcing the allocation of a new superblock
via NFS_MOUNT_UNSHARED. This effectively turns it into
get_tree_nodev().
Link: [1] https://lore.kernel.org/linux-block/20230704-fasching-wertarbeit-7c6ffb01c83d@brauner
Link: [2] https://lore.kernel.org/linux-block/20230705-pumpwerk-vielversprechend-a4b1fd947b65@brauner
Link: [3] https://lore.kernel.org/linux-fsdevel/20230725-einnahmen-warnschilder-17779aec0a97@brauner
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Aleksa Sarai <cyphar@cyphar.com>
Message-Id: <20230802-vfs-super-exclusive-v2-4-95dc4e41b870@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-08-02 19:57:06 +08:00
|
|
|
bool exclusive:1; /* create new superblock, reject existing one */
|
2018-11-04 16:19:03 +08:00
|
|
|
};
|
|
|
|
|
2018-12-24 07:55:56 +08:00
|
|
|
struct fs_context_operations {
|
|
|
|
void (*free)(struct fs_context *fc);
|
2018-12-24 05:02:47 +08:00
|
|
|
int (*dup)(struct fs_context *fc, struct fs_context *src_fc);
|
2018-11-02 07:07:25 +08:00
|
|
|
int (*parse_param)(struct fs_context *fc, struct fs_parameter *param);
|
2018-12-24 07:55:56 +08:00
|
|
|
int (*parse_monolithic)(struct fs_context *fc, void *data);
|
|
|
|
int (*get_tree)(struct fs_context *fc);
|
|
|
|
int (*reconfigure)(struct fs_context *fc);
|
|
|
|
};
|
|
|
|
|
2018-11-04 16:19:03 +08:00
|
|
|
/*
|
|
|
|
* fs_context manipulation functions.
|
|
|
|
*/
|
|
|
|
extern struct fs_context *fs_context_for_mount(struct file_system_type *fs_type,
|
|
|
|
unsigned int sb_flags);
|
2018-11-04 22:28:36 +08:00
|
|
|
extern struct fs_context *fs_context_for_reconfigure(struct dentry *dentry,
|
|
|
|
unsigned int sb_flags,
|
|
|
|
unsigned int sb_flags_mask);
|
2018-12-24 05:25:31 +08:00
|
|
|
extern struct fs_context *fs_context_for_submount(struct file_system_type *fs_type,
|
|
|
|
struct dentry *reference);
|
2018-11-04 16:19:03 +08:00
|
|
|
|
2018-12-24 05:02:47 +08:00
|
|
|
extern struct fs_context *vfs_dup_fs_context(struct fs_context *fc);
|
2018-11-02 07:07:25 +08:00
|
|
|
extern int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param);
|
|
|
|
extern int vfs_parse_fs_string(struct fs_context *fc, const char *key,
|
|
|
|
const char *value, size_t v_size);
|
2023-10-12 20:24:17 +08:00
|
|
|
int vfs_parse_monolithic_sep(struct fs_context *fc, void *data,
|
|
|
|
char *(*sep)(char **));
|
2018-11-02 07:07:25 +08:00
|
|
|
extern int generic_parse_monolithic(struct fs_context *fc, void *data);
|
2018-11-04 16:19:03 +08:00
|
|
|
extern int vfs_get_tree(struct fs_context *fc);
|
|
|
|
extern void put_fs_context(struct fs_context *fc);
|
2021-07-14 21:47:50 +08:00
|
|
|
extern int vfs_parse_fs_param_source(struct fs_context *fc,
|
|
|
|
struct fs_parameter *param);
|
cgroup1: fix leaked context root causing sporadic NULL deref in LTP
Richard reported sporadic (roughly one in 10 or so) null dereferences and
other strange behaviour for a set of automated LTP tests. Things like:
BUG: kernel NULL pointer dereference, address: 0000000000000008
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 0 P4D 0
Oops: 0000 [#1] PREEMPT SMP PTI
CPU: 0 PID: 1516 Comm: umount Not tainted 5.10.0-yocto-standard #1
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-48-gd9c812dda519-prebuilt.qemu.org 04/01/2014
RIP: 0010:kernfs_sop_show_path+0x1b/0x60
...or these others:
RIP: 0010:do_mkdirat+0x6a/0xf0
RIP: 0010:d_alloc_parallel+0x98/0x510
RIP: 0010:do_readlinkat+0x86/0x120
There were other less common instances of some kind of a general scribble
but the common theme was mount and cgroup and a dubious dentry triggering
the NULL dereference. I was only able to reproduce it under qemu by
replicating Richard's setup as closely as possible - I never did get it
to happen on bare metal, even while keeping everything else the same.
In commit 71d883c37e8d ("cgroup_do_mount(): massage calling conventions")
we see this as a part of the overall change:
--------------
struct cgroup_subsys *ss;
- struct dentry *dentry;
[...]
- dentry = cgroup_do_mount(&cgroup_fs_type, fc->sb_flags, root,
- CGROUP_SUPER_MAGIC, ns);
[...]
- if (percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
- struct super_block *sb = dentry->d_sb;
- dput(dentry);
+ ret = cgroup_do_mount(fc, CGROUP_SUPER_MAGIC, ns);
+ if (!ret && percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
+ struct super_block *sb = fc->root->d_sb;
+ dput(fc->root);
deactivate_locked_super(sb);
msleep(10);
return restart_syscall();
}
--------------
In changing from the local "*dentry" variable to using fc->root, we now
export/leave that dentry pointer in the file context after doing the dput()
in the unlikely "is_dying" case. With LTP doing a crazy amount of back to
back mount/unmount [testcases/bin/cgroup_regression_5_1.sh] the unlikely
becomes slightly likely and then bad things happen.
A fix would be to not leave the stale reference in fc->root as follows:
--------------
dput(fc->root);
+ fc->root = NULL;
deactivate_locked_super(sb);
--------------
...but then we are just open-coding a duplicate of fc_drop_locked() so we
simply use that instead.
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Tejun Heo <tj@kernel.org>
Cc: Zefan Li <lizefan.x@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: stable@vger.kernel.org # v5.1+
Reported-by: Richard Purdie <richard.purdie@linuxfoundation.org>
Fixes: 71d883c37e8d ("cgroup_do_mount(): massage calling conventions")
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2021-06-16 20:51:57 +08:00
|
|
|
extern void fc_drop_locked(struct fs_context *fc);
|
2022-01-17 06:07:26 +08:00
|
|
|
int reconfigure_single(struct super_block *s,
|
|
|
|
int flags, void *data);
|
2018-11-04 16:19:03 +08:00
|
|
|
|
2019-06-02 08:48:55 +08:00
|
|
|
extern int get_tree_nodev(struct fs_context *fc,
|
|
|
|
int (*fill_super)(struct super_block *sb,
|
|
|
|
struct fs_context *fc));
|
2019-05-23 09:23:39 +08:00
|
|
|
extern int get_tree_single(struct fs_context *fc,
|
2019-03-21 17:22:36 +08:00
|
|
|
int (*fill_super)(struct super_block *sb,
|
|
|
|
struct fs_context *fc));
|
2019-09-04 07:05:48 +08:00
|
|
|
extern int get_tree_keyed(struct fs_context *fc,
|
|
|
|
int (*fill_super)(struct super_block *sb,
|
|
|
|
struct fs_context *fc),
|
|
|
|
void *key);
|
2018-12-24 06:25:47 +08:00
|
|
|
|
2023-08-02 23:41:20 +08:00
|
|
|
int setup_bdev_super(struct super_block *sb, int sb_flags,
|
|
|
|
struct fs_context *fc);
|
2019-03-27 22:15:16 +08:00
|
|
|
extern int get_tree_bdev(struct fs_context *fc,
|
|
|
|
int (*fill_super)(struct super_block *sb,
|
|
|
|
struct fs_context *fc));
|
|
|
|
|
vfs: Implement logging through fs_context
Implement the ability for filesystems to log error, warning and
informational messages through the fs_context. In the future, these will
be extractable by userspace by reading from an fd created by the fsopen()
syscall.
Error messages are prefixed with "e ", warnings with "w " and informational
messages with "i ".
In the future, inside the kernel, formatted messages will be malloc'd but
unformatted messages will not copied if they're either in the core .rodata
section or in the .rodata section of the filesystem module pinned by
fs_context::fs_type. The messages will only be good till the fs_type is
released.
Note that the logging object will be shared between duplicated fs_context
structures. This is so that such as NFS which do a mount within a mount
can get at least some of the errors from the inner mount.
Five logging functions are provided for this:
(1) void logfc(struct fs_context *fc, const char *fmt, ...);
This logs a message into the context. If the buffer is full, the
earliest message is discarded.
(2) void errorf(fc, fmt, ...);
This wraps logfc() to log an error.
(3) void invalf(fc, fmt, ...);
This wraps errorf() and returns -EINVAL for convenience.
(4) void warnf(fc, fmt, ...);
This wraps logfc() to log a warning.
(5) void infof(fc, fmt, ...);
This wraps logfc() to log an informational message.
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-02 07:07:26 +08:00
|
|
|
extern const struct file_operations fscontext_fops;
|
|
|
|
|
vfs: Implement logging through fs_context
Implement the ability for filesystems to log error, warning and
informational messages through the fs_context. These can be extracted by
userspace by reading from an fd created by fsopen().
Error messages are prefixed with "e ", warnings with "w " and informational
messages with "i ".
Inside the kernel, formatted messages are malloc'd but unformatted messages
are not copied if they're either in the core .rodata section or in the
.rodata section of the filesystem module pinned by fs_context::fs_type.
The messages are only good till the fs_type is released.
Note that the logging object is shared between duplicated fs_context
structures. This is so that such as NFS which do a mount within a mount
can get at least some of the errors from the inner mount.
Five logging functions are provided for this:
(1) void logfc(struct fs_context *fc, const char *fmt, ...);
This logs a message into the context. If the buffer is full, the
earliest message is discarded.
(2) void errorf(fc, fmt, ...);
This wraps logfc() to log an error.
(3) void invalf(fc, fmt, ...);
This wraps errorf() and returns -EINVAL for convenience.
(4) void warnf(fc, fmt, ...);
This wraps logfc() to log a warning.
(5) void infof(fc, fmt, ...);
This wraps logfc() to log an informational message.
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-02 07:34:29 +08:00
|
|
|
/*
|
|
|
|
* Mount error, warning and informational message logging. This structure is
|
|
|
|
* shareable between a mount and a subordinate mount.
|
|
|
|
*/
|
|
|
|
struct fc_log {
|
|
|
|
refcount_t usage;
|
|
|
|
u8 head; /* Insertion index in buffer[] */
|
|
|
|
u8 tail; /* Removal index in buffer[] */
|
|
|
|
u8 need_free; /* Mask of kfree'able items in buffer[] */
|
|
|
|
struct module *owner; /* Owner module for strings that don't then need freeing */
|
|
|
|
char *buffer[8];
|
|
|
|
};
|
|
|
|
|
2019-12-21 11:10:36 +08:00
|
|
|
extern __attribute__((format(printf, 4, 5)))
|
|
|
|
void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, ...);
|
2018-11-02 07:07:23 +08:00
|
|
|
|
2019-12-21 13:16:49 +08:00
|
|
|
#define __logfc(fc, l, fmt, ...) logfc((fc)->log.log, NULL, \
|
2019-12-21 12:52:55 +08:00
|
|
|
l, fmt, ## __VA_ARGS__)
|
2019-12-21 12:43:32 +08:00
|
|
|
#define __plog(p, l, fmt, ...) logfc((p)->log, (p)->prefix, \
|
|
|
|
l, fmt, ## __VA_ARGS__)
|
2018-11-02 07:07:23 +08:00
|
|
|
/**
|
|
|
|
* infof - Store supplementary informational message
|
|
|
|
* @fc: The context in which to log the informational message
|
|
|
|
* @fmt: The format string
|
|
|
|
*
|
|
|
|
* Store the supplementary informational message for the process if the process
|
|
|
|
* has enabled the facility.
|
|
|
|
*/
|
2019-12-21 11:10:36 +08:00
|
|
|
#define infof(fc, fmt, ...) __logfc(fc, 'i', fmt, ## __VA_ARGS__)
|
2019-12-21 12:43:32 +08:00
|
|
|
#define info_plog(p, fmt, ...) __plog(p, 'i', fmt, ## __VA_ARGS__)
|
2019-12-22 10:30:50 +08:00
|
|
|
#define infofc(p, fmt, ...) __plog((&(fc)->log), 'i', fmt, ## __VA_ARGS__)
|
2018-11-02 07:07:23 +08:00
|
|
|
|
|
|
|
/**
|
|
|
|
* warnf - Store supplementary warning message
|
|
|
|
* @fc: The context in which to log the error message
|
|
|
|
* @fmt: The format string
|
|
|
|
*
|
|
|
|
* Store the supplementary warning message for the process if the process has
|
|
|
|
* enabled the facility.
|
|
|
|
*/
|
2019-12-21 11:10:36 +08:00
|
|
|
#define warnf(fc, fmt, ...) __logfc(fc, 'w', fmt, ## __VA_ARGS__)
|
2019-12-21 12:43:32 +08:00
|
|
|
#define warn_plog(p, fmt, ...) __plog(p, 'w', fmt, ## __VA_ARGS__)
|
2019-12-22 10:30:50 +08:00
|
|
|
#define warnfc(fc, fmt, ...) __plog((&(fc)->log), 'w', fmt, ## __VA_ARGS__)
|
2018-11-02 07:07:23 +08:00
|
|
|
|
|
|
|
/**
|
|
|
|
* errorf - Store supplementary error message
|
|
|
|
* @fc: The context in which to log the error message
|
|
|
|
* @fmt: The format string
|
|
|
|
*
|
|
|
|
* Store the supplementary error message for the process if the process has
|
|
|
|
* enabled the facility.
|
|
|
|
*/
|
2019-12-21 11:10:36 +08:00
|
|
|
#define errorf(fc, fmt, ...) __logfc(fc, 'e', fmt, ## __VA_ARGS__)
|
2019-12-21 12:43:32 +08:00
|
|
|
#define error_plog(p, fmt, ...) __plog(p, 'e', fmt, ## __VA_ARGS__)
|
2019-12-22 10:30:50 +08:00
|
|
|
#define errorfc(fc, fmt, ...) __plog((&(fc)->log), 'e', fmt, ## __VA_ARGS__)
|
2018-11-02 07:07:23 +08:00
|
|
|
|
|
|
|
/**
|
|
|
|
* invalf - Store supplementary invalid argument error message
|
|
|
|
* @fc: The context in which to log the error message
|
|
|
|
* @fmt: The format string
|
|
|
|
*
|
|
|
|
* Store the supplementary error message for the process if the process has
|
|
|
|
* enabled the facility and return -EINVAL.
|
|
|
|
*/
|
2019-12-21 11:10:36 +08:00
|
|
|
#define invalf(fc, fmt, ...) (errorf(fc, fmt, ## __VA_ARGS__), -EINVAL)
|
2019-12-21 12:43:32 +08:00
|
|
|
#define inval_plog(p, fmt, ...) (error_plog(p, fmt, ## __VA_ARGS__), -EINVAL)
|
2019-12-22 10:30:50 +08:00
|
|
|
#define invalfc(fc, fmt, ...) (errorfc(fc, fmt, ## __VA_ARGS__), -EINVAL)
|
2018-11-02 07:07:23 +08:00
|
|
|
|
2018-11-04 16:19:03 +08:00
|
|
|
#endif /* _LINUX_FS_CONTEXT_H */
|