2019-05-21 01:08:01 +08:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
2018-11-04 16:19:03 +08:00
|
|
|
/* Filesystem superblock creation and reconfiguration context.
|
|
|
|
*
|
|
|
|
* Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
|
|
|
|
* Written by David Howells (dhowells@redhat.com)
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef _LINUX_FS_CONTEXT_H
|
|
|
|
#define _LINUX_FS_CONTEXT_H
|
|
|
|
|
|
|
|
#include <linux/kernel.h>
|
vfs: Implement logging through fs_context
Implement the ability for filesystems to log error, warning and
informational messages through the fs_context. These can be extracted by
userspace by reading from an fd created by fsopen().
Error messages are prefixed with "e ", warnings with "w " and informational
messages with "i ".
Inside the kernel, formatted messages are malloc'd but unformatted messages
are not copied if they're either in the core .rodata section or in the
.rodata section of the filesystem module pinned by fs_context::fs_type.
The messages are only good till the fs_type is released.
Note that the logging object is shared between duplicated fs_context
structures. This is so that such as NFS which do a mount within a mount
can get at least some of the errors from the inner mount.
Five logging functions are provided for this:
(1) void logfc(struct fs_context *fc, const char *fmt, ...);
This logs a message into the context. If the buffer is full, the
earliest message is discarded.
(2) void errorf(fc, fmt, ...);
This wraps logfc() to log an error.
(3) void invalf(fc, fmt, ...);
This wraps errorf() and returns -EINVAL for convenience.
(4) void warnf(fc, fmt, ...);
This wraps logfc() to log a warning.
(5) void infof(fc, fmt, ...);
This wraps logfc() to log an informational message.
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-02 07:34:29 +08:00
|
|
|
#include <linux/refcount.h>
|
2018-11-04 16:19:03 +08:00
|
|
|
#include <linux/errno.h>
|
|
|
|
#include <linux/security.h>
|
vfs: syscall: Add fsopen() to prepare for superblock creation
Provide an fsopen() system call that starts the process of preparing to
create a superblock that will then be mountable, using an fd as a context
handle. fsopen() is given the name of the filesystem that will be used:
int mfd = fsopen(const char *fsname, unsigned int flags);
where flags can be 0 or FSOPEN_CLOEXEC.
For example:
sfd = fsopen("ext4", FSOPEN_CLOEXEC);
fsconfig(sfd, FSCONFIG_SET_PATH, "source", "/dev/sda1", AT_FDCWD);
fsconfig(sfd, FSCONFIG_SET_FLAG, "noatime", NULL, 0);
fsconfig(sfd, FSCONFIG_SET_FLAG, "acl", NULL, 0);
fsconfig(sfd, FSCONFIG_SET_FLAG, "user_xattr", NULL, 0);
fsconfig(sfd, FSCONFIG_SET_STRING, "sb", "1", 0);
fsconfig(sfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
fsinfo(sfd, NULL, ...); // query new superblock attributes
mfd = fsmount(sfd, FSMOUNT_CLOEXEC, MS_RELATIME);
move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);
sfd = fsopen("afs", -1);
fsconfig(fd, FSCONFIG_SET_STRING, "source",
"#grand.central.org:root.cell", 0);
fsconfig(fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
mfd = fsmount(sfd, 0, MS_NODEV);
move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);
If an error is reported at any step, an error message may be available to be
read() back (ENODATA will be reported if there isn't an error available) in
the form:
"e <subsys>:<problem>"
"e SELinux:Mount on mountpoint not permitted"
Once fsmount() has been called, further fsconfig() calls will incur EBUSY,
even if the fsmount() fails. read() is still possible to retrieve error
information.
The fsopen() syscall creates a mount context and hangs it of the fd that it
returns.
Netlink is not used because it is optional and would make the core VFS
dependent on the networking layer and also potentially add network
namespace issues.
Note that, for the moment, the caller must have SYS_CAP_ADMIN to use
fsopen().
Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-api@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-02 07:33:31 +08:00
|
|
|
#include <linux/mutex.h>
|
2018-11-04 16:19:03 +08:00
|
|
|
|
|
|
|
struct cred;
|
|
|
|
struct dentry;
|
|
|
|
struct file_operations;
|
|
|
|
struct file_system_type;
|
2018-12-24 07:55:56 +08:00
|
|
|
struct mnt_namespace;
|
2018-11-04 16:19:03 +08:00
|
|
|
struct net;
|
2018-12-24 07:55:56 +08:00
|
|
|
struct pid_namespace;
|
|
|
|
struct super_block;
|
2018-11-04 16:19:03 +08:00
|
|
|
struct user_namespace;
|
2018-12-24 07:55:56 +08:00
|
|
|
struct vfsmount;
|
|
|
|
struct path;
|
2018-11-04 16:19:03 +08:00
|
|
|
|
|
|
|
enum fs_context_purpose {
|
|
|
|
FS_CONTEXT_FOR_MOUNT, /* New superblock for explicit mount */
|
2018-12-24 05:25:31 +08:00
|
|
|
FS_CONTEXT_FOR_SUBMOUNT, /* New superblock for automatic submount */
|
2018-11-04 22:28:36 +08:00
|
|
|
FS_CONTEXT_FOR_RECONFIGURE, /* Superblock reconfiguration (remount) */
|
2018-11-04 16:19:03 +08:00
|
|
|
};
|
|
|
|
|
vfs: syscall: Add fsopen() to prepare for superblock creation
Provide an fsopen() system call that starts the process of preparing to
create a superblock that will then be mountable, using an fd as a context
handle. fsopen() is given the name of the filesystem that will be used:
int mfd = fsopen(const char *fsname, unsigned int flags);
where flags can be 0 or FSOPEN_CLOEXEC.
For example:
sfd = fsopen("ext4", FSOPEN_CLOEXEC);
fsconfig(sfd, FSCONFIG_SET_PATH, "source", "/dev/sda1", AT_FDCWD);
fsconfig(sfd, FSCONFIG_SET_FLAG, "noatime", NULL, 0);
fsconfig(sfd, FSCONFIG_SET_FLAG, "acl", NULL, 0);
fsconfig(sfd, FSCONFIG_SET_FLAG, "user_xattr", NULL, 0);
fsconfig(sfd, FSCONFIG_SET_STRING, "sb", "1", 0);
fsconfig(sfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
fsinfo(sfd, NULL, ...); // query new superblock attributes
mfd = fsmount(sfd, FSMOUNT_CLOEXEC, MS_RELATIME);
move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);
sfd = fsopen("afs", -1);
fsconfig(fd, FSCONFIG_SET_STRING, "source",
"#grand.central.org:root.cell", 0);
fsconfig(fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
mfd = fsmount(sfd, 0, MS_NODEV);
move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);
If an error is reported at any step, an error message may be available to be
read() back (ENODATA will be reported if there isn't an error available) in
the form:
"e <subsys>:<problem>"
"e SELinux:Mount on mountpoint not permitted"
Once fsmount() has been called, further fsconfig() calls will incur EBUSY,
even if the fsmount() fails. read() is still possible to retrieve error
information.
The fsopen() syscall creates a mount context and hangs it of the fd that it
returns.
Netlink is not used because it is optional and would make the core VFS
dependent on the networking layer and also potentially add network
namespace issues.
Note that, for the moment, the caller must have SYS_CAP_ADMIN to use
fsopen().
Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-api@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-02 07:33:31 +08:00
|
|
|
/*
|
|
|
|
* Userspace usage phase for fsopen/fspick.
|
|
|
|
*/
|
|
|
|
enum fs_context_phase {
|
|
|
|
FS_CONTEXT_CREATE_PARAMS, /* Loading params for sb creation */
|
|
|
|
FS_CONTEXT_CREATING, /* A superblock is being created */
|
|
|
|
FS_CONTEXT_AWAITING_MOUNT, /* Superblock created, awaiting fsmount() */
|
|
|
|
FS_CONTEXT_AWAITING_RECONF, /* Awaiting initialisation for reconfiguration */
|
|
|
|
FS_CONTEXT_RECONF_PARAMS, /* Loading params for reconfiguration */
|
|
|
|
FS_CONTEXT_RECONFIGURING, /* Reconfiguring the superblock */
|
|
|
|
FS_CONTEXT_FAILED, /* Failed to correctly transition a context */
|
|
|
|
};
|
|
|
|
|
vfs: Add configuration parser helpers
Because the new API passes in key,value parameters, match_token() cannot be
used with it. Instead, provide three new helpers to aid with parsing:
(1) fs_parse(). This takes a parameter and a simple static description of
all the parameters and maps the key name to an ID. It returns 1 on a
match, 0 on no match if unknowns should be ignored and some other
negative error code on a parse error.
The parameter description includes a list of key names to IDs, desired
parameter types and a list of enumeration name -> ID mappings.
[!] Note that for the moment I've required that the key->ID mapping
array is expected to be sorted and unterminated. The size of the
array is noted in the fsconfig_parser struct. This allows me to use
bsearch(), but I'm not sure any performance gain is worth the hassle
of requiring people to keep the array sorted.
The parameter type array is sized according to the number of parameter
IDs and is indexed directly. The optional enum mapping array is an
unterminated, unsorted list and the size goes into the fsconfig_parser
struct.
The function can do some additional things:
(a) If it's not ambiguous and no value is given, the prefix "no" on
a key name is permitted to indicate that the parameter should
be considered negatory.
(b) If the desired type is a single simple integer, it will perform
an appropriate conversion and store the result in a union in
the parse result.
(c) If the desired type is an enumeration, {key ID, name} will be
looked up in the enumeration list and the matching value will
be stored in the parse result union.
(d) Optionally generate an error if the key is unrecognised.
This is called something like:
enum rdt_param {
Opt_cdp,
Opt_cdpl2,
Opt_mba_mpbs,
nr__rdt_params
};
const struct fs_parameter_spec rdt_param_specs[nr__rdt_params] = {
[Opt_cdp] = { fs_param_is_bool },
[Opt_cdpl2] = { fs_param_is_bool },
[Opt_mba_mpbs] = { fs_param_is_bool },
};
const const char *const rdt_param_keys[nr__rdt_params] = {
[Opt_cdp] = "cdp",
[Opt_cdpl2] = "cdpl2",
[Opt_mba_mpbs] = "mba_mbps",
};
const struct fs_parameter_description rdt_parser = {
.name = "rdt",
.nr_params = nr__rdt_params,
.keys = rdt_param_keys,
.specs = rdt_param_specs,
.no_source = true,
};
int rdt_parse_param(struct fs_context *fc,
struct fs_parameter *param)
{
struct fs_parse_result parse;
struct rdt_fs_context *ctx = rdt_fc2context(fc);
int ret;
ret = fs_parse(fc, &rdt_parser, param, &parse);
if (ret < 0)
return ret;
switch (parse.key) {
case Opt_cdp:
ctx->enable_cdpl3 = true;
return 0;
case Opt_cdpl2:
ctx->enable_cdpl2 = true;
return 0;
case Opt_mba_mpbs:
ctx->enable_mba_mbps = true;
return 0;
}
return -EINVAL;
}
(2) fs_lookup_param(). This takes a { dirfd, path, LOOKUP_EMPTY? } or
string value and performs an appropriate path lookup to convert it
into a path object, which it will then return.
If the desired type was a blockdev, the type of the looked up inode
will be checked to make sure it is one.
This can be used like:
enum foo_param {
Opt_source,
nr__foo_params
};
const struct fs_parameter_spec foo_param_specs[nr__foo_params] = {
[Opt_source] = { fs_param_is_blockdev },
};
const char *char foo_param_keys[nr__foo_params] = {
[Opt_source] = "source",
};
const struct constant_table foo_param_alt_keys[] = {
{ "device", Opt_source },
};
const struct fs_parameter_description foo_parser = {
.name = "foo",
.nr_params = nr__foo_params,
.nr_alt_keys = ARRAY_SIZE(foo_param_alt_keys),
.keys = foo_param_keys,
.alt_keys = foo_param_alt_keys,
.specs = foo_param_specs,
};
int foo_parse_param(struct fs_context *fc,
struct fs_parameter *param)
{
struct fs_parse_result parse;
struct foo_fs_context *ctx = foo_fc2context(fc);
int ret;
ret = fs_parse(fc, &foo_parser, param, &parse);
if (ret < 0)
return ret;
switch (parse.key) {
case Opt_source:
return fs_lookup_param(fc, &foo_parser, param,
&parse, &ctx->source);
default:
return -EINVAL;
}
}
(3) lookup_constant(). This takes a table of named constants and looks up
the given name within it. The table is expected to be sorted such
that bsearch() be used upon it.
Possibly I should require the table be terminated and just use a
for-loop to scan it instead of using bsearch() to reduce hassle.
Tables look something like:
static const struct constant_table bool_names[] = {
{ "0", false },
{ "1", true },
{ "false", false },
{ "no", false },
{ "true", true },
{ "yes", true },
};
and a lookup is done with something like:
b = lookup_constant(bool_names, param->string, -1);
Additionally, optional validation routines for the parameter description
are provided that can be enabled at compile time. A later patch will
invoke these when a filesystem is registered.
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-02 07:07:24 +08:00
|
|
|
/*
|
|
|
|
* Type of parameter value.
|
|
|
|
*/
|
|
|
|
enum fs_value_type {
|
|
|
|
fs_value_is_undefined,
|
|
|
|
fs_value_is_flag, /* Value not given a value */
|
|
|
|
fs_value_is_string, /* Value is a string */
|
|
|
|
fs_value_is_blob, /* Value is a binary blob */
|
|
|
|
fs_value_is_filename, /* Value is a filename* + dirfd */
|
|
|
|
fs_value_is_file, /* Value is a file* */
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Configuration parameter.
|
|
|
|
*/
|
|
|
|
struct fs_parameter {
|
|
|
|
const char *key; /* Parameter name */
|
|
|
|
enum fs_value_type type:8; /* The type of value here */
|
|
|
|
union {
|
|
|
|
char *string;
|
|
|
|
void *blob;
|
|
|
|
struct filename *name;
|
|
|
|
struct file *file;
|
|
|
|
};
|
|
|
|
size_t size;
|
|
|
|
int dirfd;
|
|
|
|
};
|
|
|
|
|
2019-12-21 12:43:32 +08:00
|
|
|
struct p_log {
|
|
|
|
const char *prefix;
|
|
|
|
struct fc_log *log;
|
|
|
|
};
|
|
|
|
|
2018-11-04 16:19:03 +08:00
|
|
|
/*
|
|
|
|
* Filesystem context for holding the parameters used in the creation or
|
|
|
|
* reconfiguration of a superblock.
|
|
|
|
*
|
|
|
|
* Superblock creation fills in ->root whereas reconfiguration begins with this
|
|
|
|
* already set.
|
|
|
|
*
|
2020-04-28 05:17:09 +08:00
|
|
|
* See Documentation/filesystems/mount_api.rst
|
2018-11-04 16:19:03 +08:00
|
|
|
*/
|
|
|
|
struct fs_context {
|
2018-12-24 07:55:56 +08:00
|
|
|
const struct fs_context_operations *ops;
|
vfs: syscall: Add fsopen() to prepare for superblock creation
Provide an fsopen() system call that starts the process of preparing to
create a superblock that will then be mountable, using an fd as a context
handle. fsopen() is given the name of the filesystem that will be used:
int mfd = fsopen(const char *fsname, unsigned int flags);
where flags can be 0 or FSOPEN_CLOEXEC.
For example:
sfd = fsopen("ext4", FSOPEN_CLOEXEC);
fsconfig(sfd, FSCONFIG_SET_PATH, "source", "/dev/sda1", AT_FDCWD);
fsconfig(sfd, FSCONFIG_SET_FLAG, "noatime", NULL, 0);
fsconfig(sfd, FSCONFIG_SET_FLAG, "acl", NULL, 0);
fsconfig(sfd, FSCONFIG_SET_FLAG, "user_xattr", NULL, 0);
fsconfig(sfd, FSCONFIG_SET_STRING, "sb", "1", 0);
fsconfig(sfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
fsinfo(sfd, NULL, ...); // query new superblock attributes
mfd = fsmount(sfd, FSMOUNT_CLOEXEC, MS_RELATIME);
move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);
sfd = fsopen("afs", -1);
fsconfig(fd, FSCONFIG_SET_STRING, "source",
"#grand.central.org:root.cell", 0);
fsconfig(fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
mfd = fsmount(sfd, 0, MS_NODEV);
move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);
If an error is reported at any step, an error message may be available to be
read() back (ENODATA will be reported if there isn't an error available) in
the form:
"e <subsys>:<problem>"
"e SELinux:Mount on mountpoint not permitted"
Once fsmount() has been called, further fsconfig() calls will incur EBUSY,
even if the fsmount() fails. read() is still possible to retrieve error
information.
The fsopen() syscall creates a mount context and hangs it of the fd that it
returns.
Netlink is not used because it is optional and would make the core VFS
dependent on the networking layer and also potentially add network
namespace issues.
Note that, for the moment, the caller must have SYS_CAP_ADMIN to use
fsopen().
Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-api@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-02 07:33:31 +08:00
|
|
|
struct mutex uapi_mutex; /* Userspace access mutex */
|
2018-11-04 16:19:03 +08:00
|
|
|
struct file_system_type *fs_type;
|
|
|
|
void *fs_private; /* The filesystem's context */
|
2019-03-27 22:15:16 +08:00
|
|
|
void *sget_key;
|
2018-11-04 16:19:03 +08:00
|
|
|
struct dentry *root; /* The root and superblock */
|
|
|
|
struct user_namespace *user_ns; /* The user namespace for this mount */
|
|
|
|
struct net *net_ns; /* The network namespace for this mount */
|
|
|
|
const struct cred *cred; /* The mounter's credentials */
|
2019-12-21 13:16:49 +08:00
|
|
|
struct p_log log; /* Logging buffer */
|
2018-11-04 16:19:03 +08:00
|
|
|
const char *source; /* The source name (eg. dev path) */
|
|
|
|
void *security; /* Linux S&M options */
|
2018-12-24 06:25:47 +08:00
|
|
|
void *s_fs_info; /* Proposed s_fs_info */
|
2018-11-04 16:19:03 +08:00
|
|
|
unsigned int sb_flags; /* Proposed superblock flags (SB_*) */
|
|
|
|
unsigned int sb_flags_mask; /* Superblock flags that were changed */
|
2019-03-26 00:38:23 +08:00
|
|
|
unsigned int s_iflags; /* OR'd with sb->s_iflags */
|
2018-11-02 07:07:24 +08:00
|
|
|
unsigned int lsm_flags; /* Information flags from the fs to the LSM */
|
2018-11-04 16:19:03 +08:00
|
|
|
enum fs_context_purpose purpose:8;
|
vfs: syscall: Add fsopen() to prepare for superblock creation
Provide an fsopen() system call that starts the process of preparing to
create a superblock that will then be mountable, using an fd as a context
handle. fsopen() is given the name of the filesystem that will be used:
int mfd = fsopen(const char *fsname, unsigned int flags);
where flags can be 0 or FSOPEN_CLOEXEC.
For example:
sfd = fsopen("ext4", FSOPEN_CLOEXEC);
fsconfig(sfd, FSCONFIG_SET_PATH, "source", "/dev/sda1", AT_FDCWD);
fsconfig(sfd, FSCONFIG_SET_FLAG, "noatime", NULL, 0);
fsconfig(sfd, FSCONFIG_SET_FLAG, "acl", NULL, 0);
fsconfig(sfd, FSCONFIG_SET_FLAG, "user_xattr", NULL, 0);
fsconfig(sfd, FSCONFIG_SET_STRING, "sb", "1", 0);
fsconfig(sfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
fsinfo(sfd, NULL, ...); // query new superblock attributes
mfd = fsmount(sfd, FSMOUNT_CLOEXEC, MS_RELATIME);
move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);
sfd = fsopen("afs", -1);
fsconfig(fd, FSCONFIG_SET_STRING, "source",
"#grand.central.org:root.cell", 0);
fsconfig(fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
mfd = fsmount(sfd, 0, MS_NODEV);
move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);
If an error is reported at any step, an error message may be available to be
read() back (ENODATA will be reported if there isn't an error available) in
the form:
"e <subsys>:<problem>"
"e SELinux:Mount on mountpoint not permitted"
Once fsmount() has been called, further fsconfig() calls will incur EBUSY,
even if the fsmount() fails. read() is still possible to retrieve error
information.
The fsopen() syscall creates a mount context and hangs it of the fd that it
returns.
Netlink is not used because it is optional and would make the core VFS
dependent on the networking layer and also potentially add network
namespace issues.
Note that, for the moment, the caller must have SYS_CAP_ADMIN to use
fsopen().
Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-api@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-02 07:33:31 +08:00
|
|
|
enum fs_context_phase phase:8; /* The phase the context is in */
|
2018-11-04 16:19:03 +08:00
|
|
|
bool need_free:1; /* Need to call ops->free() */
|
2018-12-24 06:25:47 +08:00
|
|
|
bool global:1; /* Goes into &init_user_ns */
|
2020-07-14 20:45:41 +08:00
|
|
|
bool oldapi:1; /* Coming from mount(2) */
|
2018-11-04 16:19:03 +08:00
|
|
|
};
|
|
|
|
|
2018-12-24 07:55:56 +08:00
|
|
|
struct fs_context_operations {
|
|
|
|
void (*free)(struct fs_context *fc);
|
2018-12-24 05:02:47 +08:00
|
|
|
int (*dup)(struct fs_context *fc, struct fs_context *src_fc);
|
2018-11-02 07:07:25 +08:00
|
|
|
int (*parse_param)(struct fs_context *fc, struct fs_parameter *param);
|
2018-12-24 07:55:56 +08:00
|
|
|
int (*parse_monolithic)(struct fs_context *fc, void *data);
|
|
|
|
int (*get_tree)(struct fs_context *fc);
|
|
|
|
int (*reconfigure)(struct fs_context *fc);
|
|
|
|
};
|
|
|
|
|
2018-11-04 16:19:03 +08:00
|
|
|
/*
|
|
|
|
* fs_context manipulation functions.
|
|
|
|
*/
|
|
|
|
extern struct fs_context *fs_context_for_mount(struct file_system_type *fs_type,
|
|
|
|
unsigned int sb_flags);
|
2018-11-04 22:28:36 +08:00
|
|
|
extern struct fs_context *fs_context_for_reconfigure(struct dentry *dentry,
|
|
|
|
unsigned int sb_flags,
|
|
|
|
unsigned int sb_flags_mask);
|
2018-12-24 05:25:31 +08:00
|
|
|
extern struct fs_context *fs_context_for_submount(struct file_system_type *fs_type,
|
|
|
|
struct dentry *reference);
|
2018-11-04 16:19:03 +08:00
|
|
|
|
2018-12-24 05:02:47 +08:00
|
|
|
extern struct fs_context *vfs_dup_fs_context(struct fs_context *fc);
|
2018-11-02 07:07:25 +08:00
|
|
|
extern int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param);
|
|
|
|
extern int vfs_parse_fs_string(struct fs_context *fc, const char *key,
|
|
|
|
const char *value, size_t v_size);
|
|
|
|
extern int generic_parse_monolithic(struct fs_context *fc, void *data);
|
2018-11-04 16:19:03 +08:00
|
|
|
extern int vfs_get_tree(struct fs_context *fc);
|
|
|
|
extern void put_fs_context(struct fs_context *fc);
|
2021-07-14 21:47:50 +08:00
|
|
|
extern int vfs_parse_fs_param_source(struct fs_context *fc,
|
|
|
|
struct fs_parameter *param);
|
cgroup1: fix leaked context root causing sporadic NULL deref in LTP
Richard reported sporadic (roughly one in 10 or so) null dereferences and
other strange behaviour for a set of automated LTP tests. Things like:
BUG: kernel NULL pointer dereference, address: 0000000000000008
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 0 P4D 0
Oops: 0000 [#1] PREEMPT SMP PTI
CPU: 0 PID: 1516 Comm: umount Not tainted 5.10.0-yocto-standard #1
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-48-gd9c812dda519-prebuilt.qemu.org 04/01/2014
RIP: 0010:kernfs_sop_show_path+0x1b/0x60
...or these others:
RIP: 0010:do_mkdirat+0x6a/0xf0
RIP: 0010:d_alloc_parallel+0x98/0x510
RIP: 0010:do_readlinkat+0x86/0x120
There were other less common instances of some kind of a general scribble
but the common theme was mount and cgroup and a dubious dentry triggering
the NULL dereference. I was only able to reproduce it under qemu by
replicating Richard's setup as closely as possible - I never did get it
to happen on bare metal, even while keeping everything else the same.
In commit 71d883c37e8d ("cgroup_do_mount(): massage calling conventions")
we see this as a part of the overall change:
--------------
struct cgroup_subsys *ss;
- struct dentry *dentry;
[...]
- dentry = cgroup_do_mount(&cgroup_fs_type, fc->sb_flags, root,
- CGROUP_SUPER_MAGIC, ns);
[...]
- if (percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
- struct super_block *sb = dentry->d_sb;
- dput(dentry);
+ ret = cgroup_do_mount(fc, CGROUP_SUPER_MAGIC, ns);
+ if (!ret && percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
+ struct super_block *sb = fc->root->d_sb;
+ dput(fc->root);
deactivate_locked_super(sb);
msleep(10);
return restart_syscall();
}
--------------
In changing from the local "*dentry" variable to using fc->root, we now
export/leave that dentry pointer in the file context after doing the dput()
in the unlikely "is_dying" case. With LTP doing a crazy amount of back to
back mount/unmount [testcases/bin/cgroup_regression_5_1.sh] the unlikely
becomes slightly likely and then bad things happen.
A fix would be to not leave the stale reference in fc->root as follows:
--------------
dput(fc->root);
+ fc->root = NULL;
deactivate_locked_super(sb);
--------------
...but then we are just open-coding a duplicate of fc_drop_locked() so we
simply use that instead.
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Tejun Heo <tj@kernel.org>
Cc: Zefan Li <lizefan.x@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: stable@vger.kernel.org # v5.1+
Reported-by: Richard Purdie <richard.purdie@linuxfoundation.org>
Fixes: 71d883c37e8d ("cgroup_do_mount(): massage calling conventions")
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2021-06-16 20:51:57 +08:00
|
|
|
extern void fc_drop_locked(struct fs_context *fc);
|
2022-01-17 06:07:26 +08:00
|
|
|
int reconfigure_single(struct super_block *s,
|
|
|
|
int flags, void *data);
|
2018-11-04 16:19:03 +08:00
|
|
|
|
2018-12-24 06:25:47 +08:00
|
|
|
/*
|
2019-09-04 07:05:48 +08:00
|
|
|
* sget() wrappers to be called from the ->get_tree() op.
|
2018-12-24 06:25:47 +08:00
|
|
|
*/
|
|
|
|
enum vfs_get_super_keying {
|
|
|
|
vfs_get_single_super, /* Only one such superblock may exist */
|
2019-03-21 17:22:36 +08:00
|
|
|
vfs_get_single_reconf_super, /* As above, but reconfigure if it exists */
|
2018-12-24 06:25:47 +08:00
|
|
|
vfs_get_keyed_super, /* Superblocks with different s_fs_info keys may exist */
|
|
|
|
vfs_get_independent_super, /* Multiple independent superblocks may exist */
|
|
|
|
};
|
|
|
|
extern int vfs_get_super(struct fs_context *fc,
|
|
|
|
enum vfs_get_super_keying keying,
|
|
|
|
int (*fill_super)(struct super_block *sb,
|
|
|
|
struct fs_context *fc));
|
2019-09-04 07:05:48 +08:00
|
|
|
|
2019-06-02 08:48:55 +08:00
|
|
|
extern int get_tree_nodev(struct fs_context *fc,
|
|
|
|
int (*fill_super)(struct super_block *sb,
|
|
|
|
struct fs_context *fc));
|
2019-05-23 09:23:39 +08:00
|
|
|
extern int get_tree_single(struct fs_context *fc,
|
|
|
|
int (*fill_super)(struct super_block *sb,
|
|
|
|
struct fs_context *fc));
|
2019-03-21 17:22:36 +08:00
|
|
|
extern int get_tree_single_reconf(struct fs_context *fc,
|
|
|
|
int (*fill_super)(struct super_block *sb,
|
|
|
|
struct fs_context *fc));
|
2019-09-04 07:05:48 +08:00
|
|
|
extern int get_tree_keyed(struct fs_context *fc,
|
|
|
|
int (*fill_super)(struct super_block *sb,
|
|
|
|
struct fs_context *fc),
|
|
|
|
void *key);
|
2018-12-24 06:25:47 +08:00
|
|
|
|
2019-03-27 22:15:16 +08:00
|
|
|
extern int get_tree_bdev(struct fs_context *fc,
|
|
|
|
int (*fill_super)(struct super_block *sb,
|
|
|
|
struct fs_context *fc));
|
|
|
|
|
vfs: Implement logging through fs_context
Implement the ability for filesystems to log error, warning and
informational messages through the fs_context. In the future, these will
be extractable by userspace by reading from an fd created by the fsopen()
syscall.
Error messages are prefixed with "e ", warnings with "w " and informational
messages with "i ".
In the future, inside the kernel, formatted messages will be malloc'd but
unformatted messages will not copied if they're either in the core .rodata
section or in the .rodata section of the filesystem module pinned by
fs_context::fs_type. The messages will only be good till the fs_type is
released.
Note that the logging object will be shared between duplicated fs_context
structures. This is so that such as NFS which do a mount within a mount
can get at least some of the errors from the inner mount.
Five logging functions are provided for this:
(1) void logfc(struct fs_context *fc, const char *fmt, ...);
This logs a message into the context. If the buffer is full, the
earliest message is discarded.
(2) void errorf(fc, fmt, ...);
This wraps logfc() to log an error.
(3) void invalf(fc, fmt, ...);
This wraps errorf() and returns -EINVAL for convenience.
(4) void warnf(fc, fmt, ...);
This wraps logfc() to log a warning.
(5) void infof(fc, fmt, ...);
This wraps logfc() to log an informational message.
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-02 07:07:26 +08:00
|
|
|
extern const struct file_operations fscontext_fops;
|
|
|
|
|
vfs: Implement logging through fs_context
Implement the ability for filesystems to log error, warning and
informational messages through the fs_context. These can be extracted by
userspace by reading from an fd created by fsopen().
Error messages are prefixed with "e ", warnings with "w " and informational
messages with "i ".
Inside the kernel, formatted messages are malloc'd but unformatted messages
are not copied if they're either in the core .rodata section or in the
.rodata section of the filesystem module pinned by fs_context::fs_type.
The messages are only good till the fs_type is released.
Note that the logging object is shared between duplicated fs_context
structures. This is so that such as NFS which do a mount within a mount
can get at least some of the errors from the inner mount.
Five logging functions are provided for this:
(1) void logfc(struct fs_context *fc, const char *fmt, ...);
This logs a message into the context. If the buffer is full, the
earliest message is discarded.
(2) void errorf(fc, fmt, ...);
This wraps logfc() to log an error.
(3) void invalf(fc, fmt, ...);
This wraps errorf() and returns -EINVAL for convenience.
(4) void warnf(fc, fmt, ...);
This wraps logfc() to log a warning.
(5) void infof(fc, fmt, ...);
This wraps logfc() to log an informational message.
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-02 07:34:29 +08:00
|
|
|
/*
|
|
|
|
* Mount error, warning and informational message logging. This structure is
|
|
|
|
* shareable between a mount and a subordinate mount.
|
|
|
|
*/
|
|
|
|
struct fc_log {
|
|
|
|
refcount_t usage;
|
|
|
|
u8 head; /* Insertion index in buffer[] */
|
|
|
|
u8 tail; /* Removal index in buffer[] */
|
|
|
|
u8 need_free; /* Mask of kfree'able items in buffer[] */
|
|
|
|
struct module *owner; /* Owner module for strings that don't then need freeing */
|
|
|
|
char *buffer[8];
|
|
|
|
};
|
|
|
|
|
2019-12-21 11:10:36 +08:00
|
|
|
extern __attribute__((format(printf, 4, 5)))
|
|
|
|
void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, ...);
|
2018-11-02 07:07:23 +08:00
|
|
|
|
2019-12-21 13:16:49 +08:00
|
|
|
#define __logfc(fc, l, fmt, ...) logfc((fc)->log.log, NULL, \
|
2019-12-21 12:52:55 +08:00
|
|
|
l, fmt, ## __VA_ARGS__)
|
2019-12-21 12:43:32 +08:00
|
|
|
#define __plog(p, l, fmt, ...) logfc((p)->log, (p)->prefix, \
|
|
|
|
l, fmt, ## __VA_ARGS__)
|
2018-11-02 07:07:23 +08:00
|
|
|
/**
|
|
|
|
* infof - Store supplementary informational message
|
|
|
|
* @fc: The context in which to log the informational message
|
|
|
|
* @fmt: The format string
|
|
|
|
*
|
|
|
|
* Store the supplementary informational message for the process if the process
|
|
|
|
* has enabled the facility.
|
|
|
|
*/
|
2019-12-21 11:10:36 +08:00
|
|
|
#define infof(fc, fmt, ...) __logfc(fc, 'i', fmt, ## __VA_ARGS__)
|
2019-12-21 12:43:32 +08:00
|
|
|
#define info_plog(p, fmt, ...) __plog(p, 'i', fmt, ## __VA_ARGS__)
|
2019-12-22 10:30:50 +08:00
|
|
|
#define infofc(p, fmt, ...) __plog((&(fc)->log), 'i', fmt, ## __VA_ARGS__)
|
2018-11-02 07:07:23 +08:00
|
|
|
|
|
|
|
/**
|
|
|
|
* warnf - Store supplementary warning message
|
|
|
|
* @fc: The context in which to log the error message
|
|
|
|
* @fmt: The format string
|
|
|
|
*
|
|
|
|
* Store the supplementary warning message for the process if the process has
|
|
|
|
* enabled the facility.
|
|
|
|
*/
|
2019-12-21 11:10:36 +08:00
|
|
|
#define warnf(fc, fmt, ...) __logfc(fc, 'w', fmt, ## __VA_ARGS__)
|
2019-12-21 12:43:32 +08:00
|
|
|
#define warn_plog(p, fmt, ...) __plog(p, 'w', fmt, ## __VA_ARGS__)
|
2019-12-22 10:30:50 +08:00
|
|
|
#define warnfc(fc, fmt, ...) __plog((&(fc)->log), 'w', fmt, ## __VA_ARGS__)
|
2018-11-02 07:07:23 +08:00
|
|
|
|
|
|
|
/**
|
|
|
|
* errorf - Store supplementary error message
|
|
|
|
* @fc: The context in which to log the error message
|
|
|
|
* @fmt: The format string
|
|
|
|
*
|
|
|
|
* Store the supplementary error message for the process if the process has
|
|
|
|
* enabled the facility.
|
|
|
|
*/
|
2019-12-21 11:10:36 +08:00
|
|
|
#define errorf(fc, fmt, ...) __logfc(fc, 'e', fmt, ## __VA_ARGS__)
|
2019-12-21 12:43:32 +08:00
|
|
|
#define error_plog(p, fmt, ...) __plog(p, 'e', fmt, ## __VA_ARGS__)
|
2019-12-22 10:30:50 +08:00
|
|
|
#define errorfc(fc, fmt, ...) __plog((&(fc)->log), 'e', fmt, ## __VA_ARGS__)
|
2018-11-02 07:07:23 +08:00
|
|
|
|
|
|
|
/**
|
|
|
|
* invalf - Store supplementary invalid argument error message
|
|
|
|
* @fc: The context in which to log the error message
|
|
|
|
* @fmt: The format string
|
|
|
|
*
|
|
|
|
* Store the supplementary error message for the process if the process has
|
|
|
|
* enabled the facility and return -EINVAL.
|
|
|
|
*/
|
2019-12-21 11:10:36 +08:00
|
|
|
#define invalf(fc, fmt, ...) (errorf(fc, fmt, ## __VA_ARGS__), -EINVAL)
|
2019-12-21 12:43:32 +08:00
|
|
|
#define inval_plog(p, fmt, ...) (error_plog(p, fmt, ## __VA_ARGS__), -EINVAL)
|
2019-12-22 10:30:50 +08:00
|
|
|
#define invalfc(fc, fmt, ...) (errorfc(fc, fmt, ## __VA_ARGS__), -EINVAL)
|
2018-11-02 07:07:23 +08:00
|
|
|
|
2018-11-04 16:19:03 +08:00
|
|
|
#endif /* _LINUX_FS_CONTEXT_H */
|