memfd: replace ratcheting feature from vm.memfd_noexec with hierarchy

This sysctl has the very unusual behaviour of not allowing any user (even
CAP_SYS_ADMIN) to reduce the restriction setting, meaning that if you were
to set this sysctl to a more restrictive option in the host pidns you
would need to reboot your machine in order to reset it.

The justification given in [1] is that this is a security feature and thus
it should not be possible to disable.  Aside from the fact that we have
plenty of security-related sysctls that can be disabled after being
enabled (fs.protected_symlinks for instance), the protection provided by
the sysctl is to stop users from being able to create a binary and then
execute it.  A user with CAP_SYS_ADMIN can trivially do this without
memfd_create(2):

  % cat mount-memfd.c
  #include <fcntl.h>
  #include <string.h>
  #include <stdio.h>
  #include <stdlib.h>
  #include <unistd.h>
  #include <linux/mount.h>

  #define SHELLCODE "#!/bin/echo this file was executed from this totally private tmpfs:"

  int main(void)
  {
  	int fsfd = fsopen("tmpfs", FSOPEN_CLOEXEC);
  	assert(fsfd >= 0);
  	assert(!fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 2));

  	int dfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0);
  	assert(dfd >= 0);

  	int execfd = openat(dfd, "exe", O_CREAT | O_RDWR | O_CLOEXEC, 0782);
  	assert(execfd >= 0);
  	assert(write(execfd, SHELLCODE, strlen(SHELLCODE)) == strlen(SHELLCODE));
  	assert(!close(execfd));

  	char *execpath = NULL;
  	char *argv[] = { "bad-exe", NULL }, *envp[] = { NULL };
  	execfd = openat(dfd, "exe", O_PATH | O_CLOEXEC);
  	assert(execfd >= 0);
  	assert(asprintf(&execpath, "/proc/self/fd/%d", execfd) > 0);
  	assert(!execve(execpath, argv, envp));
  }
  % ./mount-memfd
  this file was executed from this totally private tmpfs: /proc/self/fd/5
  %

Given that it is possible for CAP_SYS_ADMIN users to create executable
binaries without memfd_create(2) and without touching the host filesystem
(not to mention the many other things a CAP_SYS_ADMIN process would be
able to do that would be equivalent or worse), it seems strange to cause a
fair amount of headache to admins when there doesn't appear to be an
actual security benefit to blocking this.  There appear to be concerns
about confused-deputy-esque attacks[2] but a confused deputy that can
write to arbitrary sysctls is a bigger security issue than executable
memfds.

/* New API */

The primary requirement from the original author appears to be more based
on the need to be able to restrict an entire system in a hierarchical
manner[3], such that child namespaces cannot re-enable executable memfds.

So, implement that behaviour explicitly -- the vm.memfd_noexec scope is
evaluated up the pidns tree to &init_pid_ns and you have the most
restrictive value applied to you.  The new lower limit you can set
vm.memfd_noexec is whatever limit applies to your parent.

Note that a pidns will inherit a copy of the parent pidns's effective
vm.memfd_noexec setting at unshare() time.  This matches the existing
behaviour, and it also ensures that a pidns will never have its
vm.memfd_noexec setting *lowered* behind its back (but it will be raised
if the parent raises theirs).

/* Backwards Compatibility */

As the previous version of the sysctl didn't allow you to lower the
setting at all, there are no backwards compatibility issues with this
aspect of the change.

However it should be noted that now that the setting is completely
hierarchical.  Previously, a cloned pidns would just copy the current
pidns setting, meaning that if the parent's vm.memfd_noexec was changed it
wouldn't propoagate to existing pid namespaces.  Now, the restriction
applies recursively.  This is a uAPI change, however:

 * The sysctl is very new, having been merged in 6.3.
 * Several aspects of the sysctl were broken up until this patchset and
   the other patchset by Jeff Xu last month.

And thus it seems incredibly unlikely that any real users would run into
this issue. In the worst case, if this causes userspace isues we could
make it so that modifying the setting follows the hierarchical rules but
the restriction checking uses the cached copy.

[1]: https://lore.kernel.org/CABi2SkWnAgHK1i6iqSqPMYuNEhtHBkO8jUuCvmG3RmUB5TKHJw@mail.gmail.com/
[2]: https://lore.kernel.org/CALmYWFs_dNCzw_pW1yRAo4bGCPEtykroEQaowNULp7svwMLjOg@mail.gmail.com/
[3]: https://lore.kernel.org/CALmYWFuahdUF7cT4cm7_TGLqPanuHXJ-hVSfZt7vpTnc18DPrw@mail.gmail.com/

Link: https://lkml.kernel.org/r/20230814-memfd-vm-noexec-uapi-fixes-v2-4-7ff9e3e10ba6@cyphar.com
Fixes: 105ff5339f ("mm/memfd: add MFD_NOEXEC_SEAL and MFD_EXEC")
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
Cc: Dominique Martinet <asmadeus@codewreck.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Daniel Verkamp <dverkamp@chromium.org>
Cc: Jeff Xu <jeffxu@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
Aleksa Sarai 2023-08-14 18:41:00 +10:00 committed by Andrew Morton
parent 434ed3350f
commit 9876cfe8ec
5 changed files with 42 additions and 21 deletions

View File

@ -39,7 +39,6 @@ struct pid_namespace {
int reboot; /* group exit code if this pidns was rebooted */ int reboot; /* group exit code if this pidns was rebooted */
struct ns_common ns; struct ns_common ns;
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
/* sysctl for vm.memfd_noexec */
int memfd_noexec_scope; int memfd_noexec_scope;
#endif #endif
} __randomize_layout; } __randomize_layout;
@ -56,6 +55,23 @@ static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
return ns; return ns;
} }
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns)
{
int scope = MEMFD_NOEXEC_SCOPE_EXEC;
for (; ns; ns = ns->parent)
scope = max(scope, READ_ONCE(ns->memfd_noexec_scope));
return scope;
}
#else
static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns)
{
return 0;
}
#endif
extern struct pid_namespace *copy_pid_ns(unsigned long flags, extern struct pid_namespace *copy_pid_ns(unsigned long flags,
struct user_namespace *user_ns, struct pid_namespace *ns); struct user_namespace *user_ns, struct pid_namespace *ns);
extern void zap_pid_ns_processes(struct pid_namespace *pid_ns); extern void zap_pid_ns_processes(struct pid_namespace *pid_ns);
@ -70,6 +86,11 @@ static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
return ns; return ns;
} }
static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns)
{
return 0;
}
static inline struct pid_namespace *copy_pid_ns(unsigned long flags, static inline struct pid_namespace *copy_pid_ns(unsigned long flags,
struct user_namespace *user_ns, struct pid_namespace *ns) struct user_namespace *user_ns, struct pid_namespace *ns)
{ {

View File

@ -83,6 +83,9 @@ struct pid_namespace init_pid_ns = {
#ifdef CONFIG_PID_NS #ifdef CONFIG_PID_NS
.ns.ops = &pidns_operations, .ns.ops = &pidns_operations,
#endif #endif
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
.memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
#endif
}; };
EXPORT_SYMBOL_GPL(init_pid_ns); EXPORT_SYMBOL_GPL(init_pid_ns);

View File

@ -110,9 +110,9 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->user_ns = get_user_ns(user_ns); ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts; ns->ucounts = ucounts;
ns->pid_allocated = PIDNS_ADDING; ns->pid_allocated = PIDNS_ADDING;
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
initialize_memfd_noexec_scope(ns); ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
#endif
return ns; return ns;
out_free_idr: out_free_idr:

View File

@ -5,33 +5,30 @@
#include <linux/pid_namespace.h> #include <linux/pid_namespace.h>
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns)
{
ns->memfd_noexec_scope =
task_active_pid_ns(current)->memfd_noexec_scope;
}
static int pid_mfd_noexec_dointvec_minmax(struct ctl_table *table, static int pid_mfd_noexec_dointvec_minmax(struct ctl_table *table,
int write, void *buf, size_t *lenp, loff_t *ppos) int write, void *buf, size_t *lenp, loff_t *ppos)
{ {
struct pid_namespace *ns = task_active_pid_ns(current); struct pid_namespace *ns = task_active_pid_ns(current);
struct ctl_table table_copy; struct ctl_table table_copy;
int err, scope, parent_scope;
if (write && !ns_capable(ns->user_ns, CAP_SYS_ADMIN)) if (write && !ns_capable(ns->user_ns, CAP_SYS_ADMIN))
return -EPERM; return -EPERM;
table_copy = *table; table_copy = *table;
if (ns != &init_pid_ns)
table_copy.data = &ns->memfd_noexec_scope;
/* /* You cannot set a lower enforcement value than your parent. */
* set minimum to current value, the effect is only bigger parent_scope = pidns_memfd_noexec_scope(ns->parent);
* value is accepted. /* Equivalent to pidns_memfd_noexec_scope(ns). */
*/ scope = max(READ_ONCE(ns->memfd_noexec_scope), parent_scope);
if (*(int *)table_copy.data > *(int *)table_copy.extra1)
table_copy.extra1 = table_copy.data;
return proc_dointvec_minmax(&table_copy, write, buf, lenp, ppos); table_copy.data = &scope;
table_copy.extra1 = &parent_scope;
err = proc_dointvec_minmax(&table_copy, write, buf, lenp, ppos);
if (!err && write)
WRITE_ONCE(ns->memfd_noexec_scope, scope);
return err;
} }
static struct ctl_table pid_ns_ctl_table_vm[] = { static struct ctl_table pid_ns_ctl_table_vm[] = {
@ -51,7 +48,6 @@ static inline void register_pid_ns_sysctl_table_vm(void)
register_sysctl("vm", pid_ns_ctl_table_vm); register_sysctl("vm", pid_ns_ctl_table_vm);
} }
#else #else
static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns) {}
static inline void register_pid_ns_sysctl_table_vm(void) {} static inline void register_pid_ns_sysctl_table_vm(void) {}
#endif #endif

View File

@ -271,7 +271,8 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
static int check_sysctl_memfd_noexec(unsigned int *flags) static int check_sysctl_memfd_noexec(unsigned int *flags)
{ {
#ifdef CONFIG_SYSCTL #ifdef CONFIG_SYSCTL
int sysctl = task_active_pid_ns(current)->memfd_noexec_scope; struct pid_namespace *ns = task_active_pid_ns(current);
int sysctl = pidns_memfd_noexec_scope(ns);
if (!(*flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) { if (!(*flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) {
if (sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL) if (sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL)