Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace

Pull namespace updates from Eric Biederman:
 "There is a lot here. A lot of these changes result in subtle user
  visible differences in kernel behavior. I don't expect anything will
  care but I will revert/fix things immediately if any regressions show
  up.

  From Seth Forshee there is a continuation of the work to make the vfs
  ready for unpriviled mounts. We had thought the previous changes
  prevented the creation of files outside of s_user_ns of a filesystem,
  but it turns we missed the O_CREAT path. Ooops.

  Pavel Tikhomirov and Oleg Nesterov worked together to fix a long
  standing bug in the implemenation of PR_SET_CHILD_SUBREAPER where only
  children that are forked after the prctl are considered and not
  children forked before the prctl. The only known user of this prctl
  systemd forks all children after the prctl. So no userspace
  regressions will occur. Holding earlier forked children to the same
  rules as later forked children creates a semantic that is sane enough
  to allow checkpoing of processes that use this feature.

  There is a long delayed change by Nikolay Borisov to limit inotify
  instances inside a user namespace.

  Michael Kerrisk extends the API for files used to maniuplate
  namespaces with two new trivial ioctls to allow discovery of the
  hierachy and properties of namespaces.

  Konstantin Khlebnikov with the help of Al Viro adds code that when a
  network namespace exits purges it's sysctl entries from the dcache. As
  in some circumstances this could use a lot of memory.

  Vivek Goyal fixed a bug with stacked filesystems where the permissions
  on the wrong inode were being checked.

  I continue previous work on ptracing across exec. Allowing a file to
  be setuid across exec while being ptraced if the tracer has enough
  credentials in the user namespace, and if the process has CAP_SETUID
  in it's own namespace. Proc files for setuid or otherwise undumpable
  executables are now owned by the root in the user namespace of their
  mm. Allowing debugging of setuid applications in containers to work
  better.

  A bug I introduced with permission checking and automount is now
  fixed. The big change is to mark the mounts that the kernel initiates
  as a result of an automount. This allows the permission checks in sget
  to be safely suppressed for this kind of mount. As the permission
  check happened when the original filesystem was mounted.

  Finally a special case in the mount namespace is removed preventing
  unbounded chains in the mount hash table, and making the semantics
  simpler which benefits CRIU.

  The vfs fix along with related work in ima and evm I believe makes us
  ready to finish developing and merge fully unprivileged mounts of the
  fuse filesystem. The cleanups of the mount namespace makes discussing
  how to fix the worst case complexity of umount. The stacked filesystem
  fixes pave the way for adding multiple mappings for the filesystem
  uids so that efficient and safer containers can be implemented"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace:
  proc/sysctl: Don't grab i_lock under sysctl_lock.
  vfs: Use upper filesystem inode in bprm_fill_uid()
  proc/sysctl: prune stale dentries during unregistering
  mnt: Tuck mounts under others instead of creating shadow/side mounts.
  prctl: propagate has_child_subreaper flag to every descendant
  introduce the walk_process_tree() helper
  nsfs: Add an ioctl() to return owner UID of a userns
  fs: Better permission checking for submounts
  exit: fix the setns() && PR_SET_CHILD_SUBREAPER interaction
  vfs: open() with O_CREAT should not create inodes with unknown ids
  nsfs: Add an ioctl() to return the namespace type
  proc: Better ownership of files for non-dumpable tasks in user namespaces
  exec: Remove LSM_UNSAFE_PTRACE_CAP
  exec: Test the ptracer's saved cred to see if the tracee can gain caps
  exec: Don't reset euid and egid when the tracee has CAP_SETUID
  inotify: Convert to using per-namespace limits
This commit is contained in:
Linus Torvalds 2017-02-23 20:33:51 -08:00
commit f1ef09fde1
40 changed files with 431 additions and 226 deletions

View File

@ -202,7 +202,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
/* try and do the mount */ /* try and do the mount */
_debug("--- attempting mount %s -o %s ---", devname, options); _debug("--- attempting mount %s -o %s ---", devname, options);
mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options); mnt = vfs_submount(mntpt, &afs_fs_type, devname, options);
_debug("--- mount result %p ---", mnt); _debug("--- mount result %p ---", mnt);
free_page((unsigned long) devname); free_page((unsigned long) devname);

View File

@ -436,8 +436,8 @@ int autofs4_wait(struct autofs_sb_info *sbi,
memcpy(&wq->name, &qstr, sizeof(struct qstr)); memcpy(&wq->name, &qstr, sizeof(struct qstr));
wq->dev = autofs4_get_dev(sbi); wq->dev = autofs4_get_dev(sbi);
wq->ino = autofs4_get_ino(sbi); wq->ino = autofs4_get_ino(sbi);
wq->uid = current_real_cred()->uid; wq->uid = current_cred()->uid;
wq->gid = current_real_cred()->gid; wq->gid = current_cred()->gid;
wq->pid = pid; wq->pid = pid;
wq->tgid = tgid; wq->tgid = tgid;
wq->status = -EINTR; /* Status return if interrupted */ wq->status = -EINTR; /* Status return if interrupted */

View File

@ -245,7 +245,8 @@ compose_mount_options_err:
* @fullpath: full path in UNC format * @fullpath: full path in UNC format
* @ref: server's referral * @ref: server's referral
*/ */
static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb, static struct vfsmount *cifs_dfs_do_refmount(struct dentry *mntpt,
struct cifs_sb_info *cifs_sb,
const char *fullpath, const struct dfs_info3_param *ref) const char *fullpath, const struct dfs_info3_param *ref)
{ {
struct vfsmount *mnt; struct vfsmount *mnt;
@ -259,7 +260,7 @@ static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
if (IS_ERR(mountdata)) if (IS_ERR(mountdata))
return (struct vfsmount *)mountdata; return (struct vfsmount *)mountdata;
mnt = vfs_kern_mount(&cifs_fs_type, 0, devname, mountdata); mnt = vfs_submount(mntpt, &cifs_fs_type, devname, mountdata);
kfree(mountdata); kfree(mountdata);
kfree(devname); kfree(devname);
return mnt; return mnt;
@ -334,7 +335,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
mnt = ERR_PTR(-EINVAL); mnt = ERR_PTR(-EINVAL);
break; break;
} }
mnt = cifs_dfs_do_refmount(cifs_sb, mnt = cifs_dfs_do_refmount(mntpt, cifs_sb,
full_path, referrals + i); full_path, referrals + i);
cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n", cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n",
__func__, referrals[i].node_name, mnt); __func__, referrals[i].node_name, mnt);

View File

@ -187,9 +187,9 @@ static const struct super_operations debugfs_super_operations = {
static struct vfsmount *debugfs_automount(struct path *path) static struct vfsmount *debugfs_automount(struct path *path)
{ {
struct vfsmount *(*f)(void *); debugfs_automount_t f;
f = (struct vfsmount *(*)(void *))path->dentry->d_fsdata; f = (debugfs_automount_t)path->dentry->d_fsdata;
return f(d_inode(path->dentry)->i_private); return f(path->dentry, d_inode(path->dentry)->i_private);
} }
static const struct dentry_operations debugfs_dops = { static const struct dentry_operations debugfs_dops = {
@ -540,7 +540,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir);
*/ */
struct dentry *debugfs_create_automount(const char *name, struct dentry *debugfs_create_automount(const char *name,
struct dentry *parent, struct dentry *parent,
struct vfsmount *(*f)(void *), debugfs_automount_t f,
void *data) void *data)
{ {
struct dentry *dentry = start_creating(name, parent); struct dentry *dentry = start_creating(name, parent);

View File

@ -1426,12 +1426,8 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
struct task_struct *p = current, *t; struct task_struct *p = current, *t;
unsigned n_fs; unsigned n_fs;
if (p->ptrace) { if (p->ptrace)
if (ptracer_capable(p, current_user_ns())) bprm->unsafe |= LSM_UNSAFE_PTRACE;
bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP;
else
bprm->unsafe |= LSM_UNSAFE_PTRACE;
}
/* /*
* This isn't strictly necessary, but it makes it harder for LSMs to * This isn't strictly necessary, but it makes it harder for LSMs to
@ -1479,7 +1475,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm)
if (task_no_new_privs(current)) if (task_no_new_privs(current))
return; return;
inode = file_inode(bprm->file); inode = bprm->file->f_path.dentry->d_inode;
mode = READ_ONCE(inode->i_mode); mode = READ_ONCE(inode->i_mode);
if (!(mode & (S_ISUID|S_ISGID))) if (!(mode & (S_ISUID|S_ISGID)))
return; return;

View File

@ -89,7 +89,6 @@ static inline int is_mounted(struct vfsmount *mnt)
} }
extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *); extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
extern int __legitimize_mnt(struct vfsmount *, unsigned); extern int __legitimize_mnt(struct vfsmount *, unsigned);
extern bool legitimize_mnt(struct vfsmount *, unsigned); extern bool legitimize_mnt(struct vfsmount *, unsigned);

View File

@ -1100,7 +1100,6 @@ static int follow_automount(struct path *path, struct nameidata *nd,
bool *need_mntput) bool *need_mntput)
{ {
struct vfsmount *mnt; struct vfsmount *mnt;
const struct cred *old_cred;
int err; int err;
if (!path->dentry->d_op || !path->dentry->d_op->d_automount) if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
@ -1129,9 +1128,7 @@ static int follow_automount(struct path *path, struct nameidata *nd,
if (nd->total_link_count >= 40) if (nd->total_link_count >= 40)
return -ELOOP; return -ELOOP;
old_cred = override_creds(&init_cred);
mnt = path->dentry->d_op->d_automount(path); mnt = path->dentry->d_op->d_automount(path);
revert_creds(old_cred);
if (IS_ERR(mnt)) { if (IS_ERR(mnt)) {
/* /*
* The filesystem is allowed to return -EISDIR here to indicate * The filesystem is allowed to return -EISDIR here to indicate
@ -2941,10 +2938,16 @@ static inline int open_to_namei_flags(int flag)
static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode) static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode)
{ {
struct user_namespace *s_user_ns;
int error = security_path_mknod(dir, dentry, mode, 0); int error = security_path_mknod(dir, dentry, mode, 0);
if (error) if (error)
return error; return error;
s_user_ns = dir->dentry->d_sb->s_user_ns;
if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
!kgid_has_mapping(s_user_ns, current_fsgid()))
return -EOVERFLOW;
error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC); error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
if (error) if (error)
return error; return error;

View File

@ -636,28 +636,6 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
return NULL; return NULL;
} }
/*
* find the last mount at @dentry on vfsmount @mnt.
* mount_lock must be held.
*/
struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
{
struct mount *p, *res = NULL;
p = __lookup_mnt(mnt, dentry);
if (!p)
goto out;
if (!(p->mnt.mnt_flags & MNT_UMOUNT))
res = p;
hlist_for_each_entry_continue(p, mnt_hash) {
if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
break;
if (!(p->mnt.mnt_flags & MNT_UMOUNT))
res = p;
}
out:
return res;
}
/* /*
* lookup_mnt - Return the first child mount mounted at path * lookup_mnt - Return the first child mount mounted at path
* *
@ -878,6 +856,13 @@ void mnt_set_mountpoint(struct mount *mnt,
hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
} }
static void __attach_mnt(struct mount *mnt, struct mount *parent)
{
hlist_add_head_rcu(&mnt->mnt_hash,
m_hash(&parent->mnt, mnt->mnt_mountpoint));
list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
}
/* /*
* vfsmount lock must be held for write * vfsmount lock must be held for write
*/ */
@ -886,28 +871,45 @@ static void attach_mnt(struct mount *mnt,
struct mountpoint *mp) struct mountpoint *mp)
{ {
mnt_set_mountpoint(parent, mp, mnt); mnt_set_mountpoint(parent, mp, mnt);
hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry)); __attach_mnt(mnt, parent);
list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
} }
static void attach_shadowed(struct mount *mnt, void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
struct mount *parent,
struct mount *shadows)
{ {
if (shadows) { struct mountpoint *old_mp = mnt->mnt_mp;
hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash); struct dentry *old_mountpoint = mnt->mnt_mountpoint;
list_add(&mnt->mnt_child, &shadows->mnt_child); struct mount *old_parent = mnt->mnt_parent;
} else {
hlist_add_head_rcu(&mnt->mnt_hash, list_del_init(&mnt->mnt_child);
m_hash(&parent->mnt, mnt->mnt_mountpoint)); hlist_del_init(&mnt->mnt_mp_list);
list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); hlist_del_init_rcu(&mnt->mnt_hash);
}
attach_mnt(mnt, parent, mp);
put_mountpoint(old_mp);
/*
* Safely avoid even the suggestion this code might sleep or
* lock the mount hash by taking advantage of the knowledge that
* mnt_change_mountpoint will not release the final reference
* to a mountpoint.
*
* During mounting, the mount passed in as the parent mount will
* continue to use the old mountpoint and during unmounting, the
* old mountpoint will continue to exist until namespace_unlock,
* which happens well after mnt_change_mountpoint.
*/
spin_lock(&old_mountpoint->d_lock);
old_mountpoint->d_lockref.count--;
spin_unlock(&old_mountpoint->d_lock);
mnt_add_count(old_parent, -1);
} }
/* /*
* vfsmount lock must be held for write * vfsmount lock must be held for write
*/ */
static void commit_tree(struct mount *mnt, struct mount *shadows) static void commit_tree(struct mount *mnt)
{ {
struct mount *parent = mnt->mnt_parent; struct mount *parent = mnt->mnt_parent;
struct mount *m; struct mount *m;
@ -925,7 +927,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows)
n->mounts += n->pending_mounts; n->mounts += n->pending_mounts;
n->pending_mounts = 0; n->pending_mounts = 0;
attach_shadowed(mnt, parent, shadows); __attach_mnt(mnt, parent);
touch_mnt_namespace(n); touch_mnt_namespace(n);
} }
@ -989,6 +991,21 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
} }
EXPORT_SYMBOL_GPL(vfs_kern_mount); EXPORT_SYMBOL_GPL(vfs_kern_mount);
struct vfsmount *
vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
const char *name, void *data)
{
/* Until it is worked out how to pass the user namespace
* through from the parent mount to the submount don't support
* unprivileged mounts with submounts.
*/
if (mountpoint->d_sb->s_user_ns != &init_user_ns)
return ERR_PTR(-EPERM);
return vfs_kern_mount(type, MS_SUBMOUNT, name, data);
}
EXPORT_SYMBOL_GPL(vfs_submount);
static struct mount *clone_mnt(struct mount *old, struct dentry *root, static struct mount *clone_mnt(struct mount *old, struct dentry *root,
int flag) int flag)
{ {
@ -1764,7 +1781,6 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
continue; continue;
for (s = r; s; s = next_mnt(s, r)) { for (s = r; s; s = next_mnt(s, r)) {
struct mount *t = NULL;
if (!(flag & CL_COPY_UNBINDABLE) && if (!(flag & CL_COPY_UNBINDABLE) &&
IS_MNT_UNBINDABLE(s)) { IS_MNT_UNBINDABLE(s)) {
s = skip_mnt_tree(s); s = skip_mnt_tree(s);
@ -1786,14 +1802,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
goto out; goto out;
lock_mount_hash(); lock_mount_hash();
list_add_tail(&q->mnt_list, &res->mnt_list); list_add_tail(&q->mnt_list, &res->mnt_list);
mnt_set_mountpoint(parent, p->mnt_mp, q); attach_mnt(q, parent, p->mnt_mp);
if (!list_empty(&parent->mnt_mounts)) {
t = list_last_entry(&parent->mnt_mounts,
struct mount, mnt_child);
if (t->mnt_mp != p->mnt_mp)
t = NULL;
}
attach_shadowed(q, parent, t);
unlock_mount_hash(); unlock_mount_hash();
} }
} }
@ -1992,10 +2001,18 @@ static int attach_recursive_mnt(struct mount *source_mnt,
{ {
HLIST_HEAD(tree_list); HLIST_HEAD(tree_list);
struct mnt_namespace *ns = dest_mnt->mnt_ns; struct mnt_namespace *ns = dest_mnt->mnt_ns;
struct mountpoint *smp;
struct mount *child, *p; struct mount *child, *p;
struct hlist_node *n; struct hlist_node *n;
int err; int err;
/* Preallocate a mountpoint in case the new mounts need
* to be tucked under other mounts.
*/
smp = get_mountpoint(source_mnt->mnt.mnt_root);
if (IS_ERR(smp))
return PTR_ERR(smp);
/* Is there space to add these mounts to the mount namespace? */ /* Is there space to add these mounts to the mount namespace? */
if (!parent_path) { if (!parent_path) {
err = count_mounts(ns, source_mnt); err = count_mounts(ns, source_mnt);
@ -2022,16 +2039,19 @@ static int attach_recursive_mnt(struct mount *source_mnt,
touch_mnt_namespace(source_mnt->mnt_ns); touch_mnt_namespace(source_mnt->mnt_ns);
} else { } else {
mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
commit_tree(source_mnt, NULL); commit_tree(source_mnt);
} }
hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
struct mount *q; struct mount *q;
hlist_del_init(&child->mnt_hash); hlist_del_init(&child->mnt_hash);
q = __lookup_mnt_last(&child->mnt_parent->mnt, q = __lookup_mnt(&child->mnt_parent->mnt,
child->mnt_mountpoint); child->mnt_mountpoint);
commit_tree(child, q); if (q)
mnt_change_mountpoint(child, smp, q);
commit_tree(child);
} }
put_mountpoint(smp);
unlock_mount_hash(); unlock_mount_hash();
return 0; return 0;
@ -2046,6 +2066,11 @@ static int attach_recursive_mnt(struct mount *source_mnt,
cleanup_group_ids(source_mnt, NULL); cleanup_group_ids(source_mnt, NULL);
out: out:
ns->pending_mounts = 0; ns->pending_mounts = 0;
read_seqlock_excl(&mount_lock);
put_mountpoint(smp);
read_sequnlock_excl(&mount_lock);
return err; return err;
} }
@ -2794,7 +2819,7 @@ long do_mount(const char *dev_name, const char __user *dir_name,
flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
MS_STRICTATIME | MS_NOREMOTELOCK); MS_STRICTATIME | MS_NOREMOTELOCK | MS_SUBMOUNT);
if (flags & MS_REMOUNT) if (flags & MS_REMOUNT)
retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,

View File

@ -226,7 +226,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
const char *devname, const char *devname,
struct nfs_clone_mount *mountdata) struct nfs_clone_mount *mountdata)
{ {
return vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata); return vfs_submount(mountdata->dentry, &nfs_xdev_fs_type, devname, mountdata);
} }
/** /**

View File

@ -279,7 +279,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
mountdata->hostname, mountdata->hostname,
mountdata->mnt_path); mountdata->mnt_path);
mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, mountdata); mnt = vfs_submount(mountdata->dentry, &nfs4_referral_fs_type, page, mountdata);
if (!IS_ERR(mnt)) if (!IS_ERR(mnt))
break; break;
} }

View File

@ -30,3 +30,20 @@ extern int inotify_handle_event(struct fsnotify_group *group,
const unsigned char *file_name, u32 cookie); const unsigned char *file_name, u32 cookie);
extern const struct fsnotify_ops inotify_fsnotify_ops; extern const struct fsnotify_ops inotify_fsnotify_ops;
#ifdef CONFIG_INOTIFY_USER
static inline void dec_inotify_instances(struct ucounts *ucounts)
{
dec_ucount(ucounts, UCOUNT_INOTIFY_INSTANCES);
}
static inline struct ucounts *inc_inotify_watches(struct ucounts *ucounts)
{
return inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_INOTIFY_WATCHES);
}
static inline void dec_inotify_watches(struct ucounts *ucounts)
{
dec_ucount(ucounts, UCOUNT_INOTIFY_WATCHES);
}
#endif

View File

@ -165,10 +165,8 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
/* ideally the idr is empty and we won't hit the BUG in the callback */ /* ideally the idr is empty and we won't hit the BUG in the callback */
idr_for_each(&group->inotify_data.idr, idr_callback, group); idr_for_each(&group->inotify_data.idr, idr_callback, group);
idr_destroy(&group->inotify_data.idr); idr_destroy(&group->inotify_data.idr);
if (group->inotify_data.user) { if (group->inotify_data.ucounts)
atomic_dec(&group->inotify_data.user->inotify_devs); dec_inotify_instances(group->inotify_data.ucounts);
free_uid(group->inotify_data.user);
}
} }
static void inotify_free_event(struct fsnotify_event *fsn_event) static void inotify_free_event(struct fsnotify_event *fsn_event)

View File

@ -44,10 +44,8 @@
#include <asm/ioctls.h> #include <asm/ioctls.h>
/* these are configurable via /proc/sys/fs/inotify/ */ /* configurable via /proc/sys/fs/inotify/ */
static int inotify_max_user_instances __read_mostly;
static int inotify_max_queued_events __read_mostly; static int inotify_max_queued_events __read_mostly;
static int inotify_max_user_watches __read_mostly;
static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
@ -60,7 +58,7 @@ static int zero;
struct ctl_table inotify_table[] = { struct ctl_table inotify_table[] = {
{ {
.procname = "max_user_instances", .procname = "max_user_instances",
.data = &inotify_max_user_instances, .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES],
.maxlen = sizeof(int), .maxlen = sizeof(int),
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec_minmax, .proc_handler = proc_dointvec_minmax,
@ -68,7 +66,7 @@ struct ctl_table inotify_table[] = {
}, },
{ {
.procname = "max_user_watches", .procname = "max_user_watches",
.data = &inotify_max_user_watches, .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES],
.maxlen = sizeof(int), .maxlen = sizeof(int),
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec_minmax, .proc_handler = proc_dointvec_minmax,
@ -500,7 +498,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
/* remove this mark from the idr */ /* remove this mark from the idr */
inotify_remove_from_idr(group, i_mark); inotify_remove_from_idr(group, i_mark);
atomic_dec(&group->inotify_data.user->inotify_watches); dec_inotify_watches(group->inotify_data.ucounts);
} }
/* ding dong the mark is dead */ /* ding dong the mark is dead */
@ -584,14 +582,17 @@ static int inotify_new_watch(struct fsnotify_group *group,
tmp_i_mark->fsn_mark.mask = mask; tmp_i_mark->fsn_mark.mask = mask;
tmp_i_mark->wd = -1; tmp_i_mark->wd = -1;
ret = -ENOSPC;
if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
goto out_err;
ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark); ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark);
if (ret) if (ret)
goto out_err; goto out_err;
/* increment the number of watches the user has */
if (!inc_inotify_watches(group->inotify_data.ucounts)) {
inotify_remove_from_idr(group, tmp_i_mark);
ret = -ENOSPC;
goto out_err;
}
/* we are on the idr, now get on the inode */ /* we are on the idr, now get on the inode */
ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode, ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode,
NULL, 0); NULL, 0);
@ -601,8 +602,6 @@ static int inotify_new_watch(struct fsnotify_group *group,
goto out_err; goto out_err;
} }
/* increment the number of watches the user has */
atomic_inc(&group->inotify_data.user->inotify_watches);
/* return the watch descriptor for this new mark */ /* return the watch descriptor for this new mark */
ret = tmp_i_mark->wd; ret = tmp_i_mark->wd;
@ -653,10 +652,11 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
spin_lock_init(&group->inotify_data.idr_lock); spin_lock_init(&group->inotify_data.idr_lock);
idr_init(&group->inotify_data.idr); idr_init(&group->inotify_data.idr);
group->inotify_data.user = get_current_user(); group->inotify_data.ucounts = inc_ucount(current_user_ns(),
current_euid(),
UCOUNT_INOTIFY_INSTANCES);
if (atomic_inc_return(&group->inotify_data.user->inotify_devs) > if (!group->inotify_data.ucounts) {
inotify_max_user_instances) {
fsnotify_destroy_group(group); fsnotify_destroy_group(group);
return ERR_PTR(-EMFILE); return ERR_PTR(-EMFILE);
} }
@ -819,8 +819,8 @@ static int __init inotify_user_setup(void)
inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC); inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
inotify_max_queued_events = 16384; inotify_max_queued_events = 16384;
inotify_max_user_instances = 128; init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128;
inotify_max_user_watches = 8192; init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = 8192;
return 0; return 0;
} }

View File

@ -7,6 +7,7 @@
#include <linux/seq_file.h> #include <linux/seq_file.h>
#include <linux/user_namespace.h> #include <linux/user_namespace.h>
#include <linux/nsfs.h> #include <linux/nsfs.h>
#include <linux/uaccess.h>
static struct vfsmount *nsfs_mnt; static struct vfsmount *nsfs_mnt;
@ -163,7 +164,10 @@ int open_related_ns(struct ns_common *ns,
static long ns_ioctl(struct file *filp, unsigned int ioctl, static long ns_ioctl(struct file *filp, unsigned int ioctl,
unsigned long arg) unsigned long arg)
{ {
struct user_namespace *user_ns;
struct ns_common *ns = get_proc_ns(file_inode(filp)); struct ns_common *ns = get_proc_ns(file_inode(filp));
uid_t __user *argp;
uid_t uid;
switch (ioctl) { switch (ioctl) {
case NS_GET_USERNS: case NS_GET_USERNS:
@ -172,6 +176,15 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
if (!ns->ops->get_parent) if (!ns->ops->get_parent)
return -EINVAL; return -EINVAL;
return open_related_ns(ns, ns->ops->get_parent); return open_related_ns(ns, ns->ops->get_parent);
case NS_GET_NSTYPE:
return ns->ops->type;
case NS_GET_OWNER_UID:
if (ns->ops->type != CLONE_NEWUSER)
return -EINVAL;
user_ns = container_of(ns, struct user_namespace, ns);
argp = (uid_t __user *) arg;
uid = from_kuid_munged(current_user_ns(), user_ns->owner);
return put_user(uid, argp);
default: default:
return -ENOTTY; return -ENOTTY;
} }

View File

@ -322,6 +322,21 @@ out:
return ret; return ret;
} }
static struct mount *find_topper(struct mount *mnt)
{
/* If there is exactly one mount covering mnt completely return it. */
struct mount *child;
if (!list_is_singular(&mnt->mnt_mounts))
return NULL;
child = list_first_entry(&mnt->mnt_mounts, struct mount, mnt_child);
if (child->mnt_mountpoint != mnt->mnt.mnt_root)
return NULL;
return child;
}
/* /*
* return true if the refcount is greater than count * return true if the refcount is greater than count
*/ */
@ -342,9 +357,8 @@ static inline int do_refcount_check(struct mount *mnt, int count)
*/ */
int propagate_mount_busy(struct mount *mnt, int refcnt) int propagate_mount_busy(struct mount *mnt, int refcnt)
{ {
struct mount *m, *child; struct mount *m, *child, *topper;
struct mount *parent = mnt->mnt_parent; struct mount *parent = mnt->mnt_parent;
int ret = 0;
if (mnt == parent) if (mnt == parent)
return do_refcount_check(mnt, refcnt); return do_refcount_check(mnt, refcnt);
@ -359,12 +373,24 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
for (m = propagation_next(parent, parent); m; for (m = propagation_next(parent, parent); m;
m = propagation_next(m, parent)) { m = propagation_next(m, parent)) {
child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint); int count = 1;
if (child && list_empty(&child->mnt_mounts) && child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
(ret = do_refcount_check(child, 1))) if (!child)
break; continue;
/* Is there exactly one mount on the child that covers
* it completely whose reference should be ignored?
*/
topper = find_topper(child);
if (topper)
count += 1;
else if (!list_empty(&child->mnt_mounts))
continue;
if (do_refcount_check(child, count))
return 1;
} }
return ret; return 0;
} }
/* /*
@ -381,7 +407,7 @@ void propagate_mount_unlock(struct mount *mnt)
for (m = propagation_next(parent, parent); m; for (m = propagation_next(parent, parent); m;
m = propagation_next(m, parent)) { m = propagation_next(m, parent)) {
child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint); child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
if (child) if (child)
child->mnt.mnt_flags &= ~MNT_LOCKED; child->mnt.mnt_flags &= ~MNT_LOCKED;
} }
@ -399,9 +425,11 @@ static void mark_umount_candidates(struct mount *mnt)
for (m = propagation_next(parent, parent); m; for (m = propagation_next(parent, parent); m;
m = propagation_next(m, parent)) { m = propagation_next(m, parent)) {
struct mount *child = __lookup_mnt_last(&m->mnt, struct mount *child = __lookup_mnt(&m->mnt,
mnt->mnt_mountpoint); mnt->mnt_mountpoint);
if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) { if (!child || (child->mnt.mnt_flags & MNT_UMOUNT))
continue;
if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) {
SET_MNT_MARK(child); SET_MNT_MARK(child);
} }
} }
@ -420,8 +448,8 @@ static void __propagate_umount(struct mount *mnt)
for (m = propagation_next(parent, parent); m; for (m = propagation_next(parent, parent); m;
m = propagation_next(m, parent)) { m = propagation_next(m, parent)) {
struct mount *topper;
struct mount *child = __lookup_mnt_last(&m->mnt, struct mount *child = __lookup_mnt(&m->mnt,
mnt->mnt_mountpoint); mnt->mnt_mountpoint);
/* /*
* umount the child only if the child has no children * umount the child only if the child has no children
@ -430,6 +458,15 @@ static void __propagate_umount(struct mount *mnt)
if (!child || !IS_MNT_MARKED(child)) if (!child || !IS_MNT_MARKED(child))
continue; continue;
CLEAR_MNT_MARK(child); CLEAR_MNT_MARK(child);
/* If there is exactly one mount covering all of child
* replace child with that mount.
*/
topper = find_topper(child);
if (topper)
mnt_change_mountpoint(child->mnt_parent, child->mnt_mp,
topper);
if (list_empty(&child->mnt_mounts)) { if (list_empty(&child->mnt_mounts)) {
list_del_init(&child->mnt_child); list_del_init(&child->mnt_child);
child->mnt.mnt_flags |= MNT_UMOUNT; child->mnt.mnt_flags |= MNT_UMOUNT;

View File

@ -49,6 +49,8 @@ int get_dominating_id(struct mount *mnt, const struct path *root);
unsigned int mnt_get_count(struct mount *mnt); unsigned int mnt_get_count(struct mount *mnt);
void mnt_set_mountpoint(struct mount *, struct mountpoint *, void mnt_set_mountpoint(struct mount *, struct mountpoint *,
struct mount *); struct mount *);
void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp,
struct mount *mnt);
struct mount *copy_tree(struct mount *, struct dentry *, int); struct mount *copy_tree(struct mount *, struct dentry *, int);
bool is_path_reachable(struct mount *, struct dentry *, bool is_path_reachable(struct mount *, struct dentry *,
const struct path *root); const struct path *root);

View File

@ -1667,12 +1667,63 @@ const struct inode_operations proc_pid_link_inode_operations = {
/* building an inode */ /* building an inode */
void task_dump_owner(struct task_struct *task, mode_t mode,
kuid_t *ruid, kgid_t *rgid)
{
/* Depending on the state of dumpable compute who should own a
* proc file for a task.
*/
const struct cred *cred;
kuid_t uid;
kgid_t gid;
/* Default to the tasks effective ownership */
rcu_read_lock();
cred = __task_cred(task);
uid = cred->euid;
gid = cred->egid;
rcu_read_unlock();
/*
* Before the /proc/pid/status file was created the only way to read
* the effective uid of a /process was to stat /proc/pid. Reading
* /proc/pid/status is slow enough that procps and other packages
* kept stating /proc/pid. To keep the rules in /proc simple I have
* made this apply to all per process world readable and executable
* directories.
*/
if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) {
struct mm_struct *mm;
task_lock(task);
mm = task->mm;
/* Make non-dumpable tasks owned by some root */
if (mm) {
if (get_dumpable(mm) != SUID_DUMP_USER) {
struct user_namespace *user_ns = mm->user_ns;
uid = make_kuid(user_ns, 0);
if (!uid_valid(uid))
uid = GLOBAL_ROOT_UID;
gid = make_kgid(user_ns, 0);
if (!gid_valid(gid))
gid = GLOBAL_ROOT_GID;
}
} else {
uid = GLOBAL_ROOT_UID;
gid = GLOBAL_ROOT_GID;
}
task_unlock(task);
}
*ruid = uid;
*rgid = gid;
}
struct inode *proc_pid_make_inode(struct super_block * sb, struct inode *proc_pid_make_inode(struct super_block * sb,
struct task_struct *task, umode_t mode) struct task_struct *task, umode_t mode)
{ {
struct inode * inode; struct inode * inode;
struct proc_inode *ei; struct proc_inode *ei;
const struct cred *cred;
/* We need a new inode */ /* We need a new inode */
@ -1694,13 +1745,7 @@ struct inode *proc_pid_make_inode(struct super_block * sb,
if (!ei->pid) if (!ei->pid)
goto out_unlock; goto out_unlock;
if (task_dumpable(task)) { task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
rcu_read_lock();
cred = __task_cred(task);
inode->i_uid = cred->euid;
inode->i_gid = cred->egid;
rcu_read_unlock();
}
security_task_to_inode(task, inode); security_task_to_inode(task, inode);
out: out:
@ -1715,7 +1760,6 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
{ {
struct inode *inode = d_inode(dentry); struct inode *inode = d_inode(dentry);
struct task_struct *task; struct task_struct *task;
const struct cred *cred;
struct pid_namespace *pid = dentry->d_sb->s_fs_info; struct pid_namespace *pid = dentry->d_sb->s_fs_info;
generic_fillattr(inode, stat); generic_fillattr(inode, stat);
@ -1733,12 +1777,7 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
*/ */
return -ENOENT; return -ENOENT;
} }
if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || task_dump_owner(task, inode->i_mode, &stat->uid, &stat->gid);
task_dumpable(task)) {
cred = __task_cred(task);
stat->uid = cred->euid;
stat->gid = cred->egid;
}
} }
rcu_read_unlock(); rcu_read_unlock();
return 0; return 0;
@ -1754,18 +1793,11 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
* Rewrite the inode's ownerships here because the owning task may have * Rewrite the inode's ownerships here because the owning task may have
* performed a setuid(), etc. * performed a setuid(), etc.
* *
* Before the /proc/pid/status file was created the only way to read
* the effective uid of a /process was to stat /proc/pid. Reading
* /proc/pid/status is slow enough that procps and other packages
* kept stating /proc/pid. To keep the rules in /proc simple I have
* made this apply to all per process world readable and executable
* directories.
*/ */
int pid_revalidate(struct dentry *dentry, unsigned int flags) int pid_revalidate(struct dentry *dentry, unsigned int flags)
{ {
struct inode *inode; struct inode *inode;
struct task_struct *task; struct task_struct *task;
const struct cred *cred;
if (flags & LOOKUP_RCU) if (flags & LOOKUP_RCU)
return -ECHILD; return -ECHILD;
@ -1774,17 +1806,8 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags)
task = get_proc_task(inode); task = get_proc_task(inode);
if (task) { if (task) {
if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);
task_dumpable(task)) {
rcu_read_lock();
cred = __task_cred(task);
inode->i_uid = cred->euid;
inode->i_gid = cred->egid;
rcu_read_unlock();
} else {
inode->i_uid = GLOBAL_ROOT_UID;
inode->i_gid = GLOBAL_ROOT_GID;
}
inode->i_mode &= ~(S_ISUID | S_ISGID); inode->i_mode &= ~(S_ISUID | S_ISGID);
security_task_to_inode(task, inode); security_task_to_inode(task, inode);
put_task_struct(task); put_task_struct(task);
@ -1881,7 +1904,6 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
bool exact_vma_exists = false; bool exact_vma_exists = false;
struct mm_struct *mm = NULL; struct mm_struct *mm = NULL;
struct task_struct *task; struct task_struct *task;
const struct cred *cred;
struct inode *inode; struct inode *inode;
int status = 0; int status = 0;
@ -1906,16 +1928,8 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
mmput(mm); mmput(mm);
if (exact_vma_exists) { if (exact_vma_exists) {
if (task_dumpable(task)) { task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
rcu_read_lock();
cred = __task_cred(task);
inode->i_uid = cred->euid;
inode->i_gid = cred->egid;
rcu_read_unlock();
} else {
inode->i_uid = GLOBAL_ROOT_UID;
inode->i_gid = GLOBAL_ROOT_GID;
}
security_task_to_inode(task, inode); security_task_to_inode(task, inode);
status = 1; status = 1;
} }

View File

@ -84,7 +84,6 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
{ {
struct files_struct *files; struct files_struct *files;
struct task_struct *task; struct task_struct *task;
const struct cred *cred;
struct inode *inode; struct inode *inode;
unsigned int fd; unsigned int fd;
@ -108,16 +107,7 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
rcu_read_unlock(); rcu_read_unlock();
put_files_struct(files); put_files_struct(files);
if (task_dumpable(task)) { task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
rcu_read_lock();
cred = __task_cred(task);
inode->i_uid = cred->euid;
inode->i_gid = cred->egid;
rcu_read_unlock();
} else {
inode->i_uid = GLOBAL_ROOT_UID;
inode->i_gid = GLOBAL_ROOT_GID;
}
if (S_ISLNK(inode->i_mode)) { if (S_ISLNK(inode->i_mode)) {
unsigned i_mode = S_IFLNK; unsigned i_mode = S_IFLNK;

View File

@ -43,10 +43,11 @@ static void proc_evict_inode(struct inode *inode)
de = PDE(inode); de = PDE(inode);
if (de) if (de)
pde_put(de); pde_put(de);
head = PROC_I(inode)->sysctl; head = PROC_I(inode)->sysctl;
if (head) { if (head) {
RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL); RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
sysctl_head_put(head); proc_sys_evict_inode(inode, head);
} }
} }

View File

@ -65,6 +65,7 @@ struct proc_inode {
struct proc_dir_entry *pde; struct proc_dir_entry *pde;
struct ctl_table_header *sysctl; struct ctl_table_header *sysctl;
struct ctl_table *sysctl_entry; struct ctl_table *sysctl_entry;
struct list_head sysctl_inodes;
const struct proc_ns_operations *ns_ops; const struct proc_ns_operations *ns_ops;
struct inode vfs_inode; struct inode vfs_inode;
}; };
@ -97,20 +98,8 @@ static inline struct task_struct *get_proc_task(struct inode *inode)
return get_pid_task(proc_pid(inode), PIDTYPE_PID); return get_pid_task(proc_pid(inode), PIDTYPE_PID);
} }
static inline int task_dumpable(struct task_struct *task) void task_dump_owner(struct task_struct *task, mode_t mode,
{ kuid_t *ruid, kgid_t *rgid);
int dumpable = 0;
struct mm_struct *mm;
task_lock(task);
mm = task->mm;
if (mm)
dumpable = get_dumpable(mm);
task_unlock(task);
if (dumpable == SUID_DUMP_USER)
return 1;
return 0;
}
static inline unsigned name_to_int(const struct qstr *qstr) static inline unsigned name_to_int(const struct qstr *qstr)
{ {
@ -249,10 +238,12 @@ extern void proc_thread_self_init(void);
*/ */
#ifdef CONFIG_PROC_SYSCTL #ifdef CONFIG_PROC_SYSCTL
extern int proc_sys_init(void); extern int proc_sys_init(void);
extern void sysctl_head_put(struct ctl_table_header *); extern void proc_sys_evict_inode(struct inode *inode,
struct ctl_table_header *head);
#else #else
static inline void proc_sys_init(void) { } static inline void proc_sys_init(void) { }
static inline void sysctl_head_put(struct ctl_table_header *head) { } static inline void proc_sys_evict_inode(struct inode *inode,
struct ctl_table_header *head) { }
#endif #endif
/* /*

View File

@ -190,6 +190,7 @@ static void init_header(struct ctl_table_header *head,
head->set = set; head->set = set;
head->parent = NULL; head->parent = NULL;
head->node = node; head->node = node;
INIT_LIST_HEAD(&head->inodes);
if (node) { if (node) {
struct ctl_table *entry; struct ctl_table *entry;
for (entry = table; entry->procname; entry++, node++) for (entry = table; entry->procname; entry++, node++)
@ -259,6 +260,27 @@ static void unuse_table(struct ctl_table_header *p)
complete(p->unregistering); complete(p->unregistering);
} }
/* called under sysctl_lock */
static void proc_sys_prune_dcache(struct ctl_table_header *head)
{
struct inode *inode, *prev = NULL;
struct proc_inode *ei;
rcu_read_lock();
list_for_each_entry_rcu(ei, &head->inodes, sysctl_inodes) {
inode = igrab(&ei->vfs_inode);
if (inode) {
rcu_read_unlock();
iput(prev);
prev = inode;
d_prune_aliases(inode);
rcu_read_lock();
}
}
rcu_read_unlock();
iput(prev);
}
/* called under sysctl_lock, will reacquire if has to wait */ /* called under sysctl_lock, will reacquire if has to wait */
static void start_unregistering(struct ctl_table_header *p) static void start_unregistering(struct ctl_table_header *p)
{ {
@ -272,33 +294,24 @@ static void start_unregistering(struct ctl_table_header *p)
p->unregistering = &wait; p->unregistering = &wait;
spin_unlock(&sysctl_lock); spin_unlock(&sysctl_lock);
wait_for_completion(&wait); wait_for_completion(&wait);
spin_lock(&sysctl_lock);
} else { } else {
/* anything non-NULL; we'll never dereference it */ /* anything non-NULL; we'll never dereference it */
p->unregistering = ERR_PTR(-EINVAL); p->unregistering = ERR_PTR(-EINVAL);
spin_unlock(&sysctl_lock);
} }
/*
* Prune dentries for unregistered sysctls: namespaced sysctls
* can have duplicate names and contaminate dcache very badly.
*/
proc_sys_prune_dcache(p);
/* /*
* do not remove from the list until nobody holds it; walking the * do not remove from the list until nobody holds it; walking the
* list in do_sysctl() relies on that. * list in do_sysctl() relies on that.
*/ */
spin_lock(&sysctl_lock);
erase_header(p); erase_header(p);
} }
static void sysctl_head_get(struct ctl_table_header *head)
{
spin_lock(&sysctl_lock);
head->count++;
spin_unlock(&sysctl_lock);
}
void sysctl_head_put(struct ctl_table_header *head)
{
spin_lock(&sysctl_lock);
if (!--head->count)
kfree_rcu(head, rcu);
spin_unlock(&sysctl_lock);
}
static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
{ {
BUG_ON(!head); BUG_ON(!head);
@ -440,10 +453,20 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
inode->i_ino = get_next_ino(); inode->i_ino = get_next_ino();
sysctl_head_get(head);
ei = PROC_I(inode); ei = PROC_I(inode);
spin_lock(&sysctl_lock);
if (unlikely(head->unregistering)) {
spin_unlock(&sysctl_lock);
iput(inode);
inode = NULL;
goto out;
}
ei->sysctl = head; ei->sysctl = head;
ei->sysctl_entry = table; ei->sysctl_entry = table;
list_add_rcu(&ei->sysctl_inodes, &head->inodes);
head->count++;
spin_unlock(&sysctl_lock);
inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
inode->i_mode = table->mode; inode->i_mode = table->mode;
@ -466,6 +489,15 @@ out:
return inode; return inode;
} }
void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head)
{
spin_lock(&sysctl_lock);
list_del_rcu(&PROC_I(inode)->sysctl_inodes);
if (!--head->count)
kfree_rcu(head, rcu);
spin_unlock(&sysctl_lock);
}
static struct ctl_table_header *grab_header(struct inode *inode) static struct ctl_table_header *grab_header(struct inode *inode)
{ {
struct ctl_table_header *head = PROC_I(inode)->sysctl; struct ctl_table_header *head = PROC_I(inode)->sysctl;

View File

@ -469,7 +469,7 @@ struct super_block *sget_userns(struct file_system_type *type,
struct super_block *old; struct super_block *old;
int err; int err;
if (!(flags & MS_KERNMOUNT) && if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) &&
!(type->fs_flags & FS_USERNS_MOUNT) && !(type->fs_flags & FS_USERNS_MOUNT) &&
!capable(CAP_SYS_ADMIN)) !capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM); return ERR_PTR(-EPERM);
@ -499,7 +499,7 @@ retry:
} }
if (!s) { if (!s) {
spin_unlock(&sb_lock); spin_unlock(&sb_lock);
s = alloc_super(type, flags, user_ns); s = alloc_super(type, (flags & ~MS_SUBMOUNT), user_ns);
if (!s) if (!s)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
goto retry; goto retry;
@ -540,8 +540,15 @@ struct super_block *sget(struct file_system_type *type,
{ {
struct user_namespace *user_ns = current_user_ns(); struct user_namespace *user_ns = current_user_ns();
/* We don't yet pass the user namespace of the parent
* mount through to here so always use &init_user_ns
* until that changes.
*/
if (flags & MS_SUBMOUNT)
user_ns = &init_user_ns;
/* Ensure the requestor has permissions over the target filesystem */ /* Ensure the requestor has permissions over the target filesystem */
if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN)) if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) && !ns_capable(user_ns, CAP_SYS_ADMIN))
return ERR_PTR(-EPERM); return ERR_PTR(-EPERM);
return sget_userns(type, test, set, flags, user_ns, data); return sget_userns(type, test, set, flags, user_ns, data);

View File

@ -98,9 +98,10 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent);
struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
const char *dest); const char *dest);
typedef struct vfsmount *(*debugfs_automount_t)(struct dentry *, void *);
struct dentry *debugfs_create_automount(const char *name, struct dentry *debugfs_create_automount(const char *name,
struct dentry *parent, struct dentry *parent,
struct vfsmount *(*f)(void *), debugfs_automount_t f,
void *data); void *data);
void debugfs_remove(struct dentry *dentry); void debugfs_remove(struct dentry *dentry);

View File

@ -16,6 +16,7 @@
#include <linux/spinlock.h> #include <linux/spinlock.h>
#include <linux/types.h> #include <linux/types.h>
#include <linux/atomic.h> #include <linux/atomic.h>
#include <linux/user_namespace.h>
/* /*
* IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily
@ -170,7 +171,7 @@ struct fsnotify_group {
struct inotify_group_private_data { struct inotify_group_private_data {
spinlock_t idr_lock; spinlock_t idr_lock;
struct idr idr; struct idr idr;
struct user_struct *user; struct ucounts *ucounts;
} inotify_data; } inotify_data;
#endif #endif
#ifdef CONFIG_FANOTIFY #ifdef CONFIG_FANOTIFY

View File

@ -90,6 +90,9 @@ struct file_system_type;
extern struct vfsmount *vfs_kern_mount(struct file_system_type *type, extern struct vfsmount *vfs_kern_mount(struct file_system_type *type,
int flags, const char *name, int flags, const char *name,
void *data); void *data);
extern struct vfsmount *vfs_submount(const struct dentry *mountpoint,
struct file_system_type *type,
const char *name, void *data);
extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list); extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list);
extern void mark_mounts_for_expiry(struct list_head *mounts); extern void mark_mounts_for_expiry(struct list_head *mounts);

View File

@ -846,10 +846,6 @@ struct user_struct {
atomic_t __count; /* reference count */ atomic_t __count; /* reference count */
atomic_t processes; /* How many processes does this user have? */ atomic_t processes; /* How many processes does this user have? */
atomic_t sigpending; /* How many pending signals does this user have? */ atomic_t sigpending; /* How many pending signals does this user have? */
#ifdef CONFIG_INOTIFY_USER
atomic_t inotify_watches; /* How many inotify watches does this user have? */
atomic_t inotify_devs; /* How many inotify devs does this user have opened? */
#endif
#ifdef CONFIG_FANOTIFY #ifdef CONFIG_FANOTIFY
atomic_t fanotify_listeners; atomic_t fanotify_listeners;
#endif #endif
@ -3051,6 +3047,9 @@ extern bool current_is_single_threaded(void);
#define for_each_process_thread(p, t) \ #define for_each_process_thread(p, t) \
for_each_process(p) for_each_thread(p, t) for_each_process(p) for_each_thread(p, t)
typedef int (*proc_visitor)(struct task_struct *p, void *data);
void walk_process_tree(struct task_struct *top, proc_visitor, void *);
static inline int get_nr_threads(struct task_struct *tsk) static inline int get_nr_threads(struct task_struct *tsk)
{ {
return tsk->signal->nr_threads; return tsk->signal->nr_threads;

View File

@ -140,8 +140,7 @@ struct request_sock;
/* bprm->unsafe reasons */ /* bprm->unsafe reasons */
#define LSM_UNSAFE_SHARE 1 #define LSM_UNSAFE_SHARE 1
#define LSM_UNSAFE_PTRACE 2 #define LSM_UNSAFE_PTRACE 2
#define LSM_UNSAFE_PTRACE_CAP 4 #define LSM_UNSAFE_NO_NEW_PRIVS 4
#define LSM_UNSAFE_NO_NEW_PRIVS 8
#ifdef CONFIG_MMU #ifdef CONFIG_MMU
extern int mmap_min_addr_handler(struct ctl_table *table, int write, extern int mmap_min_addr_handler(struct ctl_table *table, int write,

View File

@ -143,6 +143,7 @@ struct ctl_table_header
struct ctl_table_set *set; struct ctl_table_set *set;
struct ctl_dir *parent; struct ctl_dir *parent;
struct ctl_node *node; struct ctl_node *node;
struct list_head inodes; /* head for proc_inode->sysctl_inodes */
}; };
struct ctl_dir { struct ctl_dir {

View File

@ -32,6 +32,10 @@ enum ucount_type {
UCOUNT_NET_NAMESPACES, UCOUNT_NET_NAMESPACES,
UCOUNT_MNT_NAMESPACES, UCOUNT_MNT_NAMESPACES,
UCOUNT_CGROUP_NAMESPACES, UCOUNT_CGROUP_NAMESPACES,
#ifdef CONFIG_INOTIFY_USER
UCOUNT_INOTIFY_INSTANCES,
UCOUNT_INOTIFY_WATCHES,
#endif
UCOUNT_COUNTS, UCOUNT_COUNTS,
}; };

View File

@ -132,6 +132,7 @@ struct inodes_stat_t {
#define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */ #define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */
/* These sb flags are internal to the kernel */ /* These sb flags are internal to the kernel */
#define MS_SUBMOUNT (1<<26)
#define MS_NOREMOTELOCK (1<<27) #define MS_NOREMOTELOCK (1<<27)
#define MS_NOSEC (1<<28) #define MS_NOSEC (1<<28)
#define MS_BORN (1<<29) #define MS_BORN (1<<29)

View File

@ -6,8 +6,13 @@
#define NSIO 0xb7 #define NSIO 0xb7
/* Returns a file descriptor that refers to an owning user namespace */ /* Returns a file descriptor that refers to an owning user namespace */
#define NS_GET_USERNS _IO(NSIO, 0x1) #define NS_GET_USERNS _IO(NSIO, 0x1)
/* Returns a file descriptor that refers to a parent namespace */ /* Returns a file descriptor that refers to a parent namespace */
#define NS_GET_PARENT _IO(NSIO, 0x2) #define NS_GET_PARENT _IO(NSIO, 0x2)
/* Returns the type of namespace (CLONE_NEW* value) referred to by
file descriptor */
#define NS_GET_NSTYPE _IO(NSIO, 0x3)
/* Get owner UID (in the caller's user namespace) for a user namespace */
#define NS_GET_OWNER_UID _IO(NSIO, 0x4)
#endif /* __LINUX_NSFS_H */ #endif /* __LINUX_NSFS_H */

View File

@ -607,15 +607,18 @@ static struct task_struct *find_new_reaper(struct task_struct *father,
return thread; return thread;
if (father->signal->has_child_subreaper) { if (father->signal->has_child_subreaper) {
unsigned int ns_level = task_pid(father)->level;
/* /*
* Find the first ->is_child_subreaper ancestor in our pid_ns. * Find the first ->is_child_subreaper ancestor in our pid_ns.
* We start from father to ensure we can not look into another * We can't check reaper != child_reaper to ensure we do not
* namespace, this is safe because all its threads are dead. * cross the namespaces, the exiting parent could be injected
* by setns() + fork().
* We check pid->level, this is slightly more efficient than
* task_active_pid_ns(reaper) != task_active_pid_ns(father).
*/ */
for (reaper = father; for (reaper = father->real_parent;
!same_thread_group(reaper, child_reaper); task_pid(reaper)->level == ns_level;
reaper = reaper->real_parent) { reaper = reaper->real_parent) {
/* call_usermodehelper() descendants need this check */
if (reaper == &init_task) if (reaper == &init_task)
break; break;
if (!reaper->signal->is_child_subreaper) if (!reaper->signal->is_child_subreaper)

View File

@ -1377,9 +1377,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj = current->signal->oom_score_adj;
sig->oom_score_adj_min = current->signal->oom_score_adj_min; sig->oom_score_adj_min = current->signal->oom_score_adj_min;
sig->has_child_subreaper = current->signal->has_child_subreaper ||
current->signal->is_child_subreaper;
mutex_init(&sig->cred_guard_mutex); mutex_init(&sig->cred_guard_mutex);
return 0; return 0;
@ -1814,6 +1811,13 @@ static __latent_entropy struct task_struct *copy_process(
p->signal->leader_pid = pid; p->signal->leader_pid = pid;
p->signal->tty = tty_kref_get(current->signal->tty); p->signal->tty = tty_kref_get(current->signal->tty);
/*
* Inherit has_child_subreaper flag under the same
* tasklist_lock with adding child to the process tree
* for propagate_has_child_subreaper optimization.
*/
p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
p->real_parent->signal->is_child_subreaper;
list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail(&p->sibling, &p->real_parent->children);
list_add_tail_rcu(&p->tasks, &init_task.tasks); list_add_tail_rcu(&p->tasks, &init_task.tasks);
attach_pid(p, PIDTYPE_PGID); attach_pid(p, PIDTYPE_PGID);
@ -2067,6 +2071,38 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
} }
#endif #endif
void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data)
{
struct task_struct *leader, *parent, *child;
int res;
read_lock(&tasklist_lock);
leader = top = top->group_leader;
down:
for_each_thread(leader, parent) {
list_for_each_entry(child, &parent->children, sibling) {
res = visitor(child, data);
if (res) {
if (res < 0)
goto out;
leader = child;
goto down;
}
up:
;
}
}
if (leader != top) {
child = leader;
parent = child->real_parent;
leader = parent->group_leader;
goto up;
}
out:
read_unlock(&tasklist_lock);
}
#ifndef ARCH_MIN_MMSTRUCT_ALIGN #ifndef ARCH_MIN_MMSTRUCT_ALIGN
#define ARCH_MIN_MMSTRUCT_ALIGN 0 #define ARCH_MIN_MMSTRUCT_ALIGN 0
#endif #endif

View File

@ -2063,6 +2063,24 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
} }
#endif #endif
static int propagate_has_child_subreaper(struct task_struct *p, void *data)
{
/*
* If task has has_child_subreaper - all its decendants
* already have these flag too and new decendants will
* inherit it on fork, skip them.
*
* If we've found child_reaper - skip descendants in
* it's subtree as they will never get out pidns.
*/
if (p->signal->has_child_subreaper ||
is_child_reaper(task_pid(p)))
return 0;
p->signal->has_child_subreaper = 1;
return 1;
}
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5) unsigned long, arg4, unsigned long, arg5)
{ {
@ -2214,6 +2232,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
break; break;
case PR_SET_CHILD_SUBREAPER: case PR_SET_CHILD_SUBREAPER:
me->signal->is_child_subreaper = !!arg2; me->signal->is_child_subreaper = !!arg2;
if (!arg2)
break;
walk_process_tree(me, propagate_has_child_subreaper, NULL);
break; break;
case PR_GET_CHILD_SUBREAPER: case PR_GET_CHILD_SUBREAPER:
error = put_user(me->signal->is_child_subreaper, error = put_user(me->signal->is_child_subreaper,

View File

@ -7503,7 +7503,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
ftrace_init_tracefs(tr, d_tracer); ftrace_init_tracefs(tr, d_tracer);
} }
static struct vfsmount *trace_automount(void *ingore) static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
{ {
struct vfsmount *mnt; struct vfsmount *mnt;
struct file_system_type *type; struct file_system_type *type;
@ -7516,7 +7516,7 @@ static struct vfsmount *trace_automount(void *ingore)
type = get_fs_type("tracefs"); type = get_fs_type("tracefs");
if (!type) if (!type)
return NULL; return NULL;
mnt = vfs_kern_mount(type, 0, "tracefs", NULL); mnt = vfs_submount(mntpt, type, "tracefs", NULL);
put_filesystem(type); put_filesystem(type);
if (IS_ERR(mnt)) if (IS_ERR(mnt))
return NULL; return NULL;

View File

@ -57,7 +57,7 @@ static struct ctl_table_root set_root = {
static int zero = 0; static int zero = 0;
static int int_max = INT_MAX; static int int_max = INT_MAX;
#define UCOUNT_ENTRY(name) \ #define UCOUNT_ENTRY(name) \
{ \ { \
.procname = name, \ .procname = name, \
.maxlen = sizeof(int), \ .maxlen = sizeof(int), \
@ -74,6 +74,10 @@ static struct ctl_table user_table[] = {
UCOUNT_ENTRY("max_net_namespaces"), UCOUNT_ENTRY("max_net_namespaces"),
UCOUNT_ENTRY("max_mnt_namespaces"), UCOUNT_ENTRY("max_mnt_namespaces"),
UCOUNT_ENTRY("max_cgroup_namespaces"), UCOUNT_ENTRY("max_cgroup_namespaces"),
#ifdef CONFIG_INOTIFY_USER
UCOUNT_ENTRY("max_inotify_instances"),
UCOUNT_ENTRY("max_inotify_watches"),
#endif
{ } { }
}; };
#endif /* CONFIG_SYSCTL */ #endif /* CONFIG_SYSCTL */

View File

@ -471,7 +471,7 @@ int apparmor_bprm_set_creds(struct linux_binprm *bprm)
; ;
} }
if (bprm->unsafe & (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) { if (bprm->unsafe & LSM_UNSAFE_PTRACE) {
error = may_change_ptraced_domain(new_profile); error = may_change_ptraced_domain(new_profile);
if (error) if (error)
goto audit; goto audit;

View File

@ -548,9 +548,10 @@ skip:
if ((is_setid || if ((is_setid ||
!cap_issubset(new->cap_permitted, old->cap_permitted)) && !cap_issubset(new->cap_permitted, old->cap_permitted)) &&
bprm->unsafe & ~LSM_UNSAFE_PTRACE_CAP) { ((bprm->unsafe & ~LSM_UNSAFE_PTRACE) ||
!ptracer_capable(current, new->user_ns))) {
/* downgrade; they get no more than they had, and maybe less */ /* downgrade; they get no more than they had, and maybe less */
if (!capable(CAP_SETUID) || if (!ns_capable(new->user_ns, CAP_SETUID) ||
(bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) { (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) {
new->euid = new->uid; new->euid = new->uid;
new->egid = new->gid; new->egid = new->gid;

View File

@ -2399,8 +2399,7 @@ static int selinux_bprm_set_creds(struct linux_binprm *bprm)
/* Make sure that anyone attempting to ptrace over a task that /* Make sure that anyone attempting to ptrace over a task that
* changes its SID has the appropriate permit */ * changes its SID has the appropriate permit */
if (bprm->unsafe & if (bprm->unsafe & LSM_UNSAFE_PTRACE) {
(LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) {
u32 ptsid = ptrace_parent_sid(); u32 ptsid = ptrace_parent_sid();
if (ptsid != 0) { if (ptsid != 0) {
rc = avc_has_perm(ptsid, new_tsec->sid, rc = avc_has_perm(ptsid, new_tsec->sid,

View File

@ -931,7 +931,7 @@ static int smack_bprm_set_creds(struct linux_binprm *bprm)
isp->smk_task != sbsp->smk_root) isp->smk_task != sbsp->smk_root)
return 0; return 0;
if (bprm->unsafe & (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) { if (bprm->unsafe & LSM_UNSAFE_PTRACE) {
struct task_struct *tracer; struct task_struct *tracer;
rc = 0; rc = 0;