diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 81dd075356b9..d4fb0afc0097 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -202,7 +202,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) /* try and do the mount */ _debug("--- attempting mount %s -o %s ---", devname, options); - mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options); + mnt = vfs_submount(mntpt, &afs_fs_type, devname, options); _debug("--- mount result %p ---", mnt); free_page((unsigned long) devname); diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 1278335ce366..79fbd85db4ba 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -436,8 +436,8 @@ int autofs4_wait(struct autofs_sb_info *sbi, memcpy(&wq->name, &qstr, sizeof(struct qstr)); wq->dev = autofs4_get_dev(sbi); wq->ino = autofs4_get_ino(sbi); - wq->uid = current_real_cred()->uid; - wq->gid = current_real_cred()->gid; + wq->uid = current_cred()->uid; + wq->gid = current_cred()->gid; wq->pid = pid; wq->tgid = tgid; wq->status = -EINTR; /* Status return if interrupted */ diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index ec9dbbcca3b9..9156be545b0f 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -245,7 +245,8 @@ compose_mount_options_err: * @fullpath: full path in UNC format * @ref: server's referral */ -static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb, +static struct vfsmount *cifs_dfs_do_refmount(struct dentry *mntpt, + struct cifs_sb_info *cifs_sb, const char *fullpath, const struct dfs_info3_param *ref) { struct vfsmount *mnt; @@ -259,7 +260,7 @@ static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb, if (IS_ERR(mountdata)) return (struct vfsmount *)mountdata; - mnt = vfs_kern_mount(&cifs_fs_type, 0, devname, mountdata); + mnt = vfs_submount(mntpt, &cifs_fs_type, devname, mountdata); kfree(mountdata); kfree(devname); return mnt; @@ -334,7 +335,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) mnt = ERR_PTR(-EINVAL); break; } - mnt = cifs_dfs_do_refmount(cifs_sb, + mnt = cifs_dfs_do_refmount(mntpt, cifs_sb, full_path, referrals + i); cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n", __func__, referrals[i].node_name, mnt); diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 7fb1732a3630..7fd4ec4bb214 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -187,9 +187,9 @@ static const struct super_operations debugfs_super_operations = { static struct vfsmount *debugfs_automount(struct path *path) { - struct vfsmount *(*f)(void *); - f = (struct vfsmount *(*)(void *))path->dentry->d_fsdata; - return f(d_inode(path->dentry)->i_private); + debugfs_automount_t f; + f = (debugfs_automount_t)path->dentry->d_fsdata; + return f(path->dentry, d_inode(path->dentry)->i_private); } static const struct dentry_operations debugfs_dops = { @@ -540,7 +540,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir); */ struct dentry *debugfs_create_automount(const char *name, struct dentry *parent, - struct vfsmount *(*f)(void *), + debugfs_automount_t f, void *data) { struct dentry *dentry = start_creating(name, parent); diff --git a/fs/exec.c b/fs/exec.c index e57946610733..698a86094f76 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1426,12 +1426,8 @@ static void check_unsafe_exec(struct linux_binprm *bprm) struct task_struct *p = current, *t; unsigned n_fs; - if (p->ptrace) { - if (ptracer_capable(p, current_user_ns())) - bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP; - else - bprm->unsafe |= LSM_UNSAFE_PTRACE; - } + if (p->ptrace) + bprm->unsafe |= LSM_UNSAFE_PTRACE; /* * This isn't strictly necessary, but it makes it harder for LSMs to @@ -1479,7 +1475,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm) if (task_no_new_privs(current)) return; - inode = file_inode(bprm->file); + inode = bprm->file->f_path.dentry->d_inode; mode = READ_ONCE(inode->i_mode); if (!(mode & (S_ISUID|S_ISGID))) return; diff --git a/fs/mount.h b/fs/mount.h index 2c856fc47ae3..2826543a131d 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -89,7 +89,6 @@ static inline int is_mounted(struct vfsmount *mnt) } extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *); -extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *); extern int __legitimize_mnt(struct vfsmount *, unsigned); extern bool legitimize_mnt(struct vfsmount *, unsigned); diff --git a/fs/namei.c b/fs/namei.c index ad74877e1442..da689c9c005e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1100,7 +1100,6 @@ static int follow_automount(struct path *path, struct nameidata *nd, bool *need_mntput) { struct vfsmount *mnt; - const struct cred *old_cred; int err; if (!path->dentry->d_op || !path->dentry->d_op->d_automount) @@ -1129,9 +1128,7 @@ static int follow_automount(struct path *path, struct nameidata *nd, if (nd->total_link_count >= 40) return -ELOOP; - old_cred = override_creds(&init_cred); mnt = path->dentry->d_op->d_automount(path); - revert_creds(old_cred); if (IS_ERR(mnt)) { /* * The filesystem is allowed to return -EISDIR here to indicate @@ -2941,10 +2938,16 @@ static inline int open_to_namei_flags(int flag) static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode) { + struct user_namespace *s_user_ns; int error = security_path_mknod(dir, dentry, mode, 0); if (error) return error; + s_user_ns = dir->dentry->d_sb->s_user_ns; + if (!kuid_has_mapping(s_user_ns, current_fsuid()) || + !kgid_has_mapping(s_user_ns, current_fsgid())) + return -EOVERFLOW; + error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC); if (error) return error; diff --git a/fs/namespace.c b/fs/namespace.c index 487ba30bb5c6..8bfad42c1ccf 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -636,28 +636,6 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) return NULL; } -/* - * find the last mount at @dentry on vfsmount @mnt. - * mount_lock must be held. - */ -struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry) -{ - struct mount *p, *res = NULL; - p = __lookup_mnt(mnt, dentry); - if (!p) - goto out; - if (!(p->mnt.mnt_flags & MNT_UMOUNT)) - res = p; - hlist_for_each_entry_continue(p, mnt_hash) { - if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry) - break; - if (!(p->mnt.mnt_flags & MNT_UMOUNT)) - res = p; - } -out: - return res; -} - /* * lookup_mnt - Return the first child mount mounted at path * @@ -878,6 +856,13 @@ void mnt_set_mountpoint(struct mount *mnt, hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); } +static void __attach_mnt(struct mount *mnt, struct mount *parent) +{ + hlist_add_head_rcu(&mnt->mnt_hash, + m_hash(&parent->mnt, mnt->mnt_mountpoint)); + list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); +} + /* * vfsmount lock must be held for write */ @@ -886,28 +871,45 @@ static void attach_mnt(struct mount *mnt, struct mountpoint *mp) { mnt_set_mountpoint(parent, mp, mnt); - hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry)); - list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); + __attach_mnt(mnt, parent); } -static void attach_shadowed(struct mount *mnt, - struct mount *parent, - struct mount *shadows) +void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt) { - if (shadows) { - hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash); - list_add(&mnt->mnt_child, &shadows->mnt_child); - } else { - hlist_add_head_rcu(&mnt->mnt_hash, - m_hash(&parent->mnt, mnt->mnt_mountpoint)); - list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); - } + struct mountpoint *old_mp = mnt->mnt_mp; + struct dentry *old_mountpoint = mnt->mnt_mountpoint; + struct mount *old_parent = mnt->mnt_parent; + + list_del_init(&mnt->mnt_child); + hlist_del_init(&mnt->mnt_mp_list); + hlist_del_init_rcu(&mnt->mnt_hash); + + attach_mnt(mnt, parent, mp); + + put_mountpoint(old_mp); + + /* + * Safely avoid even the suggestion this code might sleep or + * lock the mount hash by taking advantage of the knowledge that + * mnt_change_mountpoint will not release the final reference + * to a mountpoint. + * + * During mounting, the mount passed in as the parent mount will + * continue to use the old mountpoint and during unmounting, the + * old mountpoint will continue to exist until namespace_unlock, + * which happens well after mnt_change_mountpoint. + */ + spin_lock(&old_mountpoint->d_lock); + old_mountpoint->d_lockref.count--; + spin_unlock(&old_mountpoint->d_lock); + + mnt_add_count(old_parent, -1); } /* * vfsmount lock must be held for write */ -static void commit_tree(struct mount *mnt, struct mount *shadows) +static void commit_tree(struct mount *mnt) { struct mount *parent = mnt->mnt_parent; struct mount *m; @@ -925,7 +927,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows) n->mounts += n->pending_mounts; n->pending_mounts = 0; - attach_shadowed(mnt, parent, shadows); + __attach_mnt(mnt, parent); touch_mnt_namespace(n); } @@ -989,6 +991,21 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void } EXPORT_SYMBOL_GPL(vfs_kern_mount); +struct vfsmount * +vfs_submount(const struct dentry *mountpoint, struct file_system_type *type, + const char *name, void *data) +{ + /* Until it is worked out how to pass the user namespace + * through from the parent mount to the submount don't support + * unprivileged mounts with submounts. + */ + if (mountpoint->d_sb->s_user_ns != &init_user_ns) + return ERR_PTR(-EPERM); + + return vfs_kern_mount(type, MS_SUBMOUNT, name, data); +} +EXPORT_SYMBOL_GPL(vfs_submount); + static struct mount *clone_mnt(struct mount *old, struct dentry *root, int flag) { @@ -1764,7 +1781,6 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, continue; for (s = r; s; s = next_mnt(s, r)) { - struct mount *t = NULL; if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(s)) { s = skip_mnt_tree(s); @@ -1786,14 +1802,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, goto out; lock_mount_hash(); list_add_tail(&q->mnt_list, &res->mnt_list); - mnt_set_mountpoint(parent, p->mnt_mp, q); - if (!list_empty(&parent->mnt_mounts)) { - t = list_last_entry(&parent->mnt_mounts, - struct mount, mnt_child); - if (t->mnt_mp != p->mnt_mp) - t = NULL; - } - attach_shadowed(q, parent, t); + attach_mnt(q, parent, p->mnt_mp); unlock_mount_hash(); } } @@ -1992,10 +2001,18 @@ static int attach_recursive_mnt(struct mount *source_mnt, { HLIST_HEAD(tree_list); struct mnt_namespace *ns = dest_mnt->mnt_ns; + struct mountpoint *smp; struct mount *child, *p; struct hlist_node *n; int err; + /* Preallocate a mountpoint in case the new mounts need + * to be tucked under other mounts. + */ + smp = get_mountpoint(source_mnt->mnt.mnt_root); + if (IS_ERR(smp)) + return PTR_ERR(smp); + /* Is there space to add these mounts to the mount namespace? */ if (!parent_path) { err = count_mounts(ns, source_mnt); @@ -2022,16 +2039,19 @@ static int attach_recursive_mnt(struct mount *source_mnt, touch_mnt_namespace(source_mnt->mnt_ns); } else { mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); - commit_tree(source_mnt, NULL); + commit_tree(source_mnt); } hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { struct mount *q; hlist_del_init(&child->mnt_hash); - q = __lookup_mnt_last(&child->mnt_parent->mnt, - child->mnt_mountpoint); - commit_tree(child, q); + q = __lookup_mnt(&child->mnt_parent->mnt, + child->mnt_mountpoint); + if (q) + mnt_change_mountpoint(child, smp, q); + commit_tree(child); } + put_mountpoint(smp); unlock_mount_hash(); return 0; @@ -2046,6 +2066,11 @@ static int attach_recursive_mnt(struct mount *source_mnt, cleanup_group_ids(source_mnt, NULL); out: ns->pending_mounts = 0; + + read_seqlock_excl(&mount_lock); + put_mountpoint(smp); + read_sequnlock_excl(&mount_lock); + return err; } @@ -2794,7 +2819,7 @@ long do_mount(const char *dev_name, const char __user *dir_name, flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | - MS_STRICTATIME | MS_NOREMOTELOCK); + MS_STRICTATIME | MS_NOREMOTELOCK | MS_SUBMOUNT); if (flags & MS_REMOUNT) retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 5551e8ef67fd..e49d831c4e85 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -226,7 +226,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, const char *devname, struct nfs_clone_mount *mountdata) { - return vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata); + return vfs_submount(mountdata->dentry, &nfs_xdev_fs_type, devname, mountdata); } /** diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index d21104912676..d8b040bd9814 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -279,7 +279,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, mountdata->hostname, mountdata->mnt_path); - mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, mountdata); + mnt = vfs_submount(mountdata->dentry, &nfs4_referral_fs_type, page, mountdata); if (!IS_ERR(mnt)) break; } diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index a6f5907a3fee..7c461fd49c4c 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -30,3 +30,20 @@ extern int inotify_handle_event(struct fsnotify_group *group, const unsigned char *file_name, u32 cookie); extern const struct fsnotify_ops inotify_fsnotify_ops; + +#ifdef CONFIG_INOTIFY_USER +static inline void dec_inotify_instances(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_INOTIFY_INSTANCES); +} + +static inline struct ucounts *inc_inotify_watches(struct ucounts *ucounts) +{ + return inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_INOTIFY_WATCHES); +} + +static inline void dec_inotify_watches(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_INOTIFY_WATCHES); +} +#endif diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 19e7ec109a75..f36c29398de3 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -165,10 +165,8 @@ static void inotify_free_group_priv(struct fsnotify_group *group) /* ideally the idr is empty and we won't hit the BUG in the callback */ idr_for_each(&group->inotify_data.idr, idr_callback, group); idr_destroy(&group->inotify_data.idr); - if (group->inotify_data.user) { - atomic_dec(&group->inotify_data.user->inotify_devs); - free_uid(group->inotify_data.user); - } + if (group->inotify_data.ucounts) + dec_inotify_instances(group->inotify_data.ucounts); } static void inotify_free_event(struct fsnotify_event *fsn_event) diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 69d1ea3d292a..1cf41c623be1 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -44,10 +44,8 @@ #include -/* these are configurable via /proc/sys/fs/inotify/ */ -static int inotify_max_user_instances __read_mostly; +/* configurable via /proc/sys/fs/inotify/ */ static int inotify_max_queued_events __read_mostly; -static int inotify_max_user_watches __read_mostly; static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; @@ -60,7 +58,7 @@ static int zero; struct ctl_table inotify_table[] = { { .procname = "max_user_instances", - .data = &inotify_max_user_instances, + .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES], .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -68,7 +66,7 @@ struct ctl_table inotify_table[] = { }, { .procname = "max_user_watches", - .data = &inotify_max_user_watches, + .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES], .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -500,7 +498,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, /* remove this mark from the idr */ inotify_remove_from_idr(group, i_mark); - atomic_dec(&group->inotify_data.user->inotify_watches); + dec_inotify_watches(group->inotify_data.ucounts); } /* ding dong the mark is dead */ @@ -584,14 +582,17 @@ static int inotify_new_watch(struct fsnotify_group *group, tmp_i_mark->fsn_mark.mask = mask; tmp_i_mark->wd = -1; - ret = -ENOSPC; - if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) - goto out_err; - ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark); if (ret) goto out_err; + /* increment the number of watches the user has */ + if (!inc_inotify_watches(group->inotify_data.ucounts)) { + inotify_remove_from_idr(group, tmp_i_mark); + ret = -ENOSPC; + goto out_err; + } + /* we are on the idr, now get on the inode */ ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode, NULL, 0); @@ -601,8 +602,6 @@ static int inotify_new_watch(struct fsnotify_group *group, goto out_err; } - /* increment the number of watches the user has */ - atomic_inc(&group->inotify_data.user->inotify_watches); /* return the watch descriptor for this new mark */ ret = tmp_i_mark->wd; @@ -653,10 +652,11 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events) spin_lock_init(&group->inotify_data.idr_lock); idr_init(&group->inotify_data.idr); - group->inotify_data.user = get_current_user(); + group->inotify_data.ucounts = inc_ucount(current_user_ns(), + current_euid(), + UCOUNT_INOTIFY_INSTANCES); - if (atomic_inc_return(&group->inotify_data.user->inotify_devs) > - inotify_max_user_instances) { + if (!group->inotify_data.ucounts) { fsnotify_destroy_group(group); return ERR_PTR(-EMFILE); } @@ -819,8 +819,8 @@ static int __init inotify_user_setup(void) inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC); inotify_max_queued_events = 16384; - inotify_max_user_instances = 128; - inotify_max_user_watches = 8192; + init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128; + init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = 8192; return 0; } diff --git a/fs/nsfs.c b/fs/nsfs.c index 8c9fb29c6673..1656843e87d2 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -7,6 +7,7 @@ #include #include #include +#include static struct vfsmount *nsfs_mnt; @@ -163,7 +164,10 @@ int open_related_ns(struct ns_common *ns, static long ns_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { + struct user_namespace *user_ns; struct ns_common *ns = get_proc_ns(file_inode(filp)); + uid_t __user *argp; + uid_t uid; switch (ioctl) { case NS_GET_USERNS: @@ -172,6 +176,15 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, if (!ns->ops->get_parent) return -EINVAL; return open_related_ns(ns, ns->ops->get_parent); + case NS_GET_NSTYPE: + return ns->ops->type; + case NS_GET_OWNER_UID: + if (ns->ops->type != CLONE_NEWUSER) + return -EINVAL; + user_ns = container_of(ns, struct user_namespace, ns); + argp = (uid_t __user *) arg; + uid = from_kuid_munged(current_user_ns(), user_ns->owner); + return put_user(uid, argp); default: return -ENOTTY; } diff --git a/fs/pnode.c b/fs/pnode.c index 06a793f4ae38..5bc7896d122a 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -322,6 +322,21 @@ out: return ret; } +static struct mount *find_topper(struct mount *mnt) +{ + /* If there is exactly one mount covering mnt completely return it. */ + struct mount *child; + + if (!list_is_singular(&mnt->mnt_mounts)) + return NULL; + + child = list_first_entry(&mnt->mnt_mounts, struct mount, mnt_child); + if (child->mnt_mountpoint != mnt->mnt.mnt_root) + return NULL; + + return child; +} + /* * return true if the refcount is greater than count */ @@ -342,9 +357,8 @@ static inline int do_refcount_check(struct mount *mnt, int count) */ int propagate_mount_busy(struct mount *mnt, int refcnt) { - struct mount *m, *child; + struct mount *m, *child, *topper; struct mount *parent = mnt->mnt_parent; - int ret = 0; if (mnt == parent) return do_refcount_check(mnt, refcnt); @@ -359,12 +373,24 @@ int propagate_mount_busy(struct mount *mnt, int refcnt) for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { - child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint); - if (child && list_empty(&child->mnt_mounts) && - (ret = do_refcount_check(child, 1))) - break; + int count = 1; + child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); + if (!child) + continue; + + /* Is there exactly one mount on the child that covers + * it completely whose reference should be ignored? + */ + topper = find_topper(child); + if (topper) + count += 1; + else if (!list_empty(&child->mnt_mounts)) + continue; + + if (do_refcount_check(child, count)) + return 1; } - return ret; + return 0; } /* @@ -381,7 +407,7 @@ void propagate_mount_unlock(struct mount *mnt) for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { - child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint); + child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); if (child) child->mnt.mnt_flags &= ~MNT_LOCKED; } @@ -399,9 +425,11 @@ static void mark_umount_candidates(struct mount *mnt) for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { - struct mount *child = __lookup_mnt_last(&m->mnt, + struct mount *child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); - if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) { + if (!child || (child->mnt.mnt_flags & MNT_UMOUNT)) + continue; + if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) { SET_MNT_MARK(child); } } @@ -420,8 +448,8 @@ static void __propagate_umount(struct mount *mnt) for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { - - struct mount *child = __lookup_mnt_last(&m->mnt, + struct mount *topper; + struct mount *child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); /* * umount the child only if the child has no children @@ -430,6 +458,15 @@ static void __propagate_umount(struct mount *mnt) if (!child || !IS_MNT_MARKED(child)) continue; CLEAR_MNT_MARK(child); + + /* If there is exactly one mount covering all of child + * replace child with that mount. + */ + topper = find_topper(child); + if (topper) + mnt_change_mountpoint(child->mnt_parent, child->mnt_mp, + topper); + if (list_empty(&child->mnt_mounts)) { list_del_init(&child->mnt_child); child->mnt.mnt_flags |= MNT_UMOUNT; diff --git a/fs/pnode.h b/fs/pnode.h index 550f5a8b4fcf..dc87e65becd2 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -49,6 +49,8 @@ int get_dominating_id(struct mount *mnt, const struct path *root); unsigned int mnt_get_count(struct mount *mnt); void mnt_set_mountpoint(struct mount *, struct mountpoint *, struct mount *); +void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, + struct mount *mnt); struct mount *copy_tree(struct mount *, struct dentry *, int); bool is_path_reachable(struct mount *, struct dentry *, const struct path *root); diff --git a/fs/proc/base.c b/fs/proc/base.c index 3d773eb9e144..b73b4de8fb36 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1667,12 +1667,63 @@ const struct inode_operations proc_pid_link_inode_operations = { /* building an inode */ +void task_dump_owner(struct task_struct *task, mode_t mode, + kuid_t *ruid, kgid_t *rgid) +{ + /* Depending on the state of dumpable compute who should own a + * proc file for a task. + */ + const struct cred *cred; + kuid_t uid; + kgid_t gid; + + /* Default to the tasks effective ownership */ + rcu_read_lock(); + cred = __task_cred(task); + uid = cred->euid; + gid = cred->egid; + rcu_read_unlock(); + + /* + * Before the /proc/pid/status file was created the only way to read + * the effective uid of a /process was to stat /proc/pid. Reading + * /proc/pid/status is slow enough that procps and other packages + * kept stating /proc/pid. To keep the rules in /proc simple I have + * made this apply to all per process world readable and executable + * directories. + */ + if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) { + struct mm_struct *mm; + task_lock(task); + mm = task->mm; + /* Make non-dumpable tasks owned by some root */ + if (mm) { + if (get_dumpable(mm) != SUID_DUMP_USER) { + struct user_namespace *user_ns = mm->user_ns; + + uid = make_kuid(user_ns, 0); + if (!uid_valid(uid)) + uid = GLOBAL_ROOT_UID; + + gid = make_kgid(user_ns, 0); + if (!gid_valid(gid)) + gid = GLOBAL_ROOT_GID; + } + } else { + uid = GLOBAL_ROOT_UID; + gid = GLOBAL_ROOT_GID; + } + task_unlock(task); + } + *ruid = uid; + *rgid = gid; +} + struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task, umode_t mode) { struct inode * inode; struct proc_inode *ei; - const struct cred *cred; /* We need a new inode */ @@ -1694,13 +1745,7 @@ struct inode *proc_pid_make_inode(struct super_block * sb, if (!ei->pid) goto out_unlock; - if (task_dumpable(task)) { - rcu_read_lock(); - cred = __task_cred(task); - inode->i_uid = cred->euid; - inode->i_gid = cred->egid; - rcu_read_unlock(); - } + task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); security_task_to_inode(task, inode); out: @@ -1715,7 +1760,6 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = d_inode(dentry); struct task_struct *task; - const struct cred *cred; struct pid_namespace *pid = dentry->d_sb->s_fs_info; generic_fillattr(inode, stat); @@ -1733,12 +1777,7 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) */ return -ENOENT; } - if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || - task_dumpable(task)) { - cred = __task_cred(task); - stat->uid = cred->euid; - stat->gid = cred->egid; - } + task_dump_owner(task, inode->i_mode, &stat->uid, &stat->gid); } rcu_read_unlock(); return 0; @@ -1754,18 +1793,11 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) * Rewrite the inode's ownerships here because the owning task may have * performed a setuid(), etc. * - * Before the /proc/pid/status file was created the only way to read - * the effective uid of a /process was to stat /proc/pid. Reading - * /proc/pid/status is slow enough that procps and other packages - * kept stating /proc/pid. To keep the rules in /proc simple I have - * made this apply to all per process world readable and executable - * directories. */ int pid_revalidate(struct dentry *dentry, unsigned int flags) { struct inode *inode; struct task_struct *task; - const struct cred *cred; if (flags & LOOKUP_RCU) return -ECHILD; @@ -1774,17 +1806,8 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags) task = get_proc_task(inode); if (task) { - if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || - task_dumpable(task)) { - rcu_read_lock(); - cred = __task_cred(task); - inode->i_uid = cred->euid; - inode->i_gid = cred->egid; - rcu_read_unlock(); - } else { - inode->i_uid = GLOBAL_ROOT_UID; - inode->i_gid = GLOBAL_ROOT_GID; - } + task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid); + inode->i_mode &= ~(S_ISUID | S_ISGID); security_task_to_inode(task, inode); put_task_struct(task); @@ -1881,7 +1904,6 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) bool exact_vma_exists = false; struct mm_struct *mm = NULL; struct task_struct *task; - const struct cred *cred; struct inode *inode; int status = 0; @@ -1906,16 +1928,8 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) mmput(mm); if (exact_vma_exists) { - if (task_dumpable(task)) { - rcu_read_lock(); - cred = __task_cred(task); - inode->i_uid = cred->euid; - inode->i_gid = cred->egid; - rcu_read_unlock(); - } else { - inode->i_uid = GLOBAL_ROOT_UID; - inode->i_gid = GLOBAL_ROOT_GID; - } + task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); + security_task_to_inode(task, inode); status = 1; } diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 4274f83bf100..00ce1531b2f5 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -84,7 +84,6 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) { struct files_struct *files; struct task_struct *task; - const struct cred *cred; struct inode *inode; unsigned int fd; @@ -108,16 +107,7 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) rcu_read_unlock(); put_files_struct(files); - if (task_dumpable(task)) { - rcu_read_lock(); - cred = __task_cred(task); - inode->i_uid = cred->euid; - inode->i_gid = cred->egid; - rcu_read_unlock(); - } else { - inode->i_uid = GLOBAL_ROOT_UID; - inode->i_gid = GLOBAL_ROOT_GID; - } + task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); if (S_ISLNK(inode->i_mode)) { unsigned i_mode = S_IFLNK; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 842a5ff5b85c..7ad9ed7958af 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -43,10 +43,11 @@ static void proc_evict_inode(struct inode *inode) de = PDE(inode); if (de) pde_put(de); + head = PROC_I(inode)->sysctl; if (head) { RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL); - sysctl_head_put(head); + proc_sys_evict_inode(inode, head); } } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 2de5194ba378..5d6960f5f1c0 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -65,6 +65,7 @@ struct proc_inode { struct proc_dir_entry *pde; struct ctl_table_header *sysctl; struct ctl_table *sysctl_entry; + struct list_head sysctl_inodes; const struct proc_ns_operations *ns_ops; struct inode vfs_inode; }; @@ -97,20 +98,8 @@ static inline struct task_struct *get_proc_task(struct inode *inode) return get_pid_task(proc_pid(inode), PIDTYPE_PID); } -static inline int task_dumpable(struct task_struct *task) -{ - int dumpable = 0; - struct mm_struct *mm; - - task_lock(task); - mm = task->mm; - if (mm) - dumpable = get_dumpable(mm); - task_unlock(task); - if (dumpable == SUID_DUMP_USER) - return 1; - return 0; -} +void task_dump_owner(struct task_struct *task, mode_t mode, + kuid_t *ruid, kgid_t *rgid); static inline unsigned name_to_int(const struct qstr *qstr) { @@ -249,10 +238,12 @@ extern void proc_thread_self_init(void); */ #ifdef CONFIG_PROC_SYSCTL extern int proc_sys_init(void); -extern void sysctl_head_put(struct ctl_table_header *); +extern void proc_sys_evict_inode(struct inode *inode, + struct ctl_table_header *head); #else static inline void proc_sys_init(void) { } -static inline void sysctl_head_put(struct ctl_table_header *head) { } +static inline void proc_sys_evict_inode(struct inode *inode, + struct ctl_table_header *head) { } #endif /* diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index d4e37acd4821..3e64c6502dc8 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -190,6 +190,7 @@ static void init_header(struct ctl_table_header *head, head->set = set; head->parent = NULL; head->node = node; + INIT_LIST_HEAD(&head->inodes); if (node) { struct ctl_table *entry; for (entry = table; entry->procname; entry++, node++) @@ -259,6 +260,27 @@ static void unuse_table(struct ctl_table_header *p) complete(p->unregistering); } +/* called under sysctl_lock */ +static void proc_sys_prune_dcache(struct ctl_table_header *head) +{ + struct inode *inode, *prev = NULL; + struct proc_inode *ei; + + rcu_read_lock(); + list_for_each_entry_rcu(ei, &head->inodes, sysctl_inodes) { + inode = igrab(&ei->vfs_inode); + if (inode) { + rcu_read_unlock(); + iput(prev); + prev = inode; + d_prune_aliases(inode); + rcu_read_lock(); + } + } + rcu_read_unlock(); + iput(prev); +} + /* called under sysctl_lock, will reacquire if has to wait */ static void start_unregistering(struct ctl_table_header *p) { @@ -272,33 +294,24 @@ static void start_unregistering(struct ctl_table_header *p) p->unregistering = &wait; spin_unlock(&sysctl_lock); wait_for_completion(&wait); - spin_lock(&sysctl_lock); } else { /* anything non-NULL; we'll never dereference it */ p->unregistering = ERR_PTR(-EINVAL); + spin_unlock(&sysctl_lock); } + /* + * Prune dentries for unregistered sysctls: namespaced sysctls + * can have duplicate names and contaminate dcache very badly. + */ + proc_sys_prune_dcache(p); /* * do not remove from the list until nobody holds it; walking the * list in do_sysctl() relies on that. */ + spin_lock(&sysctl_lock); erase_header(p); } -static void sysctl_head_get(struct ctl_table_header *head) -{ - spin_lock(&sysctl_lock); - head->count++; - spin_unlock(&sysctl_lock); -} - -void sysctl_head_put(struct ctl_table_header *head) -{ - spin_lock(&sysctl_lock); - if (!--head->count) - kfree_rcu(head, rcu); - spin_unlock(&sysctl_lock); -} - static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) { BUG_ON(!head); @@ -440,10 +453,20 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, inode->i_ino = get_next_ino(); - sysctl_head_get(head); ei = PROC_I(inode); + + spin_lock(&sysctl_lock); + if (unlikely(head->unregistering)) { + spin_unlock(&sysctl_lock); + iput(inode); + inode = NULL; + goto out; + } ei->sysctl = head; ei->sysctl_entry = table; + list_add_rcu(&ei->sysctl_inodes, &head->inodes); + head->count++; + spin_unlock(&sysctl_lock); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_mode = table->mode; @@ -466,6 +489,15 @@ out: return inode; } +void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head) +{ + spin_lock(&sysctl_lock); + list_del_rcu(&PROC_I(inode)->sysctl_inodes); + if (!--head->count) + kfree_rcu(head, rcu); + spin_unlock(&sysctl_lock); +} + static struct ctl_table_header *grab_header(struct inode *inode) { struct ctl_table_header *head = PROC_I(inode)->sysctl; diff --git a/fs/super.c b/fs/super.c index ea662b0e5e78..b8b6a086c03b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -469,7 +469,7 @@ struct super_block *sget_userns(struct file_system_type *type, struct super_block *old; int err; - if (!(flags & MS_KERNMOUNT) && + if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) && !(type->fs_flags & FS_USERNS_MOUNT) && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); @@ -499,7 +499,7 @@ retry: } if (!s) { spin_unlock(&sb_lock); - s = alloc_super(type, flags, user_ns); + s = alloc_super(type, (flags & ~MS_SUBMOUNT), user_ns); if (!s) return ERR_PTR(-ENOMEM); goto retry; @@ -540,8 +540,15 @@ struct super_block *sget(struct file_system_type *type, { struct user_namespace *user_ns = current_user_ns(); + /* We don't yet pass the user namespace of the parent + * mount through to here so always use &init_user_ns + * until that changes. + */ + if (flags & MS_SUBMOUNT) + user_ns = &init_user_ns; + /* Ensure the requestor has permissions over the target filesystem */ - if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN)) + if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) && !ns_capable(user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); return sget_userns(type, test, set, flags, user_ns, data); diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index 9d571acd3a48..7dff776e6d16 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -98,9 +98,10 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent); struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, const char *dest); +typedef struct vfsmount *(*debugfs_automount_t)(struct dentry *, void *); struct dentry *debugfs_create_automount(const char *name, struct dentry *parent, - struct vfsmount *(*f)(void *), + debugfs_automount_t f, void *data); void debugfs_remove(struct dentry *dentry); diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 487246546ebe..e6e689b5569e 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -16,6 +16,7 @@ #include #include #include +#include /* * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily @@ -170,7 +171,7 @@ struct fsnotify_group { struct inotify_group_private_data { spinlock_t idr_lock; struct idr idr; - struct user_struct *user; + struct ucounts *ucounts; } inotify_data; #endif #ifdef CONFIG_FANOTIFY diff --git a/include/linux/mount.h b/include/linux/mount.h index c6f55158d5e5..8e0352af06b7 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -90,6 +90,9 @@ struct file_system_type; extern struct vfsmount *vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data); +extern struct vfsmount *vfs_submount(const struct dentry *mountpoint, + struct file_system_type *type, + const char *name, void *data); extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list); extern void mark_mounts_for_expiry(struct list_head *mounts); diff --git a/include/linux/sched.h b/include/linux/sched.h index c8e519d0b4a3..451e241f32c5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -846,10 +846,6 @@ struct user_struct { atomic_t __count; /* reference count */ atomic_t processes; /* How many processes does this user have? */ atomic_t sigpending; /* How many pending signals does this user have? */ -#ifdef CONFIG_INOTIFY_USER - atomic_t inotify_watches; /* How many inotify watches does this user have? */ - atomic_t inotify_devs; /* How many inotify devs does this user have opened? */ -#endif #ifdef CONFIG_FANOTIFY atomic_t fanotify_listeners; #endif @@ -3051,6 +3047,9 @@ extern bool current_is_single_threaded(void); #define for_each_process_thread(p, t) \ for_each_process(p) for_each_thread(p, t) +typedef int (*proc_visitor)(struct task_struct *p, void *data); +void walk_process_tree(struct task_struct *top, proc_visitor, void *); + static inline int get_nr_threads(struct task_struct *tsk) { return tsk->signal->nr_threads; diff --git a/include/linux/security.h b/include/linux/security.h index d3868f2ebada..96899fad7016 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -140,8 +140,7 @@ struct request_sock; /* bprm->unsafe reasons */ #define LSM_UNSAFE_SHARE 1 #define LSM_UNSAFE_PTRACE 2 -#define LSM_UNSAFE_PTRACE_CAP 4 -#define LSM_UNSAFE_NO_NEW_PRIVS 8 +#define LSM_UNSAFE_NO_NEW_PRIVS 4 #ifdef CONFIG_MMU extern int mmap_min_addr_handler(struct ctl_table *table, int write, diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index adf4e51cf597..b7e82049fec7 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -143,6 +143,7 @@ struct ctl_table_header struct ctl_table_set *set; struct ctl_dir *parent; struct ctl_node *node; + struct list_head inodes; /* head for proc_inode->sysctl_inodes */ }; struct ctl_dir { diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index eb209d4523f5..363e0e8082a9 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -32,6 +32,10 @@ enum ucount_type { UCOUNT_NET_NAMESPACES, UCOUNT_MNT_NAMESPACES, UCOUNT_CGROUP_NAMESPACES, +#ifdef CONFIG_INOTIFY_USER + UCOUNT_INOTIFY_INSTANCES, + UCOUNT_INOTIFY_WATCHES, +#endif UCOUNT_COUNTS, }; diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 36da93fbf188..048a85e9f017 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -132,6 +132,7 @@ struct inodes_stat_t { #define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */ /* These sb flags are internal to the kernel */ +#define MS_SUBMOUNT (1<<26) #define MS_NOREMOTELOCK (1<<27) #define MS_NOSEC (1<<28) #define MS_BORN (1<<29) diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index 3af617230d1b..1a3ca79f466b 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -6,8 +6,13 @@ #define NSIO 0xb7 /* Returns a file descriptor that refers to an owning user namespace */ -#define NS_GET_USERNS _IO(NSIO, 0x1) +#define NS_GET_USERNS _IO(NSIO, 0x1) /* Returns a file descriptor that refers to a parent namespace */ -#define NS_GET_PARENT _IO(NSIO, 0x2) +#define NS_GET_PARENT _IO(NSIO, 0x2) +/* Returns the type of namespace (CLONE_NEW* value) referred to by + file descriptor */ +#define NS_GET_NSTYPE _IO(NSIO, 0x3) +/* Get owner UID (in the caller's user namespace) for a user namespace */ +#define NS_GET_OWNER_UID _IO(NSIO, 0x4) #endif /* __LINUX_NSFS_H */ diff --git a/kernel/exit.c b/kernel/exit.c index 580da79e38ee..9960accbf2ab 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -607,15 +607,18 @@ static struct task_struct *find_new_reaper(struct task_struct *father, return thread; if (father->signal->has_child_subreaper) { + unsigned int ns_level = task_pid(father)->level; /* * Find the first ->is_child_subreaper ancestor in our pid_ns. - * We start from father to ensure we can not look into another - * namespace, this is safe because all its threads are dead. + * We can't check reaper != child_reaper to ensure we do not + * cross the namespaces, the exiting parent could be injected + * by setns() + fork(). + * We check pid->level, this is slightly more efficient than + * task_active_pid_ns(reaper) != task_active_pid_ns(father). */ - for (reaper = father; - !same_thread_group(reaper, child_reaper); + for (reaper = father->real_parent; + task_pid(reaper)->level == ns_level; reaper = reaper->real_parent) { - /* call_usermodehelper() descendants need this check */ if (reaper == &init_task) break; if (!reaper->signal->is_child_subreaper) diff --git a/kernel/fork.c b/kernel/fork.c index d12fcc4db8a3..348fe73155bc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1377,9 +1377,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; - sig->has_child_subreaper = current->signal->has_child_subreaper || - current->signal->is_child_subreaper; - mutex_init(&sig->cred_guard_mutex); return 0; @@ -1814,6 +1811,13 @@ static __latent_entropy struct task_struct *copy_process( p->signal->leader_pid = pid; p->signal->tty = tty_kref_get(current->signal->tty); + /* + * Inherit has_child_subreaper flag under the same + * tasklist_lock with adding child to the process tree + * for propagate_has_child_subreaper optimization. + */ + p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper || + p->real_parent->signal->is_child_subreaper; list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); attach_pid(p, PIDTYPE_PGID); @@ -2067,6 +2071,38 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, } #endif +void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data) +{ + struct task_struct *leader, *parent, *child; + int res; + + read_lock(&tasklist_lock); + leader = top = top->group_leader; +down: + for_each_thread(leader, parent) { + list_for_each_entry(child, &parent->children, sibling) { + res = visitor(child, data); + if (res) { + if (res < 0) + goto out; + leader = child; + goto down; + } +up: + ; + } + } + + if (leader != top) { + child = leader; + parent = child->real_parent; + leader = parent->group_leader; + goto up; + } +out: + read_unlock(&tasklist_lock); +} + #ifndef ARCH_MIN_MMSTRUCT_ALIGN #define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif diff --git a/kernel/sys.c b/kernel/sys.c index 7d4a9a6df956..b07adca97ea3 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2063,6 +2063,24 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) } #endif +static int propagate_has_child_subreaper(struct task_struct *p, void *data) +{ + /* + * If task has has_child_subreaper - all its decendants + * already have these flag too and new decendants will + * inherit it on fork, skip them. + * + * If we've found child_reaper - skip descendants in + * it's subtree as they will never get out pidns. + */ + if (p->signal->has_child_subreaper || + is_child_reaper(task_pid(p))) + return 0; + + p->signal->has_child_subreaper = 1; + return 1; +} + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2214,6 +2232,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, break; case PR_SET_CHILD_SUBREAPER: me->signal->is_child_subreaper = !!arg2; + if (!arg2) + break; + + walk_process_tree(me, propagate_has_child_subreaper, NULL); break; case PR_GET_CHILD_SUBREAPER: error = put_user(me->signal->is_child_subreaper, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d7449783987a..310f0ea0d1a2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7503,7 +7503,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) ftrace_init_tracefs(tr, d_tracer); } -static struct vfsmount *trace_automount(void *ingore) +static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) { struct vfsmount *mnt; struct file_system_type *type; @@ -7516,7 +7516,7 @@ static struct vfsmount *trace_automount(void *ingore) type = get_fs_type("tracefs"); if (!type) return NULL; - mnt = vfs_kern_mount(type, 0, "tracefs", NULL); + mnt = vfs_submount(mntpt, type, "tracefs", NULL); put_filesystem(type); if (IS_ERR(mnt)) return NULL; diff --git a/kernel/ucount.c b/kernel/ucount.c index 95c6336fc2b3..8a11fc0cb459 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -57,7 +57,7 @@ static struct ctl_table_root set_root = { static int zero = 0; static int int_max = INT_MAX; -#define UCOUNT_ENTRY(name) \ +#define UCOUNT_ENTRY(name) \ { \ .procname = name, \ .maxlen = sizeof(int), \ @@ -74,6 +74,10 @@ static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_net_namespaces"), UCOUNT_ENTRY("max_mnt_namespaces"), UCOUNT_ENTRY("max_cgroup_namespaces"), +#ifdef CONFIG_INOTIFY_USER + UCOUNT_ENTRY("max_inotify_instances"), + UCOUNT_ENTRY("max_inotify_watches"), +#endif { } }; #endif /* CONFIG_SYSCTL */ diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c index ef4beef06e9d..001e133a3c8c 100644 --- a/security/apparmor/domain.c +++ b/security/apparmor/domain.c @@ -471,7 +471,7 @@ int apparmor_bprm_set_creds(struct linux_binprm *bprm) ; } - if (bprm->unsafe & (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) { + if (bprm->unsafe & LSM_UNSAFE_PTRACE) { error = may_change_ptraced_domain(new_profile); if (error) goto audit; diff --git a/security/commoncap.c b/security/commoncap.c index 6d4d586b9356..78b37838a2d3 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -548,9 +548,10 @@ skip: if ((is_setid || !cap_issubset(new->cap_permitted, old->cap_permitted)) && - bprm->unsafe & ~LSM_UNSAFE_PTRACE_CAP) { + ((bprm->unsafe & ~LSM_UNSAFE_PTRACE) || + !ptracer_capable(current, new->user_ns))) { /* downgrade; they get no more than they had, and maybe less */ - if (!capable(CAP_SETUID) || + if (!ns_capable(new->user_ns, CAP_SETUID) || (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) { new->euid = new->uid; new->egid = new->gid; diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index e6b1b7410321..9a8f12f8d5b7 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2399,8 +2399,7 @@ static int selinux_bprm_set_creds(struct linux_binprm *bprm) /* Make sure that anyone attempting to ptrace over a task that * changes its SID has the appropriate permit */ - if (bprm->unsafe & - (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) { + if (bprm->unsafe & LSM_UNSAFE_PTRACE) { u32 ptsid = ptrace_parent_sid(); if (ptsid != 0) { rc = avc_has_perm(ptsid, new_tsec->sid, diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 60b4217b9b68..fc8fb31fc24f 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -931,7 +931,7 @@ static int smack_bprm_set_creds(struct linux_binprm *bprm) isp->smk_task != sbsp->smk_root) return 0; - if (bprm->unsafe & (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) { + if (bprm->unsafe & LSM_UNSAFE_PTRACE) { struct task_struct *tracer; rc = 0;