pidns: Support unsharing the pid namespace.

Unsharing of the pid namespace unlike unsharing of other namespaces
does not take affect immediately.  Instead it affects the children
created with fork and clone.  The first of these children becomes the init
process of the new pid namespace, the rest become oddball children
of pid 0.  From the point of view of the new pid namespace the process
that created it is pid 0, as it's pid does not map.

A couple of different semantics were considered but this one was
settled on because it is easy to implement and it is usable from
pam modules.  The core reasons for the existence of unshare.

I took a survey of the callers of pam modules and the following
appears to be a representative sample of their logic.
{
	setup stuff include pam
	child = fork();
	if (!child) {
		setuid()
                exec /bin/bash
        }
        waitpid(child);

        pam and other cleanup
}

As you can see there is a fork to create the unprivileged user
space process.  Which means that the unprivileged user space
process will appear as pid 1 in the new pid namespace.  Further
most login processes do not cope with extraneous children which
means shifting the duty of reaping extraneous child process to
the creator of those extraneous children makes the system more
comprehensible.

The practical reason for this set of pid namespace semantics is
that it is simple to implement and verify they work correctly.
Whereas an implementation that requres changing the struct
pid on a process comes with a lot more races and pain.  Not
the least of which is that glibc caches getpid().

These semantics are implemented by having two notions
of the pid namespace of a proces.  There is task_active_pid_ns
which is the pid namspace the process was created with
and the pid namespace that all pids are presented to
that process in.  The task_active_pid_ns is stored
in the struct pid of the task.

Then there is the pid namespace that will be used for children
that pid namespace is stored in task->nsproxy->pid_ns.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
This commit is contained in:
Eric W. Biederman 2010-03-02 15:41:50 -08:00
parent 1c4042c29b
commit 50804fe373
3 changed files with 26 additions and 10 deletions

View File

@ -1565,9 +1565,11 @@ long do_fork(unsigned long clone_flags,
* Do some preliminary argument and permissions checking before we * Do some preliminary argument and permissions checking before we
* actually start allocating stuff * actually start allocating stuff
*/ */
if (clone_flags & CLONE_NEWUSER) { if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) {
if (clone_flags & CLONE_THREAD) if (clone_flags & (CLONE_THREAD|CLONE_PARENT))
return -EINVAL; return -EINVAL;
}
if (clone_flags & CLONE_NEWUSER) {
/* hopefully this check will go away when userns support is /* hopefully this check will go away when userns support is
* complete * complete
*/ */
@ -1692,7 +1694,8 @@ static int check_unshare_flags(unsigned long unshare_flags)
{ {
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
CLONE_NEWPID))
return -EINVAL; return -EINVAL;
/* /*
* Not implemented, but pretend it works if there is nothing to * Not implemented, but pretend it works if there is nothing to
@ -1763,15 +1766,30 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
int do_sysvsem = 0; int do_sysvsem = 0;
int err; int err;
err = check_unshare_flags(unshare_flags); /*
if (err) * If unsharing a pid namespace must also unshare the thread.
goto bad_unshare_out; */
if (unshare_flags & CLONE_NEWPID)
unshare_flags |= CLONE_THREAD;
/*
* If unsharing a thread from a thread group, must also unshare vm.
*/
if (unshare_flags & CLONE_THREAD)
unshare_flags |= CLONE_VM;
/*
* If unsharing vm, must also unshare signal handlers.
*/
if (unshare_flags & CLONE_VM)
unshare_flags |= CLONE_SIGHAND;
/* /*
* If unsharing namespace, must also unshare filesystem information. * If unsharing namespace, must also unshare filesystem information.
*/ */
if (unshare_flags & CLONE_NEWNS) if (unshare_flags & CLONE_NEWNS)
unshare_flags |= CLONE_FS; unshare_flags |= CLONE_FS;
err = check_unshare_flags(unshare_flags);
if (err)
goto bad_unshare_out;
/* /*
* CLONE_NEWIPC must also detach from the undolist: after switching * CLONE_NEWIPC must also detach from the undolist: after switching
* to a new ipc namespace, the semaphore arrays from the old * to a new ipc namespace, the semaphore arrays from the old

View File

@ -188,7 +188,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
int err = 0; int err = 0;
if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
CLONE_NEWNET))) CLONE_NEWNET | CLONE_NEWPID)))
return 0; return 0;
if (!capable(CAP_SYS_ADMIN)) if (!capable(CAP_SYS_ADMIN))

View File

@ -144,8 +144,6 @@ struct pid_namespace *copy_pid_ns(unsigned long flags,
{ {
if (!(flags & CLONE_NEWPID)) if (!(flags & CLONE_NEWPID))
return get_pid_ns(old_ns); return get_pid_ns(old_ns);
if (flags & (CLONE_THREAD|CLONE_PARENT))
return ERR_PTR(-EINVAL);
if (task_active_pid_ns(current) != old_ns) if (task_active_pid_ns(current) != old_ns)
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
return create_pid_namespace(user_ns, old_ns); return create_pid_namespace(user_ns, old_ns);