From f376cf84b9becafbcb27604f1ae232b0564326e6 Mon Sep 17 00:00:00 2001 From: "Daniel, Dao Quang Minh" Date: Mon, 14 Sep 2015 00:31:33 +0000 Subject: [PATCH 1/8] Check if a namespace is supported This adds `configs.IsNamespaceSupported(nsType)` to check if the host supports a namespace type. Signed-off-by: Daniel, Dao Quang Minh --- libcontainer/configs/namespaces_unix.go | 72 +++++++++++++++++-------- 1 file changed, 51 insertions(+), 21 deletions(-) diff --git a/libcontainer/configs/namespaces_unix.go b/libcontainer/configs/namespaces_unix.go index 7bc90854..60707e36 100644 --- a/libcontainer/configs/namespaces_unix.go +++ b/libcontainer/configs/namespaces_unix.go @@ -2,7 +2,11 @@ package configs -import "fmt" +import ( + "fmt" + "os" + "sync" +) const ( NEWNET NamespaceType = "NEWNET" @@ -13,6 +17,51 @@ const ( NEWUSER NamespaceType = "NEWUSER" ) +var ( + nsLock sync.Mutex + supportedNamespaces = make(map[NamespaceType]bool) +) + +// nsToFile converts the namespace type to its filename +func nsToFile(ns NamespaceType) string { + switch ns { + case NEWNET: + return "net" + case NEWNS: + return "mnt" + case NEWPID: + return "pid" + case NEWIPC: + return "ipc" + case NEWUSER: + return "user" + case NEWUTS: + return "uts" + } + return "" +} + +// IsNamespaceSupported returns whether a namespace is available or +// not +func IsNamespaceSupported(ns NamespaceType) bool { + nsLock.Lock() + defer nsLock.Unlock() + supported, ok := supportedNamespaces[ns] + if ok { + return supported + } + nsFile := nsToFile(ns) + // if the namespace type is unknown, just return false + if nsFile == "" { + return false + } + _, err := os.Stat(fmt.Sprintf("/proc/self/ns/%s", nsFile)) + // a namespace is supported if it exists and we have permissions to read it + supported = err == nil + supportedNamespaces[ns] = supported + return supported +} + func NamespaceTypes() []NamespaceType { return []NamespaceType{ NEWNET, @@ -35,26 +84,7 @@ func (n *Namespace) GetPath(pid int) string { if n.Path != "" { return n.Path } - return fmt.Sprintf("/proc/%d/ns/%s", pid, n.file()) -} - -func (n *Namespace) file() string { - file := "" - switch n.Type { - case NEWNET: - file = "net" - case NEWNS: - file = "mnt" - case NEWPID: - file = "pid" - case NEWIPC: - file = "ipc" - case NEWUSER: - file = "user" - case NEWUTS: - file = "uts" - } - return file + return fmt.Sprintf("/proc/%d/ns/%s", pid, nsToFile(n.Type)) } func (n *Namespaces) Remove(t NamespaceType) bool { From 4217b9c1218fa8ad4d6c0f93317d8520f1de50c9 Mon Sep 17 00:00:00 2001 From: "Daniel, Dao Quang Minh" Date: Mon, 14 Sep 2015 00:33:17 +0000 Subject: [PATCH 2/8] Do not override the specified userns path Signed-off-by: Daniel, Dao Quang Minh --- libcontainer/configs/namespaces_unix.go | 8 ++++++++ spec.go | 5 ++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/libcontainer/configs/namespaces_unix.go b/libcontainer/configs/namespaces_unix.go index 60707e36..b9c820d0 100644 --- a/libcontainer/configs/namespaces_unix.go +++ b/libcontainer/configs/namespaces_unix.go @@ -117,3 +117,11 @@ func (n *Namespaces) index(t NamespaceType) int { func (n *Namespaces) Contains(t NamespaceType) bool { return n.index(t) != -1 } + +func (n *Namespaces) PathOf(t NamespaceType) string { + i := n.index(t) + if i == -1 { + return "" + } + return (*n)[i].Path +} diff --git a/spec.go b/spec.go index d7fef6fc..2c870a64 100644 --- a/spec.go +++ b/spec.go @@ -590,7 +590,10 @@ func setupUserNamespace(spec *specs.LinuxSpec, config *configs.Config) error { if len(spec.Linux.UIDMappings) == 0 { return nil } - config.Namespaces.Add(configs.NEWUSER, "") + // do not override the specified user namespace path + if config.Namespaces.PathOf(configs.NEWUSER) == "" { + config.Namespaces.Add(configs.NEWUSER, "") + } create := func(m specs.IDMapping) configs.IDMap { return configs.IDMap{ HostID: int(m.HostID), From 2d3221062024cd154fadd1ca6fbfb858ddfa9125 Mon Sep 17 00:00:00 2001 From: "Daniel, Dao Quang Minh" Date: Mon, 14 Sep 2015 00:35:22 +0000 Subject: [PATCH 3/8] Integration tests for joining namespaces Signed-off-by: Daniel, Dao Quang Minh --- libcontainer/integration/exec_test.go | 195 +++++++++++++++++++++++- libcontainer/integration/execin_test.go | 61 ++++++++ libcontainer/integration/utils_test.go | 8 +- 3 files changed, 260 insertions(+), 4 deletions(-) diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go index d7f07e95..22a9afbf 100644 --- a/libcontainer/integration/exec_test.go +++ b/libcontainer/integration/exec_test.go @@ -2,10 +2,12 @@ package integration import ( "bytes" + "fmt" "io/ioutil" "os" "os/exec" "path/filepath" + "reflect" "strconv" "strings" "syscall" @@ -1209,7 +1211,6 @@ func TestRootfsPropagationSlaveMount(t *testing.T) { defer stdinW2.Close() ok(t, err) - // Wait for process stdinW2.Close() waitProcess(pconfig2, t) stdinW.Close() @@ -1375,3 +1376,195 @@ func TestPIDHost(t *testing.T) { t.Fatalf("ipc link not equal to host link %q %q", actual, l) } } + +func TestInitJoinPID(t *testing.T) { + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + // Execute a long-running container + container1, err := newContainer(newTemplateConfig(rootfs)) + ok(t, err) + defer container1.Destroy() + + stdinR1, stdinW1, err := os.Pipe() + ok(t, err) + init1 := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR1, + } + err = container1.Start(init1) + stdinR1.Close() + defer stdinW1.Close() + ok(t, err) + + // get the state of the first container + state1, err := container1.State() + ok(t, err) + pidns1 := state1.NamespacePaths[configs.NEWPID] + + // Start a container inside the existing pidns but with different cgroups + config2 := newTemplateConfig(rootfs) + config2.Namespaces.Add(configs.NEWPID, pidns1) + config2.Cgroups.Path = "integration/test2" + container2, err := newContainerWithName("testCT2", config2) + ok(t, err) + defer container2.Destroy() + + stdinR2, stdinW2, err := os.Pipe() + ok(t, err) + init2 := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR2, + } + err = container2.Start(init2) + stdinR2.Close() + defer stdinW2.Close() + ok(t, err) + // get the state of the second container + state2, err := container2.State() + ok(t, err) + + ns1, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/pid", state1.InitProcessPid)) + ok(t, err) + ns2, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/pid", state2.InitProcessPid)) + ok(t, err) + if ns1 != ns2 { + t.Errorf("pidns(%s), wanted %s", ns2, ns1) + } + + // check that namespaces are not the same + if reflect.DeepEqual(state2.NamespacePaths, state1.NamespacePaths) { + t.Errorf("Namespaces(%v), original %v", state2.NamespacePaths, + state1.NamespacePaths) + } + // check that pidns is joined correctly. The initial container process list + // should contain the second container's init process + buffers := newStdBuffers() + ps := &libcontainer.Process{ + Cwd: "/", + Args: []string{"ps"}, + Env: standardEnvironment, + Stdout: buffers.Stdout, + } + err = container1.Start(ps) + ok(t, err) + waitProcess(ps, t) + + // Stop init processes one by one. Stop the second container should + // not stop the first. + stdinW2.Close() + waitProcess(init2, t) + stdinW1.Close() + waitProcess(init1, t) + + out := strings.TrimSpace(buffers.Stdout.String()) + // output of ps inside the initial PID namespace should have + // 1 line of header, + // 2 lines of init processes, + // 1 line of ps process + if len(strings.Split(out, "\n")) != 4 { + t.Errorf("unexpected running process, output %q", out) + } +} + +func TestInitJoinNetworkAndUser(t *testing.T) { + if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { + t.Skip("userns is unsupported") + } + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + // Execute a long-running container + config1 := newTemplateConfig(rootfs) + config1.UidMappings = []configs.IDMap{{0, 0, 1000}} + config1.GidMappings = []configs.IDMap{{0, 0, 1000}} + config1.Namespaces = append(config1.Namespaces, configs.Namespace{Type: configs.NEWUSER}) + container1, err := newContainer(config1) + ok(t, err) + defer container1.Destroy() + + stdinR1, stdinW1, err := os.Pipe() + ok(t, err) + init1 := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR1, + } + err = container1.Start(init1) + stdinR1.Close() + defer stdinW1.Close() + ok(t, err) + + // get the state of the first container + state1, err := container1.State() + ok(t, err) + netns1 := state1.NamespacePaths[configs.NEWNET] + userns1 := state1.NamespacePaths[configs.NEWUSER] + + // Start a container inside the existing pidns but with different cgroups + rootfs2, err := newRootfs() + ok(t, err) + defer remove(rootfs2) + + config2 := newTemplateConfig(rootfs2) + config2.UidMappings = []configs.IDMap{{0, 0, 1000}} + config2.GidMappings = []configs.IDMap{{0, 0, 1000}} + config2.Namespaces.Add(configs.NEWNET, netns1) + config2.Namespaces.Add(configs.NEWUSER, userns1) + config2.Cgroups.Path = "integration/test2" + container2, err := newContainerWithName("testCT2", config2) + ok(t, err) + defer container2.Destroy() + + stdinR2, stdinW2, err := os.Pipe() + ok(t, err) + init2 := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR2, + } + err = container2.Start(init2) + stdinR2.Close() + defer stdinW2.Close() + ok(t, err) + + // get the state of the second container + state2, err := container2.State() + ok(t, err) + + for _, ns := range []string{"net", "user"} { + ns1, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/%s", state1.InitProcessPid, ns)) + ok(t, err) + ns2, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/%s", state2.InitProcessPid, ns)) + ok(t, err) + if ns1 != ns2 { + t.Errorf("%s(%s), wanted %s", ns, ns2, ns1) + } + } + + // check that namespaces are not the same + if reflect.DeepEqual(state2.NamespacePaths, state1.NamespacePaths) { + t.Errorf("Namespaces(%v), original %v", state2.NamespacePaths, + state1.NamespacePaths) + } + // Stop init processes one by one. Stop the second container should + // not stop the first. + stdinW2.Close() + waitProcess(init2, t) + stdinW1.Close() + waitProcess(init1, t) +} diff --git a/libcontainer/integration/execin_test.go b/libcontainer/integration/execin_test.go index 8327362f..98dd2ac0 100644 --- a/libcontainer/integration/execin_test.go +++ b/libcontainer/integration/execin_test.go @@ -2,6 +2,7 @@ package integration import ( "bytes" + "fmt" "io" "os" "os/exec" @@ -12,6 +13,7 @@ import ( "time" "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/configs" ) func TestExecIn(t *testing.T) { @@ -404,3 +406,62 @@ func TestExecInOomScoreAdj(t *testing.T) { t.Fatalf("expected oomScoreAdj to be %d, got %s", config.OomScoreAdj, oomScoreAdj) } } + +func TestExecInUserns(t *testing.T) { + if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { + t.Skip("userns is unsupported") + } + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + config := newTemplateConfig(rootfs) + config.UidMappings = []configs.IDMap{{0, 0, 1000}} + config.GidMappings = []configs.IDMap{{0, 0, 1000}} + config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER}) + container, err := newContainer(config) + ok(t, err) + defer container.Destroy() + + // Execute a first process in the container + stdinR, stdinW, err := os.Pipe() + ok(t, err) + + process := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + } + err = container.Start(process) + stdinR.Close() + defer stdinW.Close() + ok(t, err) + + initPID, err := process.Pid() + ok(t, err) + initUserns, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/user", initPID)) + ok(t, err) + + buffers := newStdBuffers() + process2 := &libcontainer.Process{ + Cwd: "/", + Args: []string{"readlink", "/proc/self/ns/user"}, + Env: []string{ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + }, + Stdout: buffers.Stdout, + Stderr: os.Stderr, + } + err = container.Start(process2) + ok(t, err) + waitProcess(process2, t) + stdinW.Close() + waitProcess(process, t) + + if out := strings.TrimSpace(buffers.Stdout.String()); out != initUserns { + t.Errorf("execin userns(%s), wanted %s", out, initUserns) + } +} diff --git a/libcontainer/integration/utils_test.go b/libcontainer/integration/utils_test.go index 3dcd0bb1..e2ca10e0 100644 --- a/libcontainer/integration/utils_test.go +++ b/libcontainer/integration/utils_test.go @@ -92,13 +92,15 @@ func copyBusybox(dest string) error { } func newContainer(config *configs.Config) (libcontainer.Container, error) { - f := factory + return newContainerWithName("testCT", config) +} +func newContainerWithName(name string, config *configs.Config) (libcontainer.Container, error) { + f := factory if config.Cgroups != nil && config.Cgroups.Parent == "system.slice" { f = systemdFactory } - - return f.Create("testCT", config) + return f.Create(name, config) } // runContainer runs the container with the specific config and arguments From d6bf4049f8013033a1c7424c4e2abdc21ada4a25 Mon Sep 17 00:00:00 2001 From: "Daniel, Dao Quang Minh" Date: Mon, 14 Sep 2015 00:37:56 +0000 Subject: [PATCH 4/8] OrderNamespacePaths gets correct order of ns This adds orderNamespacePaths to get correct order of namespaces for the bootstrap program to join. Signed-off-by: Daniel, Dao Quang Minh --- libcontainer/container_linux.go | 36 +++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 253ef00a..8756877d 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -1090,3 +1090,39 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, pid int, consolePath } return bytes.NewReader(r.Serialize()), nil } + +// orderNamespacePaths sorts namespace paths into a list of paths that we +// can setns in order. +func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) { + paths := []string{} + nsTypes := []configs.NamespaceType{ + configs.NEWIPC, + configs.NEWUTS, + configs.NEWNET, + configs.NEWPID, + configs.NEWNS, + } + // join userns if the init process explicitly requires NEWUSER + if c.config.Namespaces.Contains(configs.NEWUSER) { + nsTypes = append(nsTypes, configs.NEWUSER) + } + for _, nsType := range nsTypes { + if p, ok := namespaces[nsType]; ok && p != "" { + // check if the requested namespace is supported + if !configs.IsNamespaceSupported(nsType) { + return nil, newSystemError(fmt.Errorf("namespace %s is not supported", nsType)) + } + // only set to join this namespace if it exists + if _, err := os.Lstat(p); err != nil { + return nil, newSystemError(err) + } + // do not allow namespace path with comma as we use it to separate + // the namespace paths + if strings.ContainsRune(p, ',') { + return nil, newSystemError(fmt.Errorf("invalid path %s", p)) + } + paths = append(paths, p) + } + } + return paths, nil +} From 42d5d0480107a83300a59683a73b11d17538f3dc Mon Sep 17 00:00:00 2001 From: "Daniel, Dao Quang Minh" Date: Mon, 14 Sep 2015 00:40:43 +0000 Subject: [PATCH 5/8] Sets custom namespaces for init processes An init process can join other namespaces (pidns, ipc etc.). This leverages C code defined in nsenter package to spawn a process with correct namespaces and clone if necessary. This moves all setns and cloneflags related code to nsenter layer, which mean that we dont use Go os/exec to create process with cloneflags and set uid/gid_map or setgroups anymore. The necessary data is passed from Go to C using a netlink binary-encoding format. With this change, setns and init processes are almost the same, which brings some opportunity for refactoring. Signed-off-by: Daniel, Dao Quang Minh [mickael.laventure@docker.com: adapted to apply on master @ d97d5e] Signed-off-by: Kenfe-Mickael Laventure --- libcontainer/container_linux.go | 157 ++++++++---- libcontainer/init_linux.go | 19 -- libcontainer/message_linux.go | 28 +- libcontainer/nsenter/nsenter_test.go | 83 +++--- libcontainer/nsenter/nsexec.c | 369 +++++++++++++++++++-------- libcontainer/process_linux.go | 58 ++++- libcontainer/standard_init_linux.go | 7 - 7 files changed, 481 insertions(+), 240 deletions(-) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 8756877d..d31a3435 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -23,6 +23,7 @@ import ( "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/criurpc" "github.com/opencontainers/runc/libcontainer/utils" + "github.com/syndtr/gocapability/capability" "github.com/vishvananda/netlink/nl" ) @@ -268,37 +269,40 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec. } func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) { - t := "_LIBCONTAINER_INITTYPE=" + string(initStandard) - cloneFlags := c.config.Namespaces.CloneFlags() - if cloneFlags&syscall.CLONE_NEWUSER != 0 { - if err := c.addUidGidMappings(cmd.SysProcAttr); err != nil { - // user mappings are not supported - return nil, err - } - enableSetgroups(cmd.SysProcAttr) - // Default to root user when user namespaces are enabled. - if cmd.SysProcAttr.Credential == nil { - cmd.SysProcAttr.Credential = &syscall.Credential{} + cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard)) + nsMaps := make(map[configs.NamespaceType]string) + for _, ns := range c.config.Namespaces { + if ns.Path != "" { + nsMaps[ns.Type] = ns.Path } } - cmd.Env = append(cmd.Env, t) - cmd.SysProcAttr.Cloneflags = cloneFlags + _, sharePidns := nsMaps[configs.NEWPID] + data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, "") + if err != nil { + return nil, err + } return &initProcess{ - cmd: cmd, - childPipe: childPipe, - parentPipe: parentPipe, - manager: c.cgroupManager, - config: c.newInitConfig(p), - container: c, - process: p, + cmd: cmd, + childPipe: childPipe, + parentPipe: parentPipe, + manager: c.cgroupManager, + config: c.newInitConfig(p), + container: c, + process: p, + bootstrapData: data, + sharePidns: sharePidns, }, nil } func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) { cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns)) + state, err := c.currentState() + if err != nil { + return nil, newSystemError(err) + } // for setns process, we dont have to set cloneflags as the process namespaces // will only be set via setns syscall - data, err := c.bootstrapData(0, c.initProcess.pid(), p.consolePath) + data, err := c.bootstrapData(0, state.NamespacePaths, p.consolePath) if err != nil { return nil, err } @@ -1069,28 +1073,6 @@ func (c *linuxContainer) currentState() (*State, error) { return state, nil } -// bootstrapData encodes the necessary data in netlink binary format as a io.Reader. -// Consumer can write the data to a bootstrap program such as one that uses -// nsenter package to bootstrap the container's init process correctly, i.e. with -// correct namespaces, uid/gid mapping etc. -func (c *linuxContainer) bootstrapData(cloneFlags uintptr, pid int, consolePath string) (io.Reader, error) { - // create the netlink message - r := nl.NewNetlinkRequest(int(InitMsg), 0) - // write pid - r.AddData(&Int32msg{ - Type: PidAttr, - Value: uint32(pid), - }) - // write console path - if consolePath != "" { - r.AddData(&Bytemsg{ - Type: ConsolePathAttr, - Value: []byte(consolePath), - }) - } - return bytes.NewReader(r.Serialize()), nil -} - // orderNamespacePaths sorts namespace paths into a list of paths that we // can setns in order. func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) { @@ -1126,3 +1108,92 @@ func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceTyp } return paths, nil } + +func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) { + data := bytes.NewBuffer(nil) + for _, im := range idMap { + line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size) + if _, err := data.WriteString(line); err != nil { + return nil, err + } + } + return data.Bytes(), nil +} + +// bootstrapData encodes the necessary data in netlink binary format +// as a io.Reader. +// Consumer can write the data to a bootstrap program +// such as one that uses nsenter package to bootstrap the container's +// init process correctly, i.e. with correct namespaces, uid/gid +// mapping etc. +func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, consolePath string) (io.Reader, error) { + // create the netlink message + r := nl.NewNetlinkRequest(int(InitMsg), 0) + + // write cloneFlags + r.AddData(&Int32msg{ + Type: CloneFlagsAttr, + Value: uint32(cloneFlags), + }) + + // write console path + if consolePath != "" { + r.AddData(&Bytemsg{ + Type: ConsolePathAttr, + Value: []byte(consolePath), + }) + } + + // write custom namespace paths + if len(nsMaps) > 0 { + nsPaths, err := c.orderNamespacePaths(nsMaps) + if err != nil { + return nil, err + } + r.AddData(&Bytemsg{ + Type: NsPathsAttr, + Value: []byte(strings.Join(nsPaths, ",")), + }) + } + + // write namespace paths only when we are not joining an existing user ns + _, joinExistingUser := nsMaps[configs.NEWUSER] + if !joinExistingUser { + // write uid mappings + if len(c.config.UidMappings) > 0 { + b, err := encodeIDMapping(c.config.UidMappings) + if err != nil { + return nil, err + } + r.AddData(&Bytemsg{ + Type: UidmapAttr, + Value: b, + }) + } + + // write gid mappings + if len(c.config.GidMappings) > 0 { + b, err := encodeIDMapping(c.config.UidMappings) + if err != nil { + return nil, err + } + r.AddData(&Bytemsg{ + Type: GidmapAttr, + Value: b, + }) + // check if we have CAP_SETGID to setgroup properly + pid, err := capability.NewPid(os.Getpid()) + if err != nil { + return nil, err + } + if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) { + r.AddData(&Boolmsg{ + Type: SetgroupAttr, + Value: true, + }) + } + } + } + + return bytes.NewReader(r.Serialize()), nil +} diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index 8abe9191..dd641e87 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -185,25 +185,6 @@ func syncParentHooks(pipe io.ReadWriter) error { return nil } -// joinExistingNamespaces gets all the namespace paths specified for the container and -// does a setns on the namespace fd so that the current process joins the namespace. -func joinExistingNamespaces(namespaces []configs.Namespace) error { - for _, ns := range namespaces { - if ns.Path != "" { - f, err := os.OpenFile(ns.Path, os.O_RDONLY, 0) - if err != nil { - return err - } - err = system.Setns(f.Fd(), uintptr(ns.Syscall())) - f.Close() - if err != nil { - return err - } - } - } - return nil -} - // setupUser changes the groups, gid, and uid for the user inside the container func setupUser(config *initConfig) error { // Set up defaults. diff --git a/libcontainer/message_linux.go b/libcontainer/message_linux.go index 0c3301f2..16630133 100644 --- a/libcontainer/message_linux.go +++ b/libcontainer/message_linux.go @@ -12,8 +12,12 @@ import ( // The number is randomly chosen to not conflict with known netlink types const ( InitMsg uint16 = 62000 - PidAttr uint16 = 27281 + CloneFlagsAttr uint16 = 27281 ConsolePathAttr uint16 = 27282 + NsPathsAttr uint16 = 27283 + UidmapAttr uint16 = 27284 + GidmapAttr uint16 = 27285 + SetgroupAttr uint16 = 27286 // When syscall.NLA_HDRLEN is in gccgo, take this out. syscall_NLA_HDRLEN = (syscall.SizeofNlAttr + syscall.NLA_ALIGNTO - 1) & ^(syscall.NLA_ALIGNTO - 1) ) @@ -60,3 +64,25 @@ func (msg *Bytemsg) Serialize() []byte { func (msg *Bytemsg) Len() int { return syscall_NLA_HDRLEN + len(msg.Value) + 1 // null-terminated } + +type Boolmsg struct { + Type uint16 + Value bool +} + +func (msg *Boolmsg) Serialize() []byte { + buf := make([]byte, msg.Len()) + native := nl.NativeEndian() + native.PutUint16(buf[0:2], uint16(msg.Len())) + native.PutUint16(buf[2:4], msg.Type) + if msg.Value { + buf[4] = 1 + } else { + buf[4] = 0 + } + return buf +} + +func (msg *Boolmsg) Len() int { + return syscall_NLA_HDRLEN + 1 +} diff --git a/libcontainer/nsenter/nsenter_test.go b/libcontainer/nsenter/nsenter_test.go index 976ae6bb..f7b12be9 100644 --- a/libcontainer/nsenter/nsenter_test.go +++ b/libcontainer/nsenter/nsenter_test.go @@ -3,7 +3,9 @@ package nsenter import ( "bytes" "encoding/json" + "fmt" "io" + "io/ioutil" "os" "os/exec" "strings" @@ -18,35 +20,51 @@ type pid struct { Pid int `json:"Pid"` } -func TestNsenterAlivePid(t *testing.T) { +func TestNsenterValidPaths(t *testing.T) { args := []string{"nsenter-exec"} parent, child, err := newPipe() if err != nil { t.Fatalf("failed to create pipe %v", err) } + namespaces := []string{ + // join pid ns of the current process + fmt.Sprintf("/proc/%d/ns/pid", os.Getpid()), + } cmd := &exec.Cmd{ Path: os.Args[0], Args: args, ExtraFiles: []*os.File{child}, - Env: []string{"_LIBCONTAINER_INITTYPE=setns", "_LIBCONTAINER_INITPIPE=3"}, + Env: []string{"_LIBCONTAINER_INITPIPE=3"}, + Stdout: os.Stdout, + Stderr: os.Stderr, } if err := cmd.Start(); err != nil { t.Fatalf("nsenter failed to start %v", err) } + // write cloneFlags r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0) r.AddData(&libcontainer.Int32msg{ - Type: libcontainer.PidAttr, - Value: uint32(os.Getpid()), + Type: libcontainer.CloneFlagsAttr, + Value: uint32(syscall.CLONE_NEWNET), + }) + r.AddData(&libcontainer.Bytemsg{ + Type: libcontainer.NsPathsAttr, + Value: []byte(strings.Join(namespaces, ",")), }) if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil { t.Fatal(err) } + decoder := json.NewDecoder(parent) var pid *pid if err := decoder.Decode(&pid); err != nil { + dir, _ := ioutil.ReadDir(fmt.Sprintf("/proc/%d/ns", os.Getpid())) + for _, d := range dir { + t.Log(d.Name()) + } t.Fatalf("%v", err) } @@ -60,70 +78,43 @@ func TestNsenterAlivePid(t *testing.T) { p.Wait() } -func TestNsenterInvalidPid(t *testing.T) { +func TestNsenterInvalidPaths(t *testing.T) { args := []string{"nsenter-exec"} parent, child, err := newPipe() if err != nil { t.Fatalf("failed to create pipe %v", err) } + namespaces := []string{ + // join pid ns of the current process + fmt.Sprintf("/proc/%d/ns/pid", -1), + } cmd := &exec.Cmd{ Path: os.Args[0], Args: args, ExtraFiles: []*os.File{child}, - Env: []string{"_LIBCONTAINER_INITTYPE=setns", "_LIBCONTAINER_INITPIPE=3"}, + Env: []string{"_LIBCONTAINER_INITPIPE=3"}, } if err := cmd.Start(); err != nil { - t.Fatal("nsenter exits with a zero exit status") + t.Fatal(err) } + // write cloneFlags r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0) r.AddData(&libcontainer.Int32msg{ - Type: libcontainer.PidAttr, - Value: 0, + Type: libcontainer.CloneFlagsAttr, + Value: uint32(syscall.CLONE_NEWNET), + }) + r.AddData(&libcontainer.Bytemsg{ + Type: libcontainer.NsPathsAttr, + Value: []byte(strings.Join(namespaces, ",")), }) if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil { t.Fatal(err) } if err := cmd.Wait(); err == nil { - t.Fatal("nsenter exits with a zero exit status") - } -} - -func TestNsenterDeadPid(t *testing.T) { - deadCmd := exec.Command("true") - if err := deadCmd.Run(); err != nil { - t.Fatal(err) - } - args := []string{"nsenter-exec"} - parent, child, err := newPipe() - if err != nil { - t.Fatalf("failed to create pipe %v", err) - } - - cmd := &exec.Cmd{ - Path: os.Args[0], - Args: args, - ExtraFiles: []*os.File{child}, - Env: []string{"_LIBCONTAINER_INITTYPE=setns", "_LIBCONTAINER_INITPIPE=3"}, - } - - if err := cmd.Start(); err != nil { - t.Fatal("nsenter exits with a zero exit status") - } - - r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0) - r.AddData(&libcontainer.Int32msg{ - Type: libcontainer.PidAttr, - Value: uint32(deadCmd.Process.Pid), - }) - if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil { - t.Fatal(err) - } - - if err := cmd.Wait(); err == nil { - t.Fatal("nsenter exits with a zero exit status") + t.Fatalf("nsenter exits with a zero exit status") } } diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index 6634afc4..286c653c 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -4,7 +4,6 @@ #include #include #include - #include #include #include @@ -16,6 +15,14 @@ #include #include #include +#include +#include +#include + +// netlink related +#include +#include +#include #include #include @@ -57,166 +64,246 @@ int setns(int fd, int nstype) #endif #endif -static int clone_parent(jmp_buf * env) __attribute__ ((noinline)); -static int clone_parent(jmp_buf * env) +static int clone_parent(jmp_buf * env, int flags) __attribute__ ((noinline)); +static int clone_parent(jmp_buf * env, int flags) { struct clone_arg ca; int child; ca.env = env; - child = clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca); - + child = + clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD | flags, + &ca); return child; } +// get init pipe from the parent. It's used to read bootstrap data, and to +// write pid to after nsexec finishes setting up the environment. +static int get_init_pipe() +{ + char buf[PATH_MAX], *initpipe; + int pipenum = -1; + + initpipe = getenv("_LIBCONTAINER_INITPIPE"); + if (initpipe == NULL) { + return -1; + } + + pipenum = atoi(initpipe); + snprintf(buf, sizeof(buf), "%d", pipenum); + if (strcmp(initpipe, buf)) { + pr_perror("Unable to parse _LIBCONTAINER_INITPIPE"); + exit(1); + } + + return pipenum; +} + +// num_namespaces returns the number of additional namespaces to setns. The +// argument is a comma-separated string of namespace paths. +static int num_namespaces(char *nspaths) +{ + int size = 0, i = 0; + + for (i = 0; nspaths[i]; i++) { + if (nspaths[i] == ',') { + size += 1; + } + } + + return size + 1; +} + static uint32_t readint32(char *buf) { return *(uint32_t *) buf; } +static uint8_t readint8(char *buf) +{ + return *(uint8_t *) buf; +} + +static void writedata(int fd, char *data, int start, int len) +{ + int written = 0; + while (written < len) { + size_t nbyte, i; + if ((len - written) < 1024) { + nbyte = len - written; + } else { + nbyte = 1024; + } + i = write(fd, data + start + written, nbyte); + if (i == -1) { + pr_perror("failed to write data to %d", fd); + exit(1); + } + written += i; + } +} + // list of known message types we want to send to bootstrap program // These are defined in libcontainer/message_linux.go -#define INIT_MSG 62000 -#define PID_ATTR 27281 -#define CONSOLE_PATH_ATTR 27282 +#define INIT_MSG 62000 +#define CLONE_FLAGS_ATTR 27281 +#define CONSOLE_PATH_ATTR 27282 +#define NS_PATHS_ATTR 27283 +#define UIDMAP_ATTR 27284 +#define GIDMAP_ATTR 27285 +#define SETGROUP_ATTR 27286 void nsexec() { - char *namespaces[] = { "ipc", "uts", "net", "pid", "mnt", "user" }; - const int num = sizeof(namespaces) / sizeof(char *); jmp_buf env; - char buf[PATH_MAX], *val; - int i, tfd, self_tfd, child, n, len, pipenum, consolefd = -1; - pid_t pid = 0; + int pipenum; - // if we dont have INITTYPE or this is the init process, skip the bootstrap process - val = getenv("_LIBCONTAINER_INITTYPE"); - if (val == NULL || strcmp(val, "standard") == 0) { + // if we dont have init pipe, then just return to the parent + pipenum = get_init_pipe(); + if (pipenum == -1) { return; } - if (strcmp(val, "setns") != 0) { - pr_perror("Invalid inittype %s", val); + // Retrieve the netlink header + struct nlmsghdr nl_msg_hdr; + int len; + + if ((len = read(pipenum, &nl_msg_hdr, NLMSG_HDRLEN)) != NLMSG_HDRLEN) { + pr_perror("Failed to read netlink header, got %d instead of %d", + len, NLMSG_HDRLEN); exit(1); } - val = getenv("_LIBCONTAINER_INITPIPE"); - if (val == NULL) { - pr_perror("Child pipe not found"); + if (nl_msg_hdr.nlmsg_type == NLMSG_ERROR) { + pr_perror("failed to read netlink message"); exit(1); } - pipenum = atoi(val); - snprintf(buf, sizeof(buf), "%d", pipenum); - if (strcmp(val, buf)) { - pr_perror("Unable to parse _LIBCONTAINER_INITPIPE"); + if (nl_msg_hdr.nlmsg_type != INIT_MSG) { + pr_perror("unexpected msg type %d", nl_msg_hdr.nlmsg_type); exit(1); } + // Retrieve data + int nl_total_size = NLMSG_PAYLOAD(&nl_msg_hdr, 0); + char data[nl_total_size]; - char nlbuf[NLMSG_HDRLEN]; - struct nlmsghdr *nh; - if ((n = read(pipenum, nlbuf, NLMSG_HDRLEN)) != NLMSG_HDRLEN) { - pr_perror("Failed to read netlink header, got %d", n); + if ((len = read(pipenum, data, nl_total_size)) != nl_total_size) { + pr_perror + ("Failed to read netlink payload, got %d instead of %d", + len, nl_total_size); exit(1); } - - nh = (struct nlmsghdr *)nlbuf; - if (nh->nlmsg_type == NLMSG_ERROR) { - pr_perror("Invalid netlink header message"); - exit(1); - } - if (nh->nlmsg_type != INIT_MSG) { - pr_perror("Unexpected netlink message type %d", nh->nlmsg_type); - exit(1); - } - // read the netlink payload - len = NLMSG_PAYLOAD(nh, 0); - char data[len]; - if ((n = read(pipenum, data, len)) != len) { - pr_perror("Failed to read netlink payload, got %d", n); - exit(1); - } - + // Process the passed attributes int start = 0; - struct nlattr *attr; - while (start < len) { - int payload_len; - attr = (struct nlattr *)((void *)data + start); + uint32_t cloneflags = -1; + uint8_t is_setgroup = 0; + int consolefd = -1; + int uidmap_start = -1, uidmap_len = -1; + int gidmap_start = -1, gidmap_len = -1; + int payload_len; + struct nlattr *nlattr; + + while (start < nl_total_size) { + nlattr = (struct nlattr *)(data + start); start += NLA_HDRLEN; - payload_len = attr->nla_len - NLA_HDRLEN; - switch (attr->nla_type) { - case PID_ATTR: - pid = (pid_t) readint32(data + start); - break; - case CONSOLE_PATH_ATTR: - consolefd = open((char *)data + start, O_RDWR); + payload_len = nlattr->nla_len - NLA_HDRLEN; + + if (nlattr->nla_type == CLONE_FLAGS_ATTR) { + cloneflags = readint32(data + start); + } else if (nlattr->nla_type == CONSOLE_PATH_ATTR) { + // get the console path before setns because it may change mnt namespace + consolefd = open(data + start, O_RDWR); if (consolefd < 0) { - pr_perror("Failed to open console %s", (char *)data + start); + pr_perror("Failed to open console %s", + data + start); exit(1); } - break; + } else if (nlattr->nla_type == NS_PATHS_ATTR) { + char nspaths[payload_len + 1]; + + strncpy(nspaths, data + start, payload_len); + nspaths[payload_len] = '\0'; + + // if custom namespaces are required, open all descriptors and perform + // setns on them + int nslen = num_namespaces(nspaths); + int fds[nslen]; + char *nslist[nslen]; + int i; + char *ns, *saveptr; + + for (i = 0; i < nslen; i++) { + char *str = NULL; + + if (i == 0) { + str = nspaths; + } + ns = strtok_r(str, ",", &saveptr); + if (ns == NULL) { + break; + } + fds[i] = open(ns, O_RDONLY); + if (fds[i] == -1) { + pr_perror("Failed to open %s", ns); + exit(1); + } + nslist[i] = ns; + } + + for (i = 0; i < nslen; i++) { + if (setns(fds[i], 0) != 0) { + pr_perror("Failed to setns to %s", + nslist[i]); + exit(1); + } + close(fds[i]); + } + } else if (nlattr->nla_type == UIDMAP_ATTR) { + uidmap_len = payload_len; + uidmap_start = start; + } else if (nlattr->nla_type == GIDMAP_ATTR) { + gidmap_len = payload_len; + gidmap_start = start; + } else if (nlattr->nla_type == SETGROUP_ATTR) { + is_setgroup = readint8(data + start); + } else { + pr_perror("unknown netlink message type %d", + nlattr->nla_type); + exit(1); } + start += NLA_ALIGN(payload_len); } - // required pid to be passed - if (pid == 0) { - pr_perror("missing pid"); + // required clone_flags to be passed + if (cloneflags == -1) { + pr_perror("missing clone_flags"); exit(1); } - - /* Check that the specified process exists */ - snprintf(buf, PATH_MAX - 1, "/proc/%d/ns", pid); - tfd = open(buf, O_DIRECTORY | O_RDONLY); - if (tfd == -1) { - pr_perror("Failed to open \"%s\"", buf); + // prepare sync pipe between parent and child. We need this to let the child + // know that the parent has finished setting up + int syncpipe[2] = { -1, -1 }; + if (pipe(syncpipe) != 0) { + pr_perror("failed to setup sync pipe between parent and child"); exit(1); - } - - self_tfd = open("/proc/self/ns", O_DIRECTORY | O_RDONLY); - if (self_tfd == -1) { - pr_perror("Failed to open /proc/self/ns"); - exit(1); - } - - for (i = 0; i < num; i++) { - struct stat st; - struct stat self_st; - int fd; - - /* Symlinks on all namespaces exist for dead processes, but they can't be opened */ - if (fstatat(tfd, namespaces[i], &st, 0) == -1) { - // Ignore nonexistent namespaces. - if (errno == ENOENT) - continue; - } - - /* Skip namespaces we're already part of */ - if (fstatat(self_tfd, namespaces[i], &self_st, 0) != -1 && st.st_ino == self_st.st_ino) { - continue; - } - - fd = openat(tfd, namespaces[i], O_RDONLY); - if (fd == -1) { - pr_perror("Failed to open ns file %s for ns %s", buf, namespaces[i]); - exit(1); - } - // Set the namespace. - if (setns(fd, 0) == -1) { - pr_perror("Failed to setns for %s", namespaces[i]); - exit(1); - } - close(fd); - } - - close(self_tfd); - close(tfd); + }; if (setjmp(env) == 1) { // Child + uint8_t s; + + // close the writing side of pipe + close(syncpipe[1]); + + // sync with parent + if (read(syncpipe[0], &s, 1) != 1 || s != 1) { + pr_perror("failed to read sync byte from parent"); + exit(1); + }; if (setsid() == -1) { pr_perror("setsid failed"); exit(1); } + if (consolefd != -1) { if (ioctl(consolefd, TIOCSCTTY, 0) == -1) { pr_perror("ioctl TIOCSCTTY failed"); @@ -243,19 +330,75 @@ void nsexec() // We must fork to actually enter the PID namespace, use CLONE_PARENT // so the child can have the right parent, and we don't need to forward // the child's exit code or resend its death signal. - child = clone_parent(&env); + int child = clone_parent(&env, cloneflags); if (child < 0) { pr_perror("Unable to fork"); exit(1); } + // if uid_map and gid_map were specified, writes the data to /proc files + if (uidmap_start > 0 && uidmap_len > 0) { + char buf[PATH_MAX]; + if (snprintf(buf, sizeof(buf), "/proc/%d/uid_map", child) < 0) { + pr_perror("failed to construct uid_map file for %d", + child); + exit(1); + } - len = snprintf(buf, sizeof(buf), "{ \"pid\" : %d }\n", child); + int fd = open(buf, O_RDWR); + writedata(fd, data, uidmap_start, uidmap_len); + } - if (write(pipenum, buf, len) != len) { + if (gidmap_start > 0 && gidmap_len > 0) { + if (is_setgroup == 1) { + char buf[PATH_MAX]; + if (snprintf + (buf, sizeof(buf), "/proc/%d/setgroups", + child) < 0) { + pr_perror + ("failed to construct setgroups file for %d", + child); + exit(1); + } + + int fd = open(buf, O_RDWR); + if (write(fd, "allow", 5) != 5) { + // If the kernel is too old to support /proc/PID/setgroups, + // write will return ENOENT; this is OK. + if (errno != ENOENT) { + pr_perror("failed to write allow to %s", + buf); + exit(1); + } + } + } + // write gid mappings + char buf[PATH_MAX]; + if (snprintf(buf, sizeof(buf), "/proc/%d/gid_map", child) < 0) { + pr_perror("failed to construct gid_map file for %d", + child); + exit(1); + } + + int fd = open(buf, O_RDWR); + writedata(fd, data, gidmap_start, gidmap_len); + } + // Send the sync signal to the child + close(syncpipe[0]); + uint8_t s = 1; + if (write(syncpipe[1], &s, 1) != 1) { + pr_perror("failed to write sync byte to child"); + exit(1); + }; + + // parent to finish the bootstrap process + char child_data[PATH_MAX]; + len = + snprintf(child_data, sizeof(child_data), "{ \"pid\" : %d }\n", + child); + if (write(pipenum, child_data, len) != len) { pr_perror("Unable to send a child pid"); kill(child, SIGKILL); exit(1); } - exit(0); } diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index de2d5f00..1a4d4b04 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -167,14 +167,16 @@ func (p *setnsProcess) setExternalDescriptors(newFds []string) { } type initProcess struct { - cmd *exec.Cmd - parentPipe *os.File - childPipe *os.File - config *initConfig - manager cgroups.Manager - container *linuxContainer - fds []string - process *Process + cmd *exec.Cmd + parentPipe *os.File + childPipe *os.File + config *initConfig + manager cgroups.Manager + container *linuxContainer + fds []string + process *Process + bootstrapData io.Reader + sharePidns bool } func (p *initProcess) pid() int { @@ -185,15 +187,49 @@ func (p *initProcess) externalDescriptors() []string { return p.fds } -func (p *initProcess) start() (err error) { +// execSetns runs the process that executes C code to perform the setns calls +// because setns support requires the C process to fork off a child and perform the setns +// before the go runtime boots, we wait on the process to die and receive the child's pid +// over the provided pipe. +// This is called by initProcess.start function +func (p *initProcess) execSetns() error { + status, err := p.cmd.Process.Wait() + if err != nil { + p.cmd.Wait() + return err + } + if !status.Success() { + p.cmd.Wait() + return &exec.ExitError{ProcessState: status} + } + var pid *pid + if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil { + p.cmd.Wait() + return err + } + process, err := os.FindProcess(pid.Pid) + if err != nil { + return err + } + p.cmd.Process = process + return nil +} + +func (p *initProcess) start() error { defer p.parentPipe.Close() - err = p.cmd.Start() + err := p.cmd.Start() p.process.ops = p p.childPipe.Close() if err != nil { p.process.ops = nil return newSystemError(err) } + if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil { + return err + } + if err := p.execSetns(); err != nil { + return newSystemError(err) + } // Save the standard descriptor names before the container process // can potentially move them (e.g., via dup2()). If we don't do this now, // we won't know at checkpoint time which file descriptor to look up. @@ -317,7 +353,7 @@ func (p *initProcess) wait() (*os.ProcessState, error) { return p.cmd.ProcessState, err } // we should kill all processes in cgroup when init is died if we use host PID namespace - if p.cmd.SysProcAttr.Cloneflags&syscall.CLONE_NEWPID == 0 { + if p.sharePidns { killCgroupProcesses(p.manager) } return p.cmd.ProcessState, nil diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index 27ad8caf..935b2eea 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -55,10 +55,6 @@ func (l *linuxStandardInit) Init() error { return err } - // join any namespaces via a path to the namespace fd if provided - if err := joinExistingNamespaces(l.config.Config.Namespaces); err != nil { - return err - } var console *linuxConsole if l.config.Console != "" { console = newConsoleFromPath(l.config.Console) @@ -66,9 +62,6 @@ func (l *linuxStandardInit) Init() error { return err } } - if _, err := syscall.Setsid(); err != nil { - return err - } if console != nil { if err := system.Setctty(); err != nil { return err From 002b6c2fe8e7fb80e2d09c9ce164c36251e37324 Mon Sep 17 00:00:00 2001 From: "Daniel, Dao Quang Minh" Date: Mon, 14 Sep 2015 00:55:52 +0000 Subject: [PATCH 6/8] Reorder and remove unused imports in nsexec.c Signed-off-by: Daniel, Dao Quang Minh --- libcontainer/nsenter/nsexec.c | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index 286c653c..95b6b90c 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -1,28 +1,19 @@ #define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include - -// netlink related -#include -#include +#include +#include +#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include From 08c3c6ebe25091b3e452ef63790aa1396bff23d4 Mon Sep 17 00:00:00 2001 From: Kenfe-Mickael Laventure Date: Thu, 14 Jan 2016 17:08:45 -0800 Subject: [PATCH 7/8] Refactor nsexec Cut nsexec in smaller chunk routines to make it more readable. Signed-off-by: Kenfe-Mickael Laventure --- libcontainer/nsenter/nsexec.c | 455 +++++++++++++++++++--------------- 1 file changed, 249 insertions(+), 206 deletions(-) diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index 95b6b90c..555e7245 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -21,50 +22,71 @@ #include #include -/* All arguments should be above stack, because it grows down */ +// All arguments should be above the stack because it grows down struct clone_arg { /* * Reserve some space for clone() to locate arguments * and retcode in this place */ - char stack[4096] __attribute__ ((aligned(16))); - char stack_ptr[0]; + char stack[4096] __attribute__((aligned(16))); + char stack_ptr[0]; jmp_buf *env; }; -#define pr_perror(fmt, ...) fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__) +struct nsenter_config { + uint32_t cloneflags; + char *uidmap; + int uidmap_len; + char *gidmap; + int gidmap_len; + uint8_t is_setgroup; +}; + +// list of known message types we want to send to bootstrap program +// These are defined in libcontainer/message_linux.go +#define INIT_MSG 62000 +#define CLONE_FLAGS_ATTR 27281 +#define CONSOLE_PATH_ATTR 27282 +#define NS_PATHS_ATTR 27283 +#define UIDMAP_ATTR 27284 +#define GIDMAP_ATTR 27285 +#define SETGROUP_ATTR 27286 + +// Use raw setns syscall for versions of glibc that don't include it +// (namely glibc-2.12) +#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14 + #define _GNU_SOURCE + #include "syscall.h" + #if defined(__NR_setns) && !defined(SYS_setns) + #define SYS_setns __NR_setns + #endif + + #ifdef SYS_setns + int setns(int fd, int nstype) + { + return syscall(SYS_setns, fd, nstype); + } + #endif +#endif + +#define pr_perror(fmt, ...) \ + fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__) static int child_func(void *_arg) { - struct clone_arg *arg = (struct clone_arg *)_arg; - longjmp(*arg->env, 1); + struct clone_arg *arg = (struct clone_arg *)_arg; + longjmp(*arg->env, 1); } -// Use raw setns syscall for versions of glibc that don't include it (namely glibc-2.12) -#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14 -#define _GNU_SOURCE -#include "syscall.h" -#if defined(__NR_setns) && !defined(SYS_setns) -#define SYS_setns __NR_setns -#endif -#ifdef SYS_setns -int setns(int fd, int nstype) -{ - return syscall(SYS_setns, fd, nstype); -} -#endif -#endif - -static int clone_parent(jmp_buf * env, int flags) __attribute__ ((noinline)); -static int clone_parent(jmp_buf * env, int flags) +static int clone_parent(jmp_buf *env, int flags) __attribute__((noinline)); +static int clone_parent(jmp_buf *env, int flags) { struct clone_arg ca; - int child; + int child; ca.env = env; - child = - clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD | flags, - &ca); + child = clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD | flags, + &ca); return child; } @@ -72,8 +94,9 @@ static int clone_parent(jmp_buf * env, int flags) // write pid to after nsexec finishes setting up the environment. static int get_init_pipe() { - char buf[PATH_MAX], *initpipe; - int pipenum = -1; + char buf[PATH_MAX]; + char *initpipe; + int pipenum = -1; initpipe = getenv("_LIBCONTAINER_INITPIPE"); if (initpipe == NULL) { @@ -94,7 +117,8 @@ static int get_init_pipe() // argument is a comma-separated string of namespace paths. static int num_namespaces(char *nspaths) { - int size = 0, i = 0; + int i; + int size = 0; for (i = 0; nspaths[i]; i++) { if (nspaths[i] == ',') { @@ -107,100 +131,154 @@ static int num_namespaces(char *nspaths) static uint32_t readint32(char *buf) { - return *(uint32_t *) buf; + return *(uint32_t *)buf; } static uint8_t readint8(char *buf) { - return *(uint8_t *) buf; + return *(uint8_t *)buf; } -static void writedata(int fd, char *data, int start, int len) +static void update_process_idmap(char *pathfmt, int pid, char *map, int map_len) { - int written = 0; - while (written < len) { - size_t nbyte, i; - if ((len - written) < 1024) { - nbyte = len - written; - } else { - nbyte = 1024; - } - i = write(fd, data + start + written, nbyte); - if (i == -1) { - pr_perror("failed to write data to %d", fd); - exit(1); - } - written += i; + char buf[PATH_MAX]; + int len; + int fd; + + len = snprintf(buf, sizeof(buf), pathfmt, pid); + if (len < 0) { + pr_perror("failed to construct '%s' for %d", pathfmt, pid); + exit(1); } + + fd = open(buf, O_RDWR); + if (fd == -1) { + pr_perror("failed to open %s", buf); + exit(1); + } + + len = write(fd, map, map_len); + if (len == -1) { + pr_perror("failed to write to %s", buf); + exit(1); + } else if (len != map_len) { + fprintf(stderr, "Failed to write data to %s (%d/%d)", + buf, len, map_len); + exit(1); + } + + close(fd); } -// list of known message types we want to send to bootstrap program -// These are defined in libcontainer/message_linux.go -#define INIT_MSG 62000 -#define CLONE_FLAGS_ATTR 27281 -#define CONSOLE_PATH_ATTR 27282 -#define NS_PATHS_ATTR 27283 -#define UIDMAP_ATTR 27284 -#define GIDMAP_ATTR 27285 -#define SETGROUP_ATTR 27286 - -void nsexec() +static void update_process_uidmap(int pid, char *map, int map_len) { - jmp_buf env; - int pipenum; - - // if we dont have init pipe, then just return to the parent - pipenum = get_init_pipe(); - if (pipenum == -1) { + if ((map == NULL) || (map_len <= 0)) { return; } - // Retrieve the netlink header - struct nlmsghdr nl_msg_hdr; - int len; - if ((len = read(pipenum, &nl_msg_hdr, NLMSG_HDRLEN)) != NLMSG_HDRLEN) { - pr_perror("Failed to read netlink header, got %d instead of %d", - len, NLMSG_HDRLEN); + update_process_idmap("/proc/%d/uid_map", pid, map, map_len); +} + +static void update_process_gidmap(int pid, uint8_t is_setgroup, char *map, int map_len) +{ + if ((map == NULL) || (map_len <= 0)) { + return; + } + + if (is_setgroup == 1) { + int fd; + int len; + char buf[PATH_MAX]; + + len = snprintf(buf, sizeof(buf), "/proc/%d/setgroups", pid); + if (len < 0) { + pr_perror("failed to get setgroups path for %d", pid); + exit(1); + } + + fd = open(buf, O_RDWR); + if (fd == -1) { + pr_perror("failed to open %s", buf); + exit(1); + } + if (write(fd, "allow", 5) != 5) { + // If the kernel is too old to support + // /proc/PID/setgroups, write will return + // ENOENT; this is OK. + if (errno != ENOENT) { + pr_perror("failed to write allow to %s", buf); + exit(1); + } + } + close(fd); + } + + update_process_idmap("/proc/%d/gid_map", pid, map, map_len); +} + + +static void start_child(int pipenum, jmp_buf *env, int syncpipe[2], + struct nsenter_config *config) +{ + int len; + int childpid; + char buf[PATH_MAX]; + uint8_t syncbyte = 1; + + // We must fork to actually enter the PID namespace, use CLONE_PARENT + // so the child can have the right parent, and we don't need to forward + // the child's exit code or resend its death signal. + childpid = clone_parent(env, config->cloneflags); + if (childpid < 0) { + pr_perror("Unable to fork"); exit(1); } - if (nl_msg_hdr.nlmsg_type == NLMSG_ERROR) { - pr_perror("failed to read netlink message"); - exit(1); - } - if (nl_msg_hdr.nlmsg_type != INIT_MSG) { - pr_perror("unexpected msg type %d", nl_msg_hdr.nlmsg_type); - exit(1); - } - // Retrieve data - int nl_total_size = NLMSG_PAYLOAD(&nl_msg_hdr, 0); - char data[nl_total_size]; + // update uid_map and gid_map for the child process if they + // were provided + update_process_uidmap(childpid, config->uidmap, config->uidmap_len); - if ((len = read(pipenum, data, nl_total_size)) != nl_total_size) { - pr_perror - ("Failed to read netlink payload, got %d instead of %d", - len, nl_total_size); + update_process_gidmap(childpid, config->is_setgroup, config->gidmap, config->gidmap_len); + + // Send the sync signal to the child + close(syncpipe[0]); + syncbyte = 1; + if (write(syncpipe[1], &syncbyte, 1) != 1) { + pr_perror("failed to write sync byte to child"); exit(1); } - // Process the passed attributes - int start = 0; - uint32_t cloneflags = -1; - uint8_t is_setgroup = 0; - int consolefd = -1; - int uidmap_start = -1, uidmap_len = -1; - int gidmap_start = -1, gidmap_len = -1; - int payload_len; - struct nlattr *nlattr; - while (start < nl_total_size) { + // Send the child pid back to our parent + len = snprintf(buf, sizeof(buf), "{ \"pid\" : %d }\n", childpid); + if ((len < 0) || (write(pipenum, buf, len) != len)) { + pr_perror("Unable to send a child pid"); + kill(childpid, SIGKILL); + exit(1); + } + + exit(0); +} + +static void process_nl_attributes(int pipenum, char *data, int data_size) +{ + jmp_buf env; + struct nsenter_config config = {0}; + struct nlattr *nlattr; + int payload_len; + int start = 0; + int consolefd = -1; + int syncpipe[2] = {-1, -1}; + + while (start < data_size) { nlattr = (struct nlattr *)(data + start); start += NLA_HDRLEN; payload_len = nlattr->nla_len - NLA_HDRLEN; if (nlattr->nla_type == CLONE_FLAGS_ATTR) { - cloneflags = readint32(data + start); + config.cloneflags = readint32(data + start); } else if (nlattr->nla_type == CONSOLE_PATH_ATTR) { - // get the console path before setns because it may change mnt namespace + // get the console path before setns because it may + // change mnt namespace consolefd = open(data + start, O_RDWR); if (consolefd < 0) { pr_perror("Failed to open console %s", @@ -208,24 +286,20 @@ void nsexec() exit(1); } } else if (nlattr->nla_type == NS_PATHS_ATTR) { - char nspaths[payload_len + 1]; - - strncpy(nspaths, data + start, payload_len); - nspaths[payload_len] = '\0'; - - // if custom namespaces are required, open all descriptors and perform - // setns on them - int nslen = num_namespaces(nspaths); - int fds[nslen]; - char *nslist[nslen]; - int i; - char *ns, *saveptr; + // if custom namespaces are required, open all + // descriptors and perform setns on them + int i; + int nslen = num_namespaces(data + start); + int fds[nslen]; + char *nslist[nslen]; + char *ns; + char *saveptr; for (i = 0; i < nslen; i++) { char *str = NULL; if (i == 0) { - str = nspaths; + str = data + start; } ns = strtok_r(str, ",", &saveptr); if (ns == NULL) { @@ -241,22 +315,21 @@ void nsexec() for (i = 0; i < nslen; i++) { if (setns(fds[i], 0) != 0) { - pr_perror("Failed to setns to %s", - nslist[i]); + pr_perror("Failed to setns to %s", nslist[i]); exit(1); } close(fds[i]); } } else if (nlattr->nla_type == UIDMAP_ATTR) { - uidmap_len = payload_len; - uidmap_start = start; + config.uidmap = data + start; + config.uidmap_len = payload_len; } else if (nlattr->nla_type == GIDMAP_ATTR) { - gidmap_len = payload_len; - gidmap_start = start; + config.gidmap = data + start; + config.gidmap_len = payload_len; } else if (nlattr->nla_type == SETGROUP_ATTR) { - is_setgroup = readint8(data + start); + config.is_setgroup = readint8(data + start); } else { - pr_perror("unknown netlink message type %d", + pr_perror("Unknown netlink message type %d", nlattr->nla_type); exit(1); } @@ -265,30 +338,30 @@ void nsexec() } // required clone_flags to be passed - if (cloneflags == -1) { - pr_perror("missing clone_flags"); + if (config.cloneflags == -1) { + pr_perror("Missing clone_flags"); exit(1); } - // prepare sync pipe between parent and child. We need this to let the child + // prepare sync pipe between parent and child. We need this to let the + // child // know that the parent has finished setting up - int syncpipe[2] = { -1, -1 }; if (pipe(syncpipe) != 0) { - pr_perror("failed to setup sync pipe between parent and child"); + pr_perror("Failed to setup sync pipe between parent and child"); exit(1); - }; + } if (setjmp(env) == 1) { // Child - uint8_t s; + uint8_t s = 0; // close the writing side of pipe close(syncpipe[1]); // sync with parent - if (read(syncpipe[0], &s, 1) != 1 || s != 1) { - pr_perror("failed to read sync byte from parent"); + if ((read(syncpipe[0], &s, 1) != 1) || (s != 1)) { + pr_perror("Failed to read sync byte from parent"); exit(1); - }; + } if (setsid() == -1) { pr_perror("setsid failed"); @@ -301,95 +374,65 @@ void nsexec() exit(1); } if (dup3(consolefd, STDIN_FILENO, 0) != STDIN_FILENO) { - pr_perror("Failed to dup 0"); + pr_perror("Failed to dup stdin"); exit(1); } if (dup3(consolefd, STDOUT_FILENO, 0) != STDOUT_FILENO) { - pr_perror("Failed to dup 1"); + pr_perror("Failed to dup stdout"); exit(1); } if (dup3(consolefd, STDERR_FILENO, 0) != STDERR_FILENO) { - pr_perror("Failed to dup 2"); + pr_perror("Failed to dup stderr"); exit(1); } } + // Finish executing, let the Go runtime take over. return; } + // Parent - - // We must fork to actually enter the PID namespace, use CLONE_PARENT - // so the child can have the right parent, and we don't need to forward - // the child's exit code or resend its death signal. - int child = clone_parent(&env, cloneflags); - if (child < 0) { - pr_perror("Unable to fork"); - exit(1); - } - // if uid_map and gid_map were specified, writes the data to /proc files - if (uidmap_start > 0 && uidmap_len > 0) { - char buf[PATH_MAX]; - if (snprintf(buf, sizeof(buf), "/proc/%d/uid_map", child) < 0) { - pr_perror("failed to construct uid_map file for %d", - child); - exit(1); - } - - int fd = open(buf, O_RDWR); - writedata(fd, data, uidmap_start, uidmap_len); - } - - if (gidmap_start > 0 && gidmap_len > 0) { - if (is_setgroup == 1) { - char buf[PATH_MAX]; - if (snprintf - (buf, sizeof(buf), "/proc/%d/setgroups", - child) < 0) { - pr_perror - ("failed to construct setgroups file for %d", - child); - exit(1); - } - - int fd = open(buf, O_RDWR); - if (write(fd, "allow", 5) != 5) { - // If the kernel is too old to support /proc/PID/setgroups, - // write will return ENOENT; this is OK. - if (errno != ENOENT) { - pr_perror("failed to write allow to %s", - buf); - exit(1); - } - } - } - // write gid mappings - char buf[PATH_MAX]; - if (snprintf(buf, sizeof(buf), "/proc/%d/gid_map", child) < 0) { - pr_perror("failed to construct gid_map file for %d", - child); - exit(1); - } - - int fd = open(buf, O_RDWR); - writedata(fd, data, gidmap_start, gidmap_len); - } - // Send the sync signal to the child - close(syncpipe[0]); - uint8_t s = 1; - if (write(syncpipe[1], &s, 1) != 1) { - pr_perror("failed to write sync byte to child"); - exit(1); - }; - - // parent to finish the bootstrap process - char child_data[PATH_MAX]; - len = - snprintf(child_data, sizeof(child_data), "{ \"pid\" : %d }\n", - child); - if (write(pipenum, child_data, len) != len) { - pr_perror("Unable to send a child pid"); - kill(child, SIGKILL); - exit(1); - } - exit(0); + start_child(pipenum, &env, syncpipe, &config); +} + +void nsexec(void) +{ + int pipenum; + + // if we dont have init pipe, then just return to the parent + pipenum = get_init_pipe(); + if (pipenum == -1) { + return; + } + + // Retrieve the netlink header + struct nlmsghdr nl_msg_hdr; + int len; + + if ((len = read(pipenum, &nl_msg_hdr, NLMSG_HDRLEN)) != NLMSG_HDRLEN) { + pr_perror("Invalid netlink header length %d", len); + exit(1); + } + + if (nl_msg_hdr.nlmsg_type == NLMSG_ERROR) { + pr_perror("Failed to read netlink message"); + exit(1); + } + + if (nl_msg_hdr.nlmsg_type != INIT_MSG) { + pr_perror("Unexpected msg type %d", nl_msg_hdr.nlmsg_type); + exit(1); + } + + // Retrieve data + int nl_total_size = NLMSG_PAYLOAD(&nl_msg_hdr, 0); + char data[nl_total_size]; + + if ((len = read(pipenum, data, nl_total_size)) != nl_total_size) { + pr_perror("Failed to read netlink payload, %d != %d", len, + nl_total_size); + exit(1); + } + + process_nl_attributes(pipenum, data, nl_total_size); } From 6325ab96e7291713e55ab4165ad8e0978a194e72 Mon Sep 17 00:00:00 2001 From: Kenfe-Mickael Laventure Date: Tue, 2 Feb 2016 17:27:44 -0800 Subject: [PATCH 8/8] Call Prestart hook after namespaces have been set This simply move the call to the Prestart hooks to be made once we receive the procReady message from the client. This is necessary as we had to move the setns calls within nsexec in order to be accomodate joining namespaces that only affect future children (e.g. NEWPID). Signed-off-by: Kenfe-Mickael Laventure --- libcontainer/process_linux.go | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 1a4d4b04..ebae030b 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -249,21 +249,6 @@ func (p *initProcess) start() error { p.manager.Destroy() } }() - if !p.config.Config.Namespaces.Contains(configs.NEWNS) { - if p.config.Config.Hooks != nil { - s := configs.HookState{ - Version: p.container.config.Version, - ID: p.container.id, - Pid: p.pid(), - Root: p.config.Config.Rootfs, - } - for _, hook := range p.config.Config.Hooks.Prestart { - if err := hook.Run(s); err != nil { - return newSystemError(err) - } - } - } - } if err := p.createNetworkInterfaces(); err != nil { return newSystemError(err) } @@ -291,6 +276,22 @@ loop: if err := p.manager.Set(p.config.Config); err != nil { return newSystemError(err) } + // call prestart hooks + if !p.config.Config.Namespaces.Contains(configs.NEWNS) { + if p.config.Config.Hooks != nil { + s := configs.HookState{ + Version: p.container.config.Version, + ID: p.container.id, + Pid: p.pid(), + Root: p.config.Config.Rootfs, + } + for _, hook := range p.config.Config.Hooks.Prestart { + if err := hook.Run(s); err != nil { + return newSystemError(err) + } + } + } + } // Sync with child. if err := utils.WriteJSON(p.parentPipe, syncT{procRun}); err != nil { return newSystemError(err)