Merge pull request #454 from mlaventure/libcontainer-pidns
Move setns within nsexec
This commit is contained in:
commit
b1872a068e
|
@ -2,7 +2,11 @@
|
|||
|
||||
package configs
|
||||
|
||||
import "fmt"
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"sync"
|
||||
)
|
||||
|
||||
const (
|
||||
NEWNET NamespaceType = "NEWNET"
|
||||
|
@ -13,6 +17,51 @@ const (
|
|||
NEWUSER NamespaceType = "NEWUSER"
|
||||
)
|
||||
|
||||
var (
|
||||
nsLock sync.Mutex
|
||||
supportedNamespaces = make(map[NamespaceType]bool)
|
||||
)
|
||||
|
||||
// nsToFile converts the namespace type to its filename
|
||||
func nsToFile(ns NamespaceType) string {
|
||||
switch ns {
|
||||
case NEWNET:
|
||||
return "net"
|
||||
case NEWNS:
|
||||
return "mnt"
|
||||
case NEWPID:
|
||||
return "pid"
|
||||
case NEWIPC:
|
||||
return "ipc"
|
||||
case NEWUSER:
|
||||
return "user"
|
||||
case NEWUTS:
|
||||
return "uts"
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// IsNamespaceSupported returns whether a namespace is available or
|
||||
// not
|
||||
func IsNamespaceSupported(ns NamespaceType) bool {
|
||||
nsLock.Lock()
|
||||
defer nsLock.Unlock()
|
||||
supported, ok := supportedNamespaces[ns]
|
||||
if ok {
|
||||
return supported
|
||||
}
|
||||
nsFile := nsToFile(ns)
|
||||
// if the namespace type is unknown, just return false
|
||||
if nsFile == "" {
|
||||
return false
|
||||
}
|
||||
_, err := os.Stat(fmt.Sprintf("/proc/self/ns/%s", nsFile))
|
||||
// a namespace is supported if it exists and we have permissions to read it
|
||||
supported = err == nil
|
||||
supportedNamespaces[ns] = supported
|
||||
return supported
|
||||
}
|
||||
|
||||
func NamespaceTypes() []NamespaceType {
|
||||
return []NamespaceType{
|
||||
NEWNET,
|
||||
|
@ -35,26 +84,7 @@ func (n *Namespace) GetPath(pid int) string {
|
|||
if n.Path != "" {
|
||||
return n.Path
|
||||
}
|
||||
return fmt.Sprintf("/proc/%d/ns/%s", pid, n.file())
|
||||
}
|
||||
|
||||
func (n *Namespace) file() string {
|
||||
file := ""
|
||||
switch n.Type {
|
||||
case NEWNET:
|
||||
file = "net"
|
||||
case NEWNS:
|
||||
file = "mnt"
|
||||
case NEWPID:
|
||||
file = "pid"
|
||||
case NEWIPC:
|
||||
file = "ipc"
|
||||
case NEWUSER:
|
||||
file = "user"
|
||||
case NEWUTS:
|
||||
file = "uts"
|
||||
}
|
||||
return file
|
||||
return fmt.Sprintf("/proc/%d/ns/%s", pid, nsToFile(n.Type))
|
||||
}
|
||||
|
||||
func (n *Namespaces) Remove(t NamespaceType) bool {
|
||||
|
@ -87,3 +117,11 @@ func (n *Namespaces) index(t NamespaceType) int {
|
|||
func (n *Namespaces) Contains(t NamespaceType) bool {
|
||||
return n.index(t) != -1
|
||||
}
|
||||
|
||||
func (n *Namespaces) PathOf(t NamespaceType) string {
|
||||
i := n.index(t)
|
||||
if i == -1 {
|
||||
return ""
|
||||
}
|
||||
return (*n)[i].Path
|
||||
}
|
||||
|
|
|
@ -23,6 +23,7 @@ import (
|
|||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
"github.com/opencontainers/runc/libcontainer/criurpc"
|
||||
"github.com/opencontainers/runc/libcontainer/utils"
|
||||
"github.com/syndtr/gocapability/capability"
|
||||
"github.com/vishvananda/netlink/nl"
|
||||
)
|
||||
|
||||
|
@ -268,37 +269,40 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.
|
|||
}
|
||||
|
||||
func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) {
|
||||
t := "_LIBCONTAINER_INITTYPE=" + string(initStandard)
|
||||
cloneFlags := c.config.Namespaces.CloneFlags()
|
||||
if cloneFlags&syscall.CLONE_NEWUSER != 0 {
|
||||
if err := c.addUidGidMappings(cmd.SysProcAttr); err != nil {
|
||||
// user mappings are not supported
|
||||
return nil, err
|
||||
}
|
||||
enableSetgroups(cmd.SysProcAttr)
|
||||
// Default to root user when user namespaces are enabled.
|
||||
if cmd.SysProcAttr.Credential == nil {
|
||||
cmd.SysProcAttr.Credential = &syscall.Credential{}
|
||||
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
|
||||
nsMaps := make(map[configs.NamespaceType]string)
|
||||
for _, ns := range c.config.Namespaces {
|
||||
if ns.Path != "" {
|
||||
nsMaps[ns.Type] = ns.Path
|
||||
}
|
||||
}
|
||||
cmd.Env = append(cmd.Env, t)
|
||||
cmd.SysProcAttr.Cloneflags = cloneFlags
|
||||
_, sharePidns := nsMaps[configs.NEWPID]
|
||||
data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, "")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &initProcess{
|
||||
cmd: cmd,
|
||||
childPipe: childPipe,
|
||||
parentPipe: parentPipe,
|
||||
manager: c.cgroupManager,
|
||||
config: c.newInitConfig(p),
|
||||
container: c,
|
||||
process: p,
|
||||
cmd: cmd,
|
||||
childPipe: childPipe,
|
||||
parentPipe: parentPipe,
|
||||
manager: c.cgroupManager,
|
||||
config: c.newInitConfig(p),
|
||||
container: c,
|
||||
process: p,
|
||||
bootstrapData: data,
|
||||
sharePidns: sharePidns,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) {
|
||||
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
|
||||
state, err := c.currentState()
|
||||
if err != nil {
|
||||
return nil, newSystemError(err)
|
||||
}
|
||||
// for setns process, we dont have to set cloneflags as the process namespaces
|
||||
// will only be set via setns syscall
|
||||
data, err := c.bootstrapData(0, c.initProcess.pid(), p.consolePath)
|
||||
data, err := c.bootstrapData(0, state.NamespacePaths, p.consolePath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
@ -1069,18 +1073,69 @@ func (c *linuxContainer) currentState() (*State, error) {
|
|||
return state, nil
|
||||
}
|
||||
|
||||
// bootstrapData encodes the necessary data in netlink binary format as a io.Reader.
|
||||
// Consumer can write the data to a bootstrap program such as one that uses
|
||||
// nsenter package to bootstrap the container's init process correctly, i.e. with
|
||||
// correct namespaces, uid/gid mapping etc.
|
||||
func (c *linuxContainer) bootstrapData(cloneFlags uintptr, pid int, consolePath string) (io.Reader, error) {
|
||||
// orderNamespacePaths sorts namespace paths into a list of paths that we
|
||||
// can setns in order.
|
||||
func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
|
||||
paths := []string{}
|
||||
nsTypes := []configs.NamespaceType{
|
||||
configs.NEWIPC,
|
||||
configs.NEWUTS,
|
||||
configs.NEWNET,
|
||||
configs.NEWPID,
|
||||
configs.NEWNS,
|
||||
}
|
||||
// join userns if the init process explicitly requires NEWUSER
|
||||
if c.config.Namespaces.Contains(configs.NEWUSER) {
|
||||
nsTypes = append(nsTypes, configs.NEWUSER)
|
||||
}
|
||||
for _, nsType := range nsTypes {
|
||||
if p, ok := namespaces[nsType]; ok && p != "" {
|
||||
// check if the requested namespace is supported
|
||||
if !configs.IsNamespaceSupported(nsType) {
|
||||
return nil, newSystemError(fmt.Errorf("namespace %s is not supported", nsType))
|
||||
}
|
||||
// only set to join this namespace if it exists
|
||||
if _, err := os.Lstat(p); err != nil {
|
||||
return nil, newSystemError(err)
|
||||
}
|
||||
// do not allow namespace path with comma as we use it to separate
|
||||
// the namespace paths
|
||||
if strings.ContainsRune(p, ',') {
|
||||
return nil, newSystemError(fmt.Errorf("invalid path %s", p))
|
||||
}
|
||||
paths = append(paths, p)
|
||||
}
|
||||
}
|
||||
return paths, nil
|
||||
}
|
||||
|
||||
func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) {
|
||||
data := bytes.NewBuffer(nil)
|
||||
for _, im := range idMap {
|
||||
line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size)
|
||||
if _, err := data.WriteString(line); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
return data.Bytes(), nil
|
||||
}
|
||||
|
||||
// bootstrapData encodes the necessary data in netlink binary format
|
||||
// as a io.Reader.
|
||||
// Consumer can write the data to a bootstrap program
|
||||
// such as one that uses nsenter package to bootstrap the container's
|
||||
// init process correctly, i.e. with correct namespaces, uid/gid
|
||||
// mapping etc.
|
||||
func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, consolePath string) (io.Reader, error) {
|
||||
// create the netlink message
|
||||
r := nl.NewNetlinkRequest(int(InitMsg), 0)
|
||||
// write pid
|
||||
|
||||
// write cloneFlags
|
||||
r.AddData(&Int32msg{
|
||||
Type: PidAttr,
|
||||
Value: uint32(pid),
|
||||
Type: CloneFlagsAttr,
|
||||
Value: uint32(cloneFlags),
|
||||
})
|
||||
|
||||
// write console path
|
||||
if consolePath != "" {
|
||||
r.AddData(&Bytemsg{
|
||||
|
@ -1088,5 +1143,57 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, pid int, consolePath
|
|||
Value: []byte(consolePath),
|
||||
})
|
||||
}
|
||||
|
||||
// write custom namespace paths
|
||||
if len(nsMaps) > 0 {
|
||||
nsPaths, err := c.orderNamespacePaths(nsMaps)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
r.AddData(&Bytemsg{
|
||||
Type: NsPathsAttr,
|
||||
Value: []byte(strings.Join(nsPaths, ",")),
|
||||
})
|
||||
}
|
||||
|
||||
// write namespace paths only when we are not joining an existing user ns
|
||||
_, joinExistingUser := nsMaps[configs.NEWUSER]
|
||||
if !joinExistingUser {
|
||||
// write uid mappings
|
||||
if len(c.config.UidMappings) > 0 {
|
||||
b, err := encodeIDMapping(c.config.UidMappings)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
r.AddData(&Bytemsg{
|
||||
Type: UidmapAttr,
|
||||
Value: b,
|
||||
})
|
||||
}
|
||||
|
||||
// write gid mappings
|
||||
if len(c.config.GidMappings) > 0 {
|
||||
b, err := encodeIDMapping(c.config.UidMappings)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
r.AddData(&Bytemsg{
|
||||
Type: GidmapAttr,
|
||||
Value: b,
|
||||
})
|
||||
// check if we have CAP_SETGID to setgroup properly
|
||||
pid, err := capability.NewPid(os.Getpid())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) {
|
||||
r.AddData(&Boolmsg{
|
||||
Type: SetgroupAttr,
|
||||
Value: true,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return bytes.NewReader(r.Serialize()), nil
|
||||
}
|
||||
|
|
|
@ -185,25 +185,6 @@ func syncParentHooks(pipe io.ReadWriter) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
// joinExistingNamespaces gets all the namespace paths specified for the container and
|
||||
// does a setns on the namespace fd so that the current process joins the namespace.
|
||||
func joinExistingNamespaces(namespaces []configs.Namespace) error {
|
||||
for _, ns := range namespaces {
|
||||
if ns.Path != "" {
|
||||
f, err := os.OpenFile(ns.Path, os.O_RDONLY, 0)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
err = system.Setns(f.Fd(), uintptr(ns.Syscall()))
|
||||
f.Close()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// setupUser changes the groups, gid, and uid for the user inside the container
|
||||
func setupUser(config *initConfig) error {
|
||||
// Set up defaults.
|
||||
|
|
|
@ -2,10 +2,12 @@ package integration
|
|||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
|
@ -1209,7 +1211,6 @@ func TestRootfsPropagationSlaveMount(t *testing.T) {
|
|||
defer stdinW2.Close()
|
||||
ok(t, err)
|
||||
|
||||
// Wait for process
|
||||
stdinW2.Close()
|
||||
waitProcess(pconfig2, t)
|
||||
stdinW.Close()
|
||||
|
@ -1375,3 +1376,195 @@ func TestPIDHost(t *testing.T) {
|
|||
t.Fatalf("ipc link not equal to host link %q %q", actual, l)
|
||||
}
|
||||
}
|
||||
|
||||
func TestInitJoinPID(t *testing.T) {
|
||||
if testing.Short() {
|
||||
return
|
||||
}
|
||||
rootfs, err := newRootfs()
|
||||
ok(t, err)
|
||||
defer remove(rootfs)
|
||||
|
||||
// Execute a long-running container
|
||||
container1, err := newContainer(newTemplateConfig(rootfs))
|
||||
ok(t, err)
|
||||
defer container1.Destroy()
|
||||
|
||||
stdinR1, stdinW1, err := os.Pipe()
|
||||
ok(t, err)
|
||||
init1 := &libcontainer.Process{
|
||||
Cwd: "/",
|
||||
Args: []string{"cat"},
|
||||
Env: standardEnvironment,
|
||||
Stdin: stdinR1,
|
||||
}
|
||||
err = container1.Start(init1)
|
||||
stdinR1.Close()
|
||||
defer stdinW1.Close()
|
||||
ok(t, err)
|
||||
|
||||
// get the state of the first container
|
||||
state1, err := container1.State()
|
||||
ok(t, err)
|
||||
pidns1 := state1.NamespacePaths[configs.NEWPID]
|
||||
|
||||
// Start a container inside the existing pidns but with different cgroups
|
||||
config2 := newTemplateConfig(rootfs)
|
||||
config2.Namespaces.Add(configs.NEWPID, pidns1)
|
||||
config2.Cgroups.Path = "integration/test2"
|
||||
container2, err := newContainerWithName("testCT2", config2)
|
||||
ok(t, err)
|
||||
defer container2.Destroy()
|
||||
|
||||
stdinR2, stdinW2, err := os.Pipe()
|
||||
ok(t, err)
|
||||
init2 := &libcontainer.Process{
|
||||
Cwd: "/",
|
||||
Args: []string{"cat"},
|
||||
Env: standardEnvironment,
|
||||
Stdin: stdinR2,
|
||||
}
|
||||
err = container2.Start(init2)
|
||||
stdinR2.Close()
|
||||
defer stdinW2.Close()
|
||||
ok(t, err)
|
||||
// get the state of the second container
|
||||
state2, err := container2.State()
|
||||
ok(t, err)
|
||||
|
||||
ns1, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/pid", state1.InitProcessPid))
|
||||
ok(t, err)
|
||||
ns2, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/pid", state2.InitProcessPid))
|
||||
ok(t, err)
|
||||
if ns1 != ns2 {
|
||||
t.Errorf("pidns(%s), wanted %s", ns2, ns1)
|
||||
}
|
||||
|
||||
// check that namespaces are not the same
|
||||
if reflect.DeepEqual(state2.NamespacePaths, state1.NamespacePaths) {
|
||||
t.Errorf("Namespaces(%v), original %v", state2.NamespacePaths,
|
||||
state1.NamespacePaths)
|
||||
}
|
||||
// check that pidns is joined correctly. The initial container process list
|
||||
// should contain the second container's init process
|
||||
buffers := newStdBuffers()
|
||||
ps := &libcontainer.Process{
|
||||
Cwd: "/",
|
||||
Args: []string{"ps"},
|
||||
Env: standardEnvironment,
|
||||
Stdout: buffers.Stdout,
|
||||
}
|
||||
err = container1.Start(ps)
|
||||
ok(t, err)
|
||||
waitProcess(ps, t)
|
||||
|
||||
// Stop init processes one by one. Stop the second container should
|
||||
// not stop the first.
|
||||
stdinW2.Close()
|
||||
waitProcess(init2, t)
|
||||
stdinW1.Close()
|
||||
waitProcess(init1, t)
|
||||
|
||||
out := strings.TrimSpace(buffers.Stdout.String())
|
||||
// output of ps inside the initial PID namespace should have
|
||||
// 1 line of header,
|
||||
// 2 lines of init processes,
|
||||
// 1 line of ps process
|
||||
if len(strings.Split(out, "\n")) != 4 {
|
||||
t.Errorf("unexpected running process, output %q", out)
|
||||
}
|
||||
}
|
||||
|
||||
func TestInitJoinNetworkAndUser(t *testing.T) {
|
||||
if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
|
||||
t.Skip("userns is unsupported")
|
||||
}
|
||||
if testing.Short() {
|
||||
return
|
||||
}
|
||||
rootfs, err := newRootfs()
|
||||
ok(t, err)
|
||||
defer remove(rootfs)
|
||||
|
||||
// Execute a long-running container
|
||||
config1 := newTemplateConfig(rootfs)
|
||||
config1.UidMappings = []configs.IDMap{{0, 0, 1000}}
|
||||
config1.GidMappings = []configs.IDMap{{0, 0, 1000}}
|
||||
config1.Namespaces = append(config1.Namespaces, configs.Namespace{Type: configs.NEWUSER})
|
||||
container1, err := newContainer(config1)
|
||||
ok(t, err)
|
||||
defer container1.Destroy()
|
||||
|
||||
stdinR1, stdinW1, err := os.Pipe()
|
||||
ok(t, err)
|
||||
init1 := &libcontainer.Process{
|
||||
Cwd: "/",
|
||||
Args: []string{"cat"},
|
||||
Env: standardEnvironment,
|
||||
Stdin: stdinR1,
|
||||
}
|
||||
err = container1.Start(init1)
|
||||
stdinR1.Close()
|
||||
defer stdinW1.Close()
|
||||
ok(t, err)
|
||||
|
||||
// get the state of the first container
|
||||
state1, err := container1.State()
|
||||
ok(t, err)
|
||||
netns1 := state1.NamespacePaths[configs.NEWNET]
|
||||
userns1 := state1.NamespacePaths[configs.NEWUSER]
|
||||
|
||||
// Start a container inside the existing pidns but with different cgroups
|
||||
rootfs2, err := newRootfs()
|
||||
ok(t, err)
|
||||
defer remove(rootfs2)
|
||||
|
||||
config2 := newTemplateConfig(rootfs2)
|
||||
config2.UidMappings = []configs.IDMap{{0, 0, 1000}}
|
||||
config2.GidMappings = []configs.IDMap{{0, 0, 1000}}
|
||||
config2.Namespaces.Add(configs.NEWNET, netns1)
|
||||
config2.Namespaces.Add(configs.NEWUSER, userns1)
|
||||
config2.Cgroups.Path = "integration/test2"
|
||||
container2, err := newContainerWithName("testCT2", config2)
|
||||
ok(t, err)
|
||||
defer container2.Destroy()
|
||||
|
||||
stdinR2, stdinW2, err := os.Pipe()
|
||||
ok(t, err)
|
||||
init2 := &libcontainer.Process{
|
||||
Cwd: "/",
|
||||
Args: []string{"cat"},
|
||||
Env: standardEnvironment,
|
||||
Stdin: stdinR2,
|
||||
}
|
||||
err = container2.Start(init2)
|
||||
stdinR2.Close()
|
||||
defer stdinW2.Close()
|
||||
ok(t, err)
|
||||
|
||||
// get the state of the second container
|
||||
state2, err := container2.State()
|
||||
ok(t, err)
|
||||
|
||||
for _, ns := range []string{"net", "user"} {
|
||||
ns1, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/%s", state1.InitProcessPid, ns))
|
||||
ok(t, err)
|
||||
ns2, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/%s", state2.InitProcessPid, ns))
|
||||
ok(t, err)
|
||||
if ns1 != ns2 {
|
||||
t.Errorf("%s(%s), wanted %s", ns, ns2, ns1)
|
||||
}
|
||||
}
|
||||
|
||||
// check that namespaces are not the same
|
||||
if reflect.DeepEqual(state2.NamespacePaths, state1.NamespacePaths) {
|
||||
t.Errorf("Namespaces(%v), original %v", state2.NamespacePaths,
|
||||
state1.NamespacePaths)
|
||||
}
|
||||
// Stop init processes one by one. Stop the second container should
|
||||
// not stop the first.
|
||||
stdinW2.Close()
|
||||
waitProcess(init2, t)
|
||||
stdinW1.Close()
|
||||
waitProcess(init1, t)
|
||||
}
|
||||
|
|
|
@ -2,6 +2,7 @@ package integration
|
|||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"os/exec"
|
||||
|
@ -12,6 +13,7 @@ import (
|
|||
"time"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer"
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
)
|
||||
|
||||
func TestExecIn(t *testing.T) {
|
||||
|
@ -404,3 +406,62 @@ func TestExecInOomScoreAdj(t *testing.T) {
|
|||
t.Fatalf("expected oomScoreAdj to be %d, got %s", config.OomScoreAdj, oomScoreAdj)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExecInUserns(t *testing.T) {
|
||||
if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
|
||||
t.Skip("userns is unsupported")
|
||||
}
|
||||
if testing.Short() {
|
||||
return
|
||||
}
|
||||
rootfs, err := newRootfs()
|
||||
ok(t, err)
|
||||
defer remove(rootfs)
|
||||
config := newTemplateConfig(rootfs)
|
||||
config.UidMappings = []configs.IDMap{{0, 0, 1000}}
|
||||
config.GidMappings = []configs.IDMap{{0, 0, 1000}}
|
||||
config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER})
|
||||
container, err := newContainer(config)
|
||||
ok(t, err)
|
||||
defer container.Destroy()
|
||||
|
||||
// Execute a first process in the container
|
||||
stdinR, stdinW, err := os.Pipe()
|
||||
ok(t, err)
|
||||
|
||||
process := &libcontainer.Process{
|
||||
Cwd: "/",
|
||||
Args: []string{"cat"},
|
||||
Env: standardEnvironment,
|
||||
Stdin: stdinR,
|
||||
}
|
||||
err = container.Start(process)
|
||||
stdinR.Close()
|
||||
defer stdinW.Close()
|
||||
ok(t, err)
|
||||
|
||||
initPID, err := process.Pid()
|
||||
ok(t, err)
|
||||
initUserns, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/user", initPID))
|
||||
ok(t, err)
|
||||
|
||||
buffers := newStdBuffers()
|
||||
process2 := &libcontainer.Process{
|
||||
Cwd: "/",
|
||||
Args: []string{"readlink", "/proc/self/ns/user"},
|
||||
Env: []string{
|
||||
"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
|
||||
},
|
||||
Stdout: buffers.Stdout,
|
||||
Stderr: os.Stderr,
|
||||
}
|
||||
err = container.Start(process2)
|
||||
ok(t, err)
|
||||
waitProcess(process2, t)
|
||||
stdinW.Close()
|
||||
waitProcess(process, t)
|
||||
|
||||
if out := strings.TrimSpace(buffers.Stdout.String()); out != initUserns {
|
||||
t.Errorf("execin userns(%s), wanted %s", out, initUserns)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -92,13 +92,15 @@ func copyBusybox(dest string) error {
|
|||
}
|
||||
|
||||
func newContainer(config *configs.Config) (libcontainer.Container, error) {
|
||||
f := factory
|
||||
return newContainerWithName("testCT", config)
|
||||
}
|
||||
|
||||
func newContainerWithName(name string, config *configs.Config) (libcontainer.Container, error) {
|
||||
f := factory
|
||||
if config.Cgroups != nil && config.Cgroups.Parent == "system.slice" {
|
||||
f = systemdFactory
|
||||
}
|
||||
|
||||
return f.Create("testCT", config)
|
||||
return f.Create(name, config)
|
||||
}
|
||||
|
||||
// runContainer runs the container with the specific config and arguments
|
||||
|
|
|
@ -12,8 +12,12 @@ import (
|
|||
// The number is randomly chosen to not conflict with known netlink types
|
||||
const (
|
||||
InitMsg uint16 = 62000
|
||||
PidAttr uint16 = 27281
|
||||
CloneFlagsAttr uint16 = 27281
|
||||
ConsolePathAttr uint16 = 27282
|
||||
NsPathsAttr uint16 = 27283
|
||||
UidmapAttr uint16 = 27284
|
||||
GidmapAttr uint16 = 27285
|
||||
SetgroupAttr uint16 = 27286
|
||||
// When syscall.NLA_HDRLEN is in gccgo, take this out.
|
||||
syscall_NLA_HDRLEN = (syscall.SizeofNlAttr + syscall.NLA_ALIGNTO - 1) & ^(syscall.NLA_ALIGNTO - 1)
|
||||
)
|
||||
|
@ -60,3 +64,25 @@ func (msg *Bytemsg) Serialize() []byte {
|
|||
func (msg *Bytemsg) Len() int {
|
||||
return syscall_NLA_HDRLEN + len(msg.Value) + 1 // null-terminated
|
||||
}
|
||||
|
||||
type Boolmsg struct {
|
||||
Type uint16
|
||||
Value bool
|
||||
}
|
||||
|
||||
func (msg *Boolmsg) Serialize() []byte {
|
||||
buf := make([]byte, msg.Len())
|
||||
native := nl.NativeEndian()
|
||||
native.PutUint16(buf[0:2], uint16(msg.Len()))
|
||||
native.PutUint16(buf[2:4], msg.Type)
|
||||
if msg.Value {
|
||||
buf[4] = 1
|
||||
} else {
|
||||
buf[4] = 0
|
||||
}
|
||||
return buf
|
||||
}
|
||||
|
||||
func (msg *Boolmsg) Len() int {
|
||||
return syscall_NLA_HDRLEN + 1
|
||||
}
|
||||
|
|
|
@ -3,7 +3,9 @@ package nsenter
|
|||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
|
@ -18,35 +20,51 @@ type pid struct {
|
|||
Pid int `json:"Pid"`
|
||||
}
|
||||
|
||||
func TestNsenterAlivePid(t *testing.T) {
|
||||
func TestNsenterValidPaths(t *testing.T) {
|
||||
args := []string{"nsenter-exec"}
|
||||
parent, child, err := newPipe()
|
||||
if err != nil {
|
||||
t.Fatalf("failed to create pipe %v", err)
|
||||
}
|
||||
|
||||
namespaces := []string{
|
||||
// join pid ns of the current process
|
||||
fmt.Sprintf("/proc/%d/ns/pid", os.Getpid()),
|
||||
}
|
||||
cmd := &exec.Cmd{
|
||||
Path: os.Args[0],
|
||||
Args: args,
|
||||
ExtraFiles: []*os.File{child},
|
||||
Env: []string{"_LIBCONTAINER_INITTYPE=setns", "_LIBCONTAINER_INITPIPE=3"},
|
||||
Env: []string{"_LIBCONTAINER_INITPIPE=3"},
|
||||
Stdout: os.Stdout,
|
||||
Stderr: os.Stderr,
|
||||
}
|
||||
|
||||
if err := cmd.Start(); err != nil {
|
||||
t.Fatalf("nsenter failed to start %v", err)
|
||||
}
|
||||
// write cloneFlags
|
||||
r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0)
|
||||
r.AddData(&libcontainer.Int32msg{
|
||||
Type: libcontainer.PidAttr,
|
||||
Value: uint32(os.Getpid()),
|
||||
Type: libcontainer.CloneFlagsAttr,
|
||||
Value: uint32(syscall.CLONE_NEWNET),
|
||||
})
|
||||
r.AddData(&libcontainer.Bytemsg{
|
||||
Type: libcontainer.NsPathsAttr,
|
||||
Value: []byte(strings.Join(namespaces, ",")),
|
||||
})
|
||||
if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
decoder := json.NewDecoder(parent)
|
||||
var pid *pid
|
||||
|
||||
if err := decoder.Decode(&pid); err != nil {
|
||||
dir, _ := ioutil.ReadDir(fmt.Sprintf("/proc/%d/ns", os.Getpid()))
|
||||
for _, d := range dir {
|
||||
t.Log(d.Name())
|
||||
}
|
||||
t.Fatalf("%v", err)
|
||||
}
|
||||
|
||||
|
@ -60,70 +78,43 @@ func TestNsenterAlivePid(t *testing.T) {
|
|||
p.Wait()
|
||||
}
|
||||
|
||||
func TestNsenterInvalidPid(t *testing.T) {
|
||||
func TestNsenterInvalidPaths(t *testing.T) {
|
||||
args := []string{"nsenter-exec"}
|
||||
parent, child, err := newPipe()
|
||||
if err != nil {
|
||||
t.Fatalf("failed to create pipe %v", err)
|
||||
}
|
||||
|
||||
namespaces := []string{
|
||||
// join pid ns of the current process
|
||||
fmt.Sprintf("/proc/%d/ns/pid", -1),
|
||||
}
|
||||
cmd := &exec.Cmd{
|
||||
Path: os.Args[0],
|
||||
Args: args,
|
||||
ExtraFiles: []*os.File{child},
|
||||
Env: []string{"_LIBCONTAINER_INITTYPE=setns", "_LIBCONTAINER_INITPIPE=3"},
|
||||
Env: []string{"_LIBCONTAINER_INITPIPE=3"},
|
||||
}
|
||||
|
||||
if err := cmd.Start(); err != nil {
|
||||
t.Fatal("nsenter exits with a zero exit status")
|
||||
t.Fatal(err)
|
||||
}
|
||||
// write cloneFlags
|
||||
r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0)
|
||||
r.AddData(&libcontainer.Int32msg{
|
||||
Type: libcontainer.PidAttr,
|
||||
Value: 0,
|
||||
Type: libcontainer.CloneFlagsAttr,
|
||||
Value: uint32(syscall.CLONE_NEWNET),
|
||||
})
|
||||
r.AddData(&libcontainer.Bytemsg{
|
||||
Type: libcontainer.NsPathsAttr,
|
||||
Value: []byte(strings.Join(namespaces, ",")),
|
||||
})
|
||||
if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := cmd.Wait(); err == nil {
|
||||
t.Fatal("nsenter exits with a zero exit status")
|
||||
}
|
||||
}
|
||||
|
||||
func TestNsenterDeadPid(t *testing.T) {
|
||||
deadCmd := exec.Command("true")
|
||||
if err := deadCmd.Run(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
args := []string{"nsenter-exec"}
|
||||
parent, child, err := newPipe()
|
||||
if err != nil {
|
||||
t.Fatalf("failed to create pipe %v", err)
|
||||
}
|
||||
|
||||
cmd := &exec.Cmd{
|
||||
Path: os.Args[0],
|
||||
Args: args,
|
||||
ExtraFiles: []*os.File{child},
|
||||
Env: []string{"_LIBCONTAINER_INITTYPE=setns", "_LIBCONTAINER_INITPIPE=3"},
|
||||
}
|
||||
|
||||
if err := cmd.Start(); err != nil {
|
||||
t.Fatal("nsenter exits with a zero exit status")
|
||||
}
|
||||
|
||||
r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0)
|
||||
r.AddData(&libcontainer.Int32msg{
|
||||
Type: libcontainer.PidAttr,
|
||||
Value: uint32(deadCmd.Process.Pid),
|
||||
})
|
||||
if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := cmd.Wait(); err == nil {
|
||||
t.Fatal("nsenter exits with a zero exit status")
|
||||
t.Fatalf("nsenter exits with a zero exit status")
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,21 +1,20 @@
|
|||
#define _GNU_SOURCE
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <endian.h>
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <linux/limits.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/wait.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <fcntl.h>
|
||||
#include <signal.h>
|
||||
#include <setjmp.h>
|
||||
#include <linux/limits.h>
|
||||
#include <linux/netlink.h>
|
||||
#include <sched.h>
|
||||
#include <setjmp.h>
|
||||
#include <signal.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <sys/socket.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <bits/sockaddr.h>
|
||||
#include <linux/netlink.h>
|
||||
|
@ -23,239 +22,417 @@
|
|||
#include <stdint.h>
|
||||
#include <sys/socket.h>
|
||||
|
||||
/* All arguments should be above stack, because it grows down */
|
||||
// All arguments should be above the stack because it grows down
|
||||
struct clone_arg {
|
||||
/*
|
||||
* Reserve some space for clone() to locate arguments
|
||||
* and retcode in this place
|
||||
*/
|
||||
char stack[4096] __attribute__ ((aligned(16)));
|
||||
char stack_ptr[0];
|
||||
char stack[4096] __attribute__((aligned(16)));
|
||||
char stack_ptr[0];
|
||||
jmp_buf *env;
|
||||
};
|
||||
|
||||
#define pr_perror(fmt, ...) fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__)
|
||||
|
||||
static int child_func(void *_arg)
|
||||
{
|
||||
struct clone_arg *arg = (struct clone_arg *)_arg;
|
||||
longjmp(*arg->env, 1);
|
||||
}
|
||||
|
||||
// Use raw setns syscall for versions of glibc that don't include it (namely glibc-2.12)
|
||||
#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14
|
||||
#define _GNU_SOURCE
|
||||
#include "syscall.h"
|
||||
#if defined(__NR_setns) && !defined(SYS_setns)
|
||||
#define SYS_setns __NR_setns
|
||||
#endif
|
||||
#ifdef SYS_setns
|
||||
int setns(int fd, int nstype)
|
||||
{
|
||||
return syscall(SYS_setns, fd, nstype);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static int clone_parent(jmp_buf * env) __attribute__ ((noinline));
|
||||
static int clone_parent(jmp_buf * env)
|
||||
{
|
||||
struct clone_arg ca;
|
||||
int child;
|
||||
|
||||
ca.env = env;
|
||||
child = clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca);
|
||||
|
||||
return child;
|
||||
}
|
||||
|
||||
static uint32_t readint32(char *buf)
|
||||
{
|
||||
return *(uint32_t *) buf;
|
||||
}
|
||||
struct nsenter_config {
|
||||
uint32_t cloneflags;
|
||||
char *uidmap;
|
||||
int uidmap_len;
|
||||
char *gidmap;
|
||||
int gidmap_len;
|
||||
uint8_t is_setgroup;
|
||||
};
|
||||
|
||||
// list of known message types we want to send to bootstrap program
|
||||
// These are defined in libcontainer/message_linux.go
|
||||
#define INIT_MSG 62000
|
||||
#define PID_ATTR 27281
|
||||
#define CONSOLE_PATH_ATTR 27282
|
||||
#define INIT_MSG 62000
|
||||
#define CLONE_FLAGS_ATTR 27281
|
||||
#define CONSOLE_PATH_ATTR 27282
|
||||
#define NS_PATHS_ATTR 27283
|
||||
#define UIDMAP_ATTR 27284
|
||||
#define GIDMAP_ATTR 27285
|
||||
#define SETGROUP_ATTR 27286
|
||||
|
||||
void nsexec()
|
||||
// Use raw setns syscall for versions of glibc that don't include it
|
||||
// (namely glibc-2.12)
|
||||
#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14
|
||||
#define _GNU_SOURCE
|
||||
#include "syscall.h"
|
||||
#if defined(__NR_setns) && !defined(SYS_setns)
|
||||
#define SYS_setns __NR_setns
|
||||
#endif
|
||||
|
||||
#ifdef SYS_setns
|
||||
int setns(int fd, int nstype)
|
||||
{
|
||||
return syscall(SYS_setns, fd, nstype);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define pr_perror(fmt, ...) \
|
||||
fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__)
|
||||
|
||||
static int child_func(void *_arg)
|
||||
{
|
||||
char *namespaces[] = { "ipc", "uts", "net", "pid", "mnt", "user" };
|
||||
const int num = sizeof(namespaces) / sizeof(char *);
|
||||
jmp_buf env;
|
||||
char buf[PATH_MAX], *val;
|
||||
int i, tfd, self_tfd, child, n, len, pipenum, consolefd = -1;
|
||||
pid_t pid = 0;
|
||||
struct clone_arg *arg = (struct clone_arg *)_arg;
|
||||
longjmp(*arg->env, 1);
|
||||
}
|
||||
|
||||
// if we dont have INITTYPE or this is the init process, skip the bootstrap process
|
||||
val = getenv("_LIBCONTAINER_INITTYPE");
|
||||
if (val == NULL || strcmp(val, "standard") == 0) {
|
||||
return;
|
||||
}
|
||||
if (strcmp(val, "setns") != 0) {
|
||||
pr_perror("Invalid inittype %s", val);
|
||||
exit(1);
|
||||
static int clone_parent(jmp_buf *env, int flags) __attribute__((noinline));
|
||||
static int clone_parent(jmp_buf *env, int flags)
|
||||
{
|
||||
struct clone_arg ca;
|
||||
int child;
|
||||
|
||||
ca.env = env;
|
||||
child = clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD | flags,
|
||||
&ca);
|
||||
return child;
|
||||
}
|
||||
|
||||
// get init pipe from the parent. It's used to read bootstrap data, and to
|
||||
// write pid to after nsexec finishes setting up the environment.
|
||||
static int get_init_pipe()
|
||||
{
|
||||
char buf[PATH_MAX];
|
||||
char *initpipe;
|
||||
int pipenum = -1;
|
||||
|
||||
initpipe = getenv("_LIBCONTAINER_INITPIPE");
|
||||
if (initpipe == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
val = getenv("_LIBCONTAINER_INITPIPE");
|
||||
if (val == NULL) {
|
||||
pr_perror("Child pipe not found");
|
||||
exit(1);
|
||||
}
|
||||
pipenum = atoi(val);
|
||||
pipenum = atoi(initpipe);
|
||||
snprintf(buf, sizeof(buf), "%d", pipenum);
|
||||
if (strcmp(val, buf)) {
|
||||
if (strcmp(initpipe, buf)) {
|
||||
pr_perror("Unable to parse _LIBCONTAINER_INITPIPE");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
char nlbuf[NLMSG_HDRLEN];
|
||||
struct nlmsghdr *nh;
|
||||
if ((n = read(pipenum, nlbuf, NLMSG_HDRLEN)) != NLMSG_HDRLEN) {
|
||||
pr_perror("Failed to read netlink header, got %d", n);
|
||||
return pipenum;
|
||||
}
|
||||
|
||||
// num_namespaces returns the number of additional namespaces to setns. The
|
||||
// argument is a comma-separated string of namespace paths.
|
||||
static int num_namespaces(char *nspaths)
|
||||
{
|
||||
int i;
|
||||
int size = 0;
|
||||
|
||||
for (i = 0; nspaths[i]; i++) {
|
||||
if (nspaths[i] == ',') {
|
||||
size += 1;
|
||||
}
|
||||
}
|
||||
|
||||
return size + 1;
|
||||
}
|
||||
|
||||
static uint32_t readint32(char *buf)
|
||||
{
|
||||
return *(uint32_t *)buf;
|
||||
}
|
||||
|
||||
static uint8_t readint8(char *buf)
|
||||
{
|
||||
return *(uint8_t *)buf;
|
||||
}
|
||||
|
||||
static void update_process_idmap(char *pathfmt, int pid, char *map, int map_len)
|
||||
{
|
||||
char buf[PATH_MAX];
|
||||
int len;
|
||||
int fd;
|
||||
|
||||
len = snprintf(buf, sizeof(buf), pathfmt, pid);
|
||||
if (len < 0) {
|
||||
pr_perror("failed to construct '%s' for %d", pathfmt, pid);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
nh = (struct nlmsghdr *)nlbuf;
|
||||
if (nh->nlmsg_type == NLMSG_ERROR) {
|
||||
pr_perror("Invalid netlink header message");
|
||||
exit(1);
|
||||
}
|
||||
if (nh->nlmsg_type != INIT_MSG) {
|
||||
pr_perror("Unexpected netlink message type %d", nh->nlmsg_type);
|
||||
exit(1);
|
||||
}
|
||||
// read the netlink payload
|
||||
len = NLMSG_PAYLOAD(nh, 0);
|
||||
char data[len];
|
||||
if ((n = read(pipenum, data, len)) != len) {
|
||||
pr_perror("Failed to read netlink payload, got %d", n);
|
||||
fd = open(buf, O_RDWR);
|
||||
if (fd == -1) {
|
||||
pr_perror("failed to open %s", buf);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int start = 0;
|
||||
struct nlattr *attr;
|
||||
while (start < len) {
|
||||
int payload_len;
|
||||
attr = (struct nlattr *)((void *)data + start);
|
||||
start += NLA_HDRLEN;
|
||||
payload_len = attr->nla_len - NLA_HDRLEN;
|
||||
switch (attr->nla_type) {
|
||||
case PID_ATTR:
|
||||
pid = (pid_t) readint32(data + start);
|
||||
break;
|
||||
case CONSOLE_PATH_ATTR:
|
||||
consolefd = open((char *)data + start, O_RDWR);
|
||||
if (consolefd < 0) {
|
||||
pr_perror("Failed to open console %s", (char *)data + start);
|
||||
len = write(fd, map, map_len);
|
||||
if (len == -1) {
|
||||
pr_perror("failed to write to %s", buf);
|
||||
exit(1);
|
||||
} else if (len != map_len) {
|
||||
fprintf(stderr, "Failed to write data to %s (%d/%d)",
|
||||
buf, len, map_len);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
close(fd);
|
||||
}
|
||||
|
||||
static void update_process_uidmap(int pid, char *map, int map_len)
|
||||
{
|
||||
if ((map == NULL) || (map_len <= 0)) {
|
||||
return;
|
||||
}
|
||||
|
||||
update_process_idmap("/proc/%d/uid_map", pid, map, map_len);
|
||||
}
|
||||
|
||||
static void update_process_gidmap(int pid, uint8_t is_setgroup, char *map, int map_len)
|
||||
{
|
||||
if ((map == NULL) || (map_len <= 0)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (is_setgroup == 1) {
|
||||
int fd;
|
||||
int len;
|
||||
char buf[PATH_MAX];
|
||||
|
||||
len = snprintf(buf, sizeof(buf), "/proc/%d/setgroups", pid);
|
||||
if (len < 0) {
|
||||
pr_perror("failed to get setgroups path for %d", pid);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
fd = open(buf, O_RDWR);
|
||||
if (fd == -1) {
|
||||
pr_perror("failed to open %s", buf);
|
||||
exit(1);
|
||||
}
|
||||
if (write(fd, "allow", 5) != 5) {
|
||||
// If the kernel is too old to support
|
||||
// /proc/PID/setgroups, write will return
|
||||
// ENOENT; this is OK.
|
||||
if (errno != ENOENT) {
|
||||
pr_perror("failed to write allow to %s", buf);
|
||||
exit(1);
|
||||
}
|
||||
break;
|
||||
}
|
||||
start += NLA_ALIGN(payload_len);
|
||||
}
|
||||
|
||||
// required pid to be passed
|
||||
if (pid == 0) {
|
||||
pr_perror("missing pid");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* Check that the specified process exists */
|
||||
snprintf(buf, PATH_MAX - 1, "/proc/%d/ns", pid);
|
||||
tfd = open(buf, O_DIRECTORY | O_RDONLY);
|
||||
if (tfd == -1) {
|
||||
pr_perror("Failed to open \"%s\"", buf);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
self_tfd = open("/proc/self/ns", O_DIRECTORY | O_RDONLY);
|
||||
if (self_tfd == -1) {
|
||||
pr_perror("Failed to open /proc/self/ns");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
for (i = 0; i < num; i++) {
|
||||
struct stat st;
|
||||
struct stat self_st;
|
||||
int fd;
|
||||
|
||||
/* Symlinks on all namespaces exist for dead processes, but they can't be opened */
|
||||
if (fstatat(tfd, namespaces[i], &st, 0) == -1) {
|
||||
// Ignore nonexistent namespaces.
|
||||
if (errno == ENOENT)
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Skip namespaces we're already part of */
|
||||
if (fstatat(self_tfd, namespaces[i], &self_st, 0) != -1 && st.st_ino == self_st.st_ino) {
|
||||
continue;
|
||||
}
|
||||
|
||||
fd = openat(tfd, namespaces[i], O_RDONLY);
|
||||
if (fd == -1) {
|
||||
pr_perror("Failed to open ns file %s for ns %s", buf, namespaces[i]);
|
||||
exit(1);
|
||||
}
|
||||
// Set the namespace.
|
||||
if (setns(fd, 0) == -1) {
|
||||
pr_perror("Failed to setns for %s", namespaces[i]);
|
||||
exit(1);
|
||||
}
|
||||
close(fd);
|
||||
}
|
||||
|
||||
close(self_tfd);
|
||||
close(tfd);
|
||||
update_process_idmap("/proc/%d/gid_map", pid, map, map_len);
|
||||
}
|
||||
|
||||
|
||||
static void start_child(int pipenum, jmp_buf *env, int syncpipe[2],
|
||||
struct nsenter_config *config)
|
||||
{
|
||||
int len;
|
||||
int childpid;
|
||||
char buf[PATH_MAX];
|
||||
uint8_t syncbyte = 1;
|
||||
|
||||
// We must fork to actually enter the PID namespace, use CLONE_PARENT
|
||||
// so the child can have the right parent, and we don't need to forward
|
||||
// the child's exit code or resend its death signal.
|
||||
childpid = clone_parent(env, config->cloneflags);
|
||||
if (childpid < 0) {
|
||||
pr_perror("Unable to fork");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// update uid_map and gid_map for the child process if they
|
||||
// were provided
|
||||
update_process_uidmap(childpid, config->uidmap, config->uidmap_len);
|
||||
|
||||
update_process_gidmap(childpid, config->is_setgroup, config->gidmap, config->gidmap_len);
|
||||
|
||||
// Send the sync signal to the child
|
||||
close(syncpipe[0]);
|
||||
syncbyte = 1;
|
||||
if (write(syncpipe[1], &syncbyte, 1) != 1) {
|
||||
pr_perror("failed to write sync byte to child");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// Send the child pid back to our parent
|
||||
len = snprintf(buf, sizeof(buf), "{ \"pid\" : %d }\n", childpid);
|
||||
if ((len < 0) || (write(pipenum, buf, len) != len)) {
|
||||
pr_perror("Unable to send a child pid");
|
||||
kill(childpid, SIGKILL);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
exit(0);
|
||||
}
|
||||
|
||||
static void process_nl_attributes(int pipenum, char *data, int data_size)
|
||||
{
|
||||
jmp_buf env;
|
||||
struct nsenter_config config = {0};
|
||||
struct nlattr *nlattr;
|
||||
int payload_len;
|
||||
int start = 0;
|
||||
int consolefd = -1;
|
||||
int syncpipe[2] = {-1, -1};
|
||||
|
||||
while (start < data_size) {
|
||||
nlattr = (struct nlattr *)(data + start);
|
||||
start += NLA_HDRLEN;
|
||||
payload_len = nlattr->nla_len - NLA_HDRLEN;
|
||||
|
||||
if (nlattr->nla_type == CLONE_FLAGS_ATTR) {
|
||||
config.cloneflags = readint32(data + start);
|
||||
} else if (nlattr->nla_type == CONSOLE_PATH_ATTR) {
|
||||
// get the console path before setns because it may
|
||||
// change mnt namespace
|
||||
consolefd = open(data + start, O_RDWR);
|
||||
if (consolefd < 0) {
|
||||
pr_perror("Failed to open console %s",
|
||||
data + start);
|
||||
exit(1);
|
||||
}
|
||||
} else if (nlattr->nla_type == NS_PATHS_ATTR) {
|
||||
// if custom namespaces are required, open all
|
||||
// descriptors and perform setns on them
|
||||
int i;
|
||||
int nslen = num_namespaces(data + start);
|
||||
int fds[nslen];
|
||||
char *nslist[nslen];
|
||||
char *ns;
|
||||
char *saveptr;
|
||||
|
||||
for (i = 0; i < nslen; i++) {
|
||||
char *str = NULL;
|
||||
|
||||
if (i == 0) {
|
||||
str = data + start;
|
||||
}
|
||||
ns = strtok_r(str, ",", &saveptr);
|
||||
if (ns == NULL) {
|
||||
break;
|
||||
}
|
||||
fds[i] = open(ns, O_RDONLY);
|
||||
if (fds[i] == -1) {
|
||||
pr_perror("Failed to open %s", ns);
|
||||
exit(1);
|
||||
}
|
||||
nslist[i] = ns;
|
||||
}
|
||||
|
||||
for (i = 0; i < nslen; i++) {
|
||||
if (setns(fds[i], 0) != 0) {
|
||||
pr_perror("Failed to setns to %s", nslist[i]);
|
||||
exit(1);
|
||||
}
|
||||
close(fds[i]);
|
||||
}
|
||||
} else if (nlattr->nla_type == UIDMAP_ATTR) {
|
||||
config.uidmap = data + start;
|
||||
config.uidmap_len = payload_len;
|
||||
} else if (nlattr->nla_type == GIDMAP_ATTR) {
|
||||
config.gidmap = data + start;
|
||||
config.gidmap_len = payload_len;
|
||||
} else if (nlattr->nla_type == SETGROUP_ATTR) {
|
||||
config.is_setgroup = readint8(data + start);
|
||||
} else {
|
||||
pr_perror("Unknown netlink message type %d",
|
||||
nlattr->nla_type);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
start += NLA_ALIGN(payload_len);
|
||||
}
|
||||
|
||||
// required clone_flags to be passed
|
||||
if (config.cloneflags == -1) {
|
||||
pr_perror("Missing clone_flags");
|
||||
exit(1);
|
||||
}
|
||||
// prepare sync pipe between parent and child. We need this to let the
|
||||
// child
|
||||
// know that the parent has finished setting up
|
||||
if (pipe(syncpipe) != 0) {
|
||||
pr_perror("Failed to setup sync pipe between parent and child");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (setjmp(env) == 1) {
|
||||
// Child
|
||||
uint8_t s = 0;
|
||||
|
||||
// close the writing side of pipe
|
||||
close(syncpipe[1]);
|
||||
|
||||
// sync with parent
|
||||
if ((read(syncpipe[0], &s, 1) != 1) || (s != 1)) {
|
||||
pr_perror("Failed to read sync byte from parent");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (setsid() == -1) {
|
||||
pr_perror("setsid failed");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (consolefd != -1) {
|
||||
if (ioctl(consolefd, TIOCSCTTY, 0) == -1) {
|
||||
pr_perror("ioctl TIOCSCTTY failed");
|
||||
exit(1);
|
||||
}
|
||||
if (dup3(consolefd, STDIN_FILENO, 0) != STDIN_FILENO) {
|
||||
pr_perror("Failed to dup 0");
|
||||
pr_perror("Failed to dup stdin");
|
||||
exit(1);
|
||||
}
|
||||
if (dup3(consolefd, STDOUT_FILENO, 0) != STDOUT_FILENO) {
|
||||
pr_perror("Failed to dup 1");
|
||||
pr_perror("Failed to dup stdout");
|
||||
exit(1);
|
||||
}
|
||||
if (dup3(consolefd, STDERR_FILENO, 0) != STDERR_FILENO) {
|
||||
pr_perror("Failed to dup 2");
|
||||
pr_perror("Failed to dup stderr");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
// Finish executing, let the Go runtime take over.
|
||||
return;
|
||||
}
|
||||
|
||||
// Parent
|
||||
|
||||
// We must fork to actually enter the PID namespace, use CLONE_PARENT
|
||||
// so the child can have the right parent, and we don't need to forward
|
||||
// the child's exit code or resend its death signal.
|
||||
child = clone_parent(&env);
|
||||
if (child < 0) {
|
||||
pr_perror("Unable to fork");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
len = snprintf(buf, sizeof(buf), "{ \"pid\" : %d }\n", child);
|
||||
|
||||
if (write(pipenum, buf, len) != len) {
|
||||
pr_perror("Unable to send a child pid");
|
||||
kill(child, SIGKILL);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
exit(0);
|
||||
start_child(pipenum, &env, syncpipe, &config);
|
||||
}
|
||||
|
||||
void nsexec(void)
|
||||
{
|
||||
int pipenum;
|
||||
|
||||
// if we dont have init pipe, then just return to the parent
|
||||
pipenum = get_init_pipe();
|
||||
if (pipenum == -1) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Retrieve the netlink header
|
||||
struct nlmsghdr nl_msg_hdr;
|
||||
int len;
|
||||
|
||||
if ((len = read(pipenum, &nl_msg_hdr, NLMSG_HDRLEN)) != NLMSG_HDRLEN) {
|
||||
pr_perror("Invalid netlink header length %d", len);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (nl_msg_hdr.nlmsg_type == NLMSG_ERROR) {
|
||||
pr_perror("Failed to read netlink message");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (nl_msg_hdr.nlmsg_type != INIT_MSG) {
|
||||
pr_perror("Unexpected msg type %d", nl_msg_hdr.nlmsg_type);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// Retrieve data
|
||||
int nl_total_size = NLMSG_PAYLOAD(&nl_msg_hdr, 0);
|
||||
char data[nl_total_size];
|
||||
|
||||
if ((len = read(pipenum, data, nl_total_size)) != nl_total_size) {
|
||||
pr_perror("Failed to read netlink payload, %d != %d", len,
|
||||
nl_total_size);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
process_nl_attributes(pipenum, data, nl_total_size);
|
||||
}
|
||||
|
|
|
@ -167,14 +167,16 @@ func (p *setnsProcess) setExternalDescriptors(newFds []string) {
|
|||
}
|
||||
|
||||
type initProcess struct {
|
||||
cmd *exec.Cmd
|
||||
parentPipe *os.File
|
||||
childPipe *os.File
|
||||
config *initConfig
|
||||
manager cgroups.Manager
|
||||
container *linuxContainer
|
||||
fds []string
|
||||
process *Process
|
||||
cmd *exec.Cmd
|
||||
parentPipe *os.File
|
||||
childPipe *os.File
|
||||
config *initConfig
|
||||
manager cgroups.Manager
|
||||
container *linuxContainer
|
||||
fds []string
|
||||
process *Process
|
||||
bootstrapData io.Reader
|
||||
sharePidns bool
|
||||
}
|
||||
|
||||
func (p *initProcess) pid() int {
|
||||
|
@ -185,15 +187,49 @@ func (p *initProcess) externalDescriptors() []string {
|
|||
return p.fds
|
||||
}
|
||||
|
||||
func (p *initProcess) start() (err error) {
|
||||
// execSetns runs the process that executes C code to perform the setns calls
|
||||
// because setns support requires the C process to fork off a child and perform the setns
|
||||
// before the go runtime boots, we wait on the process to die and receive the child's pid
|
||||
// over the provided pipe.
|
||||
// This is called by initProcess.start function
|
||||
func (p *initProcess) execSetns() error {
|
||||
status, err := p.cmd.Process.Wait()
|
||||
if err != nil {
|
||||
p.cmd.Wait()
|
||||
return err
|
||||
}
|
||||
if !status.Success() {
|
||||
p.cmd.Wait()
|
||||
return &exec.ExitError{ProcessState: status}
|
||||
}
|
||||
var pid *pid
|
||||
if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
|
||||
p.cmd.Wait()
|
||||
return err
|
||||
}
|
||||
process, err := os.FindProcess(pid.Pid)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
p.cmd.Process = process
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *initProcess) start() error {
|
||||
defer p.parentPipe.Close()
|
||||
err = p.cmd.Start()
|
||||
err := p.cmd.Start()
|
||||
p.process.ops = p
|
||||
p.childPipe.Close()
|
||||
if err != nil {
|
||||
p.process.ops = nil
|
||||
return newSystemError(err)
|
||||
}
|
||||
if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := p.execSetns(); err != nil {
|
||||
return newSystemError(err)
|
||||
}
|
||||
// Save the standard descriptor names before the container process
|
||||
// can potentially move them (e.g., via dup2()). If we don't do this now,
|
||||
// we won't know at checkpoint time which file descriptor to look up.
|
||||
|
@ -213,21 +249,6 @@ func (p *initProcess) start() (err error) {
|
|||
p.manager.Destroy()
|
||||
}
|
||||
}()
|
||||
if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
|
||||
if p.config.Config.Hooks != nil {
|
||||
s := configs.HookState{
|
||||
Version: p.container.config.Version,
|
||||
ID: p.container.id,
|
||||
Pid: p.pid(),
|
||||
Root: p.config.Config.Rootfs,
|
||||
}
|
||||
for _, hook := range p.config.Config.Hooks.Prestart {
|
||||
if err := hook.Run(s); err != nil {
|
||||
return newSystemError(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if err := p.createNetworkInterfaces(); err != nil {
|
||||
return newSystemError(err)
|
||||
}
|
||||
|
@ -255,6 +276,22 @@ loop:
|
|||
if err := p.manager.Set(p.config.Config); err != nil {
|
||||
return newSystemError(err)
|
||||
}
|
||||
// call prestart hooks
|
||||
if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
|
||||
if p.config.Config.Hooks != nil {
|
||||
s := configs.HookState{
|
||||
Version: p.container.config.Version,
|
||||
ID: p.container.id,
|
||||
Pid: p.pid(),
|
||||
Root: p.config.Config.Rootfs,
|
||||
}
|
||||
for _, hook := range p.config.Config.Hooks.Prestart {
|
||||
if err := hook.Run(s); err != nil {
|
||||
return newSystemError(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Sync with child.
|
||||
if err := utils.WriteJSON(p.parentPipe, syncT{procRun}); err != nil {
|
||||
return newSystemError(err)
|
||||
|
@ -317,7 +354,7 @@ func (p *initProcess) wait() (*os.ProcessState, error) {
|
|||
return p.cmd.ProcessState, err
|
||||
}
|
||||
// we should kill all processes in cgroup when init is died if we use host PID namespace
|
||||
if p.cmd.SysProcAttr.Cloneflags&syscall.CLONE_NEWPID == 0 {
|
||||
if p.sharePidns {
|
||||
killCgroupProcesses(p.manager)
|
||||
}
|
||||
return p.cmd.ProcessState, nil
|
||||
|
|
|
@ -55,10 +55,6 @@ func (l *linuxStandardInit) Init() error {
|
|||
return err
|
||||
}
|
||||
|
||||
// join any namespaces via a path to the namespace fd if provided
|
||||
if err := joinExistingNamespaces(l.config.Config.Namespaces); err != nil {
|
||||
return err
|
||||
}
|
||||
var console *linuxConsole
|
||||
if l.config.Console != "" {
|
||||
console = newConsoleFromPath(l.config.Console)
|
||||
|
@ -66,9 +62,6 @@ func (l *linuxStandardInit) Init() error {
|
|||
return err
|
||||
}
|
||||
}
|
||||
if _, err := syscall.Setsid(); err != nil {
|
||||
return err
|
||||
}
|
||||
if console != nil {
|
||||
if err := system.Setctty(); err != nil {
|
||||
return err
|
||||
|
|
5
spec.go
5
spec.go
|
@ -593,7 +593,10 @@ func setupUserNamespace(spec *specs.LinuxSpec, config *configs.Config) error {
|
|||
if len(spec.Linux.UIDMappings) == 0 {
|
||||
return nil
|
||||
}
|
||||
config.Namespaces.Add(configs.NEWUSER, "")
|
||||
// do not override the specified user namespace path
|
||||
if config.Namespaces.PathOf(configs.NEWUSER) == "" {
|
||||
config.Namespaces.Add(configs.NEWUSER, "")
|
||||
}
|
||||
create := func(m specs.IDMapping) configs.IDMap {
|
||||
return configs.IDMap{
|
||||
HostID: int(m.HostID),
|
||||
|
|
Loading…
Reference in New Issue