Merge pull request #2391 from cyphar/devices-cgroup

cgroup: devices: major cleanups and minimal transition rules
This commit is contained in:
Akihiro Suda 2020-05-14 09:57:06 +09:00 committed by GitHub
commit 3f1e886991
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
29 changed files with 2468 additions and 580 deletions

View File

@ -65,7 +65,7 @@ func bail(err error) {
os.Exit(1)
}
func handleSingle(path string) error {
func handleSingle(path string, noStdin bool) error {
// Open a socket.
ln, err := net.Listen("unix", path)
if err != nil {
@ -113,10 +113,12 @@ func handleSingle(path string) error {
io.Copy(os.Stdout, c)
quitChan <- struct{}{}
}()
go func() {
io.Copy(c, os.Stdin)
quitChan <- struct{}{}
}()
if !noStdin {
go func() {
io.Copy(c, os.Stdin)
quitChan <- struct{}{}
}()
}
// Only close the master fd once we've stopped copying.
<-quitChan
@ -201,6 +203,10 @@ func main() {
Value: "",
Usage: "Path to write daemon process ID to",
},
cli.BoolFlag{
Name: "no-stdin",
Usage: "Disable stdin handling (no-op for null mode)",
},
}
app.Action = func(ctx *cli.Context) error {
@ -218,9 +224,10 @@ func main() {
}
}
noStdin := ctx.Bool("no-stdin")
switch ctx.String("mode") {
case "single":
if err := handleSingle(path); err != nil {
if err := handleSingle(path, noStdin); err != nil {
return err
}
case "null":

View File

@ -155,8 +155,7 @@ config := &configs.Config{
Parent: "system",
Resources: &configs.Resources{
MemorySwappiness: nil,
AllowAllDevices: nil,
AllowedDevices: configs.DefaultAllowedDevices,
Devices: specconv.AllowedDevices,
},
},
MaskPaths: []string{
@ -166,7 +165,7 @@ config := &configs.Config{
ReadonlyPaths: []string{
"/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
},
Devices: configs.DefaultAutoCreatedDevices,
Devices: specconv.AllowedDevices,
Hostname: "testing",
Mounts: []*configs.Mount{
{

View File

@ -44,6 +44,9 @@ type Manager interface {
// GetCgroups returns the cgroup data as configured.
GetCgroups() (*configs.Cgroup, error)
// GetFreezerState retrieves the current FreezerState of the cgroup.
GetFreezerState() (configs.FreezerState, error)
}
type NotFoundError struct {

View File

@ -0,0 +1,355 @@
// +build linux
package devices
import (
"bufio"
"io"
"regexp"
"sort"
"strconv"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/pkg/errors"
)
// deviceMeta is a DeviceRule without the Allow or Permissions fields, and no
// wildcard-type support. It's effectively the "match" portion of a metadata
// rule, for the purposes of our emulation.
type deviceMeta struct {
node configs.DeviceType
major int64
minor int64
}
// deviceRule is effectively the tuple (deviceMeta, DevicePermissions).
type deviceRule struct {
meta deviceMeta
perms configs.DevicePermissions
}
// deviceRules is a mapping of device metadata rules to the associated
// permissions in the ruleset.
type deviceRules map[deviceMeta]configs.DevicePermissions
func (r deviceRules) orderedEntries() []deviceRule {
var rules []deviceRule
for meta, perms := range r {
rules = append(rules, deviceRule{meta: meta, perms: perms})
}
sort.Slice(rules, func(i, j int) bool {
// Sort by (major, minor, type).
a, b := rules[i].meta, rules[j].meta
return a.major < b.major ||
(a.major == b.major && a.minor < b.minor) ||
(a.major == b.major && a.minor == b.minor && a.node < b.node)
})
return rules
}
type Emulator struct {
defaultAllow bool
rules deviceRules
}
func (e *Emulator) IsBlacklist() bool {
return e.defaultAllow
}
func (e *Emulator) IsAllowAll() bool {
return e.IsBlacklist() && len(e.rules) == 0
}
var devicesListRegexp = regexp.MustCompile(`^([abc])\s+(\d+|\*):(\d+|\*)\s+([rwm]+)$`)
func parseLine(line string) (*deviceRule, error) {
matches := devicesListRegexp.FindStringSubmatch(line)
if matches == nil {
return nil, errors.Errorf("line doesn't match devices.list format")
}
var (
rule deviceRule
node = matches[1]
major = matches[2]
minor = matches[3]
perms = matches[4]
)
// Parse the node type.
switch node {
case "a":
// Super-special case -- "a" always means every device with every
// access mode. In fact, for devices.list this actually indicates that
// the cgroup is in black-list mode.
// TODO: Double-check that the entire file is "a *:* rwm".
return nil, nil
case "b":
rule.meta.node = configs.BlockDevice
case "c":
rule.meta.node = configs.CharDevice
default:
// Should never happen!
return nil, errors.Errorf("unknown device type %q", node)
}
// Parse the major number.
if major == "*" {
rule.meta.major = configs.Wildcard
} else {
val, err := strconv.ParseUint(major, 10, 32)
if err != nil {
return nil, errors.Wrap(err, "parse major number")
}
rule.meta.major = int64(val)
}
// Parse the minor number.
if minor == "*" {
rule.meta.minor = configs.Wildcard
} else {
val, err := strconv.ParseUint(minor, 10, 32)
if err != nil {
return nil, errors.Wrap(err, "parse minor number")
}
rule.meta.minor = int64(val)
}
// Parse the access permissions.
rule.perms = configs.DevicePermissions(perms)
if !rule.perms.IsValid() || rule.perms.IsEmpty() {
// Should never happen!
return nil, errors.Errorf("parse access mode: contained unknown modes or is empty: %q", perms)
}
return &rule, nil
}
func (e *Emulator) addRule(rule deviceRule) error {
if e.rules == nil {
e.rules = make(map[deviceMeta]configs.DevicePermissions)
}
// Merge with any pre-existing permissions.
oldPerms := e.rules[rule.meta]
newPerms := rule.perms.Union(oldPerms)
e.rules[rule.meta] = newPerms
return nil
}
func (e *Emulator) rmRule(rule deviceRule) error {
// Give an error if any of the permissions requested to be removed are
// present in a partially-matching wildcard rule, because such rules will
// be ignored by cgroupv1.
//
// This is a diversion from cgroupv1, but is necessary to avoid leading
// users into a false sense of security. cgroupv1 will silently(!) ignore
// requests to remove partial exceptions, but we really shouldn't do that.
//
// It may seem like we could just "split" wildcard rules which hit this
// issue, but unfortunately there are 2^32 possible major and minor
// numbers, which would exhaust kernel memory quickly if we did this. Not
// to mention it'd be really slow (the kernel side is implemented as a
// linked-list of exceptions).
for _, partialMeta := range []deviceMeta{
{node: rule.meta.node, major: configs.Wildcard, minor: rule.meta.minor},
{node: rule.meta.node, major: rule.meta.major, minor: configs.Wildcard},
{node: rule.meta.node, major: configs.Wildcard, minor: configs.Wildcard},
} {
// This wildcard rule is equivalent to the requested rule, so skip it.
if rule.meta == partialMeta {
continue
}
// Only give an error if the set of permissions overlap.
partialPerms := e.rules[partialMeta]
if !partialPerms.Intersection(rule.perms).IsEmpty() {
return errors.Errorf("requested rule [%v %v] not supported by devices cgroupv1 (cannot punch hole in existing wildcard rule [%v %v])", rule.meta, rule.perms, partialMeta, partialPerms)
}
}
// Subtract all of the permissions listed from the full match rule. If the
// rule didn't exist, all of this is a no-op.
newPerms := e.rules[rule.meta].Difference(rule.perms)
if newPerms.IsEmpty() {
delete(e.rules, rule.meta)
} else {
e.rules[rule.meta] = newPerms
}
// TODO: The actual cgroup code doesn't care if an exception didn't exist
// during removal, so not erroring out here is /accurate/ but quite
// worrying. Maybe we should do additional validation, but again we
// have to worry about backwards-compatibility.
return nil
}
func (e *Emulator) allow(rule *deviceRule) error {
// This cgroup is configured as a black-list. Reset the entire emulator,
// and put is into black-list mode.
if rule == nil || rule.meta.node == configs.WildcardDevice {
*e = Emulator{
defaultAllow: true,
rules: nil,
}
return nil
}
var err error
if e.defaultAllow {
err = errors.Wrap(e.rmRule(*rule), "remove 'deny' exception")
} else {
err = errors.Wrap(e.addRule(*rule), "add 'allow' exception")
}
return err
}
func (e *Emulator) deny(rule *deviceRule) error {
// This cgroup is configured as a white-list. Reset the entire emulator,
// and put is into white-list mode.
if rule == nil || rule.meta.node == configs.WildcardDevice {
*e = Emulator{
defaultAllow: false,
rules: nil,
}
return nil
}
var err error
if e.defaultAllow {
err = errors.Wrap(e.addRule(*rule), "add 'deny' exception")
} else {
err = errors.Wrap(e.rmRule(*rule), "remove 'allow' exception")
}
return err
}
func (e *Emulator) Apply(rule configs.DeviceRule) error {
if !rule.Type.CanCgroup() {
return errors.Errorf("cannot add rule [%#v] with non-cgroup type %q", rule, rule.Type)
}
innerRule := &deviceRule{
meta: deviceMeta{
node: rule.Type,
major: rule.Major,
minor: rule.Minor,
},
perms: rule.Permissions,
}
if innerRule.meta.node == configs.WildcardDevice {
innerRule = nil
}
if rule.Allow {
return e.allow(innerRule)
} else {
return e.deny(innerRule)
}
}
// EmulatorFromList takes a reader to a "devices.list"-like source, and returns
// a new Emulator that represents the state of the devices cgroup. Note that
// black-list devices cgroups cannot be fully reconstructed, due to limitations
// in the devices cgroup API. Instead, such cgroups are always treated as
// "allow all" cgroups.
func EmulatorFromList(list io.Reader) (*Emulator, error) {
// Normally cgroups are in black-list mode by default, but the way we
// figure out the current mode is whether or not devices.list has an
// allow-all rule. So we default to a white-list, and the existence of an
// "a *:* rwm" entry will tell us otherwise.
e := &Emulator{
defaultAllow: false,
}
// Parse the "devices.list".
s := bufio.NewScanner(list)
for s.Scan() {
line := s.Text()
deviceRule, err := parseLine(line)
if err != nil {
return nil, errors.Wrapf(err, "parsing line %q", line)
}
// "devices.list" is an allow list. Note that this means that in
// black-list mode, we have no idea what rules are in play. As a
// result, we need to be very careful in Transition().
if err := e.allow(deviceRule); err != nil {
return nil, errors.Wrapf(err, "adding devices.list rule")
}
}
if err := s.Err(); err != nil {
return nil, errors.Wrap(err, "reading devices.list lines")
}
return e, nil
}
// Transition calculates what is the minimally-disruptive set of rules need to
// be applied to a devices cgroup in order to transition to the given target.
// This means that any already-existing rules will not be applied, and
// disruptive rules (like denying all device access) will only be applied if
// necessary.
//
// This function is the sole reason for all of Emulator -- to allow us
// to figure out how to update a containers' cgroups without causing spurrious
// device errors (if possible).
func (source *Emulator) Transition(target *Emulator) ([]*configs.DeviceRule, error) {
var transitionRules []*configs.DeviceRule
oldRules := source.rules
// If the default policy doesn't match, we need to include a "disruptive"
// rule (either allow-all or deny-all) in order to switch the cgroup to the
// correct default policy.
//
// However, due to a limitation in "devices.list" we cannot be sure what
// deny rules are in place in a black-list cgroup. Thus if the source is a
// black-list we also have to include a disruptive rule.
if source.IsBlacklist() || source.defaultAllow != target.defaultAllow {
transitionRules = append(transitionRules, &configs.DeviceRule{
Type: 'a',
Major: -1,
Minor: -1,
Permissions: configs.DevicePermissions("rwm"),
Allow: target.defaultAllow,
})
// The old rules are only relevant if we aren't starting out with a
// disruptive rule.
oldRules = nil
}
// NOTE: We traverse through the rules in a sorted order so we always write
// the same set of rules (this is to aid testing).
// First, we create inverse rules for any old rules not in the new set.
// This includes partial-inverse rules for specific permissions. This is a
// no-op if we added a disruptive rule, since oldRules will be empty.
for _, rule := range oldRules.orderedEntries() {
meta, oldPerms := rule.meta, rule.perms
newPerms := target.rules[meta]
droppedPerms := oldPerms.Difference(newPerms)
if !droppedPerms.IsEmpty() {
transitionRules = append(transitionRules, &configs.DeviceRule{
Type: meta.node,
Major: meta.major,
Minor: meta.minor,
Permissions: droppedPerms,
Allow: target.defaultAllow,
})
}
}
// Add any additional rules which weren't in the old set. We happen to
// filter out rules which are present in both sets, though this isn't
// strictly necessary.
for _, rule := range target.rules.orderedEntries() {
meta, newPerms := rule.meta, rule.perms
oldPerms := oldRules[meta]
gainedPerms := newPerms.Difference(oldPerms)
if !gainedPerms.IsEmpty() {
transitionRules = append(transitionRules, &configs.DeviceRule{
Type: meta.node,
Major: meta.major,
Minor: meta.minor,
Permissions: gainedPerms,
Allow: !target.defaultAllow,
})
}
}
return transitionRules, nil
}

File diff suppressed because it is too large Load Diff

View File

@ -22,7 +22,7 @@ const (
)
// DeviceFilter returns eBPF device filter program and its license string
func DeviceFilter(devices []*configs.Device) (asm.Instructions, string, error) {
func DeviceFilter(devices []*configs.DeviceRule) (asm.Instructions, string, error) {
p := &program{}
p.init()
for i := len(devices) - 1; i >= 0; i-- {
@ -68,7 +68,7 @@ func (p *program) init() {
}
// appendDevice needs to be called from the last element of OCI linux.resources.devices to the head element.
func (p *program) appendDevice(dev *configs.Device) error {
func (p *program) appendDevice(dev *configs.DeviceRule) error {
if p.blockID < 0 {
return errors.New("the program is finalized")
}

View File

@ -20,7 +20,7 @@ func hash(s, comm string) string {
return strings.Join(res, "\n")
}
func testDeviceFilter(t testing.TB, devices []*configs.Device, expectedStr string) {
func testDeviceFilter(t testing.TB, devices []*configs.DeviceRule, expectedStr string) {
insts, _, err := DeviceFilter(devices)
if err != nil {
t.Fatalf("%s: %v (devices: %+v)", t.Name(), err, devices)
@ -83,71 +83,69 @@ block-2:
19: Exit
block-3:
20: JNEImm dst: r2 off: -1 imm: 2 <block-4>
21: JNEImm dst: r4 off: -1 imm: 5 <block-4>
22: JNEImm dst: r5 off: -1 imm: 1 <block-4>
21: JNEImm dst: r4 off: -1 imm: 1 <block-4>
22: JNEImm dst: r5 off: -1 imm: 9 <block-4>
23: Mov32Imm dst: r0 imm: 1
24: Exit
block-4:
25: JNEImm dst: r2 off: -1 imm: 2 <block-5>
26: JNEImm dst: r4 off: -1 imm: 1 <block-5>
27: JNEImm dst: r5 off: -1 imm: 9 <block-5>
27: JNEImm dst: r5 off: -1 imm: 5 <block-5>
28: Mov32Imm dst: r0 imm: 1
29: Exit
block-5:
30: JNEImm dst: r2 off: -1 imm: 2 <block-6>
31: JNEImm dst: r4 off: -1 imm: 1 <block-6>
32: JNEImm dst: r5 off: -1 imm: 5 <block-6>
31: JNEImm dst: r4 off: -1 imm: 5 <block-6>
32: JNEImm dst: r5 off: -1 imm: 0 <block-6>
33: Mov32Imm dst: r0 imm: 1
34: Exit
block-6:
35: JNEImm dst: r2 off: -1 imm: 2 <block-7>
36: JNEImm dst: r4 off: -1 imm: 5 <block-7>
37: JNEImm dst: r5 off: -1 imm: 0 <block-7>
36: JNEImm dst: r4 off: -1 imm: 1 <block-7>
37: JNEImm dst: r5 off: -1 imm: 7 <block-7>
38: Mov32Imm dst: r0 imm: 1
39: Exit
block-7:
40: JNEImm dst: r2 off: -1 imm: 2 <block-8>
41: JNEImm dst: r4 off: -1 imm: 1 <block-8>
42: JNEImm dst: r5 off: -1 imm: 7 <block-8>
42: JNEImm dst: r5 off: -1 imm: 8 <block-8>
43: Mov32Imm dst: r0 imm: 1
44: Exit
block-8:
45: JNEImm dst: r2 off: -1 imm: 2 <block-9>
46: JNEImm dst: r4 off: -1 imm: 1 <block-9>
47: JNEImm dst: r5 off: -1 imm: 8 <block-9>
47: JNEImm dst: r5 off: -1 imm: 3 <block-9>
48: Mov32Imm dst: r0 imm: 1
49: Exit
block-9:
50: JNEImm dst: r2 off: -1 imm: 2 <block-10>
51: JNEImm dst: r4 off: -1 imm: 1 <block-10>
52: JNEImm dst: r5 off: -1 imm: 3 <block-10>
53: Mov32Imm dst: r0 imm: 1
54: Exit
block-10:
// (b, wildcard, wildcard, m, true)
55: JNEImm dst: r2 off: -1 imm: 1 <block-11>
56: Mov32Reg dst: r1 src: r3
57: And32Imm dst: r1 imm: 1
58: JEqImm dst: r1 off: -1 imm: 0 <block-11>
59: Mov32Imm dst: r0 imm: 1
60: Exit
block-11:
50: JNEImm dst: r2 off: -1 imm: 1 <block-10>
51: Mov32Reg dst: r1 src: r3
52: And32Imm dst: r1 imm: 1
53: JEqImm dst: r1 off: -1 imm: 0 <block-10>
54: Mov32Imm dst: r0 imm: 1
55: Exit
block-10:
// (c, wildcard, wildcard, m, true)
61: JNEImm dst: r2 off: -1 imm: 2 <block-12>
62: Mov32Reg dst: r1 src: r3
63: And32Imm dst: r1 imm: 1
64: JEqImm dst: r1 off: -1 imm: 0 <block-12>
65: Mov32Imm dst: r0 imm: 1
66: Exit
block-12:
67: Mov32Imm dst: r0 imm: 0
68: Exit
56: JNEImm dst: r2 off: -1 imm: 2 <block-11>
57: Mov32Reg dst: r1 src: r3
58: And32Imm dst: r1 imm: 1
59: JEqImm dst: r1 off: -1 imm: 0 <block-11>
60: Mov32Imm dst: r0 imm: 1
61: Exit
block-11:
62: Mov32Imm dst: r0 imm: 0
63: Exit
`
testDeviceFilter(t, specconv.AllowedDevices, expected)
var devices []*configs.DeviceRule
for _, device := range specconv.AllowedDevices {
devices = append(devices, &device.DeviceRule)
}
testDeviceFilter(t, devices, expected)
}
func TestDeviceFilter_Privileged(t *testing.T) {
devices := []*configs.Device{
devices := []*configs.DeviceRule{
{
Type: 'a',
Major: -1,
@ -174,7 +172,7 @@ block-0:
}
func TestDeviceFilter_PrivilegedExceptSingleDevice(t *testing.T) {
devices := []*configs.Device{
devices := []*configs.DeviceRule{
{
Type: 'a',
Major: -1,
@ -214,7 +212,7 @@ block-1:
}
func TestDeviceFilter_Weird(t *testing.T) {
devices := []*configs.Device{
devices := []*configs.DeviceRule{
{
Type: 'b',
Major: 8,

View File

@ -166,9 +166,6 @@ func (m *manager) Apply(pid int) (err error) {
}
for _, sys := range m.getSubsystems() {
// TODO: Apply should, ideally, be reentrant or be broken up into a separate
// create and join phase so that the cgroup hierarchy for a container can be
// created then join consists of writing the process pids to cgroup.procs
p, err := d.path(sys.Name())
if err != nil {
// The non-presence of the devices subsystem is
@ -181,10 +178,10 @@ func (m *manager) Apply(pid int) (err error) {
m.paths[sys.Name()] = p
if err := sys.Apply(d); err != nil {
// In the case of rootless (including euid=0 in userns), where an explicit cgroup path hasn't
// been set, we don't bail on error in case of permission problems.
// Cases where limits have been set (and we couldn't create our own
// cgroup) are handled by Set.
// In the case of rootless (including euid=0 in userns), where an
// explicit cgroup path hasn't been set, we don't bail on error in
// case of permission problems. Cases where limits have been set
// (and we couldn't create our own cgroup) are handled by Set.
if isIgnorableError(m.rootless, err) && m.cgroups.Path == "" {
delete(m.paths, sys.Name())
continue
@ -272,22 +269,25 @@ func (m *manager) Set(container *configs.Config) error {
// Freeze toggles the container's freezer cgroup depending on the state
// provided
func (m *manager) Freeze(state configs.FreezerState) error {
if m.cgroups == nil {
func (m *manager) Freeze(state configs.FreezerState) (Err error) {
path := m.GetPaths()["freezer"]
if m.cgroups == nil || path == "" {
return errors.New("cannot toggle freezer: cgroups not configured for container")
}
paths := m.GetPaths()
dir := paths["freezer"]
prevState := m.cgroups.Resources.Freezer
m.cgroups.Resources.Freezer = state
defer func() {
if Err != nil {
m.cgroups.Resources.Freezer = prevState
}
}()
freezer, err := m.getSubsystems().Get("freezer")
if err != nil {
return err
}
err = freezer.Set(dir, m.cgroups)
if err != nil {
m.cgroups.Resources.Freezer = prevState
if err := freezer.Set(path, m.cgroups); err != nil {
return err
}
return nil
@ -413,3 +413,15 @@ func (m *manager) GetPaths() map[string]string {
func (m *manager) GetCgroups() (*configs.Cgroup, error) {
return m.cgroups, nil
}
func (m *manager) GetFreezerState() (configs.FreezerState, error) {
paths := m.GetPaths()
dir := paths["freezer"]
freezer, err := m.getSubsystems().Get("freezer")
// If the container doesn't have the freezer cgroup, say it's undefined.
if err != nil || dir == "" {
return configs.Undefined, nil
}
return freezer.(*FreezerGroup).GetState(dir)
}

View File

@ -3,13 +3,19 @@
package fs
import (
"bytes"
"fmt"
"reflect"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/devices"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/system"
)
type DevicesGroup struct {
testingSkipFinalCheck bool
}
func (s *DevicesGroup) Name() string {
@ -26,49 +32,74 @@ func (s *DevicesGroup) Apply(d *cgroupData) error {
return nil
}
func loadEmulator(path string) (*devices.Emulator, error) {
list, err := fscommon.ReadFile(path, "devices.list")
if err != nil {
return nil, err
}
return devices.EmulatorFromList(bytes.NewBufferString(list))
}
func buildEmulator(rules []*configs.DeviceRule) (*devices.Emulator, error) {
// This defaults to a white-list -- which is what we want!
emu := &devices.Emulator{}
for _, rule := range rules {
if err := emu.Apply(*rule); err != nil {
return nil, err
}
}
return emu, nil
}
func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
if system.RunningInUserNS() {
return nil
}
devices := cgroup.Resources.Devices
if len(devices) > 0 {
for _, dev := range devices {
file := "devices.deny"
if dev.Allow {
file = "devices.allow"
}
if err := fscommon.WriteFile(path, file, dev.CgroupString()); err != nil {
return err
}
}
return nil
// Generate two emulators, one for the current state of the cgroup and one
// for the requested state by the user.
current, err := loadEmulator(path)
if err != nil {
return err
}
target, err := buildEmulator(cgroup.Resources.Devices)
if err != nil {
return err
}
if cgroup.Resources.AllowAllDevices != nil {
if *cgroup.Resources.AllowAllDevices == false {
if err := fscommon.WriteFile(path, "devices.deny", "a"); err != nil {
return err
}
for _, dev := range cgroup.Resources.AllowedDevices {
if err := fscommon.WriteFile(path, "devices.allow", dev.CgroupString()); err != nil {
return err
}
}
return nil
// Compute the minimal set of transition rules needed to achieve the
// requested state.
transitionRules, err := current.Transition(target)
if err != nil {
return err
}
for _, rule := range transitionRules {
file := "devices.deny"
if rule.Allow {
file = "devices.allow"
}
if err := fscommon.WriteFile(path, "devices.allow", "a"); err != nil {
if err := fscommon.WriteFile(path, file, rule.CgroupString()); err != nil {
return err
}
}
for _, dev := range cgroup.Resources.DeniedDevices {
if err := fscommon.WriteFile(path, "devices.deny", dev.CgroupString()); err != nil {
// Final safety check -- ensure that the resulting state is what was
// requested. This is only really correct for white-lists, but for
// black-lists we can at least check that the cgroup is in the right mode.
//
// This safety-check is skipped for the unit tests because we cannot
// currently mock devices.list correctly.
if !s.testingSkipFinalCheck {
currentAfter, err := loadEmulator(path)
if err != nil {
return err
}
if !target.IsBlacklist() && !reflect.DeepEqual(currentAfter, target) {
return fmt.Errorf("resulting devices cgroup doesn't precisely match target")
} else if target.IsBlacklist() != currentAfter.IsBlacklist() {
return fmt.Errorf("resulting devices cgroup doesn't match target mode")
}
}
return nil
}

View File

@ -9,91 +9,44 @@ import (
"github.com/opencontainers/runc/libcontainer/configs"
)
var (
allowedDevices = []*configs.Device{
{
Path: "/dev/zero",
Type: 'c',
Major: 1,
Minor: 5,
Permissions: "rwm",
FileMode: 0666,
},
}
allowedList = "c 1:5 rwm"
deniedDevices = []*configs.Device{
{
Path: "/dev/null",
Type: 'c',
Major: 1,
Minor: 3,
Permissions: "rwm",
FileMode: 0666,
},
}
deniedList = "c 1:3 rwm"
)
func TestDevicesSetAllow(t *testing.T) {
helper := NewCgroupTestUtil("devices", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"devices.deny": "a",
})
allowAllDevices := false
helper.CgroupData.config.Resources.AllowAllDevices = &allowAllDevices
helper.CgroupData.config.Resources.AllowedDevices = allowedDevices
devices := &DevicesGroup{}
if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "devices.allow")
if err != nil {
t.Fatalf("Failed to parse devices.allow - %s", err)
}
if value != allowedList {
t.Fatal("Got the wrong value, set devices.allow failed.")
}
// When AllowAllDevices is nil, devices.allow file should not be modified.
helper.CgroupData.config.Resources.AllowAllDevices = nil
if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
value, err = fscommon.GetCgroupParamString(helper.CgroupPath, "devices.allow")
if err != nil {
t.Fatalf("Failed to parse devices.allow - %s", err)
}
if value != allowedList {
t.Fatal("devices policy shouldn't have changed on AllowedAllDevices=nil.")
}
}
func TestDevicesSetDeny(t *testing.T) {
helper := NewCgroupTestUtil("devices", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"devices.allow": "a",
"devices.allow": "",
"devices.deny": "",
"devices.list": "a *:* rwm",
})
allowAllDevices := true
helper.CgroupData.config.Resources.AllowAllDevices = &allowAllDevices
helper.CgroupData.config.Resources.DeniedDevices = deniedDevices
devices := &DevicesGroup{}
helper.CgroupData.config.Resources.Devices = []*configs.DeviceRule{
{
Type: configs.CharDevice,
Major: 1,
Minor: 5,
Permissions: configs.DevicePermissions("rwm"),
Allow: true,
},
}
devices := &DevicesGroup{testingSkipFinalCheck: true}
if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
// The default deny rule must be written.
value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "devices.deny")
if err != nil {
t.Fatalf("Failed to parse devices.deny - %s", err)
t.Fatalf("Failed to parse devices.deny: %s", err)
}
if value[0] != 'a' {
t.Errorf("Got the wrong value (%q), set devices.deny failed.", value)
}
if value != deniedList {
t.Fatal("Got the wrong value, set devices.deny failed.")
// Permitted rule must be written.
if value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "devices.allow"); err != nil {
t.Fatalf("Failed to parse devices.allow: %s", err)
} else if value != "c 1:5 rwm" {
t.Errorf("Got the wrong value (%q), set devices.allow failed.", value)
}
}

View File

@ -3,13 +3,16 @@
package fs
import (
"errors"
"fmt"
"os"
"strings"
"time"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"golang.org/x/sys/unix"
)
type FreezerGroup struct {
@ -39,11 +42,11 @@ func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error {
return err
}
state, err := fscommon.ReadFile(path, "freezer.state")
state, err := s.GetState(path)
if err != nil {
return err
}
if strings.TrimSpace(state) == string(cgroup.Resources.Freezer) {
if state == cgroup.Resources.Freezer {
break
}
@ -65,3 +68,30 @@ func (s *FreezerGroup) Remove(d *cgroupData) error {
func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}
func (s *FreezerGroup) GetState(path string) (configs.FreezerState, error) {
for {
state, err := fscommon.ReadFile(path, "freezer.state")
if err != nil {
// If the kernel is too old, then we just treat the freezer as
// being in an "undefined" state.
if os.IsNotExist(err) || errors.Is(err, unix.ENODEV) {
err = nil
}
return configs.Undefined, err
}
switch strings.TrimSpace(state) {
case "THAWED":
return configs.Thawed, nil
case "FROZEN":
return configs.Frozen, nil
case "FREEZING":
// Make sure we get a stable freezer state, so retry if the cgroup
// is still undergoing freezing. This should be a temporary delay.
time.Sleep(1 * time.Millisecond)
continue
default:
return configs.Undefined, fmt.Errorf("unknown freezer.state %q", state)
}
}
}

View File

@ -10,12 +10,10 @@ import (
"golang.org/x/sys/unix"
)
func isRWM(cgroupPermissions string) bool {
r := false
w := false
m := false
for _, rn := range cgroupPermissions {
switch rn {
func isRWM(perms configs.DevicePermissions) bool {
var r, w, m bool
for _, perm := range perms {
switch perm {
case 'r':
r = true
case 'w':
@ -39,26 +37,10 @@ func canSkipEBPFError(cgroup *configs.Cgroup) bool {
}
func setDevices(dirPath string, cgroup *configs.Cgroup) error {
// XXX: This is currently a white-list (but all callers pass a blacklist of
// devices). This is bad for a whole variety of reasons, but will need
// to be fixed with co-ordinated effort with downstreams.
devices := cgroup.Devices
// never set by OCI specconv
if allowAllDevices := cgroup.Resources.AllowAllDevices; allowAllDevices != nil {
if *allowAllDevices == true {
if len(cgroup.Resources.DeniedDevices) != 0 {
return errors.New("libcontainer: can't use DeniedDevices together with AllowAllDevices")
}
return nil
}
// *allowAllDevices=false is still used by the integration test
for _, ad := range cgroup.Resources.AllowedDevices {
d := *ad
d.Allow = true
devices = append(devices, &d)
}
}
if len(cgroup.Resources.DeniedDevices) != 0 {
// never set by OCI specconv
return errors.New("libcontainer DeniedDevices is not supported, use Devices")
}
insts, license, err := devicefilter.DeviceFilter(devices)
if err != nil {
return err
@ -68,6 +50,17 @@ func setDevices(dirPath string, cgroup *configs.Cgroup) error {
return errors.Errorf("cannot get dir FD for %s", dirPath)
}
defer unix.Close(dirFD)
// XXX: This code is currently incorrect when it comes to updating an
// existing cgroup with new rules (new rulesets are just appended to
// the program list because this uses BPF_F_ALLOW_MULTI). If we didn't
// use BPF_F_ALLOW_MULTI we could actually atomically swap the
// programs.
//
// The real issue is that BPF_F_ALLOW_MULTI makes it hard to have a
// race-free blacklist because it acts as a whitelist by default, and
// having a deny-everything program cannot be overriden by other
// programs. You could temporarily insert a deny-everything program
// but that would result in spurrious failures during updates.
if _, err := ebpf.LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil {
if !canSkipEBPFError(cgroup) {
return err

View File

@ -3,32 +3,49 @@
package fs2
import (
"strconv"
stdErrors "errors"
"os"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
func setFreezer(dirPath string, state configs.FreezerState) error {
var desired int
if err := supportsFreezer(dirPath); err != nil {
// We can ignore this request as long as the user didn't ask us to
// freeze the container (since without the freezer cgroup, that's a
// no-op).
if state == configs.Undefined || state == configs.Thawed {
err = nil
}
return errors.Wrap(err, "freezer not supported")
}
var stateStr string
switch state {
case configs.Undefined:
return nil
case configs.Frozen:
desired = 1
stateStr = "1"
case configs.Thawed:
desired = 0
stateStr = "0"
default:
return errors.Errorf("unknown freezer state %+v", state)
return errors.Errorf("invalid freezer state %q requested", state)
}
supportedErr := supportsFreezer(dirPath)
if supportedErr != nil && desired != 0 {
// can ignore error if desired == 1
return errors.Wrap(supportedErr, "freezer not supported")
if err := fscommon.WriteFile(dirPath, "cgroup.freeze", stateStr); err != nil {
return err
}
return freezeWithInt(dirPath, desired)
// Confirm that the cgroup did actually change states.
if actualState, err := getFreezer(dirPath); err != nil {
return err
} else if actualState != state {
return errors.Errorf(`expected "cgroup.freeze" to be in state %q but was in %q`, state, actualState)
}
return nil
}
func supportsFreezer(dirPath string) error {
@ -36,18 +53,22 @@ func supportsFreezer(dirPath string) error {
return err
}
// freeze writes desired int to "cgroup.freeze".
func freezeWithInt(dirPath string, desired int) error {
desiredS := strconv.Itoa(desired)
if err := fscommon.WriteFile(dirPath, "cgroup.freeze", desiredS); err != nil {
return err
}
got, err := fscommon.ReadFile(dirPath, "cgroup.freeze")
func getFreezer(dirPath string) (configs.FreezerState, error) {
state, err := fscommon.ReadFile(dirPath, "cgroup.freeze")
if err != nil {
return err
// If the kernel is too old, then we just treat the freezer as being in
// an "undefined" state.
if os.IsNotExist(err) || stdErrors.Is(err, unix.ENODEV) {
err = nil
}
return configs.Undefined, err
}
if gotS := strings.TrimSpace(string(got)); gotS != desiredS {
return errors.Errorf("expected \"cgroup.freeze\" in %q to be %q, got %q", dirPath, desiredS, gotS)
switch strings.TrimSpace(state) {
case "0":
return configs.Thawed, nil
case "1":
return configs.Frozen, nil
default:
return configs.Undefined, errors.Errorf(`unknown "cgroup.freeze" state: %q`, state)
}
return nil
}

View File

@ -240,3 +240,7 @@ func (m *manager) GetPaths() map[string]string {
func (m *manager) GetCgroups() (*configs.Cgroup, error) {
return m.config, nil
}
func (m *manager) GetFreezerState() (configs.FreezerState, error) {
return getFreezer(m.dirPath)
}

View File

@ -1,6 +1,7 @@
package systemd
import (
"bufio"
"fmt"
"os"
"strings"
@ -9,6 +10,7 @@ import (
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
dbus "github.com/godbus/dbus/v5"
"github.com/opencontainers/runc/libcontainer/cgroups/devices"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
@ -69,6 +71,209 @@ func ExpandSlice(slice string) (string, error) {
return path, nil
}
func groupPrefix(ruleType configs.DeviceType) (string, error) {
switch ruleType {
case configs.BlockDevice:
return "block-", nil
case configs.CharDevice:
return "char-", nil
default:
return "", errors.Errorf("device type %v has no group prefix", ruleType)
}
}
// findDeviceGroup tries to find the device group name (as listed in
// /proc/devices) with the type prefixed as requried for DeviceAllow, for a
// given (type, major) combination. If more than one device group exists, an
// arbitrary one is chosen.
func findDeviceGroup(ruleType configs.DeviceType, ruleMajor int64) (string, error) {
fh, err := os.Open("/proc/devices")
if err != nil {
return "", err
}
defer fh.Close()
prefix, err := groupPrefix(ruleType)
if err != nil {
return "", err
}
scanner := bufio.NewScanner(fh)
var currentType configs.DeviceType
for scanner.Scan() {
// We need to strip spaces because the first number is column-aligned.
line := strings.TrimSpace(scanner.Text())
// Handle the "header" lines.
switch line {
case "Block devices:":
currentType = configs.BlockDevice
continue
case "Character devices:":
currentType = configs.CharDevice
continue
case "":
continue
}
// Skip lines unrelated to our type.
if currentType != ruleType {
continue
}
// Parse out the (major, name).
var (
currMajor int64
currName string
)
if n, err := fmt.Sscanf(line, "%d %s", &currMajor, &currName); err != nil || n != 2 {
if err == nil {
err = errors.Errorf("wrong number of fields")
}
return "", errors.Wrapf(err, "scan /proc/devices line %q", line)
}
if currMajor == ruleMajor {
return prefix + currName, nil
}
}
if err := scanner.Err(); err != nil {
return "", errors.Wrap(err, "reading /proc/devices")
}
// Couldn't find the device group.
return "", nil
}
// generateDeviceProperties takes the configured device rules and generates a
// corresponding set of systemd properties to configure the devices correctly.
func generateDeviceProperties(rules []*configs.DeviceRule) ([]systemdDbus.Property, error) {
// DeviceAllow is the type "a(ss)" which means we need a temporary struct
// to represent it in Go.
type deviceAllowEntry struct {
Path string
Perms string
}
properties := []systemdDbus.Property{
// Always run in the strictest white-list mode.
newProp("DevicePolicy", "strict"),
// Empty the DeviceAllow array before filling it.
newProp("DeviceAllow", []deviceAllowEntry{}),
}
// Figure out the set of rules.
configEmu := &devices.Emulator{}
for _, rule := range rules {
if err := configEmu.Apply(*rule); err != nil {
return nil, errors.Wrap(err, "apply rule for systemd")
}
}
// systemd doesn't support blacklists. So we log a warning, and tell
// systemd to act as a deny-all whitelist. This ruleset will be replaced
// with our normal fallback code. This may result in spurrious errors, but
// the only other option is to error out here.
if configEmu.IsBlacklist() {
// However, if we're dealing with an allow-all rule then we can do it.
if configEmu.IsAllowAll() {
return []systemdDbus.Property{
// Run in white-list mode by setting to "auto" and removing all
// DeviceAllow rules.
newProp("DevicePolicy", "auto"),
newProp("DeviceAllow", []deviceAllowEntry{}),
}, nil
}
logrus.Warn("systemd doesn't support blacklist device rules -- applying temporary deny-all rule")
return properties, nil
}
// Now generate the set of rules we actually need to apply. Unlike the
// normal devices cgroup, in "strict" mode systemd defaults to a deny-all
// whitelist which is the default for devices.Emulator.
baseEmu := &devices.Emulator{}
finalRules, err := baseEmu.Transition(configEmu)
if err != nil {
return nil, errors.Wrap(err, "get simplified rules for systemd")
}
var deviceAllowList []deviceAllowEntry
for _, rule := range finalRules {
if !rule.Allow {
// Should never happen.
return nil, errors.Errorf("[internal error] cannot add deny rule to systemd DeviceAllow list: %v", *rule)
}
switch rule.Type {
case configs.BlockDevice, configs.CharDevice:
default:
// Should never happen.
return nil, errors.Errorf("invalid device type for DeviceAllow: %v", rule.Type)
}
entry := deviceAllowEntry{
Perms: string(rule.Permissions),
}
// systemd has a fairly odd (though understandable) syntax here, and
// because of the OCI configuration format we have to do quite a bit of
// trickery to convert things:
//
// * Concrete rules with non-wildcard major/minor numbers have to use
// /dev/{block,char} paths. This is slightly odd because it means
// that we cannot add whitelist rules for devices that don't exist,
// but there's not too much we can do about that.
//
// However, path globbing is not support for path-based rules so we
// need to handle wildcards in some other manner.
//
// * Wildcard-minor rules have to specify a "device group name" (the
// second column in /proc/devices).
//
// * Wildcard (major and minor) rules can just specify a glob with the
// type ("char-*" or "block-*").
//
// The only type of rule we can't handle is wildcard-major rules, and
// so we'll give a warning in that case (note that the fallback code
// will insert any rules systemd couldn't handle). What amazing fun.
if rule.Major == configs.Wildcard {
// "_ *:n _" rules aren't supported by systemd.
if rule.Minor != configs.Wildcard {
logrus.Warnf("systemd doesn't support '*:n' device rules -- temporarily ignoring rule: %v", *rule)
continue
}
// "_ *:* _" rules just wildcard everything.
prefix, err := groupPrefix(rule.Type)
if err != nil {
return nil, err
}
entry.Path = prefix + "*"
} else if rule.Minor == configs.Wildcard {
// "_ n:* _" rules require a device group from /proc/devices.
group, err := findDeviceGroup(rule.Type, rule.Major)
if err != nil {
return nil, errors.Wrapf(err, "find device '%v/%d'", rule.Type, rule.Major)
}
if group == "" {
// Couldn't find a group.
logrus.Warnf("could not find device group for '%v/%d' in /proc/devices -- temporarily ignoring rule: %v", rule.Type, rule.Major, *rule)
continue
}
entry.Path = group
} else {
// "_ n:m _" rules are just a path in /dev/{block,char}/.
switch rule.Type {
case configs.BlockDevice:
entry.Path = fmt.Sprintf("/dev/block/%d:%d", rule.Major, rule.Minor)
case configs.CharDevice:
entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor)
}
}
deviceAllowList = append(deviceAllowList, entry)
}
properties = append(properties, newProp("DeviceAllow", deviceAllowList))
return properties, nil
}
// getDbusConnection lazy initializes systemd dbus connection
// and returns it
func getDbusConnection(rootless bool) (*systemdDbus.Conn, error) {

View File

@ -15,6 +15,7 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/sirupsen/logrus"
)
type legacyManager struct {
@ -70,6 +71,13 @@ var legacySubsystems = subsystemSet{
func genV1ResourcesProperties(c *configs.Cgroup) ([]systemdDbus.Property, error) {
var properties []systemdDbus.Property
deviceProperties, err := generateDeviceProperties(c.Resources.Devices)
if err != nil {
return nil, err
}
properties = append(properties, deviceProperties...)
if c.Resources.Memory != 0 {
properties = append(properties,
newProp("MemoryLimit", uint64(c.Resources.Memory)))
@ -381,13 +389,40 @@ func (m *legacyManager) Set(container *configs.Config) error {
if err != nil {
return err
}
dbusConnection, err := getDbusConnection(false)
// Figure out the current freezer state, so we can revert to it after we
// temporarily freeze the container.
targetFreezerState, err := m.GetFreezerState()
if err != nil {
return err
}
if err := dbusConnection.SetUnitProperties(getUnitName(container.Cgroups), true, properties...); err != nil {
if targetFreezerState == configs.Undefined {
targetFreezerState = configs.Thawed
}
// We have to freeze the container while systemd sets the cgroup settings.
// The reason for this is that systemd's application of DeviceAllow rules
// is done disruptively, resulting in spurrious errors to common devices
// (unlike our fs driver, they will happily write deny-all rules to running
// containers). So we freeze the container to avoid them hitting the cgroup
// error. But if the freezer cgroup isn't supported, we just warn about it.
if err := m.Freeze(configs.Frozen); err != nil {
logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
}
dbusConnection, err := getDbusConnection(false)
if err != nil {
_ = m.Freeze(targetFreezerState)
return err
}
if err := dbusConnection.SetUnitProperties(getUnitName(container.Cgroups), true, properties...); err != nil {
_ = m.Freeze(targetFreezerState)
return err
}
// Reset freezer state before we apply the configuration, to avoid clashing
// with the freezer setting in the configuration.
_ = m.Freeze(targetFreezerState)
for _, sys := range legacySubsystems {
// Get the subsystem path, but don't error out for not found cgroups.
@ -395,7 +430,6 @@ func (m *legacyManager) Set(container *configs.Config) error {
if err != nil && !cgroups.IsNotFound(err) {
return err
}
if err := sys.Set(path, container.Cgroups); err != nil {
return err
}
@ -439,3 +473,15 @@ func (m *legacyManager) GetPaths() map[string]string {
func (m *legacyManager) GetCgroups() (*configs.Cgroup, error) {
return m.cgroups, nil
}
func (m *legacyManager) GetFreezerState() (configs.FreezerState, error) {
path, err := getSubsystemPath(m.cgroups, "freezer")
if err != nil && !cgroups.IsNotFound(err) {
return configs.Undefined, err
}
freezer, err := legacySubsystems.Get("freezer")
if err != nil {
return configs.Undefined, err
}
return freezer.(*fs.FreezerGroup).GetState(path)
}

View File

@ -16,6 +16,7 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
)
type unifiedManager struct {
@ -37,6 +38,17 @@ func NewUnifiedManager(config *configs.Cgroup, path string, rootless bool) cgrou
func genV2ResourcesProperties(c *configs.Cgroup) ([]systemdDbus.Property, error) {
var properties []systemdDbus.Property
// NOTE: This is of questionable correctness because we insert our own
// devices eBPF program later. Two programs with identical rules
// aren't the end of the world, but it is a bit concerning. However
// it's unclear if systemd removes all eBPF programs attached when
// doing SetUnitProperties...
deviceProperties, err := generateDeviceProperties(c.Resources.Devices)
if err != nil {
return nil, err
}
properties = append(properties, deviceProperties...)
if c.Resources.Memory != 0 {
properties = append(properties,
newProp("MemoryMax", uint64(c.Resources.Memory)))
@ -295,14 +307,41 @@ func (m *unifiedManager) Set(container *configs.Config) error {
if err != nil {
return err
}
dbusConnection, err := getDbusConnection(m.rootless)
// Figure out the current freezer state, so we can revert to it after we
// temporarily freeze the container.
targetFreezerState, err := m.GetFreezerState()
if err != nil {
return err
}
if targetFreezerState == configs.Undefined {
targetFreezerState = configs.Thawed
}
// We have to freeze the container while systemd sets the cgroup settings.
// The reason for this is that systemd's application of DeviceAllow rules
// is done disruptively, resulting in spurrious errors to common devices
// (unlike our fs driver, they will happily write deny-all rules to running
// containers). So we freeze the container to avoid them hitting the cgroup
// error. But if the freezer cgroup isn't supported, we just warn about it.
if err := m.Freeze(configs.Frozen); err != nil {
logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
}
dbusConnection, err := getDbusConnection(m.rootless)
if err != nil {
_ = m.Freeze(targetFreezerState)
return err
}
if err := dbusConnection.SetUnitProperties(getUnitName(m.cgroups), true, properties...); err != nil {
_ = m.Freeze(targetFreezerState)
return errors.Wrap(err, "error while setting unit properties")
}
// Reset freezer state before we apply the configuration, to avoid clashing
// with the freezer setting in the configuration.
_ = m.Freeze(targetFreezerState)
fsMgr, err := m.fsManager()
if err != nil {
return err
@ -319,3 +358,11 @@ func (m *unifiedManager) GetPaths() map[string]string {
func (m *unifiedManager) GetCgroups() (*configs.Cgroup, error) {
return m.cgroups, nil
}
func (m *unifiedManager) GetFreezerState() (configs.FreezerState, error) {
fsMgr, err := m.fsManager()
if err != nil {
return configs.Undefined, err
}
return fsMgr.GetFreezerState()
}

View File

@ -41,15 +41,8 @@ type Cgroup struct {
}
type Resources struct {
// If this is true allow access to any kind of device within the container. If false, allow access only to devices explicitly listed in the allowed_devices list.
// Deprecated
AllowAllDevices *bool `json:"allow_all_devices,omitempty"`
// Deprecated
AllowedDevices []*Device `json:"allowed_devices,omitempty"`
// Deprecated
DeniedDevices []*Device `json:"denied_devices,omitempty"`
Devices []*Device `json:"devices"`
// Devices is the set of access rules for devices in the container.
Devices []*DeviceRule `json:"devices"`
// Memory limit (in bytes)
Memory int64 `json:"memory"`

View File

@ -1,8 +1,12 @@
package configs
import (
"errors"
"fmt"
"os"
"strconv"
"golang.org/x/sys/unix"
)
const (
@ -12,21 +16,11 @@ const (
// TODO Windows: This can be factored out in the future
type Device struct {
// Device type, block, char, etc.
Type rune `json:"type"`
DeviceRule
// Path to the device.
Path string `json:"path"`
// Major is the device's major number.
Major int64 `json:"major"`
// Minor is the device's minor number.
Minor int64 `json:"minor"`
// Cgroup permissions format, rwm.
Permissions string `json:"permissions"`
// FileMode permission bits for the device.
FileMode os.FileMode `json:"file_mode"`
@ -35,23 +29,154 @@ type Device struct {
// Gid of the device.
Gid uint32 `json:"gid"`
}
// Write the file to the allowed list
// DevicePermissions is a cgroupv1-style string to represent device access. It
// has to be a string for backward compatibility reasons, hence why it has
// methods to do set operations.
type DevicePermissions string
const (
deviceRead uint = (1 << iota)
deviceWrite
deviceMknod
)
func (p DevicePermissions) toSet() uint {
var set uint
for _, perm := range p {
switch perm {
case 'r':
set |= deviceRead
case 'w':
set |= deviceWrite
case 'm':
set |= deviceMknod
}
}
return set
}
func fromSet(set uint) DevicePermissions {
var perm string
if set&deviceRead == deviceRead {
perm += "r"
}
if set&deviceWrite == deviceWrite {
perm += "w"
}
if set&deviceMknod == deviceMknod {
perm += "m"
}
return DevicePermissions(perm)
}
// Union returns the union of the two sets of DevicePermissions.
func (p DevicePermissions) Union(o DevicePermissions) DevicePermissions {
lhs := p.toSet()
rhs := o.toSet()
return fromSet(lhs | rhs)
}
// Difference returns the set difference of the two sets of DevicePermissions.
// In set notation, A.Difference(B) gives you A\B.
func (p DevicePermissions) Difference(o DevicePermissions) DevicePermissions {
lhs := p.toSet()
rhs := o.toSet()
return fromSet(lhs &^ rhs)
}
// Intersection computes the intersection of the two sets of DevicePermissions.
func (p DevicePermissions) Intersection(o DevicePermissions) DevicePermissions {
lhs := p.toSet()
rhs := o.toSet()
return fromSet(lhs & rhs)
}
// IsEmpty returns whether the set of permissions in a DevicePermissions is
// empty.
func (p DevicePermissions) IsEmpty() bool {
return p == DevicePermissions("")
}
// IsValid returns whether the set of permissions is a subset of valid
// permissions (namely, {r,w,m}).
func (p DevicePermissions) IsValid() bool {
return p == fromSet(p.toSet())
}
type DeviceType rune
const (
WildcardDevice DeviceType = 'a'
BlockDevice DeviceType = 'b'
CharDevice DeviceType = 'c' // or 'u'
FifoDevice DeviceType = 'p'
)
func (t DeviceType) IsValid() bool {
switch t {
case WildcardDevice, BlockDevice, CharDevice, FifoDevice:
return true
default:
return false
}
}
func (t DeviceType) CanMknod() bool {
switch t {
case BlockDevice, CharDevice, FifoDevice:
return true
default:
return false
}
}
func (t DeviceType) CanCgroup() bool {
switch t {
case WildcardDevice, BlockDevice, CharDevice:
return true
default:
return false
}
}
type DeviceRule struct {
// Type of device ('c' for char, 'b' for block). If set to 'a', this rule
// acts as a wildcard and all fields other than Allow are ignored.
Type DeviceType `json:"type"`
// Major is the device's major number.
Major int64 `json:"major"`
// Minor is the device's minor number.
Minor int64 `json:"minor"`
// Permissions is the set of permissions that this rule applies to (in the
// cgroupv1 format -- any combination of "rwm").
Permissions DevicePermissions `json:"permissions"`
// Allow specifies whether this rule is allowed.
Allow bool `json:"allow"`
}
func (d *Device) CgroupString() string {
return fmt.Sprintf("%c %s:%s %s", d.Type, deviceNumberString(d.Major), deviceNumberString(d.Minor), d.Permissions)
}
func (d *Device) Mkdev() int {
return int((d.Major << 8) | (d.Minor & 0xff) | ((d.Minor & 0xfff00) << 12))
}
// deviceNumberString converts the device number to a string return result.
func deviceNumberString(number int64) string {
if number == Wildcard {
return "*"
func (d *DeviceRule) CgroupString() string {
var (
major = strconv.FormatInt(d.Major, 10)
minor = strconv.FormatInt(d.Minor, 10)
)
if d.Major == Wildcard {
major = "*"
}
return fmt.Sprint(number)
if d.Minor == Wildcard {
minor = "*"
}
return fmt.Sprintf("%c %s:%s %s", d.Type, major, minor, d.Permissions)
}
func (d *DeviceRule) Mkdev() (uint64, error) {
if d.Major == Wildcard || d.Minor == Wildcard {
return 0, errors.New("cannot mkdev() device with wildcards")
}
return unix.Mkdev(uint32(d.Major), uint32(d.Minor)), nil
}

View File

@ -1,111 +0,0 @@
// +build linux
package configs
var (
// DefaultSimpleDevices are devices that are to be both allowed and created.
DefaultSimpleDevices = []*Device{
// /dev/null and zero
{
Path: "/dev/null",
Type: 'c',
Major: 1,
Minor: 3,
Permissions: "rwm",
FileMode: 0666,
},
{
Path: "/dev/zero",
Type: 'c',
Major: 1,
Minor: 5,
Permissions: "rwm",
FileMode: 0666,
},
{
Path: "/dev/full",
Type: 'c',
Major: 1,
Minor: 7,
Permissions: "rwm",
FileMode: 0666,
},
// consoles and ttys
{
Path: "/dev/tty",
Type: 'c',
Major: 5,
Minor: 0,
Permissions: "rwm",
FileMode: 0666,
},
// /dev/urandom,/dev/random
{
Path: "/dev/urandom",
Type: 'c',
Major: 1,
Minor: 9,
Permissions: "rwm",
FileMode: 0666,
},
{
Path: "/dev/random",
Type: 'c',
Major: 1,
Minor: 8,
Permissions: "rwm",
FileMode: 0666,
},
}
DefaultAllowedDevices = append([]*Device{
// allow mknod for any device
{
Type: 'c',
Major: Wildcard,
Minor: Wildcard,
Permissions: "m",
},
{
Type: 'b',
Major: Wildcard,
Minor: Wildcard,
Permissions: "m",
},
{
Path: "/dev/console",
Type: 'c',
Major: 5,
Minor: 1,
Permissions: "rwm",
},
// /dev/pts/ - pts namespaces are "coming soon"
{
Path: "",
Type: 'c',
Major: 136,
Minor: Wildcard,
Permissions: "rwm",
},
{
Path: "",
Type: 'c',
Major: 5,
Minor: 2,
Permissions: "rwm",
},
// tuntap
{
Path: "",
Type: 'c',
Major: 10,
Minor: 200,
Permissions: "rwm",
},
}, DefaultSimpleDevices...)
DefaultAutoCreatedDevices = append([]*Device{}, DefaultSimpleDevices...)
)

View File

@ -1847,30 +1847,11 @@ func (c *linuxContainer) runType() Status {
}
func (c *linuxContainer) isPaused() (bool, error) {
var filename, pausedState string
fcg := c.cgroupManager.Path("freezer")
if !cgroups.IsCgroup2UnifiedMode() {
if fcg == "" {
// container doesn't have a freezer cgroup
return false, nil
}
filename = "freezer.state"
pausedState = "FROZEN"
} else {
filename = "cgroup.freeze"
pausedState = "1"
}
data, err := ioutil.ReadFile(filepath.Join(fcg, filename))
state, err := c.cgroupManager.GetFreezerState()
if err != nil {
// If freezer cgroup is not mounted, the container would just be not paused.
if os.IsNotExist(err) || errors.Is(err, unix.ENODEV) {
return false, nil
}
return false, newSystemErrorWithCause(err, "checking if container is paused")
return false, err
}
return bytes.Equal(bytes.TrimSpace(data), []byte(pausedState)), nil
return state == configs.Frozen, nil
}
func (c *linuxContainer) currentState() (*State, error) {

View File

@ -61,10 +61,15 @@ func (m *mockCgroupManager) Path(subsys string) string {
func (m *mockCgroupManager) Freeze(state configs.FreezerState) error {
return nil
}
func (m *mockCgroupManager) GetCgroups() (*configs.Cgroup, error) {
return nil, nil
}
func (m *mockCgroupManager) GetFreezerState() (configs.FreezerState, error) {
return configs.Thawed, nil
}
func (m *mockIntelRdtManager) Apply(pid int) error {
return nil
}

View File

@ -31,33 +31,33 @@ func DeviceFromPath(path, permissions string) (*configs.Device, error) {
}
var (
devType configs.DeviceType
mode = stat.Mode
devNumber = uint64(stat.Rdev)
major = unix.Major(devNumber)
minor = unix.Minor(devNumber)
)
if major == 0 {
return nil, ErrNotADevice
}
var (
devType rune
mode = stat.Mode
)
switch {
case mode&unix.S_IFBLK == unix.S_IFBLK:
devType = 'b'
devType = configs.BlockDevice
case mode&unix.S_IFCHR == unix.S_IFCHR:
devType = 'c'
devType = configs.CharDevice
case mode&unix.S_IFIFO == unix.S_IFIFO:
devType = configs.FifoDevice
default:
return nil, ErrNotADevice
}
return &configs.Device{
Type: devType,
Path: path,
Major: int64(major),
Minor: int64(minor),
Permissions: permissions,
FileMode: os.FileMode(mode),
Uid: stat.Uid,
Gid: stat.Gid,
DeviceRule: configs.DeviceRule{
Type: devType,
Major: int64(major),
Minor: int64(minor),
Permissions: configs.DevicePermissions(permissions),
},
Path: path,
FileMode: os.FileMode(mode),
Uid: stat.Uid,
Gid: stat.Gid,
}, nil
}

View File

@ -2,6 +2,7 @@ package integration
import (
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/specconv"
"golang.org/x/sys/unix"
)
@ -20,7 +21,10 @@ const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
// it uses a network strategy of just setting a loopback interface
// and the default setup for devices
func newTemplateConfig(rootfs string) *configs.Config {
allowAllDevices := false
var allowedDevices []*configs.DeviceRule
for _, device := range specconv.AllowedDevices {
allowedDevices = append(allowedDevices, &device.DeviceRule)
}
return &configs.Config{
Rootfs: rootfs,
Capabilities: &configs.Capabilities{
@ -116,8 +120,7 @@ func newTemplateConfig(rootfs string) *configs.Config {
Path: "integration/test",
Resources: &configs.Resources{
MemorySwappiness: nil,
AllowAllDevices: &allowAllDevices,
AllowedDevices: configs.DefaultAllowedDevices,
Devices: allowedDevices,
},
},
MaskPaths: []string{
@ -127,7 +130,7 @@ func newTemplateConfig(rootfs string) *configs.Config {
ReadonlyPaths: []string{
"/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
},
Devices: configs.DefaultAutoCreatedDevices,
Devices: specconv.AllowedDevices,
Hostname: "integration",
Mounts: []*configs.Mount{
{

View File

@ -606,11 +606,14 @@ func bindMountDeviceNode(dest string, node *configs.Device) error {
// Creates the device node in the rootfs of the container.
func createDeviceNode(rootfs string, node *configs.Device, bind bool) error {
if node.Path == "" {
// The node only exists for cgroup reasons, ignore it here.
return nil
}
dest := filepath.Join(rootfs, node.Path)
if err := os.MkdirAll(filepath.Dir(dest), 0755); err != nil {
return err
}
if bind {
return bindMountDeviceNode(dest, node)
}
@ -628,16 +631,20 @@ func createDeviceNode(rootfs string, node *configs.Device, bind bool) error {
func mknodDevice(dest string, node *configs.Device) error {
fileMode := node.FileMode
switch node.Type {
case 'c', 'u':
fileMode |= unix.S_IFCHR
case 'b':
case configs.BlockDevice:
fileMode |= unix.S_IFBLK
case 'p':
case configs.CharDevice:
fileMode |= unix.S_IFCHR
case configs.FifoDevice:
fileMode |= unix.S_IFIFO
default:
return fmt.Errorf("%c is not a valid device type for device %s", node.Type, node.Path)
}
if err := unix.Mknod(dest, uint32(fileMode), node.Mkdev()); err != nil {
dev, err := node.Mkdev()
if err != nil {
return err
}
if err := unix.Mknod(dest, uint32(fileMode), int(dev)); err != nil {
return err
}
return unix.Chown(dest, int(node.Uid), int(node.Gid))

View File

@ -48,104 +48,147 @@ var mountPropagationMapping = map[string]int{
"": 0,
}
// AllowedDevices is exposed for devicefilter_test.go
// AllowedDevices is the set of devices which are automatically included for
// all containers.
//
// XXX (cyphar)
// This behaviour is at the very least "questionable" (if not outright
// wrong) according to the runtime-spec.
//
// Yes, we have to include certain devices other than the ones the user
// specifies, but several devices listed here are not part of the spec
// (including "mknod for any device"?!). In addition, these rules are
// appended to the user-provided set which means that users *cannot disable
// this behaviour*.
//
// ... unfortunately I'm too scared to change this now because who knows how
// many people depend on this (incorrect and arguably insecure) behaviour.
var AllowedDevices = []*configs.Device{
// allow mknod for any device
{
Type: 'c',
Major: wildcard,
Minor: wildcard,
Permissions: "m",
Allow: true,
DeviceRule: configs.DeviceRule{
Type: configs.CharDevice,
Major: configs.Wildcard,
Minor: configs.Wildcard,
Permissions: "m",
Allow: true,
},
},
{
Type: 'b',
Major: wildcard,
Minor: wildcard,
Permissions: "m",
Allow: true,
DeviceRule: configs.DeviceRule{
Type: configs.BlockDevice,
Major: configs.Wildcard,
Minor: configs.Wildcard,
Permissions: "m",
Allow: true,
},
},
{
Type: 'c',
Path: "/dev/null",
Major: 1,
Minor: 3,
Permissions: "rwm",
Allow: true,
Path: "/dev/null",
FileMode: 0666,
Uid: 0,
Gid: 0,
DeviceRule: configs.DeviceRule{
Type: configs.CharDevice,
Major: 1,
Minor: 3,
Permissions: "rwm",
Allow: true,
},
},
{
Type: 'c',
Path: "/dev/random",
Major: 1,
Minor: 8,
Permissions: "rwm",
Allow: true,
Path: "/dev/random",
FileMode: 0666,
Uid: 0,
Gid: 0,
DeviceRule: configs.DeviceRule{
Type: configs.CharDevice,
Major: 1,
Minor: 8,
Permissions: "rwm",
Allow: true,
},
},
{
Type: 'c',
Path: "/dev/full",
Major: 1,
Minor: 7,
Permissions: "rwm",
Allow: true,
Path: "/dev/full",
FileMode: 0666,
Uid: 0,
Gid: 0,
DeviceRule: configs.DeviceRule{
Type: configs.CharDevice,
Major: 1,
Minor: 7,
Permissions: "rwm",
Allow: true,
},
},
{
Type: 'c',
Path: "/dev/tty",
Major: 5,
Minor: 0,
Permissions: "rwm",
Allow: true,
Path: "/dev/tty",
FileMode: 0666,
Uid: 0,
Gid: 0,
DeviceRule: configs.DeviceRule{
Type: configs.CharDevice,
Major: 5,
Minor: 0,
Permissions: "rwm",
Allow: true,
},
},
{
Type: 'c',
Path: "/dev/zero",
Major: 1,
Minor: 5,
Permissions: "rwm",
Allow: true,
Path: "/dev/zero",
FileMode: 0666,
Uid: 0,
Gid: 0,
DeviceRule: configs.DeviceRule{
Type: configs.CharDevice,
Major: 1,
Minor: 5,
Permissions: "rwm",
Allow: true,
},
},
{
Type: 'c',
Path: "/dev/urandom",
Major: 1,
Minor: 9,
Permissions: "rwm",
Allow: true,
},
{
Path: "/dev/console",
Type: 'c',
Major: 5,
Minor: 1,
Permissions: "rwm",
Allow: true,
Path: "/dev/urandom",
FileMode: 0666,
Uid: 0,
Gid: 0,
DeviceRule: configs.DeviceRule{
Type: configs.CharDevice,
Major: 1,
Minor: 9,
Permissions: "rwm",
Allow: true,
},
},
// /dev/pts/ - pts namespaces are "coming soon"
{
Path: "",
Type: 'c',
Major: 136,
Minor: wildcard,
Permissions: "rwm",
Allow: true,
DeviceRule: configs.DeviceRule{
Type: configs.CharDevice,
Major: 136,
Minor: configs.Wildcard,
Permissions: "rwm",
Allow: true,
},
},
{
Path: "",
Type: 'c',
Major: 5,
Minor: 2,
Permissions: "rwm",
Allow: true,
DeviceRule: configs.DeviceRule{
Type: configs.CharDevice,
Major: 5,
Minor: 2,
Permissions: "rwm",
Allow: true,
},
},
// tuntap
{
Path: "",
Type: 'c',
Major: 10,
Minor: 200,
Permissions: "rwm",
Allow: true,
DeviceRule: configs.DeviceRule{
Type: configs.CharDevice,
Major: 10,
Minor: 200,
Permissions: "rwm",
Allow: true,
},
},
}
@ -420,7 +463,6 @@ func CreateCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) {
// In rootless containers, any attempt to make cgroup changes is likely to fail.
// libcontainer will validate this but ignores the error.
c.Resources.AllowedDevices = AllowedDevices
if spec.Linux != nil {
r := spec.Linux.Resources
if r != nil {
@ -446,14 +488,13 @@ func CreateCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) {
if err != nil {
return nil, err
}
dd := &configs.Device{
c.Resources.Devices = append(c.Resources.Devices, &configs.DeviceRule{
Type: dt,
Major: major,
Minor: minor,
Permissions: d.Access,
Permissions: configs.DevicePermissions(d.Access),
Allow: d.Allow,
}
c.Resources.Devices = append(c.Resources.Devices, dd)
})
}
if r.Memory != nil {
if r.Memory.Limit != nil {
@ -578,98 +619,48 @@ func CreateCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) {
}
}
}
// append the default allowed devices to the end of the list
c.Resources.Devices = append(c.Resources.Devices, AllowedDevices...)
// Append the default allowed devices to the end of the list.
// XXX: Really this should be prefixed...
for _, device := range AllowedDevices {
c.Resources.Devices = append(c.Resources.Devices, &device.DeviceRule)
}
return c, nil
}
func stringToCgroupDeviceRune(s string) (rune, error) {
func stringToCgroupDeviceRune(s string) (configs.DeviceType, error) {
switch s {
case "a":
return 'a', nil
return configs.WildcardDevice, nil
case "b":
return 'b', nil
return configs.BlockDevice, nil
case "c":
return 'c', nil
return configs.CharDevice, nil
default:
return 0, fmt.Errorf("invalid cgroup device type %q", s)
}
}
func stringToDeviceRune(s string) (rune, error) {
func stringToDeviceRune(s string) (configs.DeviceType, error) {
switch s {
case "p":
return 'p', nil
case "u":
return 'u', nil
return configs.FifoDevice, nil
case "u", "c":
return configs.CharDevice, nil
case "b":
return 'b', nil
case "c":
return 'c', nil
return configs.BlockDevice, nil
default:
return 0, fmt.Errorf("invalid device type %q", s)
}
}
func createDevices(spec *specs.Spec, config *configs.Config) error {
// add whitelisted devices
config.Devices = []*configs.Device{
{
Type: 'c',
Path: "/dev/null",
Major: 1,
Minor: 3,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
{
Type: 'c',
Path: "/dev/random",
Major: 1,
Minor: 8,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
{
Type: 'c',
Path: "/dev/full",
Major: 1,
Minor: 7,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
{
Type: 'c',
Path: "/dev/tty",
Major: 5,
Minor: 0,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
{
Type: 'c',
Path: "/dev/zero",
Major: 1,
Minor: 5,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
{
Type: 'c',
Path: "/dev/urandom",
Major: 1,
Minor: 9,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
// Add default set of devices.
for _, device := range AllowedDevices {
if device.Path != "" {
config.Devices = append(config.Devices, device)
}
}
// merge in additional devices from the spec
// Merge in additional devices from the spec.
if spec.Linux != nil {
for _, d := range spec.Linux.Devices {
var uid, gid uint32
@ -689,10 +680,12 @@ func createDevices(spec *specs.Spec, config *configs.Config) error {
filemode = *d.FileMode
}
device := &configs.Device{
Type: dt,
DeviceRule: configs.DeviceRule{
Type: dt,
Major: d.Major,
Minor: d.Minor,
},
Path: d.Path,
Major: d.Major,
Minor: d.Minor,
FileMode: filemode,
Uid: uid,
Gid: gid,

View File

@ -126,12 +126,19 @@ function init_cgroup_paths() {
if stat -f -c %t /sys/fs/cgroup | grep -qFw 63677270; then
CGROUP_UNIFIED=yes
# "pseudo" controllers do not appear in /sys/fs/cgroup/cgroup.controllers.
# - devices (since kernel 4.15)
# - freezer (since kernel 5.2)
# Assume these are always available, as it is hard to detect
CGROUP_SUBSYSTEMS=$(cat /sys/fs/cgroup/cgroup.controllers; echo devices freezer)
# - devices (since kernel 4.15) we must assume to be supported because
# it's quite hard to test.
# - freezer (since kernel 5.2) we can auto-detect by looking for the
# "cgroup.freeze" file a *non-root* cgroup.
CGROUP_SUBSYSTEMS=$(cat /sys/fs/cgroup/cgroup.controllers; echo devices)
CGROUP_BASE_PATH=/sys/fs/cgroup
CGROUP_PATH=${CGROUP_BASE_PATH}${REL_CGROUPS_PATH}
# Find any cgroup.freeze files...
if [ -n "$(find "$CGROUP_BASE_PATH" -type f -name "cgroup.freeze" -print -quit)" ]
then
CGROUP_SUBSYSTEMS+=" freezer"
fi
else
CGROUP_UNIFIED=no
CGROUP_SUBSYSTEMS=$(awk '!/^#/ {print $1}' /proc/cgroups)
@ -200,75 +207,85 @@ function fail() {
# support it, the test is skipped with a message.
function requires() {
for var in "$@"; do
local skip_me
case $var in
criu)
if [ ! -e "$CRIU" ]; then
skip "test requires ${var}"
skip_me=1
fi
;;
root)
if [ "$ROOTLESS" -ne 0 ]; then
skip "test requires ${var}"
skip_me=1
fi
;;
rootless)
if [ "$ROOTLESS" -eq 0 ]; then
skip "test requires ${var}"
skip_me=1
fi
;;
rootless_idmap)
if [[ "$ROOTLESS_FEATURES" != *"idmap"* ]]; then
skip "test requires ${var}"
skip_me=1
fi
;;
rootless_cgroup)
if [[ "$ROOTLESS_FEATURES" != *"cgroup"* ]]; then
skip "test requires ${var}"
skip_me=1
fi
;;
rootless_no_cgroup)
if [[ "$ROOTLESS_FEATURES" == *"cgroup"* ]]; then
skip "test requires ${var}"
skip_me=1
fi
;;
cgroups_freezer)
init_cgroup_paths
if [[ "$CGROUP_SUBSYSTEMS" != *"freezer"* ]]; then
skip_me=1
fi
;;
cgroups_kmem)
init_cgroup_paths
if [ ! -e "${CGROUP_MEMORY_BASE_PATH}/memory.kmem.limit_in_bytes" ]; then
skip "Test requires ${var}"
skip_me=1
fi
;;
cgroups_rt)
init_cgroup_paths
if [ ! -e "${CGROUP_CPU_BASE_PATH}/cpu.rt_period_us" ]; then
skip "Test requires ${var}"
skip_me=1
fi
;;
cgroups_v1)
init_cgroup_paths
if [ "$CGROUP_UNIFIED" != "no" ]; then
skip "Test requires cgroups v1"
skip_me=1
fi
;;
cgroups_v2)
init_cgroup_paths
if [ "$CGROUP_UNIFIED" != "yes" ]; then
skip "Test requires cgroups v2 (unified)"
skip_me=1
fi
;;
systemd)
if [ -z "${RUNC_USE_SYSTEMD}" ]; then
skip "Test requires systemd"
skip_me=1
fi
;;
no_systemd)
if [ -n "${RUNC_USE_SYSTEMD}" ]; then
skip "Test requires no systemd"
skip_me=1
fi
;;
*)
fail "BUG: Invalid requires ${var}."
fail "BUG: Invalid requires $var."
;;
esac
if [ -n "$skip_me" ]; then
skip "test requires $var"
fi
done
}

View File

@ -12,8 +12,12 @@ function teardown() {
}
@test "runc pause and resume" {
# XXX: currently cgroups require root containers.
requires root
if [[ "$ROOTLESS" -ne 0 ]]
then
requires rootless_cgroup
set_cgroups_path "$BUSYBOX_BUNDLE"
fi
requires cgroups_freezer
# run busybox detached
runc run -d --console-socket $CONSOLE_SOCKET test_busybox
@ -37,8 +41,12 @@ function teardown() {
}
@test "runc pause and resume with nonexist container" {
# XXX: currently cgroups require root containers.
requires root
if [[ "$ROOTLESS" -ne 0 ]]
then
requires rootless_cgroup
set_cgroups_path "$BUSYBOX_BUNDLE"
fi
requires cgroups_freezer
# run test_busybox detached
runc run -d --console-socket $CONSOLE_SOCKET test_busybox

View File

@ -312,3 +312,70 @@ EOF
check_cgroup_value "cpu.rt_period_us" 900001
check_cgroup_value "cpu.rt_runtime_us" 600001
}
@test "update devices [minimal transition rules]" {
[[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup
# This test currently only makes sense on cgroupv1.
requires cgroups_v1
# Run a basic shell script that tries to write to /dev/null. If "runc
# update" makes use of minimal transition rules, updates should not cause
# writes to fail at any point.
jq '.process.args = ["sh", "-c", "while true; do echo >/dev/null; done"]' config.json > config.json.tmp
mv config.json{.tmp,}
# Set up a temporary console socket and recvtty so we can get the stdio.
TMP_RECVTTY_DIR="$(mktemp -d "$BATS_TMPDIR/runc-tmp-recvtty.XXXXXX")"
TMP_RECVTTY_PID="$TMP_RECVTTY_DIR/recvtty.pid"
TMP_CONSOLE_SOCKET="$TMP_RECVTTY_DIR/console.sock"
CONTAINER_OUTPUT="$TMP_RECVTTY_DIR/output"
("$RECVTTY" --no-stdin --pid-file "$TMP_RECVTTY_PID" \
--mode single "$TMP_CONSOLE_SOCKET" &>"$CONTAINER_OUTPUT" ) &
retry 10 0.1 [ -e "$TMP_CONSOLE_SOCKET" ]
# Run the container in the background.
runc run -d --console-socket "$TMP_CONSOLE_SOCKET" test_update
cat "$CONTAINER_OUTPUT"
[ "$status" -eq 0 ]
# Trigger an update. This update doesn't actually change the device rules,
# but it will trigger the devices cgroup code to reapply the current rules.
# We trigger the update a few times to make sure we hit the race.
for _ in {1..12}
do
# TODO: Update "runc update" so we can change the device rules.
runc update --pids-limit 30 test_update
[ "$status" -eq 0 ]
done
# Kill recvtty.
kill -9 "$(<"$TMP_RECVTTY_PID")"
# There should've been no output from the container.
cat "$CONTAINER_OUTPUT"
[ -z "$(<"$CONTAINER_OUTPUT")" ]
}
@test "update paused container" {
[[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup
requires cgroups_freezer
# Run the container in the background.
runc run -d --console-socket "$CONSOLE_SOCKET" test_update
[ "$status" -eq 0 ]
# Pause the container.
runc pause test_update
[ "$status" -eq 0 ]
# Trigger an unrelated update.
runc update --pids-limit 30 test_update
[ "$status" -eq 0 ]
# The container should still be paused.
testcontainer test_update paused
# Resume the container.
runc resume test_update
[ "$status" -eq 0 ]
}