diff --git a/Makefile b/Makefile index b82884af6..5fff5151b 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ SOURCES := $(shell find . 2>&1 | grep -E '.*\.(c|h|go)$$') PREFIX := $(DESTDIR)/usr/local -BINDIR := $(PREFIX)/sbin +BINDIR := $(PREFIX)/bin GIT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null) GIT_BRANCH_CLEAN := $(shell echo $(GIT_BRANCH) | sed -e "s/[^[:alnum:]]/-/g") RUNC_IMAGE := runc_dev$(if $(GIT_BRANCH_CLEAN),:$(GIT_BRANCH_CLEAN)) diff --git a/checkpoint.go b/checkpoint.go index dd7704f61..78977d71a 100644 --- a/checkpoint.go +++ b/checkpoint.go @@ -39,6 +39,11 @@ checkpointed.`, if err := checkArgs(context, 1, exactArgs); err != nil { return err } + // XXX: Currently this is untested with rootless containers. + if isRootless() { + return fmt.Errorf("runc checkpoint requires root") + } + container, err := getContainer(context) if err != nil { return err diff --git a/exec.go b/exec.go index 84061e6b7..22f2689ab 100644 --- a/exec.go +++ b/exec.go @@ -90,9 +90,6 @@ following will output a list of processes running in the container: if err := checkArgs(context, 1, minArgs); err != nil { return err } - if os.Geteuid() != 0 { - return fmt.Errorf("runc should be run as root") - } if err := revisePidFile(context); err != nil { return err } diff --git a/libcontainer/configs/config.go b/libcontainer/configs/config.go index 890cd7d19..98f4b8585 100644 --- a/libcontainer/configs/config.go +++ b/libcontainer/configs/config.go @@ -183,6 +183,9 @@ type Config struct { // NoNewKeyring will not allocated a new session keyring for the container. It will use the // callers keyring in this case. NoNewKeyring bool `json:"no_new_keyring"` + + // Rootless specifies whether the container is a rootless container. + Rootless bool `json:"rootless"` } type Hooks struct { diff --git a/libcontainer/configs/validate/rootless.go b/libcontainer/configs/validate/rootless.go new file mode 100644 index 000000000..1e83cedd0 --- /dev/null +++ b/libcontainer/configs/validate/rootless.go @@ -0,0 +1,117 @@ +package validate + +import ( + "fmt" + "os" + "reflect" + "strings" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +var ( + geteuid = os.Geteuid + getegid = os.Getegid +) + +func (v *ConfigValidator) rootless(config *configs.Config) error { + if err := rootlessMappings(config); err != nil { + return err + } + if err := rootlessMount(config); err != nil { + return err + } + // Currently, cgroups cannot effectively be used in rootless containers. + // The new cgroup namespace doesn't really help us either because it doesn't + // have nice interactions with the user namespace (we're working with upstream + // to fix this). + if err := rootlessCgroup(config); err != nil { + return err + } + + // XXX: We currently can't verify the user config at all, because + // configs.Config doesn't store the user-related configs. So this + // has to be verified by setupUser() in init_linux.go. + + return nil +} + +func rootlessMappings(config *configs.Config) error { + rootuid, err := config.HostUID() + if err != nil { + return fmt.Errorf("failed to get root uid from uidMappings: %v", err) + } + if euid := geteuid(); euid != 0 { + if !config.Namespaces.Contains(configs.NEWUSER) { + return fmt.Errorf("rootless containers require user namespaces") + } + if rootuid != euid { + return fmt.Errorf("rootless containers cannot map container root to a different host user") + } + } + + rootgid, err := config.HostGID() + if err != nil { + return fmt.Errorf("failed to get root gid from gidMappings: %v", err) + } + + // Similar to the above test, we need to make sure that we aren't trying to + // map to a group ID that we don't have the right to be. + if rootgid != getegid() { + return fmt.Errorf("rootless containers cannot map container root to a different host group") + } + + // We can only map one user and group inside a container (our own). + if len(config.UidMappings) != 1 || config.UidMappings[0].Size != 1 { + return fmt.Errorf("rootless containers cannot map more than one user") + } + if len(config.GidMappings) != 1 || config.GidMappings[0].Size != 1 { + return fmt.Errorf("rootless containers cannot map more than one group") + } + + return nil +} + +// cgroup verifies that the user isn't trying to set any cgroup limits or paths. +func rootlessCgroup(config *configs.Config) error { + // Nothing set at all. + if config.Cgroups == nil || config.Cgroups.Resources == nil { + return nil + } + + // Used for comparing to the zero value. + left := reflect.ValueOf(*config.Cgroups.Resources) + right := reflect.Zero(left.Type()) + + // This is all we need to do, since specconv won't add cgroup options in + // rootless mode. + if !reflect.DeepEqual(left.Interface(), right.Interface()) { + return fmt.Errorf("cannot specify resource limits in rootless container") + } + + return nil +} + +// mount verifies that the user isn't trying to set up any mounts they don't have +// the rights to do. In addition, it makes sure that no mount has a `uid=` or +// `gid=` option that doesn't resolve to root. +func rootlessMount(config *configs.Config) error { + // XXX: We could whitelist allowed devices at this point, but I'm not + // convinced that's a good idea. The kernel is the best arbiter of + // access control. + + for _, mount := range config.Mounts { + // Check that the options list doesn't contain any uid= or gid= entries + // that don't resolve to root. + for _, opt := range strings.Split(mount.Data, ",") { + if strings.HasPrefix(opt, "uid=") && opt != "uid=0" { + return fmt.Errorf("cannot specify uid= mount options in rootless containers where argument isn't 0") + } + if strings.HasPrefix(opt, "gid=") && opt != "gid=0" { + return fmt.Errorf("cannot specify gid= mount options in rootless containers where argument isn't 0") + } + } + } + + return nil +} diff --git a/libcontainer/configs/validate/rootless_test.go b/libcontainer/configs/validate/rootless_test.go new file mode 100644 index 000000000..23d678d97 --- /dev/null +++ b/libcontainer/configs/validate/rootless_test.go @@ -0,0 +1,195 @@ +package validate + +import ( + "testing" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +func init() { + geteuid = func() int { return 1337 } + getegid = func() int { return 7331 } +} + +func rootlessConfig() *configs.Config { + return &configs.Config{ + Rootfs: "/var", + Rootless: true, + Namespaces: configs.Namespaces( + []configs.Namespace{ + {Type: configs.NEWUSER}, + }, + ), + UidMappings: []configs.IDMap{ + { + HostID: geteuid(), + ContainerID: 0, + Size: 1, + }, + }, + GidMappings: []configs.IDMap{ + { + HostID: getegid(), + ContainerID: 0, + Size: 1, + }, + }, + } +} + +func TestValidateRootless(t *testing.T) { + validator := New() + + config := rootlessConfig() + if err := validator.Validate(config); err != nil { + t.Errorf("Expected error to not occur: %+v", err) + } +} + +/* rootlessMappings() */ + +func TestValidateRootlessUserns(t *testing.T) { + validator := New() + + config := rootlessConfig() + config.Namespaces = nil + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur if user namespaces not set") + } +} + +func TestValidateRootlessMappingUid(t *testing.T) { + validator := New() + + config := rootlessConfig() + config.UidMappings = nil + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur if no uid mappings provided") + } + + config = rootlessConfig() + config.UidMappings[0].HostID = geteuid() + 1 + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur if geteuid() != mapped uid") + } + + config = rootlessConfig() + config.UidMappings[0].Size = 1024 + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur if more than one uid mapped") + } + + config = rootlessConfig() + config.UidMappings = append(config.UidMappings, configs.IDMap{ + HostID: geteuid() + 1, + ContainerID: 0, + Size: 1, + }) + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur if more than one uid extent mapped") + } +} + +func TestValidateRootlessMappingGid(t *testing.T) { + validator := New() + + config := rootlessConfig() + config.GidMappings = nil + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur if no gid mappings provided") + } + + config = rootlessConfig() + config.GidMappings[0].HostID = getegid() + 1 + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur if getegid() != mapped gid") + } + + config = rootlessConfig() + config.GidMappings[0].Size = 1024 + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur if more than one gid mapped") + } + + config = rootlessConfig() + config.GidMappings = append(config.GidMappings, configs.IDMap{ + HostID: getegid() + 1, + ContainerID: 0, + Size: 1, + }) + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur if more than one gid extent mapped") + } +} + +/* rootlessMount() */ + +func TestValidateRootlessMountUid(t *testing.T) { + config := rootlessConfig() + validator := New() + + config.Mounts = []*configs.Mount{ + { + Source: "devpts", + Destination: "/dev/pts", + Device: "devpts", + }, + } + + if err := validator.Validate(config); err != nil { + t.Errorf("Expected error to not occur when uid= not set in mount options: %+v", err) + } + + config.Mounts[0].Data = "uid=5" + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur when setting uid=5 in mount options") + } + + config.Mounts[0].Data = "uid=0" + if err := validator.Validate(config); err != nil { + t.Errorf("Expected error to not occur when setting uid=0 in mount options: %+v", err) + } +} + +func TestValidateRootlessMountGid(t *testing.T) { + config := rootlessConfig() + validator := New() + + config.Mounts = []*configs.Mount{ + { + Source: "devpts", + Destination: "/dev/pts", + Device: "devpts", + }, + } + + if err := validator.Validate(config); err != nil { + t.Errorf("Expected error to not occur when gid= not set in mount options: %+v", err) + } + + config.Mounts[0].Data = "gid=5" + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur when setting gid=5 in mount options") + } + + config.Mounts[0].Data = "gid=0" + if err := validator.Validate(config); err != nil { + t.Errorf("Expected error to not occur when setting gid=0 in mount options: %+v", err) + } +} + +/* rootlessCgroup() */ + +func TestValidateRootlessCgroup(t *testing.T) { + validator := New() + + config := rootlessConfig() + config.Cgroups = &configs.Cgroup{ + Resources: &configs.Resources{ + PidsLimit: 1337, + }, + } + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur if cgroup limits set") + } +} diff --git a/libcontainer/configs/validate/validator.go b/libcontainer/configs/validate/validator.go index f076f506a..0dd580ac9 100644 --- a/libcontainer/configs/validate/validator.go +++ b/libcontainer/configs/validate/validator.go @@ -40,6 +40,11 @@ func (v *ConfigValidator) Validate(config *configs.Config) error { if err := v.sysctl(config); err != nil { return err } + if config.Rootless { + if err := v.rootless(config); err != nil { + return err + } + } return nil } diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index da685402e..c3dd42d27 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -51,6 +51,9 @@ type State struct { // Platform specific fields below here + // Specifies if the container was started under the rootless mode. + Rootless bool `json:"rootless"` + // Path to all the cgroups setup for a container. Key is cgroup subsystem name // with the value as the path. CgroupPaths map[string]string `json:"cgroup_paths"` @@ -452,6 +455,7 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig { PassedFilesCount: len(process.ExtraFiles), ContainerId: c.ID(), NoNewPrivileges: c.config.NoNewPrivileges, + Rootless: c.config.Rootless, AppArmorProfile: c.config.AppArmorProfile, ProcessLabel: c.config.ProcessLabel, Rlimits: c.config.Rlimits, @@ -622,6 +626,13 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { c.m.Lock() defer c.m.Unlock() + // TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has + // support for doing unprivileged dumps, but the setup of + // rootless containers might make this complicated. + if c.config.Rootless { + return fmt.Errorf("cannot checkpoint a rootless container") + } + if err := c.checkCriuVersion("1.5.2"); err != nil { return err } @@ -791,6 +802,13 @@ func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { c.m.Lock() defer c.m.Unlock() + + // TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have + // support for unprivileged restore at the moment. + if c.config.Rootless { + return fmt.Errorf("cannot restore a rootless container") + } + if err := c.checkCriuVersion("1.5.2"); err != nil { return err } @@ -918,6 +936,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { } func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { + // XXX: Do we need to deal with this case? AFAIK criu still requires root. if err := c.cgroupManager.Apply(pid); err != nil { return err } @@ -1314,6 +1333,7 @@ func (c *linuxContainer) currentState() (*State, error) { InitProcessStartTime: startTime, Created: c.created, }, + Rootless: c.config.Rootless, CgroupPaths: c.cgroupManager.GetPaths(), NamespacePaths: make(map[configs.NamespaceType]string), ExternalDescriptors: externalDescriptors, @@ -1441,16 +1461,19 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na Type: GidmapAttr, Value: b, }) - // check if we have CAP_SETGID to setgroup properly - pid, err := capability.NewPid(os.Getpid()) - if err != nil { - return nil, err - } - if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) { - r.AddData(&Boolmsg{ - Type: SetgroupAttr, - Value: true, - }) + // The following only applies if we are root. + if !c.config.Rootless { + // check if we have CAP_SETGID to setgroup properly + pid, err := capability.NewPid(os.Getpid()) + if err != nil { + return nil, err + } + if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) { + r.AddData(&Boolmsg{ + Type: SetgroupAttr, + Value: true, + }) + } } } } @@ -1461,5 +1484,11 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na Value: []byte(fmt.Sprintf("%d", c.config.OomScoreAdj)), }) + // write rootless + r.AddData(&Boolmsg{ + Type: RootlessAttr, + Value: c.config.Rootless, + }) + return bytes.NewReader(r.Serialize()), nil } diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index 0f5d412ac..118783516 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -58,6 +58,7 @@ type initConfig struct { ContainerId string `json:"containerid"` Rlimits []configs.Rlimit `json:"rlimits"` CreateConsole bool `json:"create_console"` + Rootless bool `json:"rootless"` } type initer interface { @@ -229,18 +230,21 @@ func syncParentHooks(pipe io.ReadWriter) error { func setupUser(config *initConfig) error { // Set up defaults. defaultExecUser := user.ExecUser{ - Uid: syscall.Getuid(), - Gid: syscall.Getgid(), + Uid: 0, + Gid: 0, Home: "/", } + passwdPath, err := user.GetPasswdPath() if err != nil { return err } + groupPath, err := user.GetGroupPath() if err != nil { return err } + execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath) if err != nil { return err @@ -253,22 +257,49 @@ func setupUser(config *initConfig) error { return err } } + + if config.Rootless { + if execUser.Uid != 0 { + return fmt.Errorf("cannot run as a non-root user in a rootless container") + } + + if execUser.Gid != 0 { + return fmt.Errorf("cannot run as a non-root group in a rootless container") + } + + // We cannot set any additional groups in a rootless container and thus we + // bail if the user asked us to do so. TODO: We currently can't do this + // earlier, but if libcontainer.Process.User was typesafe this might work. + if len(addGroups) > 0 { + return fmt.Errorf("cannot set any additional groups in a rootless container") + } + } + // before we change to the container's user make sure that the processes STDIO // is correctly owned by the user that we are switching to. if err := fixStdioPermissions(execUser); err != nil { return err } - suppGroups := append(execUser.Sgids, addGroups...) - if err := syscall.Setgroups(suppGroups); err != nil { - return err + + // This isn't allowed in an unprivileged user namespace since Linux 3.19. + // There's nothing we can do about /etc/group entries, so we silently + // ignore setting groups here (since the user didn't explicitly ask us to + // set the group). + if !config.Rootless { + suppGroups := append(execUser.Sgids, addGroups...) + if err := syscall.Setgroups(suppGroups); err != nil { + return err + } } if err := system.Setgid(execUser.Gid); err != nil { return err } + if err := system.Setuid(execUser.Uid); err != nil { return err } + // if we didn't get HOME already, set it based on the user's HOME if envHome := os.Getenv("HOME"); envHome == "" { if err := os.Setenv("HOME", execUser.Home); err != nil { diff --git a/libcontainer/message_linux.go b/libcontainer/message_linux.go index 321d6642f..bc725a227 100644 --- a/libcontainer/message_linux.go +++ b/libcontainer/message_linux.go @@ -18,6 +18,7 @@ const ( GidmapAttr uint16 = 27284 SetgroupAttr uint16 = 27285 OomScoreAdjAttr uint16 = 27286 + RootlessAttr uint16 = 27287 // When syscall.NLA_HDRLEN is in gccgo, take this out. syscall_NLA_HDRLEN = (syscall.SizeofNlAttr + syscall.NLA_ALIGNTO - 1) & ^(syscall.NLA_ALIGNTO - 1) diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index 9630206e3..0ad688343 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -72,6 +72,7 @@ struct nlconfig_t { char *namespaces; size_t namespaces_len; uint8_t is_setgroup; + uint8_t is_rootless; char *oom_score_adj; size_t oom_score_adj_len; }; @@ -87,6 +88,7 @@ struct nlconfig_t { #define GIDMAP_ATTR 27284 #define SETGROUP_ATTR 27285 #define OOM_SCORE_ADJ_ATTR 27286 +#define ROOTLESS_ATTR 27287 /* * Use the raw syscall for versions of glibc which don't include a function for @@ -175,6 +177,7 @@ static void update_setgroups(int pid, enum policy_t setgroup) policy = "deny"; break; case SETGROUPS_DEFAULT: + default: /* Nothing to do. */ return; } @@ -329,6 +332,9 @@ static void nl_parse(int fd, struct nlconfig_t *config) case CLONE_FLAGS_ATTR: config->cloneflags = readint32(current); break; + case ROOTLESS_ATTR: + config->is_rootless = readint8(current); + break; case OOM_SCORE_ADJ_ATTR: config->oom_score_adj = current; config->oom_score_adj_len = payload_len; @@ -574,9 +580,21 @@ void nsexec(void) exit(ret); case SYNC_USERMAP_PLS: - /* Enable setgroups(2) if we've been asked to. */ + /* + * Enable setgroups(2) if we've been asked to. But we also + * have to explicitly disable setgroups(2) if we're + * creating a rootless container (this is required since + * Linux 3.19). + */ + if (config.is_rootless && config.is_setgroup) { + kill(child, SIGKILL); + bail("cannot allow setgroup in an unprivileged user namespace setup"); + } + if (config.is_setgroup) update_setgroups(child, SETGROUPS_ALLOW); + if (config.is_rootless) + update_setgroups(child, SETGROUPS_DENY); /* Set up mappings. */ update_uidmap(child, config.uidmap, config.uidmap_len); @@ -818,8 +836,10 @@ void nsexec(void) if (setgid(0) < 0) bail("setgid failed"); - if (setgroups(0, NULL) < 0) - bail("setgroups failed"); + if (!config.is_rootless && config.is_setgroup) { + if (setgroups(0, NULL) < 0) + bail("setgroups failed"); + } s = SYNC_CHILD_READY; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index c60f47301..e8b7506db 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -80,7 +80,8 @@ func (p *setnsProcess) start() (err error) { if err = p.execSetns(); err != nil { return newSystemErrorWithCause(err, "executing setns process") } - if len(p.cgroupPaths) > 0 { + // We can't join cgroups if we're in a rootless container. + if !p.config.Rootless && len(p.cgroupPaths) > 0 { if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil { return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid()) } @@ -253,13 +254,15 @@ func (p *initProcess) start() error { return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid()) } p.setExternalDescriptors(fds) - // Do this before syncing with child so that no children - // can escape the cgroup - if err := p.manager.Apply(p.pid()); err != nil { - return newSystemErrorWithCause(err, "applying cgroup configuration for process") + if !p.container.config.Rootless { + // Do this before syncing with child so that no children can escape the + // cgroup. We can't do this if we're not running as root. + if err := p.manager.Apply(p.pid()); err != nil { + return newSystemErrorWithCause(err, "applying cgroup configuration for process") + } } defer func() { - if err != nil { + if err != nil && !p.container.config.Rootless { // TODO: should not be the responsibility to call here p.manager.Destroy() } @@ -278,8 +281,11 @@ func (p *initProcess) start() error { ierr := parseSync(p.parentPipe, func(sync *syncT) error { switch sync.Type { case procReady: - if err := p.manager.Set(p.config.Config); err != nil { - return newSystemErrorWithCause(err, "setting cgroup config for ready process") + // We can't set cgroups if we're in a rootless container. + if !p.container.config.Rootless { + if err := p.manager.Set(p.config.Config); err != nil { + return newSystemErrorWithCause(err, "setting cgroup config for ready process") + } } // set rlimits, this has to be done here because we lose permissions // to raise the limits once we enter a user-namespace @@ -424,6 +430,12 @@ func getPipeFds(pid int) ([]string, error) { f := filepath.Join(dirPath, strconv.Itoa(i)) target, err := os.Readlink(f) if err != nil { + // Ignore permission errors, for rootless containers and other + // non-dumpable processes. if we can't get the fd for a particular + // file, there's not much we can do. + if os.IsPermission(err) { + continue + } return fds, err } fds[i] = target diff --git a/libcontainer/specconv/example.go b/libcontainer/specconv/example.go new file mode 100644 index 000000000..44fad97e5 --- /dev/null +++ b/libcontainer/specconv/example.go @@ -0,0 +1,160 @@ +package specconv + +import ( + "runtime" + + "github.com/opencontainers/runtime-spec/specs-go" +) + +func sPtr(s string) *string { return &s } + +// ExampleSpec returns an example spec file, with many options set so a user +// can see what a standard spec file looks like. +func ExampleSpec() *specs.Spec { + return &specs.Spec{ + Version: specs.Version, + Platform: specs.Platform{ + OS: runtime.GOOS, + Arch: runtime.GOARCH, + }, + Root: specs.Root{ + Path: "rootfs", + Readonly: true, + }, + Process: specs.Process{ + Terminal: true, + User: specs.User{}, + Args: []string{ + "sh", + }, + Env: []string{ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm", + }, + Cwd: "/", + NoNewPrivileges: true, + Capabilities: &specs.LinuxCapabilities{ + Bounding: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Permitted: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Inheritable: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Ambient: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Effective: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + }, + Rlimits: []specs.LinuxRlimit{ + { + Type: "RLIMIT_NOFILE", + Hard: uint64(1024), + Soft: uint64(1024), + }, + }, + }, + Hostname: "runc", + Mounts: []specs.Mount{ + { + Destination: "/proc", + Type: "proc", + Source: "proc", + Options: nil, + }, + { + Destination: "/dev", + Type: "tmpfs", + Source: "tmpfs", + Options: []string{"nosuid", "strictatime", "mode=755", "size=65536k"}, + }, + { + Destination: "/dev/pts", + Type: "devpts", + Source: "devpts", + Options: []string{"nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620", "gid=5"}, + }, + { + Destination: "/dev/shm", + Type: "tmpfs", + Source: "shm", + Options: []string{"nosuid", "noexec", "nodev", "mode=1777", "size=65536k"}, + }, + { + Destination: "/dev/mqueue", + Type: "mqueue", + Source: "mqueue", + Options: []string{"nosuid", "noexec", "nodev"}, + }, + { + Destination: "/sys", + Type: "sysfs", + Source: "sysfs", + Options: []string{"nosuid", "noexec", "nodev", "ro"}, + }, + { + Destination: "/sys/fs/cgroup", + Type: "cgroup", + Source: "cgroup", + Options: []string{"nosuid", "noexec", "nodev", "relatime", "ro"}, + }, + }, + Linux: &specs.Linux{ + MaskedPaths: []string{ + "/proc/kcore", + "/proc/latency_stats", + "/proc/timer_list", + "/proc/timer_stats", + "/proc/sched_debug", + "/sys/firmware", + }, + ReadonlyPaths: []string{ + "/proc/asound", + "/proc/bus", + "/proc/fs", + "/proc/irq", + "/proc/sys", + "/proc/sysrq-trigger", + }, + Resources: &specs.LinuxResources{ + Devices: []specs.LinuxDeviceCgroup{ + { + Allow: false, + Access: "rwm", + }, + }, + }, + Namespaces: []specs.LinuxNamespace{ + { + Type: "pid", + }, + { + Type: "network", + }, + { + Type: "ipc", + }, + { + Type: "uts", + }, + { + Type: "mount", + }, + }, + }, + } +} diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index 52b3ca112..346b2689d 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -145,6 +145,7 @@ type CreateOpts struct { NoPivotRoot bool NoNewKeyring bool Spec *specs.Spec + Rootless bool } // CreateLibcontainerConfig creates a new libcontainer configuration from a @@ -175,6 +176,7 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { Hostname: spec.Hostname, Labels: append(labels, fmt.Sprintf("bundle=%s", cwd)), NoNewKeyring: opts.NoNewKeyring, + Rootless: opts.Rootless, } exists := false @@ -208,7 +210,7 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { if err := setupUserNamespace(spec, config); err != nil { return nil, err } - c, err := createCgroupConfig(opts.CgroupName, opts.UseSystemdCgroup, spec) + c, err := createCgroupConfig(opts) if err != nil { return nil, err } @@ -264,8 +266,14 @@ func createLibcontainerMount(cwd string, m specs.Mount) *configs.Mount { } } -func createCgroupConfig(name string, useSystemdCgroup bool, spec *specs.Spec) (*configs.Cgroup, error) { - var myCgroupPath string +func createCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) { + var ( + myCgroupPath string + + spec = opts.Spec + useSystemdCgroup = opts.UseSystemdCgroup + name = opts.CgroupName + ) c := &configs.Cgroup{ Resources: &configs.Resources{}, @@ -301,9 +309,14 @@ func createCgroupConfig(name string, useSystemdCgroup bool, spec *specs.Spec) (* c.Path = myCgroupPath } - c.Resources.AllowedDevices = allowedDevices - if spec.Linux == nil { - return c, nil + // In rootless containers, any attempt to make cgroup changes will fail. + // libcontainer will validate this and we shouldn't add any cgroup options + // the user didn't specify. + if !opts.Rootless { + c.Resources.AllowedDevices = allowedDevices + if spec.Linux == nil { + return c, nil + } } r := spec.Linux.Resources if r == nil { @@ -340,8 +353,10 @@ func createCgroupConfig(name string, useSystemdCgroup bool, spec *specs.Spec) (* } c.Resources.Devices = append(c.Resources.Devices, dd) } - // append the default allowed devices to the end of the list - c.Resources.Devices = append(c.Resources.Devices, allowedDevices...) + if !opts.Rootless { + // append the default allowed devices to the end of the list + c.Resources.Devices = append(c.Resources.Devices, allowedDevices...) + } if r.Memory != nil { if r.Memory.Limit != nil { c.Resources.Memory = *r.Memory.Limit diff --git a/libcontainer/specconv/spec_linux_test.go b/libcontainer/specconv/spec_linux_test.go index baa2638ad..741fae630 100644 --- a/libcontainer/specconv/spec_linux_test.go +++ b/libcontainer/specconv/spec_linux_test.go @@ -3,8 +3,10 @@ package specconv import ( + "os" "testing" + "github.com/opencontainers/runc/libcontainer/configs/validate" "github.com/opencontainers/runtime-spec/specs-go" ) @@ -16,7 +18,13 @@ func TestLinuxCgroupsPathSpecified(t *testing.T) { CgroupsPath: cgroupsPath, } - cgroup, err := createCgroupConfig("ContainerID", false, spec) + opts := &CreateOpts{ + CgroupName: "ContainerID", + UseSystemdCgroup: false, + Spec: spec, + } + + cgroup, err := createCgroupConfig(opts) if err != nil { t.Errorf("Couldn't create Cgroup config: %v", err) } @@ -28,8 +36,13 @@ func TestLinuxCgroupsPathSpecified(t *testing.T) { func TestLinuxCgroupsPathNotSpecified(t *testing.T) { spec := &specs.Spec{} + opts := &CreateOpts{ + CgroupName: "ContainerID", + UseSystemdCgroup: false, + Spec: spec, + } - cgroup, err := createCgroupConfig("ContainerID", false, spec) + cgroup, err := createCgroupConfig(opts) if err != nil { t.Errorf("Couldn't create Cgroup config: %v", err) } @@ -39,6 +52,26 @@ func TestLinuxCgroupsPathNotSpecified(t *testing.T) { } } +func TestSpecconvExampleValidate(t *testing.T) { + spec := ExampleSpec() + spec.Root.Path = "/" + opts := &CreateOpts{ + CgroupName: "ContainerID", + UseSystemdCgroup: false, + Spec: spec, + } + + config, err := CreateLibcontainerConfig(opts) + if err != nil { + t.Errorf("Couldn't create libcontainer config: %v", err) + } + + validator := validate.New() + if err := validator.Validate(config); err != nil { + t.Errorf("Expected specconv to produce valid container config: %v", err) + } +} + func TestDupNamespaces(t *testing.T) { spec := &specs.Spec{ Linux: &specs.Linux{ @@ -62,3 +95,46 @@ func TestDupNamespaces(t *testing.T) { t.Errorf("Duplicated namespaces should be forbidden") } } + +func TestRootlessSpecconvValidate(t *testing.T) { + spec := &specs.Spec{ + Linux: specs.Linux{ + Namespaces: []specs.Namespace{ + { + Type: specs.UserNamespace, + }, + }, + UIDMappings: []specs.IDMapping{ + { + HostID: uint32(os.Geteuid()), + ContainerID: 0, + Size: 1, + }, + }, + GIDMappings: []specs.IDMapping{ + { + HostID: uint32(os.Getegid()), + ContainerID: 0, + Size: 1, + }, + }, + }, + } + + opts := &CreateOpts{ + CgroupName: "ContainerID", + UseSystemdCgroup: false, + Spec: spec, + Rootless: true, + } + + config, err := CreateLibcontainerConfig(opts) + if err != nil { + t.Errorf("Couldn't create libcontainer config: %v", err) + } + + validator := validate.New() + if err := validator.Validate(config); err != nil { + t.Errorf("Expected specconv to produce valid rootless container config: %v", err) + } +} diff --git a/list.go b/list.go index c7550a2a8..1c3b9aa83 100644 --- a/list.go +++ b/list.go @@ -7,12 +7,14 @@ import ( "io/ioutil" "os" "path/filepath" + "syscall" "text/tabwriter" "time" "encoding/json" "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/user" "github.com/opencontainers/runc/libcontainer/utils" "github.com/urfave/cli" ) @@ -38,6 +40,8 @@ type containerState struct { Created time.Time `json:"created"` // Annotations is the user defined annotations added to the config. Annotations map[string]string `json:"annotations,omitempty"` + // The owner of the state directory (the owner of the container). + Owner string `json:"owner"` } var listCommand = cli.Command{ @@ -85,14 +89,15 @@ To list containers created using a non-default value for "--root": switch context.String("format") { case "table": w := tabwriter.NewWriter(os.Stdout, 12, 1, 3, ' ', 0) - fmt.Fprint(w, "ID\tPID\tSTATUS\tBUNDLE\tCREATED\n") + fmt.Fprint(w, "ID\tPID\tSTATUS\tBUNDLE\tCREATED\tOWNER\n") for _, item := range s { - fmt.Fprintf(w, "%s\t%d\t%s\t%s\t%s\n", + fmt.Fprintf(w, "%s\t%d\t%s\t%s\t%s\t%s\n", item.ID, item.InitProcessPid, item.Status, item.Bundle, - item.Created.Format(time.RFC3339Nano)) + item.Created.Format(time.RFC3339Nano), + item.Owner) } if err := w.Flush(); err != nil { return err @@ -126,6 +131,13 @@ func getContainers(context *cli.Context) ([]containerState, error) { var s []containerState for _, item := range list { if item.IsDir() { + // This cast is safe on Linux. + stat := item.Sys().(*syscall.Stat_t) + owner, err := user.LookupUid(int(stat.Uid)) + if err != nil { + owner.Name = string(stat.Uid) + } + container, err := factory.Load(item.Name()) if err != nil { fmt.Fprintf(os.Stderr, "load container %s: %v\n", item.Name(), err) @@ -155,6 +167,7 @@ func getContainers(context *cli.Context) ([]containerState, error) { Rootfs: state.BaseState.Config.Rootfs, Created: state.BaseState.Created, Annotations: annotations, + Owner: owner.Name, }) } } diff --git a/ps.go b/ps.go index b8a1b111b..6e0c7376a 100644 --- a/ps.go +++ b/ps.go @@ -28,6 +28,11 @@ var psCommand = cli.Command{ if err := checkArgs(context, 1, minArgs); err != nil { return err } + // XXX: Currently not supported with rootless containers. + if isRootless() { + return fmt.Errorf("runc ps requires root") + } + container, err := getContainer(context) if err != nil { return err diff --git a/restore.go b/restore.go index afc604653..06f635f13 100644 --- a/restore.go +++ b/restore.go @@ -3,6 +3,7 @@ package main import ( + "fmt" "os" "syscall" @@ -86,6 +87,11 @@ using the runc checkpoint command.`, if err := checkArgs(context, 1, exactArgs); err != nil { return err } + // XXX: Currently this is untested with rootless containers. + if isRootless() { + return fmt.Errorf("runc restore requires root") + } + imagePath := context.String("image-path") id := context.Args().First() if id == "" { diff --git a/spec.go b/spec.go index 1b55c6b4c..d7df312a8 100644 --- a/spec.go +++ b/spec.go @@ -10,6 +10,7 @@ import ( "runtime" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/specconv" "github.com/opencontainers/runtime-spec/specs-go" "github.com/urfave/cli" ) @@ -68,152 +69,7 @@ container on your host.`, if err := checkArgs(context, 0, exactArgs); err != nil { return err } - spec := specs.Spec{ - Version: specs.Version, - Platform: specs.Platform{ - OS: runtime.GOOS, - Arch: runtime.GOARCH, - }, - Root: specs.Root{ - Path: "rootfs", - Readonly: true, - }, - Process: specs.Process{ - Terminal: true, - User: specs.User{}, - Args: []string{ - "sh", - }, - Env: []string{ - "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", - "TERM=xterm", - }, - Cwd: "/", - NoNewPrivileges: true, - Capabilities: &specs.LinuxCapabilities{ - Bounding: []string{ - "CAP_AUDIT_WRITE", - "CAP_KILL", - "CAP_NET_BIND_SERVICE", - }, - Permitted: []string{ - "CAP_AUDIT_WRITE", - "CAP_KILL", - "CAP_NET_BIND_SERVICE", - }, - Inheritable: []string{ - "CAP_AUDIT_WRITE", - "CAP_KILL", - "CAP_NET_BIND_SERVICE", - }, - Ambient: []string{ - "CAP_AUDIT_WRITE", - "CAP_KILL", - "CAP_NET_BIND_SERVICE", - }, - Effective: []string{ - "CAP_AUDIT_WRITE", - "CAP_KILL", - "CAP_NET_BIND_SERVICE", - }, - }, - Rlimits: []specs.LinuxRlimit{ - { - Type: "RLIMIT_NOFILE", - Hard: uint64(1024), - Soft: uint64(1024), - }, - }, - }, - Hostname: "runc", - Mounts: []specs.Mount{ - { - Destination: "/proc", - Type: "proc", - Source: "proc", - Options: nil, - }, - { - Destination: "/dev", - Type: "tmpfs", - Source: "tmpfs", - Options: []string{"nosuid", "strictatime", "mode=755", "size=65536k"}, - }, - { - Destination: "/dev/pts", - Type: "devpts", - Source: "devpts", - Options: []string{"nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620", "gid=5"}, - }, - { - Destination: "/dev/shm", - Type: "tmpfs", - Source: "shm", - Options: []string{"nosuid", "noexec", "nodev", "mode=1777", "size=65536k"}, - }, - { - Destination: "/dev/mqueue", - Type: "mqueue", - Source: "mqueue", - Options: []string{"nosuid", "noexec", "nodev"}, - }, - { - Destination: "/sys", - Type: "sysfs", - Source: "sysfs", - Options: []string{"nosuid", "noexec", "nodev", "ro"}, - }, - { - Destination: "/sys/fs/cgroup", - Type: "cgroup", - Source: "cgroup", - Options: []string{"nosuid", "noexec", "nodev", "relatime", "ro"}, - }, - }, - Linux: &specs.Linux{ - MaskedPaths: []string{ - "/proc/kcore", - "/proc/latency_stats", - "/proc/timer_list", - "/proc/timer_stats", - "/proc/sched_debug", - "/sys/firmware", - }, - ReadonlyPaths: []string{ - "/proc/asound", - "/proc/bus", - "/proc/fs", - "/proc/irq", - "/proc/sys", - "/proc/sysrq-trigger", - }, - Resources: &specs.LinuxResources{ - Devices: []specs.LinuxDeviceCgroup{ - { - Allow: false, - Access: "rwm", - }, - }, - }, - Namespaces: []specs.LinuxNamespace{ - { - Type: "pid", - }, - { - Type: "network", - }, - { - Type: "ipc", - }, - { - Type: "uts", - }, - { - Type: "mount", - }, - }, - }, - } + spec := specconv.ExampleSpec() checkNoFile := func(name string) error { _, err := os.Stat(name) @@ -234,7 +90,7 @@ container on your host.`, if err := checkNoFile(specConfig); err != nil { return err } - data, err := json.MarshalIndent(&spec, "", "\t") + data, err := json.MarshalIndent(spec, "", "\t") if err != nil { return err } diff --git a/utils.go b/utils.go index 1286fd6f2..98f93a4cf 100644 --- a/utils.go +++ b/utils.go @@ -63,9 +63,6 @@ func setupSpec(context *cli.Context) (*specs.Spec, error) { if err != nil { return nil, err } - if os.Geteuid() != 0 { - return nil, fmt.Errorf("runc should be run as root") - } return spec, nil } diff --git a/utils_linux.go b/utils_linux.go index dcf156c8c..767015ed0 100644 --- a/utils_linux.go +++ b/utils_linux.go @@ -186,6 +186,11 @@ func createPidFile(path string, process *libcontainer.Process) error { return os.Rename(tmpName, path) } +// XXX: Currently we autodetect rootless mode. +func isRootless() bool { + return os.Geteuid() != 0 +} + func createContainer(context *cli.Context, id string, spec *specs.Spec) (libcontainer.Container, error) { config, err := specconv.CreateLibcontainerConfig(&specconv.CreateOpts{ CgroupName: id, @@ -193,6 +198,7 @@ func createContainer(context *cli.Context, id string, spec *specs.Spec) (libcont NoPivotRoot: context.Bool("no-pivot"), NoNewKeyring: context.Bool("no-new-keyring"), Spec: spec, + Rootless: isRootless(), }) if err != nil { return nil, err