1
0
mirror of https://github.com/opencontainers/runc.git synced 2026-02-05 18:45:28 +01:00
Files
runc/libcontainer/init_linux.go

737 lines
24 KiB
Go
Raw Permalink Normal View History

package libcontainer
import (
"bytes"
"encoding/json"
"errors"
"fmt"
"io"
"net"
"os"
"path/filepath"
"runtime"
"runtime/debug"
"strconv"
"syscall"
"github.com/containerd/console"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
"github.com/vishvananda/netlink"
"golang.org/x/sys/unix"
"github.com/opencontainers/cgroups"
"github.com/opencontainers/runc/internal/linux"
"github.com/opencontainers/runc/internal/pathrs"
"github.com/opencontainers/runc/libcontainer/capabilities"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/utils"
)
type initType string
const (
initSetns initType = "setns"
initStandard initType = "standard"
)
type pid struct {
Pid int `json:"stage2_pid"`
PidFirstChild int `json:"stage1_pid"`
}
// network is an internal struct used to setup container networks.
type network struct {
configs.Network
// TempVethPeerName is a unique temporary veth peer name that was placed into
// the container's namespace.
TempVethPeerName string `json:"temp_veth_peer_name"`
}
// initConfig is used for transferring parameters from Exec() to Init().
// It contains:
// - original container config;
// - some [Process] properties;
// - set of properties merged from the container config ([configs.Config])
// and the process ([Process]);
// - some properties that come from the container.
//
// When adding new fields, please make sure they go into the relevant section.
type initConfig struct {
// Config is the original container config.
Config *configs.Config `json:"config"`
// Properties that are unique to and come from [Process].
Args []string `json:"args"`
Env []string `json:"env"`
UID int `json:"uid"`
GID int `json:"gid"`
AdditionalGroups []int `json:"additional_groups"`
Cwd string `json:"cwd"`
CreateConsole bool `json:"create_console"`
ConsoleWidth uint16 `json:"console_width"`
ConsoleHeight uint16 `json:"console_height"`
PassedFilesCount int `json:"passed_files_count"`
// Properties that exists both in the container config and the process,
// as merged by [Container.newInitConfig] (process properties has preference).
AppArmorProfile string `json:"apparmor_profile"`
Capabilities *configs.Capabilities `json:"capabilities"`
NoNewPrivileges bool `json:"no_new_privileges"`
ProcessLabel string `json:"process_label"`
Rlimits []configs.Rlimit `json:"rlimits"`
IOPriority *configs.IOPriority `json:"io_priority,omitempty"`
Scheduler *configs.Scheduler `json:"scheduler,omitempty"`
CPUAffinity *configs.CPUAffinity `json:"cpu_affinity,omitempty"`
// Miscellaneous properties, filled in by [Container.newInitConfig]
// unless documented otherwise.
ContainerID string `json:"containerid"`
Cgroup2Path string `json:"cgroup2_path,omitempty"`
// Networks is filled in from container config by [initProcess.createNetworkInterfaces].
Networks []*network `json:"network"`
// SpecState is filled in by [initProcess.Start].
SpecState *specs.State `json:"spec_state,omitempty"`
}
// Init is part of "runc init" implementation.
func Init() {
runtime.GOMAXPROCS(1)
runtime.LockOSThread()
if err := startInitialization(); err != nil {
// If the error is returned, it was not communicated
// back to the parent (which is not a common case),
// so print it to stderr here as a last resort.
//
// Do not use logrus as we are not sure if it has been
// set up yet, but most important, if the parent is
// alive (and its log forwarding is working).
fmt.Fprintln(os.Stderr, err)
}
// Normally, StartInitialization() never returns, meaning
// if we are here, it had failed.
init: close internal fds before execve If we leak a file descriptor referencing the host filesystem, an attacker could use a /proc/self/fd magic-link as the source for execve to execute a host binary in the container. This would allow the binary itself (or a process inside the container in the 'runc exec' case) to write to a host binary, leading to a container escape. The simple solution is to make sure we close all file descriptors immediately before the execve(2) step. Doing this earlier can lead to very serious issues in Go (as file descriptors can be reused, any (*os.File) reference could start silently operating on a different file) so we have to do it as late as possible. Unfortunately, there are some Go runtime file descriptors that we must not close (otherwise the Go scheduler panics randomly). The only way of being sure which file descriptors cannot be closed is to sneakily go:linkname the runtime internal "internal/poll.IsPollDescriptor" function. This is almost certainly not recommended but there isn't any other way to be absolutely sure, while also closing any other possible files. In addition, we can keep the logrus forwarding logfd open because you cannot execve a pipe and the contents of the pipe are so restricted (JSON-encoded in a format we pick) that it seems unlikely you could even construct shellcode. Closing the logfd causes issues if there is an error returned from execve. In mainline runc, runc-dmz protects us against this attack because the intermediate execve(2) closes all of the O_CLOEXEC internal runc file descriptors and thus runc-dmz cannot access them to attack the host. Fixes: GHSA-xr7r-f8xq-vfvv CVE-2024-21626 Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
2024-01-02 14:58:28 +11:00
os.Exit(255)
}
// Normally, this function does not return. If it returns, with or without an
// error, it means the initialization has failed. If the error is returned,
// it means the error can not be communicated back to the parent.
func startInitialization() (retErr error) {
// Get the synchronisation pipe.
envSyncPipe := os.Getenv("_LIBCONTAINER_SYNCPIPE")
syncPipeFd, err := strconv.Atoi(envSyncPipe)
if err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_SYNCPIPE: %w", err)
}
syncPipe := newSyncSocket(os.NewFile(uintptr(syncPipeFd), "sync"))
defer syncPipe.Close()
defer func() {
// If this defer is ever called, this means initialization has failed.
// Send the error back to the parent process in the form of an initError
// if the sync socket has not been closed.
if syncPipe.isClosed() {
return
}
ierr := initError{Message: retErr.Error()}
if err := writeSyncArg(syncPipe, procError, ierr); err != nil {
fmt.Fprintln(os.Stderr, err)
return
}
// The error is sent, no need to also return it (or it will be reported twice).
retErr = nil
}()
// Get the INITPIPE.
envInitPipe := os.Getenv("_LIBCONTAINER_INITPIPE")
initPipeFd, err := strconv.Atoi(envInitPipe)
if err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE: %w", err)
}
initPipe := os.NewFile(uintptr(initPipeFd), "init")
defer initPipe.Close()
// Set up logging. This is used rarely, and mostly for init debugging.
// Passing log level is optional; currently libcontainer/integration does not do it.
if levelStr := os.Getenv("_LIBCONTAINER_LOGLEVEL"); levelStr != "" {
logLevel, err := strconv.Atoi(levelStr)
if err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_LOGLEVEL: %w", err)
}
logrus.SetLevel(logrus.Level(logLevel))
}
logFd, err := strconv.Atoi(os.Getenv("_LIBCONTAINER_LOGPIPE"))
if err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_LOGPIPE: %w", err)
}
logPipe := os.NewFile(uintptr(logFd), "logpipe")
defer logPipe.Close()
logrus.SetOutput(logPipe)
logrus.SetFormatter(new(logrus.JSONFormatter))
logrus.Debug("child process in init()")
// Only init processes have FIFOFD.
var fifoFile *os.File
envInitType := os.Getenv("_LIBCONTAINER_INITTYPE")
it := initType(envInitType)
if it == initStandard {
fifoFd, err := strconv.Atoi(os.Getenv("_LIBCONTAINER_FIFOFD"))
if err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD: %w", err)
}
fifoFile = os.NewFile(uintptr(fifoFd), "initfifo")
defer fifoFile.Close()
}
var consoleSocket *os.File
if envConsole := os.Getenv("_LIBCONTAINER_CONSOLE"); envConsole != "" {
console, err := strconv.Atoi(envConsole)
if err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_CONSOLE: %w", err)
}
consoleSocket = os.NewFile(uintptr(console), "console-socket")
defer consoleSocket.Close()
}
var pidfdSocket *os.File
if envSockFd := os.Getenv("_LIBCONTAINER_PIDFD_SOCK"); envSockFd != "" {
sockFd, err := strconv.Atoi(envSockFd)
if err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_PIDFD_SOCK: %w", err)
}
pidfdSocket = os.NewFile(uintptr(sockFd), "pidfd-socket")
defer pidfdSocket.Close()
}
libct: speedup process.Env handling The current implementation sets all the environment variables passed in Process.Env in the current process, one by one, then uses os.Environ to read those back. As pointed out in [1], this is slow, as runc calls os.Setenv for every variable, and there may be a few thousands of those. Looking into how os.Setenv is implemented, it is indeed slow, especially when cgo is enabled. Looking into why it was implemented the way it is, I found commit 9744d72c and traced it to [2], which discusses the actual reasons. It boils down to these two: - HOME is not passed into container as it is set in setupUser by os.Setenv and has no effect on config.Env; - there is a need to deduplicate the environment variables. Yet it was decided in [2] to not go ahead with this patch, but later [3] was opened with the carry of this patch, and merged. Now, from what I see: 1. Passing environment to exec is way faster than using os.Setenv and os.Environ (tests show ~20x speed improvement in a simple Go test, and ~3x improvement in real-world test, see below). 2. Setting environment variables in the runc context may result is some ugly side effects (think GODEBUG, LD_PRELOAD, or _LIBCONTAINER_*). 3. Nothing in runtime spec says that the environment needs to be deduplicated, or the order of preference (whether the first or the last value of a variable with the same name is to be used). We should stick to what we have in order to maintain backward compatibility. So, this patch: - switches to passing env directly to exec; - adds deduplication mechanism to retain backward compatibility; - takes care to set PATH from process.Env in the current process (so that supplied PATH is used to find the binary to execute), also to retain backward compatibility; - adds HOME to process.Env if not set; - ensures any StartContainer CommandHook entries with no environment set explicitly are run with the same environment as before. Thanks to @lifubang who noticed that peculiarity. The benchmark added by the previous commit shows ~3x improvement: │ before │ after │ │ sec/op │ sec/op vs base │ ExecInBigEnv-20 61.53m ± 1% 21.87m ± 16% -64.46% (p=0.000 n=10) [1]: https://github.com/opencontainers/runc/pull/1983 [2]: https://github.com/docker-archive/libcontainer/pull/418 [3]: https://github.com/docker-archive/libcontainer/pull/432 Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2024-06-23 16:31:57 -07:00
// From here on, we don't need current process environment. It is not
// used directly anywhere below this point, but let's clear it anyway.
os.Clearenv()
defer func() {
if err := recover(); err != nil {
if err2, ok := err.(error); ok {
retErr = fmt.Errorf("panic from initialization: %w, %s", err2, debug.Stack())
} else {
retErr = fmt.Errorf("panic from initialization: %v, %s", err, debug.Stack())
}
}
}()
var config initConfig
if err := json.NewDecoder(initPipe).Decode(&config); err != nil {
return err
}
// If init succeeds, it will not return, hence none of the defers will be called.
return containerInit(it, &config, syncPipe, consoleSocket, pidfdSocket, fifoFile, logPipe)
}
func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSocket, pidfdSocket, fifoFile, logPipe *os.File) error {
// Clean the RLIMIT_NOFILE cache in go runtime.
// Issue: https://github.com/opencontainers/runc/issues/4195
maybeClearRlimitNofileCache(config.Rlimits)
switch t {
case initSetns:
i := &linuxSetnsInit{
pipe: pipe,
consoleSocket: consoleSocket,
pidfdSocket: pidfdSocket,
config: config,
logPipe: logPipe,
}
return i.Init()
case initStandard:
i := &linuxStandardInit{
pipe: pipe,
consoleSocket: consoleSocket,
pidfdSocket: pidfdSocket,
parentPid: unix.Getppid(),
config: config,
fifoFile: fifoFile,
logPipe: logPipe,
}
return i.Init()
}
return fmt.Errorf("unknown init type %q", t)
}
// verifyCwd ensures that the current directory is actually inside the mount
// namespace root of the current process.
func verifyCwd() error {
// getcwd(2) on Linux detects if cwd is outside of the rootfs of the
// current mount namespace root, and in that case prefixes "(unreachable)"
// to the returned string. glibc's getcwd(3) and Go's Getwd() both detect
// when this happens and return ENOENT rather than returning a non-absolute
// path. In both cases we can therefore easily detect if we have an invalid
// cwd by checking the return value of getcwd(3). See getcwd(3) for more
// details, and CVE-2024-21626 for the security issue that motivated this
// check.
//
// We do not use os.Getwd() here because it has a workaround for
// $PWD which involves doing stat(.), which can fail if the current
// directory is inaccessible to the container process.
if wd, err := linux.Getwd(); errors.Is(err, unix.ENOENT) {
return errors.New("current working directory is outside of container mount namespace root -- possible container breakout detected")
} else if err != nil {
return fmt.Errorf("failed to verify if current working directory is safe: %w", err)
} else if !filepath.IsAbs(wd) {
// We shouldn't ever hit this, but check just in case.
return fmt.Errorf("current working directory is not absolute -- possible container breakout detected: cwd is %q", wd)
}
return nil
}
// finalizeNamespace drops the caps, sets the correct user
// and working dir, and closes any leaked file descriptors
libct: speedup process.Env handling The current implementation sets all the environment variables passed in Process.Env in the current process, one by one, then uses os.Environ to read those back. As pointed out in [1], this is slow, as runc calls os.Setenv for every variable, and there may be a few thousands of those. Looking into how os.Setenv is implemented, it is indeed slow, especially when cgo is enabled. Looking into why it was implemented the way it is, I found commit 9744d72c and traced it to [2], which discusses the actual reasons. It boils down to these two: - HOME is not passed into container as it is set in setupUser by os.Setenv and has no effect on config.Env; - there is a need to deduplicate the environment variables. Yet it was decided in [2] to not go ahead with this patch, but later [3] was opened with the carry of this patch, and merged. Now, from what I see: 1. Passing environment to exec is way faster than using os.Setenv and os.Environ (tests show ~20x speed improvement in a simple Go test, and ~3x improvement in real-world test, see below). 2. Setting environment variables in the runc context may result is some ugly side effects (think GODEBUG, LD_PRELOAD, or _LIBCONTAINER_*). 3. Nothing in runtime spec says that the environment needs to be deduplicated, or the order of preference (whether the first or the last value of a variable with the same name is to be used). We should stick to what we have in order to maintain backward compatibility. So, this patch: - switches to passing env directly to exec; - adds deduplication mechanism to retain backward compatibility; - takes care to set PATH from process.Env in the current process (so that supplied PATH is used to find the binary to execute), also to retain backward compatibility; - adds HOME to process.Env if not set; - ensures any StartContainer CommandHook entries with no environment set explicitly are run with the same environment as before. Thanks to @lifubang who noticed that peculiarity. The benchmark added by the previous commit shows ~3x improvement: │ before │ after │ │ sec/op │ sec/op vs base │ ExecInBigEnv-20 61.53m ± 1% 21.87m ± 16% -64.46% (p=0.000 n=10) [1]: https://github.com/opencontainers/runc/pull/1983 [2]: https://github.com/docker-archive/libcontainer/pull/418 [3]: https://github.com/docker-archive/libcontainer/pull/432 Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2024-06-23 16:31:57 -07:00
// before executing the command inside the namespace.
libct: switch to numeric UID/GID/groups This addresses the following TODO in the code (added back in 2015 by commit 845fc65e5): > // TODO: fix libcontainer's API to better support uid/gid in a typesafe way. Historically, libcontainer internally uses strings for user, group, and additional (aka supplementary) groups. Yet, runc receives those credentials as part of runtime-spec's process, which uses integers for all of them (see [1], [2]). What happens next is: 1. runc start/run/exec converts those credentials to strings (a User string containing "UID:GID", and a []string for additional GIDs) and passes those onto runc init. 2. runc init converts them back to int, in the most complicated way possible (parsing container's /etc/passwd and /etc/group). All this conversion and, especially, parsing is totally unnecessary, but is performed on every container exec (and start). The only benefit of all this is, a libcontainer user could use user and group names instead of numeric IDs (but runc itself is not using this feature, and we don't know if there are any other users of this). Let's remove this back and forth translation, hopefully increasing runc exec performance. The only remaining need to parse /etc/passwd is to set HOME environment variable for a specified UID, in case $HOME is not explicitly set in process.Env. This can now be done right in prepareEnv, which simplifies the code flow a lot. Alas, we can not use standard os/user.LookupId, as it could cache host's /etc/passwd or the current user (even with the osusergo tag). PS Note that the structures being changed (initConfig and Process) are never saved to disk as JSON by runc, so there is no compatibility issue for runc users. Still, this is a breaking change in libcontainer, but we never promised that libcontainer API will be stable (and there's a special package that can handle it -- github.com/moby/sys/user). Reflect this in CHANGELOG. For 3998. [1]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/config.md#posix-platform-user [2]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/specs-go/config.go#L86 Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2023-08-28 20:05:56 -07:00
func finalizeNamespace(config *initConfig) error {
// Ensure that all unwanted fds we may have accidentally
// inherited are marked close-on-exec so they stay out of the
// container
if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil {
return fmt.Errorf("error closing exec fds: %w", err)
}
// we only do chdir if it's specified
doChdir := config.Cwd != ""
if doChdir {
// First, attempt the chdir before setting up the user.
// This could allow us to access a directory that the user running runc can access
// but the container user cannot.
err := unix.Chdir(config.Cwd)
switch {
case err == nil:
doChdir = false
case errors.Is(err, os.ErrPermission):
// If we hit an EPERM, we should attempt again after setting up user.
// This will allow us to successfully chdir if the container user has access
// to the directory, but the user running runc does not.
// This is useful in cases where the cwd is also a volume that's been chowned to the container user.
default:
return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err)
}
}
// We should set envs after we are in the jail of the container.
// Please see https://github.com/opencontainers/runc/issues/4688
env, err := prepareEnv(config.Env, config.UID)
if err != nil {
return err
}
config.Env = env
w, err := capabilities.New(config.Capabilities)
if err != nil {
return err
}
// drop capabilities in bounding set before changing user
if err := w.ApplyBoundingSet(); err != nil {
return fmt.Errorf("unable to apply bounding set: %w", err)
}
// preserve existing capabilities while we change users
if err := system.SetKeepCaps(); err != nil {
return fmt.Errorf("unable to set keep caps: %w", err)
}
libct: switch to numeric UID/GID/groups This addresses the following TODO in the code (added back in 2015 by commit 845fc65e5): > // TODO: fix libcontainer's API to better support uid/gid in a typesafe way. Historically, libcontainer internally uses strings for user, group, and additional (aka supplementary) groups. Yet, runc receives those credentials as part of runtime-spec's process, which uses integers for all of them (see [1], [2]). What happens next is: 1. runc start/run/exec converts those credentials to strings (a User string containing "UID:GID", and a []string for additional GIDs) and passes those onto runc init. 2. runc init converts them back to int, in the most complicated way possible (parsing container's /etc/passwd and /etc/group). All this conversion and, especially, parsing is totally unnecessary, but is performed on every container exec (and start). The only benefit of all this is, a libcontainer user could use user and group names instead of numeric IDs (but runc itself is not using this feature, and we don't know if there are any other users of this). Let's remove this back and forth translation, hopefully increasing runc exec performance. The only remaining need to parse /etc/passwd is to set HOME environment variable for a specified UID, in case $HOME is not explicitly set in process.Env. This can now be done right in prepareEnv, which simplifies the code flow a lot. Alas, we can not use standard os/user.LookupId, as it could cache host's /etc/passwd or the current user (even with the osusergo tag). PS Note that the structures being changed (initConfig and Process) are never saved to disk as JSON by runc, so there is no compatibility issue for runc users. Still, this is a breaking change in libcontainer, but we never promised that libcontainer API will be stable (and there's a special package that can handle it -- github.com/moby/sys/user). Reflect this in CHANGELOG. For 3998. [1]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/config.md#posix-platform-user [2]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/specs-go/config.go#L86 Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2023-08-28 20:05:56 -07:00
if err := setupUser(config); err != nil {
return fmt.Errorf("unable to setup user: %w", err)
}
// Change working directory AFTER the user has been set up, if we haven't done it yet.
if doChdir {
if err := unix.Chdir(config.Cwd); err != nil {
return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err)
}
}
// Make sure our final working directory is inside the container.
if err := verifyCwd(); err != nil {
return err
}
if err := system.ClearKeepCaps(); err != nil {
return fmt.Errorf("unable to clear keep caps: %w", err)
}
if err := w.ApplyCaps(); err != nil {
return fmt.Errorf("unable to apply caps: %w", err)
}
return nil
}
// setupConsole sets up the console from inside the container, and sends the
// master pty fd to the config.Pipe (using cmsg). This is done to ensure that
// consoles are scoped to a container properly (see runc#814 and the many
// issues related to that). This has to be run *after* we've pivoted to the new
// rootfs (and the users' configuration is entirely set up).
func setupConsole(socket *os.File, config *initConfig, mount bool) error {
defer socket.Close()
// At this point, /dev/ptmx points to something that we would expect. We
// used to change the owner of the slave path, but since the /dev/pts mount
// can have gid=X set (at the users' option). So touching the owner of the
// slave PTY is not necessary, as the kernel will handle that for us. Note
// however, that setupUser (specifically fixStdioPermissions) *will* change
// the UID owner of the console to be the user the process will run as (so
// they can actually control their console).
console: use TIOCGPTPEER when allocating peer PTY When opening the peer end of a pty, the old kernel API required us to open /dev/pts/$num inside the container (at least since we fixed console handling many years ago in commit 244c9fc426ae ("*: console rewrite")). The problem is that in a hostile container it is possible for /dev/pts/$num to be an attacker-controlled symlink that runc can be tricked into resolving when doing bind-mounts. This allows the attacker to (among other things) persist /proc/... entries that are later masked by runc, allowing an attacker to escape through the kernel.core_pattern sysctl (/proc/sys/kernel/core_pattern). This is the original issue reported by Lei Wang and Li Fu Bang in CVE-2025-52565. However, it should be noted that this is not entirely a newly-discovered problem. Way back in Linux 4.13 (2017), I added the TIOCGPTPEER ioctl, which allows us to get a pty peer without touching the /dev/pts inside the container. The original threat model was around an attacker replacing /dev/pts/$n or /dev/pts/ptmx with some malicious inode (a DoS inode, or possibly a PTY they wanted a confused deputy to operate on). Unfortunately, there was no practical way for runc to cache a safe O_PATH handle to /dev/pts/ptmx (unlike other runtimes like LXC, which switched to TIOCGPTPEER way back in 2017). Since it wasn't clear how we could protect against the main attack TIOCGPTPEER was meant to protect against, we never switched to it (even though I implemented it specifically to harden container runtimes). Unfortunately, It turns out that mount *sources* are a threat we didn't fully consider. Since TIOCGPTPEER already solves this problem entirely for us in a race free way, we should just use that. In a later patch, we will add some hardening for /dev/pts/$num opening to maintain support for very old kernels (Linux 4.13 is very old at this point, but RHEL 7 is still kicking and is stuck on Linux 3.10). Fixes: GHSA-qw9x-cqr3-wc7r CVE-2025-52565 Reported-by: Lei Wang <ssst0n3@gmail.com> (CVE-2025-52565) Reported-by: lfbzhm <lifubang@acmcoder.com> (CVE-2025-52565) Reported-by: Aleksa Sarai <cyphar@cyphar.com> (TIOCGPTPEER) Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
2025-05-15 16:12:21 +10:00
pty, peerPty, err := safeAllocPty()
if err != nil {
return err
}
// After we return from here, we don't need the console anymore.
defer pty.Close()
console: use TIOCGPTPEER when allocating peer PTY When opening the peer end of a pty, the old kernel API required us to open /dev/pts/$num inside the container (at least since we fixed console handling many years ago in commit 244c9fc426ae ("*: console rewrite")). The problem is that in a hostile container it is possible for /dev/pts/$num to be an attacker-controlled symlink that runc can be tricked into resolving when doing bind-mounts. This allows the attacker to (among other things) persist /proc/... entries that are later masked by runc, allowing an attacker to escape through the kernel.core_pattern sysctl (/proc/sys/kernel/core_pattern). This is the original issue reported by Lei Wang and Li Fu Bang in CVE-2025-52565. However, it should be noted that this is not entirely a newly-discovered problem. Way back in Linux 4.13 (2017), I added the TIOCGPTPEER ioctl, which allows us to get a pty peer without touching the /dev/pts inside the container. The original threat model was around an attacker replacing /dev/pts/$n or /dev/pts/ptmx with some malicious inode (a DoS inode, or possibly a PTY they wanted a confused deputy to operate on). Unfortunately, there was no practical way for runc to cache a safe O_PATH handle to /dev/pts/ptmx (unlike other runtimes like LXC, which switched to TIOCGPTPEER way back in 2017). Since it wasn't clear how we could protect against the main attack TIOCGPTPEER was meant to protect against, we never switched to it (even though I implemented it specifically to harden container runtimes). Unfortunately, It turns out that mount *sources* are a threat we didn't fully consider. Since TIOCGPTPEER already solves this problem entirely for us in a race free way, we should just use that. In a later patch, we will add some hardening for /dev/pts/$num opening to maintain support for very old kernels (Linux 4.13 is very old at this point, but RHEL 7 is still kicking and is stuck on Linux 3.10). Fixes: GHSA-qw9x-cqr3-wc7r CVE-2025-52565 Reported-by: Lei Wang <ssst0n3@gmail.com> (CVE-2025-52565) Reported-by: lfbzhm <lifubang@acmcoder.com> (CVE-2025-52565) Reported-by: Aleksa Sarai <cyphar@cyphar.com> (TIOCGPTPEER) Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
2025-05-15 16:12:21 +10:00
defer peerPty.Close()
if config.ConsoleHeight != 0 && config.ConsoleWidth != 0 {
err = pty.Resize(console.WinSize{
Height: config.ConsoleHeight,
Width: config.ConsoleWidth,
})
if err != nil {
return err
}
}
// Mount the console inside our rootfs.
if mount {
console: use TIOCGPTPEER when allocating peer PTY When opening the peer end of a pty, the old kernel API required us to open /dev/pts/$num inside the container (at least since we fixed console handling many years ago in commit 244c9fc426ae ("*: console rewrite")). The problem is that in a hostile container it is possible for /dev/pts/$num to be an attacker-controlled symlink that runc can be tricked into resolving when doing bind-mounts. This allows the attacker to (among other things) persist /proc/... entries that are later masked by runc, allowing an attacker to escape through the kernel.core_pattern sysctl (/proc/sys/kernel/core_pattern). This is the original issue reported by Lei Wang and Li Fu Bang in CVE-2025-52565. However, it should be noted that this is not entirely a newly-discovered problem. Way back in Linux 4.13 (2017), I added the TIOCGPTPEER ioctl, which allows us to get a pty peer without touching the /dev/pts inside the container. The original threat model was around an attacker replacing /dev/pts/$n or /dev/pts/ptmx with some malicious inode (a DoS inode, or possibly a PTY they wanted a confused deputy to operate on). Unfortunately, there was no practical way for runc to cache a safe O_PATH handle to /dev/pts/ptmx (unlike other runtimes like LXC, which switched to TIOCGPTPEER way back in 2017). Since it wasn't clear how we could protect against the main attack TIOCGPTPEER was meant to protect against, we never switched to it (even though I implemented it specifically to harden container runtimes). Unfortunately, It turns out that mount *sources* are a threat we didn't fully consider. Since TIOCGPTPEER already solves this problem entirely for us in a race free way, we should just use that. In a later patch, we will add some hardening for /dev/pts/$num opening to maintain support for very old kernels (Linux 4.13 is very old at this point, but RHEL 7 is still kicking and is stuck on Linux 3.10). Fixes: GHSA-qw9x-cqr3-wc7r CVE-2025-52565 Reported-by: Lei Wang <ssst0n3@gmail.com> (CVE-2025-52565) Reported-by: lfbzhm <lifubang@acmcoder.com> (CVE-2025-52565) Reported-by: Aleksa Sarai <cyphar@cyphar.com> (TIOCGPTPEER) Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
2025-05-15 16:12:21 +10:00
if err := mountConsole(peerPty); err != nil {
return err
}
}
// While we can access console.master, using the API is a good idea.
if err := utils.SendRawFd(socket, pty.Name(), pty.Fd()); err != nil {
return err
}
runtime.KeepAlive(pty)
// Now, dup over all the things.
console: use TIOCGPTPEER when allocating peer PTY When opening the peer end of a pty, the old kernel API required us to open /dev/pts/$num inside the container (at least since we fixed console handling many years ago in commit 244c9fc426ae ("*: console rewrite")). The problem is that in a hostile container it is possible for /dev/pts/$num to be an attacker-controlled symlink that runc can be tricked into resolving when doing bind-mounts. This allows the attacker to (among other things) persist /proc/... entries that are later masked by runc, allowing an attacker to escape through the kernel.core_pattern sysctl (/proc/sys/kernel/core_pattern). This is the original issue reported by Lei Wang and Li Fu Bang in CVE-2025-52565. However, it should be noted that this is not entirely a newly-discovered problem. Way back in Linux 4.13 (2017), I added the TIOCGPTPEER ioctl, which allows us to get a pty peer without touching the /dev/pts inside the container. The original threat model was around an attacker replacing /dev/pts/$n or /dev/pts/ptmx with some malicious inode (a DoS inode, or possibly a PTY they wanted a confused deputy to operate on). Unfortunately, there was no practical way for runc to cache a safe O_PATH handle to /dev/pts/ptmx (unlike other runtimes like LXC, which switched to TIOCGPTPEER way back in 2017). Since it wasn't clear how we could protect against the main attack TIOCGPTPEER was meant to protect against, we never switched to it (even though I implemented it specifically to harden container runtimes). Unfortunately, It turns out that mount *sources* are a threat we didn't fully consider. Since TIOCGPTPEER already solves this problem entirely for us in a race free way, we should just use that. In a later patch, we will add some hardening for /dev/pts/$num opening to maintain support for very old kernels (Linux 4.13 is very old at this point, but RHEL 7 is still kicking and is stuck on Linux 3.10). Fixes: GHSA-qw9x-cqr3-wc7r CVE-2025-52565 Reported-by: Lei Wang <ssst0n3@gmail.com> (CVE-2025-52565) Reported-by: lfbzhm <lifubang@acmcoder.com> (CVE-2025-52565) Reported-by: Aleksa Sarai <cyphar@cyphar.com> (TIOCGPTPEER) Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
2025-05-15 16:12:21 +10:00
return dupStdio(peerPty)
}
// syncParentReady sends to the given pipe a JSON payload which indicates that
// the init is ready to Exec the child process. It then waits for the parent to
// indicate that it is cleared to Exec.
func syncParentReady(pipe *syncSocket) error {
// Tell parent.
if err := writeSync(pipe, procReady); err != nil {
return err
}
// Wait for parent to give the all-clear.
return readSync(pipe, procRun)
}
// syncParentHooks sends to the given pipe a JSON payload which indicates that
// the parent should execute pre-start hooks. It then waits for the parent to
// indicate that it is cleared to resume.
func syncParentHooks(pipe *syncSocket) error {
// Tell parent.
if err := writeSync(pipe, procHooks); err != nil {
return err
}
// Wait for parent to give the all-clear.
return readSync(pipe, procHooksDone)
}
// syncParentSeccomp sends the fd associated with the seccomp file descriptor
// to the parent, and wait for the parent to do pidfd_getfd() to grab a copy.
func syncParentSeccomp(pipe *syncSocket, seccompFd int) error {
if seccompFd == -1 {
return nil
}
defer unix.Close(seccompFd)
// Tell parent to grab our fd.
//
// Notably, we do not use writeSyncFile here because a container might have
// an SCMP_ACT_NOTIFY action on sendmsg(2) so we need to use the smallest
// possible number of system calls here because all of those syscalls
// cannot be used with SCMP_ACT_NOTIFY as a result (any syscall we use here
// before the parent gets the file descriptor would deadlock "runc init" if
// we allowed it for SCMP_ACT_NOTIFY). See seccomp.InitSeccomp() for more
// details.
if err := writeSyncArg(pipe, procSeccomp, seccompFd); err != nil {
return err
}
// Wait for parent to tell us they've grabbed the seccompfd.
return readSync(pipe, procSeccompDone)
}
libct: switch to numeric UID/GID/groups This addresses the following TODO in the code (added back in 2015 by commit 845fc65e5): > // TODO: fix libcontainer's API to better support uid/gid in a typesafe way. Historically, libcontainer internally uses strings for user, group, and additional (aka supplementary) groups. Yet, runc receives those credentials as part of runtime-spec's process, which uses integers for all of them (see [1], [2]). What happens next is: 1. runc start/run/exec converts those credentials to strings (a User string containing "UID:GID", and a []string for additional GIDs) and passes those onto runc init. 2. runc init converts them back to int, in the most complicated way possible (parsing container's /etc/passwd and /etc/group). All this conversion and, especially, parsing is totally unnecessary, but is performed on every container exec (and start). The only benefit of all this is, a libcontainer user could use user and group names instead of numeric IDs (but runc itself is not using this feature, and we don't know if there are any other users of this). Let's remove this back and forth translation, hopefully increasing runc exec performance. The only remaining need to parse /etc/passwd is to set HOME environment variable for a specified UID, in case $HOME is not explicitly set in process.Env. This can now be done right in prepareEnv, which simplifies the code flow a lot. Alas, we can not use standard os/user.LookupId, as it could cache host's /etc/passwd or the current user (even with the osusergo tag). PS Note that the structures being changed (initConfig and Process) are never saved to disk as JSON by runc, so there is no compatibility issue for runc users. Still, this is a breaking change in libcontainer, but we never promised that libcontainer API will be stable (and there's a special package that can handle it -- github.com/moby/sys/user). Reflect this in CHANGELOG. For 3998. [1]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/config.md#posix-platform-user [2]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/specs-go/config.go#L86 Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2023-08-28 20:05:56 -07:00
// setupUser changes the groups, gid, and uid for the user inside the container.
func setupUser(config *initConfig) error {
// Before we change to the container's user make sure that the processes
// STDIO is correctly owned by the user that we are switching to.
libct: switch to numeric UID/GID/groups This addresses the following TODO in the code (added back in 2015 by commit 845fc65e5): > // TODO: fix libcontainer's API to better support uid/gid in a typesafe way. Historically, libcontainer internally uses strings for user, group, and additional (aka supplementary) groups. Yet, runc receives those credentials as part of runtime-spec's process, which uses integers for all of them (see [1], [2]). What happens next is: 1. runc start/run/exec converts those credentials to strings (a User string containing "UID:GID", and a []string for additional GIDs) and passes those onto runc init. 2. runc init converts them back to int, in the most complicated way possible (parsing container's /etc/passwd and /etc/group). All this conversion and, especially, parsing is totally unnecessary, but is performed on every container exec (and start). The only benefit of all this is, a libcontainer user could use user and group names instead of numeric IDs (but runc itself is not using this feature, and we don't know if there are any other users of this). Let's remove this back and forth translation, hopefully increasing runc exec performance. The only remaining need to parse /etc/passwd is to set HOME environment variable for a specified UID, in case $HOME is not explicitly set in process.Env. This can now be done right in prepareEnv, which simplifies the code flow a lot. Alas, we can not use standard os/user.LookupId, as it could cache host's /etc/passwd or the current user (even with the osusergo tag). PS Note that the structures being changed (initConfig and Process) are never saved to disk as JSON by runc, so there is no compatibility issue for runc users. Still, this is a breaking change in libcontainer, but we never promised that libcontainer API will be stable (and there's a special package that can handle it -- github.com/moby/sys/user). Reflect this in CHANGELOG. For 3998. [1]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/config.md#posix-platform-user [2]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/specs-go/config.go#L86 Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2023-08-28 20:05:56 -07:00
if err := fixStdioPermissions(config.UID); err != nil {
return err
}
tree-wide: use /proc/thread-self for thread-local state With the idmap work, we will have a tainted Go thread in our thread-group that has a different mount namespace to the other threads. It seems that (due to some bad luck) the Go scheduler tends to make this thread the thread-group leader in our tests, which results in very baffling failures where /proc/self/mountinfo produces gibberish results. In order to avoid this, switch to using /proc/thread-self for everything that is thread-local. This primarily includes switching all file descriptor paths (CLONE_FS), all of the places that check the current cgroup (technically we never will run a single runc thread in a separate cgroup, but better to be safe than sorry), and the aforementioned mountinfo code. We don't need to do anything for the following because the results we need aren't thread-local: * Checks that certain namespaces are supported by stat(2)ing /proc/self/ns/... * /proc/self/exe and /proc/self/cmdline are not thread-local. * While threads can be in different cgroups, we do not do this for the runc binary (or libcontainer) and thus we do not need to switch to the thread-local version of /proc/self/cgroups. * All of the CLONE_NEWUSER files are not thread-local because you cannot set the usernamespace of a single thread (setns(CLONE_NEWUSER) is blocked for multi-threaded programs). Note that we have to use runtime.LockOSThread when we have an open handle to a tid-specific procfs file that we are operating on multiple times. Go can reschedule us such that we are running on a different thread and then kill the original thread (causing -ENOENT or similarly confusing errors). This is not strictly necessary for most usages of /proc/thread-self (such as using /proc/thread-self/fd/$n directly) since only operating on the actual inodes associated with the tid requires this locking, but because of the pre-3.17 fallback for CentOS, we have to do this in most cases. In addition, CentOS's kernel is too old for /proc/thread-self, which requires us to emulate it -- however in rootfs_linux.go, we are in the container pid namespace but /proc is the host's procfs. This leads to the incredibly frustrating situation where there is no way (on pre-4.1 Linux) to figure out which /proc/self/task/... entry refers to the current tid. We can just use /proc/self in this case. Yes this is all pretty ugly. I also wish it wasn't necessary. Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
2023-08-24 12:53:53 +10:00
// We don't need to use /proc/thread-self here because setgroups is a
// per-userns file and thus is global to all threads in a thread-group.
// This lets us avoid having to do runtime.LockOSThread.
var setgroups []byte
setgroupsFile, err := pathrs.ProcSelfOpen("setgroups", unix.O_RDONLY)
if err == nil {
setgroups, err = io.ReadAll(setgroupsFile)
_ = setgroupsFile.Close()
}
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
// This isn't allowed in an unprivileged user namespace since Linux 3.19.
// There's nothing we can do about /etc/group entries, so we silently
// ignore setting groups here (since the user didn't explicitly ask us to
// set the group).
allowSupGroups := !config.Config.RootlessEUID && string(bytes.TrimSpace(setgroups)) != "deny"
if allowSupGroups {
libct: switch to numeric UID/GID/groups This addresses the following TODO in the code (added back in 2015 by commit 845fc65e5): > // TODO: fix libcontainer's API to better support uid/gid in a typesafe way. Historically, libcontainer internally uses strings for user, group, and additional (aka supplementary) groups. Yet, runc receives those credentials as part of runtime-spec's process, which uses integers for all of them (see [1], [2]). What happens next is: 1. runc start/run/exec converts those credentials to strings (a User string containing "UID:GID", and a []string for additional GIDs) and passes those onto runc init. 2. runc init converts them back to int, in the most complicated way possible (parsing container's /etc/passwd and /etc/group). All this conversion and, especially, parsing is totally unnecessary, but is performed on every container exec (and start). The only benefit of all this is, a libcontainer user could use user and group names instead of numeric IDs (but runc itself is not using this feature, and we don't know if there are any other users of this). Let's remove this back and forth translation, hopefully increasing runc exec performance. The only remaining need to parse /etc/passwd is to set HOME environment variable for a specified UID, in case $HOME is not explicitly set in process.Env. This can now be done right in prepareEnv, which simplifies the code flow a lot. Alas, we can not use standard os/user.LookupId, as it could cache host's /etc/passwd or the current user (even with the osusergo tag). PS Note that the structures being changed (initConfig and Process) are never saved to disk as JSON by runc, so there is no compatibility issue for runc users. Still, this is a breaking change in libcontainer, but we never promised that libcontainer API will be stable (and there's a special package that can handle it -- github.com/moby/sys/user). Reflect this in CHANGELOG. For 3998. [1]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/config.md#posix-platform-user [2]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/specs-go/config.go#L86 Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2023-08-28 20:05:56 -07:00
if err := unix.Setgroups(config.AdditionalGroups); err != nil {
return &os.SyscallError{Syscall: "setgroups", Err: err}
}
}
libct: switch to numeric UID/GID/groups This addresses the following TODO in the code (added back in 2015 by commit 845fc65e5): > // TODO: fix libcontainer's API to better support uid/gid in a typesafe way. Historically, libcontainer internally uses strings for user, group, and additional (aka supplementary) groups. Yet, runc receives those credentials as part of runtime-spec's process, which uses integers for all of them (see [1], [2]). What happens next is: 1. runc start/run/exec converts those credentials to strings (a User string containing "UID:GID", and a []string for additional GIDs) and passes those onto runc init. 2. runc init converts them back to int, in the most complicated way possible (parsing container's /etc/passwd and /etc/group). All this conversion and, especially, parsing is totally unnecessary, but is performed on every container exec (and start). The only benefit of all this is, a libcontainer user could use user and group names instead of numeric IDs (but runc itself is not using this feature, and we don't know if there are any other users of this). Let's remove this back and forth translation, hopefully increasing runc exec performance. The only remaining need to parse /etc/passwd is to set HOME environment variable for a specified UID, in case $HOME is not explicitly set in process.Env. This can now be done right in prepareEnv, which simplifies the code flow a lot. Alas, we can not use standard os/user.LookupId, as it could cache host's /etc/passwd or the current user (even with the osusergo tag). PS Note that the structures being changed (initConfig and Process) are never saved to disk as JSON by runc, so there is no compatibility issue for runc users. Still, this is a breaking change in libcontainer, but we never promised that libcontainer API will be stable (and there's a special package that can handle it -- github.com/moby/sys/user). Reflect this in CHANGELOG. For 3998. [1]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/config.md#posix-platform-user [2]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/specs-go/config.go#L86 Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2023-08-28 20:05:56 -07:00
if err := unix.Setgid(config.GID); err != nil {
if err == unix.EINVAL {
libct: switch to numeric UID/GID/groups This addresses the following TODO in the code (added back in 2015 by commit 845fc65e5): > // TODO: fix libcontainer's API to better support uid/gid in a typesafe way. Historically, libcontainer internally uses strings for user, group, and additional (aka supplementary) groups. Yet, runc receives those credentials as part of runtime-spec's process, which uses integers for all of them (see [1], [2]). What happens next is: 1. runc start/run/exec converts those credentials to strings (a User string containing "UID:GID", and a []string for additional GIDs) and passes those onto runc init. 2. runc init converts them back to int, in the most complicated way possible (parsing container's /etc/passwd and /etc/group). All this conversion and, especially, parsing is totally unnecessary, but is performed on every container exec (and start). The only benefit of all this is, a libcontainer user could use user and group names instead of numeric IDs (but runc itself is not using this feature, and we don't know if there are any other users of this). Let's remove this back and forth translation, hopefully increasing runc exec performance. The only remaining need to parse /etc/passwd is to set HOME environment variable for a specified UID, in case $HOME is not explicitly set in process.Env. This can now be done right in prepareEnv, which simplifies the code flow a lot. Alas, we can not use standard os/user.LookupId, as it could cache host's /etc/passwd or the current user (even with the osusergo tag). PS Note that the structures being changed (initConfig and Process) are never saved to disk as JSON by runc, so there is no compatibility issue for runc users. Still, this is a breaking change in libcontainer, but we never promised that libcontainer API will be stable (and there's a special package that can handle it -- github.com/moby/sys/user). Reflect this in CHANGELOG. For 3998. [1]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/config.md#posix-platform-user [2]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/specs-go/config.go#L86 Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2023-08-28 20:05:56 -07:00
return fmt.Errorf("cannot setgid to unmapped gid %d in user namespace", config.GID)
}
return err
}
libct: switch to numeric UID/GID/groups This addresses the following TODO in the code (added back in 2015 by commit 845fc65e5): > // TODO: fix libcontainer's API to better support uid/gid in a typesafe way. Historically, libcontainer internally uses strings for user, group, and additional (aka supplementary) groups. Yet, runc receives those credentials as part of runtime-spec's process, which uses integers for all of them (see [1], [2]). What happens next is: 1. runc start/run/exec converts those credentials to strings (a User string containing "UID:GID", and a []string for additional GIDs) and passes those onto runc init. 2. runc init converts them back to int, in the most complicated way possible (parsing container's /etc/passwd and /etc/group). All this conversion and, especially, parsing is totally unnecessary, but is performed on every container exec (and start). The only benefit of all this is, a libcontainer user could use user and group names instead of numeric IDs (but runc itself is not using this feature, and we don't know if there are any other users of this). Let's remove this back and forth translation, hopefully increasing runc exec performance. The only remaining need to parse /etc/passwd is to set HOME environment variable for a specified UID, in case $HOME is not explicitly set in process.Env. This can now be done right in prepareEnv, which simplifies the code flow a lot. Alas, we can not use standard os/user.LookupId, as it could cache host's /etc/passwd or the current user (even with the osusergo tag). PS Note that the structures being changed (initConfig and Process) are never saved to disk as JSON by runc, so there is no compatibility issue for runc users. Still, this is a breaking change in libcontainer, but we never promised that libcontainer API will be stable (and there's a special package that can handle it -- github.com/moby/sys/user). Reflect this in CHANGELOG. For 3998. [1]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/config.md#posix-platform-user [2]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/specs-go/config.go#L86 Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2023-08-28 20:05:56 -07:00
if err := unix.Setuid(config.UID); err != nil {
if err == unix.EINVAL {
libct: switch to numeric UID/GID/groups This addresses the following TODO in the code (added back in 2015 by commit 845fc65e5): > // TODO: fix libcontainer's API to better support uid/gid in a typesafe way. Historically, libcontainer internally uses strings for user, group, and additional (aka supplementary) groups. Yet, runc receives those credentials as part of runtime-spec's process, which uses integers for all of them (see [1], [2]). What happens next is: 1. runc start/run/exec converts those credentials to strings (a User string containing "UID:GID", and a []string for additional GIDs) and passes those onto runc init. 2. runc init converts them back to int, in the most complicated way possible (parsing container's /etc/passwd and /etc/group). All this conversion and, especially, parsing is totally unnecessary, but is performed on every container exec (and start). The only benefit of all this is, a libcontainer user could use user and group names instead of numeric IDs (but runc itself is not using this feature, and we don't know if there are any other users of this). Let's remove this back and forth translation, hopefully increasing runc exec performance. The only remaining need to parse /etc/passwd is to set HOME environment variable for a specified UID, in case $HOME is not explicitly set in process.Env. This can now be done right in prepareEnv, which simplifies the code flow a lot. Alas, we can not use standard os/user.LookupId, as it could cache host's /etc/passwd or the current user (even with the osusergo tag). PS Note that the structures being changed (initConfig and Process) are never saved to disk as JSON by runc, so there is no compatibility issue for runc users. Still, this is a breaking change in libcontainer, but we never promised that libcontainer API will be stable (and there's a special package that can handle it -- github.com/moby/sys/user). Reflect this in CHANGELOG. For 3998. [1]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/config.md#posix-platform-user [2]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/specs-go/config.go#L86 Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2023-08-28 20:05:56 -07:00
return fmt.Errorf("cannot setuid to unmapped uid %d in user namespace", config.UID)
}
return err
}
return nil
}
libct: switch to numeric UID/GID/groups This addresses the following TODO in the code (added back in 2015 by commit 845fc65e5): > // TODO: fix libcontainer's API to better support uid/gid in a typesafe way. Historically, libcontainer internally uses strings for user, group, and additional (aka supplementary) groups. Yet, runc receives those credentials as part of runtime-spec's process, which uses integers for all of them (see [1], [2]). What happens next is: 1. runc start/run/exec converts those credentials to strings (a User string containing "UID:GID", and a []string for additional GIDs) and passes those onto runc init. 2. runc init converts them back to int, in the most complicated way possible (parsing container's /etc/passwd and /etc/group). All this conversion and, especially, parsing is totally unnecessary, but is performed on every container exec (and start). The only benefit of all this is, a libcontainer user could use user and group names instead of numeric IDs (but runc itself is not using this feature, and we don't know if there are any other users of this). Let's remove this back and forth translation, hopefully increasing runc exec performance. The only remaining need to parse /etc/passwd is to set HOME environment variable for a specified UID, in case $HOME is not explicitly set in process.Env. This can now be done right in prepareEnv, which simplifies the code flow a lot. Alas, we can not use standard os/user.LookupId, as it could cache host's /etc/passwd or the current user (even with the osusergo tag). PS Note that the structures being changed (initConfig and Process) are never saved to disk as JSON by runc, so there is no compatibility issue for runc users. Still, this is a breaking change in libcontainer, but we never promised that libcontainer API will be stable (and there's a special package that can handle it -- github.com/moby/sys/user). Reflect this in CHANGELOG. For 3998. [1]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/config.md#posix-platform-user [2]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/specs-go/config.go#L86 Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2023-08-28 20:05:56 -07:00
// fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified uid.
// The ownership needs to match because it is created outside of the container and needs to be
// localized.
libct: switch to numeric UID/GID/groups This addresses the following TODO in the code (added back in 2015 by commit 845fc65e5): > // TODO: fix libcontainer's API to better support uid/gid in a typesafe way. Historically, libcontainer internally uses strings for user, group, and additional (aka supplementary) groups. Yet, runc receives those credentials as part of runtime-spec's process, which uses integers for all of them (see [1], [2]). What happens next is: 1. runc start/run/exec converts those credentials to strings (a User string containing "UID:GID", and a []string for additional GIDs) and passes those onto runc init. 2. runc init converts them back to int, in the most complicated way possible (parsing container's /etc/passwd and /etc/group). All this conversion and, especially, parsing is totally unnecessary, but is performed on every container exec (and start). The only benefit of all this is, a libcontainer user could use user and group names instead of numeric IDs (but runc itself is not using this feature, and we don't know if there are any other users of this). Let's remove this back and forth translation, hopefully increasing runc exec performance. The only remaining need to parse /etc/passwd is to set HOME environment variable for a specified UID, in case $HOME is not explicitly set in process.Env. This can now be done right in prepareEnv, which simplifies the code flow a lot. Alas, we can not use standard os/user.LookupId, as it could cache host's /etc/passwd or the current user (even with the osusergo tag). PS Note that the structures being changed (initConfig and Process) are never saved to disk as JSON by runc, so there is no compatibility issue for runc users. Still, this is a breaking change in libcontainer, but we never promised that libcontainer API will be stable (and there's a special package that can handle it -- github.com/moby/sys/user). Reflect this in CHANGELOG. For 3998. [1]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/config.md#posix-platform-user [2]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/specs-go/config.go#L86 Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2023-08-28 20:05:56 -07:00
func fixStdioPermissions(uid int) error {
for _, file := range []*os.File{os.Stdin, os.Stdout, os.Stderr} {
var s unix.Stat_t
if err := unix.Fstat(int(file.Fd()), &s); err != nil {
return &os.PathError{Op: "fstat", Path: file.Name(), Err: err}
}
// Skip chown if:
// - uid is already the one we want, or
// - fd is opened to /dev/null.
if int(s.Uid) == uid || isDevNull(&s) {
continue
}
// We only change the uid (as it is possible for the mount to
// prefer a different gid, and there's no reason for us to change it).
// The reason why we don't just leave the default uid=X mount setup is
// that users expect to be able to actually use their console. Without
// this code, you couldn't effectively run as a non-root user inside a
// container and also have a console set up.
if err := file.Chown(uid, -1); err != nil {
// If we've hit an EPERM then the inode's current owner
// is not mapped in our user namespace (in particular,
// privileged_wrt_inode_uidgid() has failed). Read-only
// /dev can result in EROFS error. In any case, it's
// better for us to just not touch the stdio rather
// than bail at this point.
// EINVAL should never happen, as it would mean the uid
// is not mapped, we expect this function to be called
// with a mapped uid.
if errors.Is(err, unix.EPERM) || errors.Is(err, unix.EROFS) {
continue
}
return err
}
}
return nil
}
// setupNetwork sets up and initializes any network interface inside the container.
func setupNetwork(config *initConfig) error {
for _, config := range config.Networks {
strategy, err := getStrategy(config.Type)
if err != nil {
return err
}
if err := strategy.initialize(config); err != nil {
return err
}
}
return nil
}
func setupRoute(config *configs.Config) error {
for _, config := range config.Routes {
_, dst, err := net.ParseCIDR(config.Destination)
if err != nil {
return err
}
src := net.ParseIP(config.Source)
if src == nil {
return fmt.Errorf("Invalid source for route: %s", config.Source)
}
gw := net.ParseIP(config.Gateway)
if gw == nil {
return fmt.Errorf("Invalid gateway for route: %s", config.Gateway)
}
l, err := netlink.LinkByName(config.InterfaceName)
if err != nil {
return err
}
route := &netlink.Route{
Scope: netlink.SCOPE_UNIVERSE,
Dst: dst,
Src: src,
Gw: gw,
LinkIndex: l.Attrs().Index,
}
if err := netlink.RouteAdd(route); err != nil {
return err
}
}
return nil
}
func maybeClearRlimitNofileCache(limits []configs.Rlimit) {
for _, rlimit := range limits {
if rlimit.Type == syscall.RLIMIT_NOFILE {
system.ClearRlimitNofileCache(&syscall.Rlimit{
Cur: rlimit.Soft,
Max: rlimit.Hard,
})
return
}
}
}
func setupRlimits(limits []configs.Rlimit, pid int) error {
for _, rlimit := range limits {
if err := unix.Prlimit(pid, rlimit.Type, &unix.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}, nil); err != nil {
return fmt.Errorf("error setting rlimit type %v: %w", rlimit.Type, err)
}
}
return nil
}
func setupScheduler(config *initConfig) error {
if config.Scheduler == nil {
return nil
}
attr, err := configs.ToSchedAttr(config.Scheduler)
if err != nil {
return err
}
if err := unix.SchedSetAttr(0, attr, 0); err != nil {
if errors.Is(err, unix.EPERM) && config.Config.Cgroups.CpusetCpus != "" {
return errors.New("process scheduler can't be used together with AllowedCPUs")
}
return fmt.Errorf("error setting scheduler: %w", err)
}
return nil
}
func setupIOPriority(config *initConfig) error {
const ioprioWhoPgrp = 1
ioprio := config.IOPriority
if ioprio == nil {
return nil
}
class := 0
switch ioprio.Class {
case specs.IOPRIO_CLASS_RT:
class = 1
case specs.IOPRIO_CLASS_BE:
class = 2
case specs.IOPRIO_CLASS_IDLE:
class = 3
default:
return fmt.Errorf("invalid io priority class: %s", ioprio.Class)
}
// Combine class and priority into a single value
// https://github.com/torvalds/linux/blob/v5.18/include/uapi/linux/ioprio.h#L5-L17
iop := (class << 13) | ioprio.Priority
_, _, errno := unix.RawSyscall(unix.SYS_IOPRIO_SET, ioprioWhoPgrp, 0, uintptr(iop))
if errno != 0 {
return fmt.Errorf("failed to set io priority: %w", errno)
}
return nil
}
func setupMemoryPolicy(config *configs.Config) error {
mpol := config.MemoryPolicy
if mpol == nil {
return nil
}
return linux.SetMempolicy(mpol.Mode|mpol.Flags, config.MemoryPolicy.Nodes)
}
func setupPersonality(config *configs.Config) error {
return system.SetLinuxPersonality(config.Personality.Domain)
}
// signalAllProcesses freezes then iterates over all the processes inside the
// manager's cgroups sending the signal s to them.
func signalAllProcesses(m cgroups.Manager, s unix.Signal) error {
if !m.Exists() {
return ErrCgroupNotExist
}
// Use cgroup.kill, if available.
if s == unix.SIGKILL {
if p := m.Path(""); p != "" { // Either cgroup v2 or hybrid.
err := cgroups.WriteFile(p, "cgroup.kill", "1")
if err == nil || !errors.Is(err, os.ErrNotExist) {
return err
}
// Fallback to old implementation.
}
}
if err := m.Freeze(cgroups.Frozen); err != nil {
logrus.Warn(err)
}
pids, err := m.GetAllPids()
if err != nil {
if err := m.Freeze(cgroups.Thawed); err != nil {
logrus.Warn(err)
}
return err
}
for _, pid := range pids {
err := unix.Kill(pid, s)
if err != nil && err != unix.ESRCH {
logrus.Warnf("kill %d: %v", pid, err)
}
}
if err := m.Freeze(cgroups.Thawed); err != nil {
logrus.Warn(err)
}
return nil
}
// setupPidfd opens a process file descriptor of init process, and sends the
// file descriptor back to the socket.
func setupPidfd(socket *os.File, initType string) error {
defer socket.Close()
pidFd, err := unix.PidfdOpen(os.Getpid(), 0)
if err != nil {
return fmt.Errorf("failed to pidfd_open: %w", err)
}
if err := utils.SendRawFd(socket, initType, uintptr(pidFd)); err != nil {
unix.Close(pidFd)
return fmt.Errorf("failed to send pidfd on socket: %w", err)
}
return unix.Close(pidFd)
}