2015-02-06 12:48:57 -08:00
|
|
|
package libcontainer
|
|
|
|
|
|
|
|
|
|
import (
|
2020-09-29 12:12:10 +02:00
|
|
|
"bytes"
|
2015-02-06 12:48:57 -08:00
|
|
|
"encoding/json"
|
2021-06-11 16:32:40 -07:00
|
|
|
"errors"
|
2015-02-06 12:48:57 -08:00
|
|
|
"fmt"
|
2025-07-18 15:33:56 +10:00
|
|
|
"io"
|
2015-06-26 11:38:23 -07:00
|
|
|
"net"
|
2015-02-06 12:48:57 -08:00
|
|
|
"os"
|
2023-12-26 23:53:07 +11:00
|
|
|
"path/filepath"
|
2023-04-19 14:52:30 -07:00
|
|
|
"runtime"
|
2023-04-11 16:38:50 -07:00
|
|
|
"runtime/debug"
|
|
|
|
|
"strconv"
|
2024-05-23 16:29:58 -07:00
|
|
|
"syscall"
|
2015-02-06 12:48:57 -08:00
|
|
|
|
2017-05-19 18:18:43 +01:00
|
|
|
"github.com/containerd/console"
|
2021-06-11 16:32:40 -07:00
|
|
|
"github.com/opencontainers/runtime-spec/specs-go"
|
|
|
|
|
"github.com/sirupsen/logrus"
|
|
|
|
|
"github.com/vishvananda/netlink"
|
|
|
|
|
"golang.org/x/sys/unix"
|
|
|
|
|
|
2025-02-25 17:32:28 -08:00
|
|
|
"github.com/opencontainers/cgroups"
|
2025-03-25 14:36:42 -07:00
|
|
|
"github.com/opencontainers/runc/internal/linux"
|
2025-07-18 15:33:56 +10:00
|
|
|
"github.com/opencontainers/runc/internal/pathrs"
|
2020-09-29 11:02:48 +02:00
|
|
|
"github.com/opencontainers/runc/libcontainer/capabilities"
|
2015-06-21 19:29:59 -07:00
|
|
|
"github.com/opencontainers/runc/libcontainer/configs"
|
|
|
|
|
"github.com/opencontainers/runc/libcontainer/system"
|
|
|
|
|
"github.com/opencontainers/runc/libcontainer/utils"
|
2015-02-06 12:48:57 -08:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
type initType string
|
|
|
|
|
|
|
|
|
|
const (
|
2015-02-17 21:50:43 -08:00
|
|
|
initSetns initType = "setns"
|
|
|
|
|
initStandard initType = "standard"
|
2015-02-06 12:48:57 -08:00
|
|
|
)
|
|
|
|
|
|
2015-02-09 15:16:27 -08:00
|
|
|
type pid struct {
|
2020-06-08 01:24:30 +10:00
|
|
|
Pid int `json:"stage2_pid"`
|
|
|
|
|
PidFirstChild int `json:"stage1_pid"`
|
2015-02-09 15:16:27 -08:00
|
|
|
}
|
|
|
|
|
|
2015-02-10 11:51:45 -08:00
|
|
|
// network is an internal struct used to setup container networks.
|
|
|
|
|
type network struct {
|
|
|
|
|
configs.Network
|
|
|
|
|
|
2015-07-24 11:19:25 +09:00
|
|
|
// TempVethPeerName is a unique temporary veth peer name that was placed into
|
2015-02-10 11:51:45 -08:00
|
|
|
// the container's namespace.
|
|
|
|
|
TempVethPeerName string `json:"temp_veth_peer_name"`
|
|
|
|
|
}
|
|
|
|
|
|
2025-01-07 19:15:33 -08:00
|
|
|
// initConfig is used for transferring parameters from Exec() to Init().
|
|
|
|
|
// It contains:
|
|
|
|
|
// - original container config;
|
|
|
|
|
// - some [Process] properties;
|
|
|
|
|
// - set of properties merged from the container config ([configs.Config])
|
|
|
|
|
// and the process ([Process]);
|
|
|
|
|
// - some properties that come from the container.
|
|
|
|
|
//
|
|
|
|
|
// When adding new fields, please make sure they go into the relevant section.
|
2015-02-06 12:48:57 -08:00
|
|
|
type initConfig struct {
|
2025-01-07 19:15:33 -08:00
|
|
|
// Config is the original container config.
|
|
|
|
|
Config *configs.Config `json:"config"`
|
|
|
|
|
|
|
|
|
|
// Properties that are unique to and come from [Process].
|
|
|
|
|
|
|
|
|
|
Args []string `json:"args"`
|
|
|
|
|
Env []string `json:"env"`
|
|
|
|
|
UID int `json:"uid"`
|
|
|
|
|
GID int `json:"gid"`
|
|
|
|
|
AdditionalGroups []int `json:"additional_groups"`
|
|
|
|
|
Cwd string `json:"cwd"`
|
|
|
|
|
CreateConsole bool `json:"create_console"`
|
|
|
|
|
ConsoleWidth uint16 `json:"console_width"`
|
|
|
|
|
ConsoleHeight uint16 `json:"console_height"`
|
|
|
|
|
PassedFilesCount int `json:"passed_files_count"`
|
|
|
|
|
|
|
|
|
|
// Properties that exists both in the container config and the process,
|
|
|
|
|
// as merged by [Container.newInitConfig] (process properties has preference).
|
|
|
|
|
|
|
|
|
|
AppArmorProfile string `json:"apparmor_profile"`
|
|
|
|
|
Capabilities *configs.Capabilities `json:"capabilities"`
|
|
|
|
|
NoNewPrivileges bool `json:"no_new_privileges"`
|
|
|
|
|
ProcessLabel string `json:"process_label"`
|
|
|
|
|
Rlimits []configs.Rlimit `json:"rlimits"`
|
2025-01-07 19:48:58 -08:00
|
|
|
IOPriority *configs.IOPriority `json:"io_priority,omitempty"`
|
2025-01-08 14:11:02 -08:00
|
|
|
Scheduler *configs.Scheduler `json:"scheduler,omitempty"`
|
runc exec: implement CPU affinity
As per
- https://github.com/opencontainers/runtime-spec/pull/1253
- https://github.com/opencontainers/runtime-spec/pull/1261
CPU affinity can be set in two ways:
1. When creating/starting a container, in config.json's
Process.ExecCPUAffinity, which is when applied to all execs.
2. When running an exec, in process.json's CPUAffinity, which
applied to a given exec and overrides the value from (1).
Add some basic tests.
Note that older kernels (RHEL8, Ubuntu 20.04) change CPU affinity of a
process to that of a container's cgroup, as soon as it is moved to that
cgroup, while newer kernels (Ubuntu 24.04, Fedora 41) don't do that.
Because of the above,
- it's impossible to really test initial CPU affinity without adding
debug logging to libcontainer/nsenter;
- for older kernels, there can be a brief moment when exec's affinity
is different than either initial or final affinity being set;
- exec's final CPU affinity, if not specified, can be different
depending on the kernel, therefore we don't test it.
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2024-10-21 15:50:38 -07:00
|
|
|
CPUAffinity *configs.CPUAffinity `json:"cpu_affinity,omitempty"`
|
2025-01-07 19:15:33 -08:00
|
|
|
|
|
|
|
|
// Miscellaneous properties, filled in by [Container.newInitConfig]
|
|
|
|
|
// unless documented otherwise.
|
|
|
|
|
|
|
|
|
|
ContainerID string `json:"containerid"`
|
|
|
|
|
Cgroup2Path string `json:"cgroup2_path,omitempty"`
|
|
|
|
|
|
|
|
|
|
// Networks is filled in from container config by [initProcess.createNetworkInterfaces].
|
|
|
|
|
Networks []*network `json:"network"`
|
|
|
|
|
|
|
|
|
|
// SpecState is filled in by [initProcess.Start].
|
|
|
|
|
SpecState *specs.State `json:"spec_state,omitempty"`
|
2015-02-06 12:48:57 -08:00
|
|
|
}
|
|
|
|
|
|
2023-04-19 14:52:30 -07:00
|
|
|
// Init is part of "runc init" implementation.
|
|
|
|
|
func Init() {
|
|
|
|
|
runtime.GOMAXPROCS(1)
|
|
|
|
|
runtime.LockOSThread()
|
|
|
|
|
|
|
|
|
|
if err := startInitialization(); err != nil {
|
|
|
|
|
// If the error is returned, it was not communicated
|
|
|
|
|
// back to the parent (which is not a common case),
|
|
|
|
|
// so print it to stderr here as a last resort.
|
|
|
|
|
//
|
|
|
|
|
// Do not use logrus as we are not sure if it has been
|
|
|
|
|
// set up yet, but most important, if the parent is
|
|
|
|
|
// alive (and its log forwarding is working).
|
|
|
|
|
fmt.Fprintln(os.Stderr, err)
|
|
|
|
|
}
|
|
|
|
|
// Normally, StartInitialization() never returns, meaning
|
|
|
|
|
// if we are here, it had failed.
|
2024-01-02 14:58:28 +11:00
|
|
|
os.Exit(255)
|
2023-04-19 14:52:30 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Normally, this function does not return. If it returns, with or without an
|
|
|
|
|
// error, it means the initialization has failed. If the error is returned,
|
|
|
|
|
// it means the error can not be communicated back to the parent.
|
|
|
|
|
func startInitialization() (retErr error) {
|
2023-10-03 17:48:04 -07:00
|
|
|
// Get the synchronisation pipe.
|
2023-08-17 21:20:33 +10:00
|
|
|
envSyncPipe := os.Getenv("_LIBCONTAINER_SYNCPIPE")
|
|
|
|
|
syncPipeFd, err := strconv.Atoi(envSyncPipe)
|
2023-04-11 16:38:50 -07:00
|
|
|
if err != nil {
|
2023-08-17 21:20:33 +10:00
|
|
|
return fmt.Errorf("unable to convert _LIBCONTAINER_SYNCPIPE: %w", err)
|
2023-04-11 16:38:50 -07:00
|
|
|
}
|
2023-08-17 21:20:33 +10:00
|
|
|
syncPipe := newSyncSocket(os.NewFile(uintptr(syncPipeFd), "sync"))
|
|
|
|
|
defer syncPipe.Close()
|
2023-04-11 16:38:50 -07:00
|
|
|
|
|
|
|
|
defer func() {
|
2023-04-19 14:52:30 -07:00
|
|
|
// If this defer is ever called, this means initialization has failed.
|
2024-01-14 12:54:32 +08:00
|
|
|
// Send the error back to the parent process in the form of an initError
|
|
|
|
|
// if the sync socket has not been closed.
|
|
|
|
|
if syncPipe.isClosed() {
|
|
|
|
|
return
|
|
|
|
|
}
|
2023-08-08 11:33:06 +10:00
|
|
|
ierr := initError{Message: retErr.Error()}
|
2023-08-17 21:20:33 +10:00
|
|
|
if err := writeSyncArg(syncPipe, procError, ierr); err != nil {
|
2023-04-19 14:52:30 -07:00
|
|
|
fmt.Fprintln(os.Stderr, err)
|
2023-04-11 16:38:50 -07:00
|
|
|
return
|
|
|
|
|
}
|
2023-04-19 14:52:30 -07:00
|
|
|
// The error is sent, no need to also return it (or it will be reported twice).
|
|
|
|
|
retErr = nil
|
2023-04-11 16:38:50 -07:00
|
|
|
}()
|
|
|
|
|
|
2023-08-17 21:20:33 +10:00
|
|
|
// Get the INITPIPE.
|
|
|
|
|
envInitPipe := os.Getenv("_LIBCONTAINER_INITPIPE")
|
|
|
|
|
initPipeFd, err := strconv.Atoi(envInitPipe)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE: %w", err)
|
|
|
|
|
}
|
|
|
|
|
initPipe := os.NewFile(uintptr(initPipeFd), "init")
|
|
|
|
|
defer initPipe.Close()
|
|
|
|
|
|
2021-12-09 13:01:03 -08:00
|
|
|
// Set up logging. This is used rarely, and mostly for init debugging.
|
|
|
|
|
|
|
|
|
|
// Passing log level is optional; currently libcontainer/integration does not do it.
|
|
|
|
|
if levelStr := os.Getenv("_LIBCONTAINER_LOGLEVEL"); levelStr != "" {
|
|
|
|
|
logLevel, err := strconv.Atoi(levelStr)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return fmt.Errorf("unable to convert _LIBCONTAINER_LOGLEVEL: %w", err)
|
|
|
|
|
}
|
|
|
|
|
logrus.SetLevel(logrus.Level(logLevel))
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-20 16:58:28 +11:00
|
|
|
logFd, err := strconv.Atoi(os.Getenv("_LIBCONTAINER_LOGPIPE"))
|
2021-12-09 13:01:03 -08:00
|
|
|
if err != nil {
|
|
|
|
|
return fmt.Errorf("unable to convert _LIBCONTAINER_LOGPIPE: %w", err)
|
|
|
|
|
}
|
2024-01-20 16:58:28 +11:00
|
|
|
logPipe := os.NewFile(uintptr(logFd), "logpipe")
|
2025-11-12 13:13:48 -08:00
|
|
|
defer logPipe.Close()
|
2021-12-09 13:01:03 -08:00
|
|
|
|
2024-01-20 16:58:28 +11:00
|
|
|
logrus.SetOutput(logPipe)
|
2021-12-09 13:01:03 -08:00
|
|
|
logrus.SetFormatter(new(logrus.JSONFormatter))
|
|
|
|
|
logrus.Debug("child process in init()")
|
|
|
|
|
|
2023-04-11 16:38:50 -07:00
|
|
|
// Only init processes have FIFOFD.
|
2024-01-20 16:58:28 +11:00
|
|
|
var fifoFile *os.File
|
2023-04-11 16:38:50 -07:00
|
|
|
envInitType := os.Getenv("_LIBCONTAINER_INITTYPE")
|
|
|
|
|
it := initType(envInitType)
|
|
|
|
|
if it == initStandard {
|
2024-01-20 16:58:28 +11:00
|
|
|
fifoFd, err := strconv.Atoi(os.Getenv("_LIBCONTAINER_FIFOFD"))
|
|
|
|
|
if err != nil {
|
2023-04-11 16:38:50 -07:00
|
|
|
return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD: %w", err)
|
|
|
|
|
}
|
2024-01-20 16:58:28 +11:00
|
|
|
fifoFile = os.NewFile(uintptr(fifoFd), "initfifo")
|
2025-11-12 13:13:48 -08:00
|
|
|
defer fifoFile.Close()
|
2023-04-11 16:38:50 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var consoleSocket *os.File
|
|
|
|
|
if envConsole := os.Getenv("_LIBCONTAINER_CONSOLE"); envConsole != "" {
|
|
|
|
|
console, err := strconv.Atoi(envConsole)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return fmt.Errorf("unable to convert _LIBCONTAINER_CONSOLE: %w", err)
|
|
|
|
|
}
|
|
|
|
|
consoleSocket = os.NewFile(uintptr(console), "console-socket")
|
|
|
|
|
defer consoleSocket.Close()
|
|
|
|
|
}
|
|
|
|
|
|
2023-10-01 11:09:49 +08:00
|
|
|
var pidfdSocket *os.File
|
|
|
|
|
if envSockFd := os.Getenv("_LIBCONTAINER_PIDFD_SOCK"); envSockFd != "" {
|
|
|
|
|
sockFd, err := strconv.Atoi(envSockFd)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return fmt.Errorf("unable to convert _LIBCONTAINER_PIDFD_SOCK: %w", err)
|
|
|
|
|
}
|
|
|
|
|
pidfdSocket = os.NewFile(uintptr(sockFd), "pidfd-socket")
|
|
|
|
|
defer pidfdSocket.Close()
|
|
|
|
|
}
|
|
|
|
|
|
libct: speedup process.Env handling
The current implementation sets all the environment variables passed in
Process.Env in the current process, one by one, then uses os.Environ to
read those back.
As pointed out in [1], this is slow, as runc calls os.Setenv for every
variable, and there may be a few thousands of those. Looking into how
os.Setenv is implemented, it is indeed slow, especially when cgo is
enabled.
Looking into why it was implemented the way it is, I found commit
9744d72c and traced it to [2], which discusses the actual reasons.
It boils down to these two:
- HOME is not passed into container as it is set in setupUser by
os.Setenv and has no effect on config.Env;
- there is a need to deduplicate the environment variables.
Yet it was decided in [2] to not go ahead with this patch, but
later [3] was opened with the carry of this patch, and merged.
Now, from what I see:
1. Passing environment to exec is way faster than using os.Setenv and
os.Environ (tests show ~20x speed improvement in a simple Go test,
and ~3x improvement in real-world test, see below).
2. Setting environment variables in the runc context may result is some
ugly side effects (think GODEBUG, LD_PRELOAD, or _LIBCONTAINER_*).
3. Nothing in runtime spec says that the environment needs to be
deduplicated, or the order of preference (whether the first or the
last value of a variable with the same name is to be used). We should
stick to what we have in order to maintain backward compatibility.
So, this patch:
- switches to passing env directly to exec;
- adds deduplication mechanism to retain backward compatibility;
- takes care to set PATH from process.Env in the current process
(so that supplied PATH is used to find the binary to execute),
also to retain backward compatibility;
- adds HOME to process.Env if not set;
- ensures any StartContainer CommandHook entries with no environment
set explicitly are run with the same environment as before. Thanks
to @lifubang who noticed that peculiarity.
The benchmark added by the previous commit shows ~3x improvement:
│ before │ after │
│ sec/op │ sec/op vs base │
ExecInBigEnv-20 61.53m ± 1% 21.87m ± 16% -64.46% (p=0.000 n=10)
[1]: https://github.com/opencontainers/runc/pull/1983
[2]: https://github.com/docker-archive/libcontainer/pull/418
[3]: https://github.com/docker-archive/libcontainer/pull/432
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2024-06-23 16:31:57 -07:00
|
|
|
// From here on, we don't need current process environment. It is not
|
|
|
|
|
// used directly anywhere below this point, but let's clear it anyway.
|
2023-04-11 16:38:50 -07:00
|
|
|
os.Clearenv()
|
|
|
|
|
|
|
|
|
|
defer func() {
|
2022-08-04 16:33:57 -07:00
|
|
|
if err := recover(); err != nil {
|
|
|
|
|
if err2, ok := err.(error); ok {
|
|
|
|
|
retErr = fmt.Errorf("panic from initialization: %w, %s", err2, debug.Stack())
|
2023-04-11 16:38:50 -07:00
|
|
|
} else {
|
2022-08-04 16:33:57 -07:00
|
|
|
retErr = fmt.Errorf("panic from initialization: %v, %s", err, debug.Stack())
|
2023-04-11 16:38:50 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}()
|
|
|
|
|
|
2023-08-17 21:20:33 +10:00
|
|
|
var config initConfig
|
|
|
|
|
if err := json.NewDecoder(initPipe).Decode(&config); err != nil {
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
|
2021-12-09 14:22:34 -08:00
|
|
|
// If init succeeds, it will not return, hence none of the defers will be called.
|
2024-10-28 17:22:19 +08:00
|
|
|
return containerInit(it, &config, syncPipe, consoleSocket, pidfdSocket, fifoFile, logPipe)
|
2015-02-06 12:48:57 -08:00
|
|
|
}
|
|
|
|
|
|
2024-10-28 17:22:19 +08:00
|
|
|
func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSocket, pidfdSocket, fifoFile, logPipe *os.File) error {
|
2024-03-29 18:12:08 +08:00
|
|
|
// Clean the RLIMIT_NOFILE cache in go runtime.
|
|
|
|
|
// Issue: https://github.com/opencontainers/runc/issues/4195
|
2024-05-23 16:29:58 -07:00
|
|
|
maybeClearRlimitNofileCache(config.Rlimits)
|
2024-03-29 18:12:08 +08:00
|
|
|
|
2015-02-06 12:48:57 -08:00
|
|
|
switch t {
|
|
|
|
|
case initSetns:
|
2021-12-09 14:22:34 -08:00
|
|
|
i := &linuxSetnsInit{
|
2017-03-02 12:53:06 -08:00
|
|
|
pipe: pipe,
|
|
|
|
|
consoleSocket: consoleSocket,
|
2023-10-01 11:09:49 +08:00
|
|
|
pidfdSocket: pidfdSocket,
|
2017-03-02 12:53:06 -08:00
|
|
|
config: config,
|
2024-01-20 16:58:28 +11:00
|
|
|
logPipe: logPipe,
|
2021-12-09 14:22:34 -08:00
|
|
|
}
|
|
|
|
|
return i.Init()
|
2015-02-06 12:48:57 -08:00
|
|
|
case initStandard:
|
2021-12-09 14:22:34 -08:00
|
|
|
i := &linuxStandardInit{
|
2017-03-02 12:53:06 -08:00
|
|
|
pipe: pipe,
|
|
|
|
|
consoleSocket: consoleSocket,
|
2023-10-01 11:09:49 +08:00
|
|
|
pidfdSocket: pidfdSocket,
|
2017-05-09 17:38:27 -04:00
|
|
|
parentPid: unix.Getppid(),
|
2017-03-02 12:53:06 -08:00
|
|
|
config: config,
|
2024-01-20 16:58:28 +11:00
|
|
|
fifoFile: fifoFile,
|
|
|
|
|
logPipe: logPipe,
|
2021-12-09 14:22:34 -08:00
|
|
|
}
|
|
|
|
|
return i.Init()
|
2015-02-06 12:48:57 -08:00
|
|
|
}
|
2021-12-09 14:22:34 -08:00
|
|
|
return fmt.Errorf("unknown init type %q", t)
|
2015-02-06 12:48:57 -08:00
|
|
|
}
|
|
|
|
|
|
2023-12-26 23:53:07 +11:00
|
|
|
// verifyCwd ensures that the current directory is actually inside the mount
|
|
|
|
|
// namespace root of the current process.
|
|
|
|
|
func verifyCwd() error {
|
|
|
|
|
// getcwd(2) on Linux detects if cwd is outside of the rootfs of the
|
|
|
|
|
// current mount namespace root, and in that case prefixes "(unreachable)"
|
|
|
|
|
// to the returned string. glibc's getcwd(3) and Go's Getwd() both detect
|
|
|
|
|
// when this happens and return ENOENT rather than returning a non-absolute
|
|
|
|
|
// path. In both cases we can therefore easily detect if we have an invalid
|
|
|
|
|
// cwd by checking the return value of getcwd(3). See getcwd(3) for more
|
|
|
|
|
// details, and CVE-2024-21626 for the security issue that motivated this
|
|
|
|
|
// check.
|
|
|
|
|
//
|
2025-03-25 14:36:42 -07:00
|
|
|
// We do not use os.Getwd() here because it has a workaround for
|
2023-12-26 23:53:07 +11:00
|
|
|
// $PWD which involves doing stat(.), which can fail if the current
|
|
|
|
|
// directory is inaccessible to the container process.
|
2025-03-25 14:36:42 -07:00
|
|
|
if wd, err := linux.Getwd(); errors.Is(err, unix.ENOENT) {
|
2023-12-26 23:53:07 +11:00
|
|
|
return errors.New("current working directory is outside of container mount namespace root -- possible container breakout detected")
|
|
|
|
|
} else if err != nil {
|
|
|
|
|
return fmt.Errorf("failed to verify if current working directory is safe: %w", err)
|
|
|
|
|
} else if !filepath.IsAbs(wd) {
|
|
|
|
|
// We shouldn't ever hit this, but check just in case.
|
|
|
|
|
return fmt.Errorf("current working directory is not absolute -- possible container breakout detected: cwd is %q", wd)
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
2015-02-06 12:48:57 -08:00
|
|
|
// finalizeNamespace drops the caps, sets the correct user
|
2015-02-13 19:06:17 -05:00
|
|
|
// and working dir, and closes any leaked file descriptors
|
libct: speedup process.Env handling
The current implementation sets all the environment variables passed in
Process.Env in the current process, one by one, then uses os.Environ to
read those back.
As pointed out in [1], this is slow, as runc calls os.Setenv for every
variable, and there may be a few thousands of those. Looking into how
os.Setenv is implemented, it is indeed slow, especially when cgo is
enabled.
Looking into why it was implemented the way it is, I found commit
9744d72c and traced it to [2], which discusses the actual reasons.
It boils down to these two:
- HOME is not passed into container as it is set in setupUser by
os.Setenv and has no effect on config.Env;
- there is a need to deduplicate the environment variables.
Yet it was decided in [2] to not go ahead with this patch, but
later [3] was opened with the carry of this patch, and merged.
Now, from what I see:
1. Passing environment to exec is way faster than using os.Setenv and
os.Environ (tests show ~20x speed improvement in a simple Go test,
and ~3x improvement in real-world test, see below).
2. Setting environment variables in the runc context may result is some
ugly side effects (think GODEBUG, LD_PRELOAD, or _LIBCONTAINER_*).
3. Nothing in runtime spec says that the environment needs to be
deduplicated, or the order of preference (whether the first or the
last value of a variable with the same name is to be used). We should
stick to what we have in order to maintain backward compatibility.
So, this patch:
- switches to passing env directly to exec;
- adds deduplication mechanism to retain backward compatibility;
- takes care to set PATH from process.Env in the current process
(so that supplied PATH is used to find the binary to execute),
also to retain backward compatibility;
- adds HOME to process.Env if not set;
- ensures any StartContainer CommandHook entries with no environment
set explicitly are run with the same environment as before. Thanks
to @lifubang who noticed that peculiarity.
The benchmark added by the previous commit shows ~3x improvement:
│ before │ after │
│ sec/op │ sec/op vs base │
ExecInBigEnv-20 61.53m ± 1% 21.87m ± 16% -64.46% (p=0.000 n=10)
[1]: https://github.com/opencontainers/runc/pull/1983
[2]: https://github.com/docker-archive/libcontainer/pull/418
[3]: https://github.com/docker-archive/libcontainer/pull/432
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2024-06-23 16:31:57 -07:00
|
|
|
// before executing the command inside the namespace.
|
libct: switch to numeric UID/GID/groups
This addresses the following TODO in the code (added back in 2015
by commit 845fc65e5):
> // TODO: fix libcontainer's API to better support uid/gid in a typesafe way.
Historically, libcontainer internally uses strings for user, group, and
additional (aka supplementary) groups.
Yet, runc receives those credentials as part of runtime-spec's process,
which uses integers for all of them (see [1], [2]).
What happens next is:
1. runc start/run/exec converts those credentials to strings (a User
string containing "UID:GID", and a []string for additional GIDs) and
passes those onto runc init.
2. runc init converts them back to int, in the most complicated way
possible (parsing container's /etc/passwd and /etc/group).
All this conversion and, especially, parsing is totally unnecessary,
but is performed on every container exec (and start).
The only benefit of all this is, a libcontainer user could use user and
group names instead of numeric IDs (but runc itself is not using this
feature, and we don't know if there are any other users of this).
Let's remove this back and forth translation, hopefully increasing
runc exec performance.
The only remaining need to parse /etc/passwd is to set HOME environment
variable for a specified UID, in case $HOME is not explicitly set in
process.Env. This can now be done right in prepareEnv, which simplifies
the code flow a lot. Alas, we can not use standard os/user.LookupId, as
it could cache host's /etc/passwd or the current user (even with the
osusergo tag).
PS Note that the structures being changed (initConfig and Process) are
never saved to disk as JSON by runc, so there is no compatibility issue
for runc users.
Still, this is a breaking change in libcontainer, but we never promised
that libcontainer API will be stable (and there's a special package
that can handle it -- github.com/moby/sys/user). Reflect this in
CHANGELOG.
For 3998.
[1]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/config.md#posix-platform-user
[2]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/specs-go/config.go#L86
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2023-08-28 20:05:56 -07:00
|
|
|
func finalizeNamespace(config *initConfig) error {
|
2015-03-31 23:40:05 +02:00
|
|
|
// Ensure that all unwanted fds we may have accidentally
|
2015-02-06 12:48:57 -08:00
|
|
|
// inherited are marked close-on-exec so they stay out of the
|
|
|
|
|
// container
|
2015-03-31 23:40:05 +02:00
|
|
|
if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil {
|
2021-06-11 16:32:40 -07:00
|
|
|
return fmt.Errorf("error closing exec fds: %w", err)
|
2015-02-06 12:48:57 -08:00
|
|
|
}
|
2015-03-25 15:40:32 -04:00
|
|
|
|
2021-04-06 13:48:16 -04:00
|
|
|
// we only do chdir if it's specified
|
|
|
|
|
doChdir := config.Cwd != ""
|
|
|
|
|
if doChdir {
|
|
|
|
|
// First, attempt the chdir before setting up the user.
|
|
|
|
|
// This could allow us to access a directory that the user running runc can access
|
|
|
|
|
// but the container user cannot.
|
|
|
|
|
err := unix.Chdir(config.Cwd)
|
|
|
|
|
switch {
|
|
|
|
|
case err == nil:
|
|
|
|
|
doChdir = false
|
2025-11-17 00:28:05 +01:00
|
|
|
case errors.Is(err, os.ErrPermission):
|
2021-04-06 13:48:16 -04:00
|
|
|
// If we hit an EPERM, we should attempt again after setting up user.
|
|
|
|
|
// This will allow us to successfully chdir if the container user has access
|
|
|
|
|
// to the directory, but the user running runc does not.
|
|
|
|
|
// This is useful in cases where the cwd is also a volume that's been chowned to the container user.
|
|
|
|
|
default:
|
2021-06-08 20:05:54 -07:00
|
|
|
return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err)
|
2021-04-06 13:48:16 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-03-25 02:48:44 +00:00
|
|
|
// We should set envs after we are in the jail of the container.
|
|
|
|
|
// Please see https://github.com/opencontainers/runc/issues/4688
|
|
|
|
|
env, err := prepareEnv(config.Env, config.UID)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
config.Env = env
|
|
|
|
|
|
2025-01-07 19:32:27 -08:00
|
|
|
w, err := capabilities.New(config.Capabilities)
|
2015-02-09 15:38:28 -08:00
|
|
|
if err != nil {
|
|
|
|
|
return err
|
|
|
|
|
}
|
2015-02-06 12:48:57 -08:00
|
|
|
// drop capabilities in bounding set before changing user
|
2017-03-14 09:36:38 -07:00
|
|
|
if err := w.ApplyBoundingSet(); err != nil {
|
2021-06-11 16:32:40 -07:00
|
|
|
return fmt.Errorf("unable to apply bounding set: %w", err)
|
2015-02-06 12:48:57 -08:00
|
|
|
}
|
|
|
|
|
// preserve existing capabilities while we change users
|
|
|
|
|
if err := system.SetKeepCaps(); err != nil {
|
2021-06-11 16:32:40 -07:00
|
|
|
return fmt.Errorf("unable to set keep caps: %w", err)
|
2015-02-06 12:48:57 -08:00
|
|
|
}
|
libct: switch to numeric UID/GID/groups
This addresses the following TODO in the code (added back in 2015
by commit 845fc65e5):
> // TODO: fix libcontainer's API to better support uid/gid in a typesafe way.
Historically, libcontainer internally uses strings for user, group, and
additional (aka supplementary) groups.
Yet, runc receives those credentials as part of runtime-spec's process,
which uses integers for all of them (see [1], [2]).
What happens next is:
1. runc start/run/exec converts those credentials to strings (a User
string containing "UID:GID", and a []string for additional GIDs) and
passes those onto runc init.
2. runc init converts them back to int, in the most complicated way
possible (parsing container's /etc/passwd and /etc/group).
All this conversion and, especially, parsing is totally unnecessary,
but is performed on every container exec (and start).
The only benefit of all this is, a libcontainer user could use user and
group names instead of numeric IDs (but runc itself is not using this
feature, and we don't know if there are any other users of this).
Let's remove this back and forth translation, hopefully increasing
runc exec performance.
The only remaining need to parse /etc/passwd is to set HOME environment
variable for a specified UID, in case $HOME is not explicitly set in
process.Env. This can now be done right in prepareEnv, which simplifies
the code flow a lot. Alas, we can not use standard os/user.LookupId, as
it could cache host's /etc/passwd or the current user (even with the
osusergo tag).
PS Note that the structures being changed (initConfig and Process) are
never saved to disk as JSON by runc, so there is no compatibility issue
for runc users.
Still, this is a breaking change in libcontainer, but we never promised
that libcontainer API will be stable (and there's a special package
that can handle it -- github.com/moby/sys/user). Reflect this in
CHANGELOG.
For 3998.
[1]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/config.md#posix-platform-user
[2]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/specs-go/config.go#L86
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2023-08-28 20:05:56 -07:00
|
|
|
if err := setupUser(config); err != nil {
|
2021-06-11 16:32:40 -07:00
|
|
|
return fmt.Errorf("unable to setup user: %w", err)
|
2015-02-06 12:48:57 -08:00
|
|
|
}
|
2021-04-06 13:48:16 -04:00
|
|
|
// Change working directory AFTER the user has been set up, if we haven't done it yet.
|
|
|
|
|
if doChdir {
|
2020-11-19 09:38:05 -05:00
|
|
|
if err := unix.Chdir(config.Cwd); err != nil {
|
2021-06-08 20:05:54 -07:00
|
|
|
return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err)
|
2020-11-19 09:38:05 -05:00
|
|
|
}
|
|
|
|
|
}
|
2023-12-26 23:53:07 +11:00
|
|
|
// Make sure our final working directory is inside the container.
|
|
|
|
|
if err := verifyCwd(); err != nil {
|
|
|
|
|
return err
|
|
|
|
|
}
|
2015-02-06 12:48:57 -08:00
|
|
|
if err := system.ClearKeepCaps(); err != nil {
|
2021-06-11 16:32:40 -07:00
|
|
|
return fmt.Errorf("unable to clear keep caps: %w", err)
|
2015-02-06 12:48:57 -08:00
|
|
|
}
|
2017-03-14 09:36:38 -07:00
|
|
|
if err := w.ApplyCaps(); err != nil {
|
2021-06-11 16:32:40 -07:00
|
|
|
return fmt.Errorf("unable to apply caps: %w", err)
|
2015-02-06 12:48:57 -08:00
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
2016-06-04 01:29:34 +10:00
|
|
|
// setupConsole sets up the console from inside the container, and sends the
|
|
|
|
|
// master pty fd to the config.Pipe (using cmsg). This is done to ensure that
|
|
|
|
|
// consoles are scoped to a container properly (see runc#814 and the many
|
|
|
|
|
// issues related to that). This has to be run *after* we've pivoted to the new
|
|
|
|
|
// rootfs (and the users' configuration is entirely set up).
|
2017-03-02 12:53:06 -08:00
|
|
|
func setupConsole(socket *os.File, config *initConfig, mount bool) error {
|
|
|
|
|
defer socket.Close()
|
2016-09-14 19:02:53 +10:00
|
|
|
// At this point, /dev/ptmx points to something that we would expect. We
|
|
|
|
|
// used to change the owner of the slave path, but since the /dev/pts mount
|
|
|
|
|
// can have gid=X set (at the users' option). So touching the owner of the
|
|
|
|
|
// slave PTY is not necessary, as the kernel will handle that for us. Note
|
|
|
|
|
// however, that setupUser (specifically fixStdioPermissions) *will* change
|
|
|
|
|
// the UID owner of the console to be the user the process will run as (so
|
|
|
|
|
// they can actually control their console).
|
2017-09-26 15:39:46 +02:00
|
|
|
|
console: use TIOCGPTPEER when allocating peer PTY
When opening the peer end of a pty, the old kernel API required us to
open /dev/pts/$num inside the container (at least since we fixed console
handling many years ago in commit 244c9fc426ae ("*: console rewrite")).
The problem is that in a hostile container it is possible for
/dev/pts/$num to be an attacker-controlled symlink that runc can be
tricked into resolving when doing bind-mounts. This allows the attacker
to (among other things) persist /proc/... entries that are later masked
by runc, allowing an attacker to escape through the kernel.core_pattern
sysctl (/proc/sys/kernel/core_pattern). This is the original issue
reported by Lei Wang and Li Fu Bang in CVE-2025-52565.
However, it should be noted that this is not entirely a newly-discovered
problem. Way back in Linux 4.13 (2017), I added the TIOCGPTPEER ioctl,
which allows us to get a pty peer without touching the /dev/pts inside
the container. The original threat model was around an attacker
replacing /dev/pts/$n or /dev/pts/ptmx with some malicious inode (a DoS
inode, or possibly a PTY they wanted a confused deputy to operate on).
Unfortunately, there was no practical way for runc to cache a safe
O_PATH handle to /dev/pts/ptmx (unlike other runtimes like LXC, which
switched to TIOCGPTPEER way back in 2017). Since it wasn't clear how we
could protect against the main attack TIOCGPTPEER was meant to protect
against, we never switched to it (even though I implemented it
specifically to harden container runtimes).
Unfortunately, It turns out that mount *sources* are a threat we didn't
fully consider. Since TIOCGPTPEER already solves this problem entirely
for us in a race free way, we should just use that. In a later patch, we
will add some hardening for /dev/pts/$num opening to maintain support
for very old kernels (Linux 4.13 is very old at this point, but RHEL 7
is still kicking and is stuck on Linux 3.10).
Fixes: GHSA-qw9x-cqr3-wc7r CVE-2025-52565
Reported-by: Lei Wang <ssst0n3@gmail.com> (CVE-2025-52565)
Reported-by: lfbzhm <lifubang@acmcoder.com> (CVE-2025-52565)
Reported-by: Aleksa Sarai <cyphar@cyphar.com> (TIOCGPTPEER)
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
2025-05-15 16:12:21 +10:00
|
|
|
pty, peerPty, err := safeAllocPty()
|
2016-06-04 01:29:34 +10:00
|
|
|
if err != nil {
|
|
|
|
|
return err
|
|
|
|
|
}
|
2020-07-05 12:10:47 +08:00
|
|
|
// After we return from here, we don't need the console anymore.
|
|
|
|
|
defer pty.Close()
|
console: use TIOCGPTPEER when allocating peer PTY
When opening the peer end of a pty, the old kernel API required us to
open /dev/pts/$num inside the container (at least since we fixed console
handling many years ago in commit 244c9fc426ae ("*: console rewrite")).
The problem is that in a hostile container it is possible for
/dev/pts/$num to be an attacker-controlled symlink that runc can be
tricked into resolving when doing bind-mounts. This allows the attacker
to (among other things) persist /proc/... entries that are later masked
by runc, allowing an attacker to escape through the kernel.core_pattern
sysctl (/proc/sys/kernel/core_pattern). This is the original issue
reported by Lei Wang and Li Fu Bang in CVE-2025-52565.
However, it should be noted that this is not entirely a newly-discovered
problem. Way back in Linux 4.13 (2017), I added the TIOCGPTPEER ioctl,
which allows us to get a pty peer without touching the /dev/pts inside
the container. The original threat model was around an attacker
replacing /dev/pts/$n or /dev/pts/ptmx with some malicious inode (a DoS
inode, or possibly a PTY they wanted a confused deputy to operate on).
Unfortunately, there was no practical way for runc to cache a safe
O_PATH handle to /dev/pts/ptmx (unlike other runtimes like LXC, which
switched to TIOCGPTPEER way back in 2017). Since it wasn't clear how we
could protect against the main attack TIOCGPTPEER was meant to protect
against, we never switched to it (even though I implemented it
specifically to harden container runtimes).
Unfortunately, It turns out that mount *sources* are a threat we didn't
fully consider. Since TIOCGPTPEER already solves this problem entirely
for us in a race free way, we should just use that. In a later patch, we
will add some hardening for /dev/pts/$num opening to maintain support
for very old kernels (Linux 4.13 is very old at this point, but RHEL 7
is still kicking and is stuck on Linux 3.10).
Fixes: GHSA-qw9x-cqr3-wc7r CVE-2025-52565
Reported-by: Lei Wang <ssst0n3@gmail.com> (CVE-2025-52565)
Reported-by: lfbzhm <lifubang@acmcoder.com> (CVE-2025-52565)
Reported-by: Aleksa Sarai <cyphar@cyphar.com> (TIOCGPTPEER)
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
2025-05-15 16:12:21 +10:00
|
|
|
defer peerPty.Close()
|
2020-07-05 12:10:47 +08:00
|
|
|
|
2017-09-26 15:39:46 +02:00
|
|
|
if config.ConsoleHeight != 0 && config.ConsoleWidth != 0 {
|
|
|
|
|
err = pty.Resize(console.WinSize{
|
|
|
|
|
Height: config.ConsoleHeight,
|
|
|
|
|
Width: config.ConsoleWidth,
|
|
|
|
|
})
|
|
|
|
|
if err != nil {
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-06-04 01:29:34 +10:00
|
|
|
// Mount the console inside our rootfs.
|
|
|
|
|
if mount {
|
console: use TIOCGPTPEER when allocating peer PTY
When opening the peer end of a pty, the old kernel API required us to
open /dev/pts/$num inside the container (at least since we fixed console
handling many years ago in commit 244c9fc426ae ("*: console rewrite")).
The problem is that in a hostile container it is possible for
/dev/pts/$num to be an attacker-controlled symlink that runc can be
tricked into resolving when doing bind-mounts. This allows the attacker
to (among other things) persist /proc/... entries that are later masked
by runc, allowing an attacker to escape through the kernel.core_pattern
sysctl (/proc/sys/kernel/core_pattern). This is the original issue
reported by Lei Wang and Li Fu Bang in CVE-2025-52565.
However, it should be noted that this is not entirely a newly-discovered
problem. Way back in Linux 4.13 (2017), I added the TIOCGPTPEER ioctl,
which allows us to get a pty peer without touching the /dev/pts inside
the container. The original threat model was around an attacker
replacing /dev/pts/$n or /dev/pts/ptmx with some malicious inode (a DoS
inode, or possibly a PTY they wanted a confused deputy to operate on).
Unfortunately, there was no practical way for runc to cache a safe
O_PATH handle to /dev/pts/ptmx (unlike other runtimes like LXC, which
switched to TIOCGPTPEER way back in 2017). Since it wasn't clear how we
could protect against the main attack TIOCGPTPEER was meant to protect
against, we never switched to it (even though I implemented it
specifically to harden container runtimes).
Unfortunately, It turns out that mount *sources* are a threat we didn't
fully consider. Since TIOCGPTPEER already solves this problem entirely
for us in a race free way, we should just use that. In a later patch, we
will add some hardening for /dev/pts/$num opening to maintain support
for very old kernels (Linux 4.13 is very old at this point, but RHEL 7
is still kicking and is stuck on Linux 3.10).
Fixes: GHSA-qw9x-cqr3-wc7r CVE-2025-52565
Reported-by: Lei Wang <ssst0n3@gmail.com> (CVE-2025-52565)
Reported-by: lfbzhm <lifubang@acmcoder.com> (CVE-2025-52565)
Reported-by: Aleksa Sarai <cyphar@cyphar.com> (TIOCGPTPEER)
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
2025-05-15 16:12:21 +10:00
|
|
|
if err := mountConsole(peerPty); err != nil {
|
2016-06-04 01:29:34 +10:00
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// While we can access console.master, using the API is a good idea.
|
2023-08-08 11:33:06 +10:00
|
|
|
if err := utils.SendRawFd(socket, pty.Name(), pty.Fd()); err != nil {
|
2016-06-04 01:29:34 +10:00
|
|
|
return err
|
|
|
|
|
}
|
2023-08-08 11:33:06 +10:00
|
|
|
runtime.KeepAlive(pty)
|
|
|
|
|
|
2016-06-04 01:29:34 +10:00
|
|
|
// Now, dup over all the things.
|
console: use TIOCGPTPEER when allocating peer PTY
When opening the peer end of a pty, the old kernel API required us to
open /dev/pts/$num inside the container (at least since we fixed console
handling many years ago in commit 244c9fc426ae ("*: console rewrite")).
The problem is that in a hostile container it is possible for
/dev/pts/$num to be an attacker-controlled symlink that runc can be
tricked into resolving when doing bind-mounts. This allows the attacker
to (among other things) persist /proc/... entries that are later masked
by runc, allowing an attacker to escape through the kernel.core_pattern
sysctl (/proc/sys/kernel/core_pattern). This is the original issue
reported by Lei Wang and Li Fu Bang in CVE-2025-52565.
However, it should be noted that this is not entirely a newly-discovered
problem. Way back in Linux 4.13 (2017), I added the TIOCGPTPEER ioctl,
which allows us to get a pty peer without touching the /dev/pts inside
the container. The original threat model was around an attacker
replacing /dev/pts/$n or /dev/pts/ptmx with some malicious inode (a DoS
inode, or possibly a PTY they wanted a confused deputy to operate on).
Unfortunately, there was no practical way for runc to cache a safe
O_PATH handle to /dev/pts/ptmx (unlike other runtimes like LXC, which
switched to TIOCGPTPEER way back in 2017). Since it wasn't clear how we
could protect against the main attack TIOCGPTPEER was meant to protect
against, we never switched to it (even though I implemented it
specifically to harden container runtimes).
Unfortunately, It turns out that mount *sources* are a threat we didn't
fully consider. Since TIOCGPTPEER already solves this problem entirely
for us in a race free way, we should just use that. In a later patch, we
will add some hardening for /dev/pts/$num opening to maintain support
for very old kernels (Linux 4.13 is very old at this point, but RHEL 7
is still kicking and is stuck on Linux 3.10).
Fixes: GHSA-qw9x-cqr3-wc7r CVE-2025-52565
Reported-by: Lei Wang <ssst0n3@gmail.com> (CVE-2025-52565)
Reported-by: lfbzhm <lifubang@acmcoder.com> (CVE-2025-52565)
Reported-by: Aleksa Sarai <cyphar@cyphar.com> (TIOCGPTPEER)
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
2025-05-15 16:12:21 +10:00
|
|
|
return dupStdio(peerPty)
|
2016-06-04 01:29:34 +10:00
|
|
|
}
|
|
|
|
|
|
2015-12-17 20:16:34 +11:00
|
|
|
// syncParentReady sends to the given pipe a JSON payload which indicates that
|
|
|
|
|
// the init is ready to Exec the child process. It then waits for the parent to
|
|
|
|
|
// indicate that it is cleared to Exec.
|
2023-08-17 21:20:33 +10:00
|
|
|
func syncParentReady(pipe *syncSocket) error {
|
2015-12-17 20:16:34 +11:00
|
|
|
// Tell parent.
|
2016-06-06 20:26:35 +10:00
|
|
|
if err := writeSync(pipe, procReady); err != nil {
|
2015-12-17 20:16:34 +11:00
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
// Wait for parent to give the all-clear.
|
2018-10-13 21:14:03 +02:00
|
|
|
return readSync(pipe, procRun)
|
2015-12-17 20:16:34 +11:00
|
|
|
}
|
|
|
|
|
|
2016-02-17 02:20:06 -08:00
|
|
|
// syncParentHooks sends to the given pipe a JSON payload which indicates that
|
|
|
|
|
// the parent should execute pre-start hooks. It then waits for the parent to
|
|
|
|
|
// indicate that it is cleared to resume.
|
2023-08-17 21:20:33 +10:00
|
|
|
func syncParentHooks(pipe *syncSocket) error {
|
2016-02-17 02:20:06 -08:00
|
|
|
// Tell parent.
|
2016-06-06 20:26:35 +10:00
|
|
|
if err := writeSync(pipe, procHooks); err != nil {
|
2016-02-17 02:20:06 -08:00
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
// Wait for parent to give the all-clear.
|
2023-08-17 15:13:33 +10:00
|
|
|
return readSync(pipe, procHooksDone)
|
2016-02-17 02:20:06 -08:00
|
|
|
}
|
|
|
|
|
|
2023-08-08 11:33:06 +10:00
|
|
|
// syncParentSeccomp sends the fd associated with the seccomp file descriptor
|
|
|
|
|
// to the parent, and wait for the parent to do pidfd_getfd() to grab a copy.
|
2024-07-03 17:25:10 +09:00
|
|
|
func syncParentSeccomp(pipe *syncSocket, seccompFd int) error {
|
|
|
|
|
if seccompFd == -1 {
|
2020-09-09 14:22:48 +02:00
|
|
|
return nil
|
|
|
|
|
}
|
2024-07-03 17:25:10 +09:00
|
|
|
defer unix.Close(seccompFd)
|
2023-08-08 11:33:06 +10:00
|
|
|
|
|
|
|
|
// Tell parent to grab our fd.
|
|
|
|
|
//
|
|
|
|
|
// Notably, we do not use writeSyncFile here because a container might have
|
|
|
|
|
// an SCMP_ACT_NOTIFY action on sendmsg(2) so we need to use the smallest
|
|
|
|
|
// possible number of system calls here because all of those syscalls
|
|
|
|
|
// cannot be used with SCMP_ACT_NOTIFY as a result (any syscall we use here
|
|
|
|
|
// before the parent gets the file descriptor would deadlock "runc init" if
|
|
|
|
|
// we allowed it for SCMP_ACT_NOTIFY). See seccomp.InitSeccomp() for more
|
|
|
|
|
// details.
|
2024-07-03 17:25:10 +09:00
|
|
|
if err := writeSyncArg(pipe, procSeccomp, seccompFd); err != nil {
|
2020-09-09 14:22:48 +02:00
|
|
|
return err
|
|
|
|
|
}
|
2023-08-08 11:33:06 +10:00
|
|
|
// Wait for parent to tell us they've grabbed the seccompfd.
|
|
|
|
|
return readSync(pipe, procSeccompDone)
|
2020-09-09 14:22:48 +02:00
|
|
|
}
|
|
|
|
|
|
libct: switch to numeric UID/GID/groups
This addresses the following TODO in the code (added back in 2015
by commit 845fc65e5):
> // TODO: fix libcontainer's API to better support uid/gid in a typesafe way.
Historically, libcontainer internally uses strings for user, group, and
additional (aka supplementary) groups.
Yet, runc receives those credentials as part of runtime-spec's process,
which uses integers for all of them (see [1], [2]).
What happens next is:
1. runc start/run/exec converts those credentials to strings (a User
string containing "UID:GID", and a []string for additional GIDs) and
passes those onto runc init.
2. runc init converts them back to int, in the most complicated way
possible (parsing container's /etc/passwd and /etc/group).
All this conversion and, especially, parsing is totally unnecessary,
but is performed on every container exec (and start).
The only benefit of all this is, a libcontainer user could use user and
group names instead of numeric IDs (but runc itself is not using this
feature, and we don't know if there are any other users of this).
Let's remove this back and forth translation, hopefully increasing
runc exec performance.
The only remaining need to parse /etc/passwd is to set HOME environment
variable for a specified UID, in case $HOME is not explicitly set in
process.Env. This can now be done right in prepareEnv, which simplifies
the code flow a lot. Alas, we can not use standard os/user.LookupId, as
it could cache host's /etc/passwd or the current user (even with the
osusergo tag).
PS Note that the structures being changed (initConfig and Process) are
never saved to disk as JSON by runc, so there is no compatibility issue
for runc users.
Still, this is a breaking change in libcontainer, but we never promised
that libcontainer API will be stable (and there's a special package
that can handle it -- github.com/moby/sys/user). Reflect this in
CHANGELOG.
For 3998.
[1]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/config.md#posix-platform-user
[2]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/specs-go/config.go#L86
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2023-08-28 20:05:56 -07:00
|
|
|
// setupUser changes the groups, gid, and uid for the user inside the container.
|
|
|
|
|
func setupUser(config *initConfig) error {
|
2017-09-07 06:58:52 +10:00
|
|
|
// Before we change to the container's user make sure that the processes
|
|
|
|
|
// STDIO is correctly owned by the user that we are switching to.
|
libct: switch to numeric UID/GID/groups
This addresses the following TODO in the code (added back in 2015
by commit 845fc65e5):
> // TODO: fix libcontainer's API to better support uid/gid in a typesafe way.
Historically, libcontainer internally uses strings for user, group, and
additional (aka supplementary) groups.
Yet, runc receives those credentials as part of runtime-spec's process,
which uses integers for all of them (see [1], [2]).
What happens next is:
1. runc start/run/exec converts those credentials to strings (a User
string containing "UID:GID", and a []string for additional GIDs) and
passes those onto runc init.
2. runc init converts them back to int, in the most complicated way
possible (parsing container's /etc/passwd and /etc/group).
All this conversion and, especially, parsing is totally unnecessary,
but is performed on every container exec (and start).
The only benefit of all this is, a libcontainer user could use user and
group names instead of numeric IDs (but runc itself is not using this
feature, and we don't know if there are any other users of this).
Let's remove this back and forth translation, hopefully increasing
runc exec performance.
The only remaining need to parse /etc/passwd is to set HOME environment
variable for a specified UID, in case $HOME is not explicitly set in
process.Env. This can now be done right in prepareEnv, which simplifies
the code flow a lot. Alas, we can not use standard os/user.LookupId, as
it could cache host's /etc/passwd or the current user (even with the
osusergo tag).
PS Note that the structures being changed (initConfig and Process) are
never saved to disk as JSON by runc, so there is no compatibility issue
for runc users.
Still, this is a breaking change in libcontainer, but we never promised
that libcontainer API will be stable (and there's a special package
that can handle it -- github.com/moby/sys/user). Reflect this in
CHANGELOG.
For 3998.
[1]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/config.md#posix-platform-user
[2]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/specs-go/config.go#L86
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2023-08-28 20:05:56 -07:00
|
|
|
if err := fixStdioPermissions(config.UID); err != nil {
|
2015-09-22 11:12:55 -07:00
|
|
|
return err
|
2015-09-18 13:55:49 -07:00
|
|
|
}
|
2016-04-23 23:39:42 +10:00
|
|
|
|
tree-wide: use /proc/thread-self for thread-local state
With the idmap work, we will have a tainted Go thread in our
thread-group that has a different mount namespace to the other threads.
It seems that (due to some bad luck) the Go scheduler tends to make this
thread the thread-group leader in our tests, which results in very
baffling failures where /proc/self/mountinfo produces gibberish results.
In order to avoid this, switch to using /proc/thread-self for everything
that is thread-local. This primarily includes switching all file
descriptor paths (CLONE_FS), all of the places that check the current
cgroup (technically we never will run a single runc thread in a separate
cgroup, but better to be safe than sorry), and the aforementioned
mountinfo code. We don't need to do anything for the following because
the results we need aren't thread-local:
* Checks that certain namespaces are supported by stat(2)ing
/proc/self/ns/...
* /proc/self/exe and /proc/self/cmdline are not thread-local.
* While threads can be in different cgroups, we do not do this for the
runc binary (or libcontainer) and thus we do not need to switch to
the thread-local version of /proc/self/cgroups.
* All of the CLONE_NEWUSER files are not thread-local because you
cannot set the usernamespace of a single thread (setns(CLONE_NEWUSER)
is blocked for multi-threaded programs).
Note that we have to use runtime.LockOSThread when we have an open
handle to a tid-specific procfs file that we are operating on multiple
times. Go can reschedule us such that we are running on a different
thread and then kill the original thread (causing -ENOENT or similarly
confusing errors). This is not strictly necessary for most usages of
/proc/thread-self (such as using /proc/thread-self/fd/$n directly) since
only operating on the actual inodes associated with the tid requires
this locking, but because of the pre-3.17 fallback for CentOS, we have
to do this in most cases.
In addition, CentOS's kernel is too old for /proc/thread-self, which
requires us to emulate it -- however in rootfs_linux.go, we are in the
container pid namespace but /proc is the host's procfs. This leads to
the incredibly frustrating situation where there is no way (on pre-4.1
Linux) to figure out which /proc/self/task/... entry refers to the
current tid. We can just use /proc/self in this case.
Yes this is all pretty ugly. I also wish it wasn't necessary.
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
2023-08-24 12:53:53 +10:00
|
|
|
// We don't need to use /proc/thread-self here because setgroups is a
|
|
|
|
|
// per-userns file and thus is global to all threads in a thread-group.
|
|
|
|
|
// This lets us avoid having to do runtime.LockOSThread.
|
2025-07-18 15:33:56 +10:00
|
|
|
var setgroups []byte
|
|
|
|
|
setgroupsFile, err := pathrs.ProcSelfOpen("setgroups", unix.O_RDONLY)
|
|
|
|
|
if err == nil {
|
|
|
|
|
setgroups, err = io.ReadAll(setgroupsFile)
|
|
|
|
|
_ = setgroupsFile.Close()
|
|
|
|
|
}
|
2025-11-17 00:28:05 +01:00
|
|
|
if err != nil && !errors.Is(err, os.ErrNotExist) {
|
2018-10-25 15:39:35 +02:00
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
|
2016-04-23 23:39:42 +10:00
|
|
|
// This isn't allowed in an unprivileged user namespace since Linux 3.19.
|
|
|
|
|
// There's nothing we can do about /etc/group entries, so we silently
|
|
|
|
|
// ignore setting groups here (since the user didn't explicitly ask us to
|
|
|
|
|
// set the group).
|
2025-01-15 23:28:08 -08:00
|
|
|
allowSupGroups := !config.Config.RootlessEUID && string(bytes.TrimSpace(setgroups)) != "deny"
|
2018-10-25 15:39:35 +02:00
|
|
|
|
|
|
|
|
if allowSupGroups {
|
libct: switch to numeric UID/GID/groups
This addresses the following TODO in the code (added back in 2015
by commit 845fc65e5):
> // TODO: fix libcontainer's API to better support uid/gid in a typesafe way.
Historically, libcontainer internally uses strings for user, group, and
additional (aka supplementary) groups.
Yet, runc receives those credentials as part of runtime-spec's process,
which uses integers for all of them (see [1], [2]).
What happens next is:
1. runc start/run/exec converts those credentials to strings (a User
string containing "UID:GID", and a []string for additional GIDs) and
passes those onto runc init.
2. runc init converts them back to int, in the most complicated way
possible (parsing container's /etc/passwd and /etc/group).
All this conversion and, especially, parsing is totally unnecessary,
but is performed on every container exec (and start).
The only benefit of all this is, a libcontainer user could use user and
group names instead of numeric IDs (but runc itself is not using this
feature, and we don't know if there are any other users of this).
Let's remove this back and forth translation, hopefully increasing
runc exec performance.
The only remaining need to parse /etc/passwd is to set HOME environment
variable for a specified UID, in case $HOME is not explicitly set in
process.Env. This can now be done right in prepareEnv, which simplifies
the code flow a lot. Alas, we can not use standard os/user.LookupId, as
it could cache host's /etc/passwd or the current user (even with the
osusergo tag).
PS Note that the structures being changed (initConfig and Process) are
never saved to disk as JSON by runc, so there is no compatibility issue
for runc users.
Still, this is a breaking change in libcontainer, but we never promised
that libcontainer API will be stable (and there's a special package
that can handle it -- github.com/moby/sys/user). Reflect this in
CHANGELOG.
For 3998.
[1]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/config.md#posix-platform-user
[2]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/specs-go/config.go#L86
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2023-08-28 20:05:56 -07:00
|
|
|
if err := unix.Setgroups(config.AdditionalGroups); err != nil {
|
2021-11-10 17:36:48 -08:00
|
|
|
return &os.SyscallError{Syscall: "setgroups", Err: err}
|
2016-04-23 23:39:42 +10:00
|
|
|
}
|
2015-02-06 12:48:57 -08:00
|
|
|
}
|
2015-04-30 19:02:31 -04:00
|
|
|
|
libct: switch to numeric UID/GID/groups
This addresses the following TODO in the code (added back in 2015
by commit 845fc65e5):
> // TODO: fix libcontainer's API to better support uid/gid in a typesafe way.
Historically, libcontainer internally uses strings for user, group, and
additional (aka supplementary) groups.
Yet, runc receives those credentials as part of runtime-spec's process,
which uses integers for all of them (see [1], [2]).
What happens next is:
1. runc start/run/exec converts those credentials to strings (a User
string containing "UID:GID", and a []string for additional GIDs) and
passes those onto runc init.
2. runc init converts them back to int, in the most complicated way
possible (parsing container's /etc/passwd and /etc/group).
All this conversion and, especially, parsing is totally unnecessary,
but is performed on every container exec (and start).
The only benefit of all this is, a libcontainer user could use user and
group names instead of numeric IDs (but runc itself is not using this
feature, and we don't know if there are any other users of this).
Let's remove this back and forth translation, hopefully increasing
runc exec performance.
The only remaining need to parse /etc/passwd is to set HOME environment
variable for a specified UID, in case $HOME is not explicitly set in
process.Env. This can now be done right in prepareEnv, which simplifies
the code flow a lot. Alas, we can not use standard os/user.LookupId, as
it could cache host's /etc/passwd or the current user (even with the
osusergo tag).
PS Note that the structures being changed (initConfig and Process) are
never saved to disk as JSON by runc, so there is no compatibility issue
for runc users.
Still, this is a breaking change in libcontainer, but we never promised
that libcontainer API will be stable (and there's a special package
that can handle it -- github.com/moby/sys/user). Reflect this in
CHANGELOG.
For 3998.
[1]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/config.md#posix-platform-user
[2]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/specs-go/config.go#L86
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2023-08-28 20:05:56 -07:00
|
|
|
if err := unix.Setgid(config.GID); err != nil {
|
2023-12-02 02:19:00 +11:00
|
|
|
if err == unix.EINVAL {
|
libct: switch to numeric UID/GID/groups
This addresses the following TODO in the code (added back in 2015
by commit 845fc65e5):
> // TODO: fix libcontainer's API to better support uid/gid in a typesafe way.
Historically, libcontainer internally uses strings for user, group, and
additional (aka supplementary) groups.
Yet, runc receives those credentials as part of runtime-spec's process,
which uses integers for all of them (see [1], [2]).
What happens next is:
1. runc start/run/exec converts those credentials to strings (a User
string containing "UID:GID", and a []string for additional GIDs) and
passes those onto runc init.
2. runc init converts them back to int, in the most complicated way
possible (parsing container's /etc/passwd and /etc/group).
All this conversion and, especially, parsing is totally unnecessary,
but is performed on every container exec (and start).
The only benefit of all this is, a libcontainer user could use user and
group names instead of numeric IDs (but runc itself is not using this
feature, and we don't know if there are any other users of this).
Let's remove this back and forth translation, hopefully increasing
runc exec performance.
The only remaining need to parse /etc/passwd is to set HOME environment
variable for a specified UID, in case $HOME is not explicitly set in
process.Env. This can now be done right in prepareEnv, which simplifies
the code flow a lot. Alas, we can not use standard os/user.LookupId, as
it could cache host's /etc/passwd or the current user (even with the
osusergo tag).
PS Note that the structures being changed (initConfig and Process) are
never saved to disk as JSON by runc, so there is no compatibility issue
for runc users.
Still, this is a breaking change in libcontainer, but we never promised
that libcontainer API will be stable (and there's a special package
that can handle it -- github.com/moby/sys/user). Reflect this in
CHANGELOG.
For 3998.
[1]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/config.md#posix-platform-user
[2]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/specs-go/config.go#L86
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2023-08-28 20:05:56 -07:00
|
|
|
return fmt.Errorf("cannot setgid to unmapped gid %d in user namespace", config.GID)
|
2023-12-02 02:19:00 +11:00
|
|
|
}
|
2015-02-09 13:11:57 -08:00
|
|
|
return err
|
2015-02-06 12:48:57 -08:00
|
|
|
}
|
libct: switch to numeric UID/GID/groups
This addresses the following TODO in the code (added back in 2015
by commit 845fc65e5):
> // TODO: fix libcontainer's API to better support uid/gid in a typesafe way.
Historically, libcontainer internally uses strings for user, group, and
additional (aka supplementary) groups.
Yet, runc receives those credentials as part of runtime-spec's process,
which uses integers for all of them (see [1], [2]).
What happens next is:
1. runc start/run/exec converts those credentials to strings (a User
string containing "UID:GID", and a []string for additional GIDs) and
passes those onto runc init.
2. runc init converts them back to int, in the most complicated way
possible (parsing container's /etc/passwd and /etc/group).
All this conversion and, especially, parsing is totally unnecessary,
but is performed on every container exec (and start).
The only benefit of all this is, a libcontainer user could use user and
group names instead of numeric IDs (but runc itself is not using this
feature, and we don't know if there are any other users of this).
Let's remove this back and forth translation, hopefully increasing
runc exec performance.
The only remaining need to parse /etc/passwd is to set HOME environment
variable for a specified UID, in case $HOME is not explicitly set in
process.Env. This can now be done right in prepareEnv, which simplifies
the code flow a lot. Alas, we can not use standard os/user.LookupId, as
it could cache host's /etc/passwd or the current user (even with the
osusergo tag).
PS Note that the structures being changed (initConfig and Process) are
never saved to disk as JSON by runc, so there is no compatibility issue
for runc users.
Still, this is a breaking change in libcontainer, but we never promised
that libcontainer API will be stable (and there's a special package
that can handle it -- github.com/moby/sys/user). Reflect this in
CHANGELOG.
For 3998.
[1]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/config.md#posix-platform-user
[2]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/specs-go/config.go#L86
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2023-08-28 20:05:56 -07:00
|
|
|
if err := unix.Setuid(config.UID); err != nil {
|
2023-12-02 02:19:00 +11:00
|
|
|
if err == unix.EINVAL {
|
libct: switch to numeric UID/GID/groups
This addresses the following TODO in the code (added back in 2015
by commit 845fc65e5):
> // TODO: fix libcontainer's API to better support uid/gid in a typesafe way.
Historically, libcontainer internally uses strings for user, group, and
additional (aka supplementary) groups.
Yet, runc receives those credentials as part of runtime-spec's process,
which uses integers for all of them (see [1], [2]).
What happens next is:
1. runc start/run/exec converts those credentials to strings (a User
string containing "UID:GID", and a []string for additional GIDs) and
passes those onto runc init.
2. runc init converts them back to int, in the most complicated way
possible (parsing container's /etc/passwd and /etc/group).
All this conversion and, especially, parsing is totally unnecessary,
but is performed on every container exec (and start).
The only benefit of all this is, a libcontainer user could use user and
group names instead of numeric IDs (but runc itself is not using this
feature, and we don't know if there are any other users of this).
Let's remove this back and forth translation, hopefully increasing
runc exec performance.
The only remaining need to parse /etc/passwd is to set HOME environment
variable for a specified UID, in case $HOME is not explicitly set in
process.Env. This can now be done right in prepareEnv, which simplifies
the code flow a lot. Alas, we can not use standard os/user.LookupId, as
it could cache host's /etc/passwd or the current user (even with the
osusergo tag).
PS Note that the structures being changed (initConfig and Process) are
never saved to disk as JSON by runc, so there is no compatibility issue
for runc users.
Still, this is a breaking change in libcontainer, but we never promised
that libcontainer API will be stable (and there's a special package
that can handle it -- github.com/moby/sys/user). Reflect this in
CHANGELOG.
For 3998.
[1]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/config.md#posix-platform-user
[2]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/specs-go/config.go#L86
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2023-08-28 20:05:56 -07:00
|
|
|
return fmt.Errorf("cannot setuid to unmapped uid %d in user namespace", config.UID)
|
2023-12-02 02:19:00 +11:00
|
|
|
}
|
2015-02-09 13:11:57 -08:00
|
|
|
return err
|
2015-02-06 12:48:57 -08:00
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
libct: switch to numeric UID/GID/groups
This addresses the following TODO in the code (added back in 2015
by commit 845fc65e5):
> // TODO: fix libcontainer's API to better support uid/gid in a typesafe way.
Historically, libcontainer internally uses strings for user, group, and
additional (aka supplementary) groups.
Yet, runc receives those credentials as part of runtime-spec's process,
which uses integers for all of them (see [1], [2]).
What happens next is:
1. runc start/run/exec converts those credentials to strings (a User
string containing "UID:GID", and a []string for additional GIDs) and
passes those onto runc init.
2. runc init converts them back to int, in the most complicated way
possible (parsing container's /etc/passwd and /etc/group).
All this conversion and, especially, parsing is totally unnecessary,
but is performed on every container exec (and start).
The only benefit of all this is, a libcontainer user could use user and
group names instead of numeric IDs (but runc itself is not using this
feature, and we don't know if there are any other users of this).
Let's remove this back and forth translation, hopefully increasing
runc exec performance.
The only remaining need to parse /etc/passwd is to set HOME environment
variable for a specified UID, in case $HOME is not explicitly set in
process.Env. This can now be done right in prepareEnv, which simplifies
the code flow a lot. Alas, we can not use standard os/user.LookupId, as
it could cache host's /etc/passwd or the current user (even with the
osusergo tag).
PS Note that the structures being changed (initConfig and Process) are
never saved to disk as JSON by runc, so there is no compatibility issue
for runc users.
Still, this is a breaking change in libcontainer, but we never promised
that libcontainer API will be stable (and there's a special package
that can handle it -- github.com/moby/sys/user). Reflect this in
CHANGELOG.
For 3998.
[1]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/config.md#posix-platform-user
[2]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/specs-go/config.go#L86
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2023-08-28 20:05:56 -07:00
|
|
|
// fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified uid.
|
2015-09-22 11:12:55 -07:00
|
|
|
// The ownership needs to match because it is created outside of the container and needs to be
|
|
|
|
|
// localized.
|
libct: switch to numeric UID/GID/groups
This addresses the following TODO in the code (added back in 2015
by commit 845fc65e5):
> // TODO: fix libcontainer's API to better support uid/gid in a typesafe way.
Historically, libcontainer internally uses strings for user, group, and
additional (aka supplementary) groups.
Yet, runc receives those credentials as part of runtime-spec's process,
which uses integers for all of them (see [1], [2]).
What happens next is:
1. runc start/run/exec converts those credentials to strings (a User
string containing "UID:GID", and a []string for additional GIDs) and
passes those onto runc init.
2. runc init converts them back to int, in the most complicated way
possible (parsing container's /etc/passwd and /etc/group).
All this conversion and, especially, parsing is totally unnecessary,
but is performed on every container exec (and start).
The only benefit of all this is, a libcontainer user could use user and
group names instead of numeric IDs (but runc itself is not using this
feature, and we don't know if there are any other users of this).
Let's remove this back and forth translation, hopefully increasing
runc exec performance.
The only remaining need to parse /etc/passwd is to set HOME environment
variable for a specified UID, in case $HOME is not explicitly set in
process.Env. This can now be done right in prepareEnv, which simplifies
the code flow a lot. Alas, we can not use standard os/user.LookupId, as
it could cache host's /etc/passwd or the current user (even with the
osusergo tag).
PS Note that the structures being changed (initConfig and Process) are
never saved to disk as JSON by runc, so there is no compatibility issue
for runc users.
Still, this is a breaking change in libcontainer, but we never promised
that libcontainer API will be stable (and there's a special package
that can handle it -- github.com/moby/sys/user). Reflect this in
CHANGELOG.
For 3998.
[1]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/config.md#posix-platform-user
[2]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/specs-go/config.go#L86
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2023-08-28 20:05:56 -07:00
|
|
|
func fixStdioPermissions(uid int) error {
|
2022-01-21 17:35:52 -08:00
|
|
|
for _, file := range []*os.File{os.Stdin, os.Stdout, os.Stderr} {
|
2017-05-09 17:38:27 -04:00
|
|
|
var s unix.Stat_t
|
2022-01-21 17:35:52 -08:00
|
|
|
if err := unix.Fstat(int(file.Fd()), &s); err != nil {
|
|
|
|
|
return &os.PathError{Op: "fstat", Path: file.Name(), Err: err}
|
2015-09-22 11:12:55 -07:00
|
|
|
}
|
2017-03-18 04:33:14 +11:00
|
|
|
|
2025-03-06 08:19:45 -08:00
|
|
|
// Skip chown if:
|
|
|
|
|
// - uid is already the one we want, or
|
|
|
|
|
// - fd is opened to /dev/null.
|
|
|
|
|
if int(s.Uid) == uid || isDevNull(&s) {
|
2015-09-22 11:12:55 -07:00
|
|
|
continue
|
|
|
|
|
}
|
2017-03-18 04:33:14 +11:00
|
|
|
|
2022-01-21 17:43:09 -08:00
|
|
|
// We only change the uid (as it is possible for the mount to
|
2016-09-14 19:02:53 +10:00
|
|
|
// prefer a different gid, and there's no reason for us to change it).
|
|
|
|
|
// The reason why we don't just leave the default uid=X mount setup is
|
|
|
|
|
// that users expect to be able to actually use their console. Without
|
|
|
|
|
// this code, you couldn't effectively run as a non-root user inside a
|
|
|
|
|
// container and also have a console set up.
|
2025-03-06 15:02:50 +01:00
|
|
|
if err := file.Chown(uid, -1); err != nil {
|
|
|
|
|
// If we've hit an EPERM then the inode's current owner
|
2017-10-25 00:10:35 +11:00
|
|
|
// is not mapped in our user namespace (in particular,
|
2022-01-21 17:53:03 -08:00
|
|
|
// privileged_wrt_inode_uidgid() has failed). Read-only
|
|
|
|
|
// /dev can result in EROFS error. In any case, it's
|
|
|
|
|
// better for us to just not touch the stdio rather
|
|
|
|
|
// than bail at this point.
|
2025-03-06 15:02:50 +01:00
|
|
|
// EINVAL should never happen, as it would mean the uid
|
|
|
|
|
// is not mapped, we expect this function to be called
|
|
|
|
|
// with a mapped uid.
|
|
|
|
|
if errors.Is(err, unix.EPERM) || errors.Is(err, unix.EROFS) {
|
2017-10-25 00:10:35 +11:00
|
|
|
continue
|
|
|
|
|
}
|
2022-01-21 17:35:52 -08:00
|
|
|
return err
|
2015-09-22 11:12:55 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
2015-02-10 11:51:45 -08:00
|
|
|
// setupNetwork sets up and initializes any network interface inside the container.
|
|
|
|
|
func setupNetwork(config *initConfig) error {
|
2015-02-06 12:48:57 -08:00
|
|
|
for _, config := range config.Networks {
|
2015-02-09 15:16:27 -08:00
|
|
|
strategy, err := getStrategy(config.Type)
|
2015-02-06 12:48:57 -08:00
|
|
|
if err != nil {
|
|
|
|
|
return err
|
|
|
|
|
}
|
2015-02-10 11:51:45 -08:00
|
|
|
if err := strategy.initialize(config); err != nil {
|
|
|
|
|
return err
|
2015-02-06 12:48:57 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func setupRoute(config *configs.Config) error {
|
|
|
|
|
for _, config := range config.Routes {
|
2015-06-26 11:38:23 -07:00
|
|
|
_, dst, err := net.ParseCIDR(config.Destination)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
src := net.ParseIP(config.Source)
|
|
|
|
|
if src == nil {
|
|
|
|
|
return fmt.Errorf("Invalid source for route: %s", config.Source)
|
|
|
|
|
}
|
|
|
|
|
gw := net.ParseIP(config.Gateway)
|
|
|
|
|
if gw == nil {
|
|
|
|
|
return fmt.Errorf("Invalid gateway for route: %s", config.Gateway)
|
|
|
|
|
}
|
|
|
|
|
l, err := netlink.LinkByName(config.InterfaceName)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
route := &netlink.Route{
|
|
|
|
|
Scope: netlink.SCOPE_UNIVERSE,
|
|
|
|
|
Dst: dst,
|
|
|
|
|
Src: src,
|
|
|
|
|
Gw: gw,
|
|
|
|
|
LinkIndex: l.Attrs().Index,
|
|
|
|
|
}
|
|
|
|
|
if err := netlink.RouteAdd(route); err != nil {
|
2015-02-06 12:48:57 -08:00
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
2024-05-23 16:29:58 -07:00
|
|
|
func maybeClearRlimitNofileCache(limits []configs.Rlimit) {
|
2024-03-29 18:12:08 +08:00
|
|
|
for _, rlimit := range limits {
|
2024-05-23 16:29:58 -07:00
|
|
|
if rlimit.Type == syscall.RLIMIT_NOFILE {
|
|
|
|
|
system.ClearRlimitNofileCache(&syscall.Rlimit{
|
|
|
|
|
Cur: rlimit.Soft,
|
|
|
|
|
Max: rlimit.Hard,
|
|
|
|
|
})
|
|
|
|
|
return
|
2024-03-29 18:12:08 +08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-03-25 15:03:30 +00:00
|
|
|
func setupRlimits(limits []configs.Rlimit, pid int) error {
|
2016-03-10 14:35:16 -08:00
|
|
|
for _, rlimit := range limits {
|
2021-11-11 19:57:56 -08:00
|
|
|
if err := unix.Prlimit(pid, rlimit.Type, &unix.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}, nil); err != nil {
|
2021-06-08 20:05:54 -07:00
|
|
|
return fmt.Errorf("error setting rlimit type %v: %w", rlimit.Type, err)
|
2015-02-06 12:48:57 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
2015-02-11 16:45:23 -08:00
|
|
|
|
2025-01-08 14:11:02 -08:00
|
|
|
func setupScheduler(config *initConfig) error {
|
2024-06-26 16:14:44 -07:00
|
|
|
if config.Scheduler == nil {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
2023-08-04 02:04:56 +00:00
|
|
|
attr, err := configs.ToSchedAttr(config.Scheduler)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
if err := unix.SchedSetAttr(0, attr, 0); err != nil {
|
2025-01-08 14:11:02 -08:00
|
|
|
if errors.Is(err, unix.EPERM) && config.Config.Cgroups.CpusetCpus != "" {
|
2023-08-04 02:04:56 +00:00
|
|
|
return errors.New("process scheduler can't be used together with AllowedCPUs")
|
|
|
|
|
}
|
|
|
|
|
return fmt.Errorf("error setting scheduler: %w", err)
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
2025-01-07 19:48:58 -08:00
|
|
|
func setupIOPriority(config *initConfig) error {
|
libct: unify IOPriority setting
For some reason, io priority is set in different places between runc
start/run and runc exec:
- for runc start/run, it is done in the middle of (*linuxStandardInit).Init,
close to the place where we exec runc init.
- for runc exec, it is done much earlier, in (*setnsProcess) start().
Let's move setIOPriority call for runc exec to (*linuxSetnsInit).Init,
so it is in the same logical place as for runc start/run.
Also, move the function itself to init_linux.go as it's part of init.
Should not have any visible effect, except part of runc init is run with
a different I/O priority.
While at it, rename setIOPriority to setupIOPriority, and make it accept
the whole *configs.Config, for uniformity with other similar functions.
Fixes: bfbd0305 ("Add I/O priority")
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2024-06-26 17:22:02 -07:00
|
|
|
const ioprioWhoPgrp = 1
|
|
|
|
|
|
|
|
|
|
ioprio := config.IOPriority
|
|
|
|
|
if ioprio == nil {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
2024-10-21 15:00:11 -07:00
|
|
|
class := 0
|
|
|
|
|
switch ioprio.Class {
|
|
|
|
|
case specs.IOPRIO_CLASS_RT:
|
|
|
|
|
class = 1
|
|
|
|
|
case specs.IOPRIO_CLASS_BE:
|
|
|
|
|
class = 2
|
|
|
|
|
case specs.IOPRIO_CLASS_IDLE:
|
|
|
|
|
class = 3
|
|
|
|
|
default:
|
libct: unify IOPriority setting
For some reason, io priority is set in different places between runc
start/run and runc exec:
- for runc start/run, it is done in the middle of (*linuxStandardInit).Init,
close to the place where we exec runc init.
- for runc exec, it is done much earlier, in (*setnsProcess) start().
Let's move setIOPriority call for runc exec to (*linuxSetnsInit).Init,
so it is in the same logical place as for runc start/run.
Also, move the function itself to init_linux.go as it's part of init.
Should not have any visible effect, except part of runc init is run with
a different I/O priority.
While at it, rename setIOPriority to setupIOPriority, and make it accept
the whole *configs.Config, for uniformity with other similar functions.
Fixes: bfbd0305 ("Add I/O priority")
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2024-06-26 17:22:02 -07:00
|
|
|
return fmt.Errorf("invalid io priority class: %s", ioprio.Class)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Combine class and priority into a single value
|
|
|
|
|
// https://github.com/torvalds/linux/blob/v5.18/include/uapi/linux/ioprio.h#L5-L17
|
|
|
|
|
iop := (class << 13) | ioprio.Priority
|
|
|
|
|
_, _, errno := unix.RawSyscall(unix.SYS_IOPRIO_SET, ioprioWhoPgrp, 0, uintptr(iop))
|
|
|
|
|
if errno != 0 {
|
|
|
|
|
return fmt.Errorf("failed to set io priority: %w", errno)
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
2025-03-27 15:42:24 +02:00
|
|
|
func setupMemoryPolicy(config *configs.Config) error {
|
|
|
|
|
mpol := config.MemoryPolicy
|
|
|
|
|
if mpol == nil {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
return linux.SetMempolicy(mpol.Mode|mpol.Flags, config.MemoryPolicy.Nodes)
|
|
|
|
|
}
|
|
|
|
|
|
2023-10-20 20:25:10 +08:00
|
|
|
func setupPersonality(config *configs.Config) error {
|
|
|
|
|
return system.SetLinuxPersonality(config.Personality.Domain)
|
|
|
|
|
}
|
|
|
|
|
|
2016-11-07 15:22:27 -08:00
|
|
|
// signalAllProcesses freezes then iterates over all the processes inside the
|
2017-01-21 18:02:01 +00:00
|
|
|
// manager's cgroups sending the signal s to them.
|
libct: signalAllProcesses: remove child reaping
There are two very distinct usage scenarios for signalAllProcesses:
* when used from the runc binary ("runc kill" command), the processes
that it kills are not the children of "runc kill", and so calling
wait(2) on each process is totally useless, as it will return ECHLD;
* when used from a program that have created the container (such as
libcontainer/integration test suite), that program can and should call
wait(2), not the signalling code.
So, the child reaping code is totally useless in the first case, and
should be implemented by the program using libcontainer in the second
case. I was not able to track down how this code was added, my best
guess is it happened when this code was part of dockerd, which did not
have a proper child reaper implemented at that time.
Remove it, and add a proper documentation piece.
Change the integration test accordingly.
PS the first attempt to disable the child reaping code in
signalAllProcesses was made in commit bb912eb00c51f, which used a
questionable heuristic to figure out whether wait(2) should be called.
This heuristic worked for a particular use case, but is not correct in
general.
While at it:
- simplify signalAllProcesses to use unix.Kill;
- document (container).Signal.
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2023-05-12 11:04:21 -07:00
|
|
|
func signalAllProcesses(m cgroups.Manager, s unix.Signal) error {
|
2023-10-30 18:09:07 -07:00
|
|
|
if !m.Exists() {
|
2024-09-23 20:59:57 +08:00
|
|
|
return ErrCgroupNotExist
|
2023-10-30 18:09:07 -07:00
|
|
|
}
|
2023-05-12 12:57:50 -07:00
|
|
|
// Use cgroup.kill, if available.
|
|
|
|
|
if s == unix.SIGKILL {
|
|
|
|
|
if p := m.Path(""); p != "" { // Either cgroup v2 or hybrid.
|
|
|
|
|
err := cgroups.WriteFile(p, "cgroup.kill", "1")
|
|
|
|
|
if err == nil || !errors.Is(err, os.ErrNotExist) {
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
// Fallback to old implementation.
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-10-23 16:47:01 -07:00
|
|
|
if err := m.Freeze(cgroups.Frozen); err != nil {
|
2015-05-06 21:14:04 +08:00
|
|
|
logrus.Warn(err)
|
2015-02-11 16:45:23 -08:00
|
|
|
}
|
2016-01-08 19:37:18 +00:00
|
|
|
pids, err := m.GetAllPids()
|
2015-02-11 16:45:23 -08:00
|
|
|
if err != nil {
|
2024-10-23 16:47:01 -07:00
|
|
|
if err := m.Freeze(cgroups.Thawed); err != nil {
|
2021-01-07 11:45:11 +01:00
|
|
|
logrus.Warn(err)
|
|
|
|
|
}
|
2015-02-11 16:45:23 -08:00
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
for _, pid := range pids {
|
libct: signalAllProcesses: remove child reaping
There are two very distinct usage scenarios for signalAllProcesses:
* when used from the runc binary ("runc kill" command), the processes
that it kills are not the children of "runc kill", and so calling
wait(2) on each process is totally useless, as it will return ECHLD;
* when used from a program that have created the container (such as
libcontainer/integration test suite), that program can and should call
wait(2), not the signalling code.
So, the child reaping code is totally useless in the first case, and
should be implemented by the program using libcontainer in the second
case. I was not able to track down how this code was added, my best
guess is it happened when this code was part of dockerd, which did not
have a proper child reaper implemented at that time.
Remove it, and add a proper documentation piece.
Change the integration test accordingly.
PS the first attempt to disable the child reaping code in
signalAllProcesses was made in commit bb912eb00c51f, which used a
questionable heuristic to figure out whether wait(2) should be called.
This heuristic worked for a particular use case, but is not correct in
general.
While at it:
- simplify signalAllProcesses to use unix.Kill;
- document (container).Signal.
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2023-05-12 11:04:21 -07:00
|
|
|
err := unix.Kill(pid, s)
|
2023-08-24 17:27:14 -07:00
|
|
|
if err != nil && err != unix.ESRCH {
|
libct: signalAllProcesses: remove child reaping
There are two very distinct usage scenarios for signalAllProcesses:
* when used from the runc binary ("runc kill" command), the processes
that it kills are not the children of "runc kill", and so calling
wait(2) on each process is totally useless, as it will return ECHLD;
* when used from a program that have created the container (such as
libcontainer/integration test suite), that program can and should call
wait(2), not the signalling code.
So, the child reaping code is totally useless in the first case, and
should be implemented by the program using libcontainer in the second
case. I was not able to track down how this code was added, my best
guess is it happened when this code was part of dockerd, which did not
have a proper child reaper implemented at that time.
Remove it, and add a proper documentation piece.
Change the integration test accordingly.
PS the first attempt to disable the child reaping code in
signalAllProcesses was made in commit bb912eb00c51f, which used a
questionable heuristic to figure out whether wait(2) should be called.
This heuristic worked for a particular use case, but is not correct in
general.
While at it:
- simplify signalAllProcesses to use unix.Kill;
- document (container).Signal.
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2023-05-12 11:04:21 -07:00
|
|
|
logrus.Warnf("kill %d: %v", pid, err)
|
2015-02-11 16:45:23 -08:00
|
|
|
}
|
|
|
|
|
}
|
2024-10-23 16:47:01 -07:00
|
|
|
if err := m.Freeze(cgroups.Thawed); err != nil {
|
2015-05-06 21:14:04 +08:00
|
|
|
logrus.Warn(err)
|
2015-02-11 16:45:23 -08:00
|
|
|
}
|
2017-01-21 18:02:01 +00:00
|
|
|
|
2015-02-11 16:45:23 -08:00
|
|
|
return nil
|
|
|
|
|
}
|
2023-10-01 11:09:49 +08:00
|
|
|
|
|
|
|
|
// setupPidfd opens a process file descriptor of init process, and sends the
|
|
|
|
|
// file descriptor back to the socket.
|
|
|
|
|
func setupPidfd(socket *os.File, initType string) error {
|
|
|
|
|
defer socket.Close()
|
|
|
|
|
|
|
|
|
|
pidFd, err := unix.PidfdOpen(os.Getpid(), 0)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return fmt.Errorf("failed to pidfd_open: %w", err)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if err := utils.SendRawFd(socket, initType, uintptr(pidFd)); err != nil {
|
|
|
|
|
unix.Close(pidFd)
|
|
|
|
|
return fmt.Errorf("failed to send pidfd on socket: %w", err)
|
|
|
|
|
}
|
|
|
|
|
return unix.Close(pidFd)
|
|
|
|
|
}
|