1
0
mirror of https://github.com/opencontainers/runc.git synced 2026-02-06 03:45:41 +01:00
Files

97 lines
3.0 KiB
Go
Raw Permalink Normal View History

runc-dmz: reduce memfd binary cloning cost with small C binary The idea is to remove the need for cloning the entire runc binary by replacing the final execve() call of the container process with an execve() call to a clone of a small C binary which just does an execve() of its arguments. This provides similar protection against CVE-2019-5736 but without requiring a >10MB binary copy for each "runc init". When compiled with musl, runc-dmz is 13kB (though unfortunately with glibc, it is 1.1MB which is still quite large). It should be noted that there is still a window where the container processes could get access to the host runc binary, but because we set ourselves as non-dumpable the container would need CAP_SYS_PTRACE (which is not enabled by default in Docker) in order to get around the proc_fd_access_allowed() checks. In addition, since Linux 4.10[1] the kernel blocks access entirely for user namespaced containers in this scenario. For those cases we cannot use runc-dmz, but most containers won't have this issue. This new runc-dmz binary can be opted out of at compile time by setting the "runc_nodmz" buildtag, and at runtime by setting the RUNC_DMZ=legacy environment variable. In both cases, runc will fall back to the classic /proc/self/exe-based cloning trick. If /proc/self/exe is already a sealed memfd (namely if the user is using contrib/cmd/memfd-bind to create a persistent sealed memfd for runc), neither runc-dmz nor /proc/self/exe cloning will be used because they are not necessary. [1]: https://github.com/torvalds/linux/commit/bfedb589252c01fa505ac9f6f2a3d5d68d707ef4 Co-authored-by: lifubang <lifubang@acmcoder.com> Signed-off-by: lifubang <lifubang@acmcoder.com> [cyphar: address various review nits] [cyphar: fix runc-dmz cross-compilation] [cyphar: embed runc-dmz into runc binary and clone in Go code] [cyphar: make runc-dmz optional, with fallback to /proc/self/exe cloning] [cyphar: do not use runc-dmz when the container has certain privs] Co-authored-by: Aleksa Sarai <cyphar@cyphar.com> Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
2023-08-15 17:00:22 +08:00
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
File copied and customized based on
https://github.com/moby/moby/tree/v20.10.14/profiles/seccomp/kernel_linux.go
File copied from
https://github.com/containerd/containerd/blob/v1.7.5/contrib/seccomp/kernelversion/kernel_linux.go
*/
// Package kernelversion provides a method to check whether the running kernel
// version is at least a minimum kernel version.
runc-dmz: reduce memfd binary cloning cost with small C binary The idea is to remove the need for cloning the entire runc binary by replacing the final execve() call of the container process with an execve() call to a clone of a small C binary which just does an execve() of its arguments. This provides similar protection against CVE-2019-5736 but without requiring a >10MB binary copy for each "runc init". When compiled with musl, runc-dmz is 13kB (though unfortunately with glibc, it is 1.1MB which is still quite large). It should be noted that there is still a window where the container processes could get access to the host runc binary, but because we set ourselves as non-dumpable the container would need CAP_SYS_PTRACE (which is not enabled by default in Docker) in order to get around the proc_fd_access_allowed() checks. In addition, since Linux 4.10[1] the kernel blocks access entirely for user namespaced containers in this scenario. For those cases we cannot use runc-dmz, but most containers won't have this issue. This new runc-dmz binary can be opted out of at compile time by setting the "runc_nodmz" buildtag, and at runtime by setting the RUNC_DMZ=legacy environment variable. In both cases, runc will fall back to the classic /proc/self/exe-based cloning trick. If /proc/self/exe is already a sealed memfd (namely if the user is using contrib/cmd/memfd-bind to create a persistent sealed memfd for runc), neither runc-dmz nor /proc/self/exe cloning will be used because they are not necessary. [1]: https://github.com/torvalds/linux/commit/bfedb589252c01fa505ac9f6f2a3d5d68d707ef4 Co-authored-by: lifubang <lifubang@acmcoder.com> Signed-off-by: lifubang <lifubang@acmcoder.com> [cyphar: address various review nits] [cyphar: fix runc-dmz cross-compilation] [cyphar: embed runc-dmz into runc binary and clone in Go code] [cyphar: make runc-dmz optional, with fallback to /proc/self/exe cloning] [cyphar: do not use runc-dmz when the container has certain privs] Co-authored-by: Aleksa Sarai <cyphar@cyphar.com> Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
2023-08-15 17:00:22 +08:00
package kernelversion
import (
"bytes"
"fmt"
"sync"
"golang.org/x/sys/unix"
)
// KernelVersion holds information about the kernel.
type KernelVersion struct {
Kernel uint64 // Version of the Kernel (i.e., the "4" in "4.1.2-generic")
Major uint64 // Major revision of the Kernel (i.e., the "1" in "4.1.2-generic")
}
func (k *KernelVersion) String() string {
if k.Kernel > 0 || k.Major > 0 {
return fmt.Sprintf("%d.%d", k.Kernel, k.Major)
}
return ""
}
var (
currentKernelVersion *KernelVersion
kernelVersionError error
once sync.Once
)
// getKernelVersion gets the current kernel version.
func getKernelVersion() (*KernelVersion, error) {
once.Do(func() {
var uts unix.Utsname
if err := unix.Uname(&uts); err != nil {
return
}
// Remove the \x00 from the release for Atoi to parse correctly
currentKernelVersion, kernelVersionError = parseRelease(string(uts.Release[:bytes.IndexByte(uts.Release[:], 0)]))
})
return currentKernelVersion, kernelVersionError
}
// parseRelease parses a string and creates a KernelVersion based on it.
func parseRelease(release string) (*KernelVersion, error) {
var version KernelVersion
// We're only make sure we get the "kernel" and "major revision". Sometimes we have
// 3.12.25-gentoo, but sometimes we just have 3.12-1-amd64.
_, err := fmt.Sscanf(release, "%d.%d", &version.Kernel, &version.Major)
if err != nil {
return nil, fmt.Errorf("failed to parse kernel version %q: %w", release, err)
}
return &version, nil
}
// GreaterEqualThan checks if the host's kernel version is greater than, or
// equal to the given kernel version v. Only "kernel version" and "major revision"
// can be specified (e.g., "3.12") and will be taken into account, which means
// that 3.12.25-gentoo and 3.12-1-amd64 are considered equal (kernel: 3, major: 12).
func GreaterEqualThan(minVersion KernelVersion) (bool, error) {
kv, err := getKernelVersion()
if err != nil {
return false, err
}
if kv.Kernel > minVersion.Kernel {
return true, nil
}
if kv.Kernel == minVersion.Kernel && kv.Major >= minVersion.Major {
return true, nil
}
return false, nil
}