From 9f8d6880a8d339960c4e255f49ce0f902ecf98bc Mon Sep 17 00:00:00 2001 From: Alexander Larsson Date: Mon, 15 Feb 2016 15:12:57 +0100 Subject: [PATCH] Initial version --- Makefile.am | 14 + autogen.sh | 19 + build-root.c | 1566 ++++++++++++++++++++++++++++++++++++++++++++++++++ configure.ac | 34 ++ 4 files changed, 1633 insertions(+) create mode 100644 Makefile.am create mode 100755 autogen.sh create mode 100644 build-root.c create mode 100644 configure.ac diff --git a/Makefile.am b/Makefile.am new file mode 100644 index 0000000..990297c --- /dev/null +++ b/Makefile.am @@ -0,0 +1,14 @@ +NULL = + +ACLOCAL_AMFLAGS = -I m4 ${ACLOCAL_FLAGS} +AM_CPPFLAGS = -DDATADIR='"$(datadir)"' -DLIBEXECDIR='"$(libexecdir)"' +AM_CFLAGS = $(WARN_CFLAGS) + +bin_PROGRAMS = build-root + +build_root_SOURCES = \ + build-root.c \ + $(NULL) + +build_root_CFLAGS = $(AM_CFLAGS) +build_root_LDFLAGS = diff --git a/autogen.sh b/autogen.sh new file mode 100755 index 0000000..4674ca8 --- /dev/null +++ b/autogen.sh @@ -0,0 +1,19 @@ +#!/bin/sh + +test -n "$srcdir" || srcdir=`dirname "$0"` +test -n "$srcdir" || srcdir=. + +olddir=`pwd` +cd $srcdir + +if ! (autoreconf --version >/dev/null 2>&1); then + echo "*** No autoreconf found, please install it ***" + exit 1 +fi + +mkdir -p m4 + +autoreconf --force --install --verbose + +cd $olddir +test -n "$NOCONFIGURE" || "$srcdir/configure" "$@" diff --git a/build-root.c b/build-root.c new file mode 100644 index 0000000..02903d7 --- /dev/null +++ b/build-root.c @@ -0,0 +1,1566 @@ +/* build-root + * Copyright (C) 2016 Alexander Larsson + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see . + * + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if 0 +#define __debug__(x) printf x +#else +#define __debug__(x) +#endif + +#define N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0])) + +#define TRUE 1 +#define FALSE 0 +typedef int bool; + +#define READ_END 0 +#define WRITE_END 1 + +/* Globals to avoid having to use getuid(), since the uid/gid changes during runtime */ +static uid_t uid; +static gid_t gid; +static bool is_privileged; + +static inline void +cleanup_fdp (int *fdp) +{ + int fd; + + assert (fdp); + + fd = *fdp; + if (fd != -1) + (void) close (fd); +} + +#define cleanup_fd __attribute__((cleanup(cleanup_fdp))) + +static void +die_with_error (const char *format, ...) +{ + va_list args; + int errsv; + + errsv = errno; + + va_start (args, format); + vfprintf (stderr, format, args); + va_end (args); + + fprintf (stderr, ": %s\n", strerror (errsv)); + + exit (1); +} + +static void +die (const char *format, ...) +{ + va_list args; + + va_start (args, format); + vfprintf (stderr, format, args); + va_end (args); + + fprintf (stderr, "\n"); + + exit (1); +} + +static void +die_oom (void) +{ + die ("Out of memory"); +} + +static void * +xmalloc (size_t size) +{ + void *res = malloc (size); + if (res == NULL) + die_oom (); + return res; +} + +static void * +xrealloc (void *ptr, size_t size) +{ + void *res = realloc (ptr, size); + if (size != 0 && res == NULL) + die_oom (); + return res; +} + +static char * +xstrdup (const char *str) +{ + char *res; + + assert (str != NULL); + + res = strdup (str); + if (res == NULL) + die_oom (); + + return res; +} + +static void +xsetenv (const char *name, const char *value, int overwrite) +{ + if (setenv (name, value, overwrite)) + die ("setenv failed"); +} + +static void +xunsetenv (const char *name) +{ + if (unsetenv(name)) + die ("unsetenv failed"); +} + +static char * +strconcat (const char *s1, + const char *s2) +{ + size_t len = 0; + char *res; + + if (s1) + len += strlen (s1); + if (s2) + len += strlen (s2); + + res = xmalloc (len + 1); + *res = 0; + if (s1) + strcat (res, s1); + if (s2) + strcat (res, s2); + + return res; +} + +static char * +strconcat3 (const char *s1, + const char *s2, + const char *s3) +{ + size_t len = 0; + char *res; + + if (s1) + len += strlen (s1); + if (s2) + len += strlen (s2); + if (s3) + len += strlen (s3); + + res = xmalloc (len + 1); + *res = 0; + if (s1) + strcat (res, s1); + if (s2) + strcat (res, s2); + if (s3) + strcat (res, s3); + + return res; +} + +static char* +strdup_printf (const char *format, + ...) +{ + char *buffer = NULL; + va_list args; + + va_start (args, format); + vasprintf (&buffer, format, args); + va_end (args); + + if (buffer == NULL) + die_oom (); + + return buffer; +} + +static const char * +get_relative_path (const char *path) +{ + while (*path == '/') + path++; + return path; +} + +static int +fdwalk (int (*cb)(void *data, int fd), void *data) +{ + int open_max; + int fd; + int res = 0; + DIR *d; + + if ((d = opendir ("/proc/self/fd"))) + { + struct dirent *de; + + while ((de = readdir (d))) + { + long l; + char *e = NULL; + + if (de->d_name[0] == '.') + continue; + + errno = 0; + l = strtol (de->d_name, &e, 10); + if (errno != 0 || !e || *e) + continue; + + fd = (int) l; + + if ((long) fd != l) + continue; + + if (fd == dirfd (d)) + continue; + + if ((res = cb (data, fd)) != 0) + break; + } + + closedir (d); + return res; + } + + open_max = sysconf (_SC_OPEN_MAX); + + for (fd = 0; fd < open_max; fd++) + if ((res = cb (data, fd)) != 0) + break; + + return res; +} + +static inline int raw_clone(unsigned long flags, void *child_stack) { +#if defined(__s390__) || defined(__CRIS__) + /* On s390 and cris the order of the first and second arguments + * of the raw clone() system call is reversed. */ + return (int) syscall(__NR_clone, child_stack, flags); +#else + return (int) syscall(__NR_clone, flags, child_stack); +#endif +} + +static void +usage (char **argv) +{ + fprintf (stderr, "usage: %s [OPTIONS...] RUNTIMEPATH COMMAND [ARGS...]\n\n", argv[0]); + + fprintf (stderr, "TODO...." + ); + exit (1); +} + +static int +pivot_root (const char * new_root, const char * put_old) +{ +#ifdef __NR_pivot_root + return syscall(__NR_pivot_root, new_root, put_old); +#else + errno = ENOSYS; + return -1; +#endif +} + +typedef enum { + BIND_READONLY = (1<<0), + BIND_PRIVATE = (1<<1), + BIND_DEVICES = (1<<2), + BIND_RECURSIVE = (1<<3), +} bind_option_t; + + +static char * +load_file (const char *path) +{ + cleanup_fd int fd = -1; + char *data; + ssize_t data_read; + ssize_t data_len; + ssize_t res; + + fd = open (path, O_CLOEXEC | O_RDONLY); + if (fd == -1) + return NULL; + + data_read = 0; + data_len = 4080; + data = xmalloc (data_len); + + do + { + if (data_len >= data_read + 1) + { + data_len *= 2; + data = xrealloc (data, data_len); + } + + do + res = read (fd, data + data_read, data_len - data_read - 1); + while (res < 0 && errno == EINTR); + + if (res < 0) + { + int errsv = errno; + free (data); + errno = errsv; + return NULL; + } + + data_read += res; + } + while (res > 0); + + data[data_read] = 0; + + return data; +} + +static char * +skip_line (char *line) +{ + while (*line != 0 && *line != '\n') + line++; + + if (*line == '\n') + line++; + + return line; +} + +static char * +skip_token (char *line, bool eat_whitespace) +{ + while (*line != ' ' && *line != '\n') + line++; + + if (eat_whitespace && *line == ' ') + line++; + + return line; +} + +static bool +str_has_prefix (const char *str, + const char *prefix) +{ + return strncmp (str, prefix, strlen (prefix)) == 0; +} + +static char * +unescape_mountpoint (const char *escaped, ssize_t len) +{ + char *unescaped, *res; + const char *end; + + if (len < 0) + len = strlen (escaped); + end = escaped + len; + + unescaped = res = xmalloc (len + 1); + while (escaped < end) + { + if (*escaped == '\\') + { + *unescaped++ = + ((escaped[1] - '0') << 6) | + ((escaped[2] - '0') << 3) | + ((escaped[3] - '0') << 0); + escaped += 4; + } + else + *unescaped++ = *escaped++; + } + *unescaped = 0; + return res; +} + +static char * +get_mountinfo (const char *mountpoint) +{ + char *line_mountpoint, *line_mountpoint_end; + char *mountinfo; + char *free_me = NULL; + char *line, *line_start; + char *res = NULL; + int i; + + if (mountpoint[0] != '/') + { + char *cwd = getcwd(NULL, 0); + if (cwd == NULL) + die_oom (); + + mountpoint = free_me = strconcat3 (cwd, "/", mountpoint); + free (cwd); + } + + mountinfo = load_file ("/proc/self/mountinfo"); + if (mountinfo == NULL) + return NULL; + + line = mountinfo; + + while (*line != 0) + { + char *unescaped; + + line_start = line; + for (i = 0; i < 4; i++) + line = skip_token (line, TRUE); + line_mountpoint = line; + line = skip_token (line, FALSE); + line_mountpoint_end = line; + line = skip_line (line); + + unescaped = unescape_mountpoint (line_mountpoint, line_mountpoint_end - line_mountpoint); + if (strcmp (mountpoint, unescaped) == 0) + { + free (unescaped); + res = line_start; + line[-1] = 0; + break; + } + free (unescaped); + } + + if (free_me) + free (free_me); + free (mountinfo); + + if (res) + return xstrdup (res); + return NULL; +} + +static unsigned long +get_mountflags (const char *mountpoint) +{ + char *line, *token, *end_token; + int i; + unsigned long flags = 0; + static const struct { int flag; char *name; } flags_data[] = { + { 0, "rw" }, + { MS_RDONLY, "ro" }, + { MS_NOSUID, "nosuid" }, + { MS_NODEV, "nodev" }, + { MS_NOEXEC, "noexec" }, + { MS_NOATIME, "noatime" }, + { MS_NODIRATIME, "nodiratime" }, + { MS_RELATIME, "relatime" }, + { 0, NULL } + }; + + line = get_mountinfo (mountpoint); + if (line == NULL) + return 0; + + token = line; + for (i = 0; i < 5; i++) + token = skip_token (token, TRUE); + + end_token = skip_token (token, FALSE); + *end_token = 0; + + do { + end_token = strchr (token, ','); + if (end_token != NULL) + *end_token = 0; + + for (i = 0; flags_data[i].name != NULL; i++) + { + if (strcmp (token, flags_data[i].name) == 0) + flags |= flags_data[i].flag; + } + + if (end_token) + token = end_token + 1; + else + token = NULL; + } while (token != NULL); + + free (line); + + return flags; +} + + +static char ** +get_submounts (const char *parent_mount) +{ + char *mountpoint, *mountpoint_end; + char **submounts; + int i, n_submounts, submounts_size; + char *mountinfo; + char *line; + + mountinfo = load_file ("/proc/self/mountinfo"); + if (mountinfo == NULL) + return NULL; + + submounts_size = 8; + n_submounts = 0; + submounts = xmalloc (sizeof (char *) * submounts_size); + + line = mountinfo; + + while (*line != 0) + { + char *unescaped; + for (i = 0; i < 4; i++) + line = skip_token (line, TRUE); + mountpoint = line; + line = skip_token (line, FALSE); + mountpoint_end = line; + line = skip_line (line); + *mountpoint_end = 0; + + unescaped = unescape_mountpoint (mountpoint, -1); + + if (*unescaped == '/' && + str_has_prefix (unescaped + 1, parent_mount) && + *(unescaped + 1 + strlen (parent_mount)) == '/') + { + if (n_submounts + 1 >= submounts_size) + { + submounts_size *= 2; + submounts = xrealloc (submounts, sizeof (char *) * submounts_size); + } + submounts[n_submounts++] = xstrdup (unescaped + 1); + } + free (unescaped); + } + + submounts[n_submounts] = NULL; + + free (mountinfo); + + return submounts; +} + +static int +bind_mount (const char *src, const char *dest, bind_option_t options) +{ + bool readonly = (options & BIND_READONLY) != 0; + bool private = (options & BIND_PRIVATE) != 0; + bool devices = (options & BIND_DEVICES) != 0; + bool recursive = (options & BIND_RECURSIVE) != 0; + unsigned long current_flags; + char **submounts; + int i; + + if (mount (src, dest, NULL, MS_MGC_VAL|MS_BIND|(recursive?MS_REC:0), NULL) != 0) + return 1; + + if (private) + { + if (mount ("none", dest, + NULL, MS_REC|MS_PRIVATE, NULL) != 0) + return 2; + } + + current_flags = get_mountflags (dest); + + if (mount ("none", dest, + NULL, MS_MGC_VAL|MS_BIND|MS_REMOUNT|current_flags|(devices?0:MS_NODEV)|MS_NOSUID|(readonly?MS_RDONLY:0), NULL) != 0) + return 3; + + /* We need to work around the fact that a bind mount does not apply the flags, so we need to manually + * apply the flags to all submounts in the recursive case. + * Note: This does not apply the flags to mounts which are later propagated into this namespace. + */ + if (recursive) + { + submounts = get_submounts (dest); + if (submounts == NULL) + return 4; + + for (i = 0; submounts[i] != NULL; i++) + { + current_flags = get_mountflags (submounts[i]); + if (mount ("none", submounts[i], + NULL, MS_MGC_VAL|MS_BIND|MS_REMOUNT|current_flags|(devices?0:MS_NODEV)|MS_NOSUID|(readonly?MS_RDONLY:0), NULL) != 0) + return 5; + free (submounts[i]); + } + + free (submounts); + } + + return 0; +} + +static bool +stat_is_dir (const char *pathname) +{ + struct stat buf; + + if (stat (pathname, &buf) != 0) + return FALSE; + + return S_ISDIR (buf.st_mode); +} + +static int +mkdir_with_parents (const char *pathname, + int mode, + bool create_last) +{ + char *fn, *p; + struct stat buf; + + if (pathname == NULL || *pathname == '\0') + { + errno = EINVAL; + return 1; + } + + fn = xstrdup (pathname); + + p = fn; + while (*p == '/') + p++; + + do + { + while (*p && *p != '/') + p++; + + if (!*p) + p = NULL; + else + *p = '\0'; + + if (!create_last && p == NULL) + break; + + if (stat (fn, &buf) != 0) + { + if (mkdir (fn, mode) == -1 && errno != EEXIST) + { + int errsave = errno; + free (fn); + errno = errsave; + return -1; + } + } + else if (!S_ISDIR (buf.st_mode)) + { + free (fn); + errno = ENOTDIR; + return -1; + } + + if (p) + { + *p++ = '/'; + while (*p && *p == '/') + p++; + } + } + while (p); + + free (fn); + + return 0; +} + +static bool +write_to_file (int fd, + const char *content, + ssize_t len) +{ + ssize_t res; + + while (len > 0) + { + res = write (fd, content, len); + if (res < 0 && errno == EINTR) + continue; + if (res <= 0) + return FALSE; + len -= res; + content += res; + } + + return TRUE; +} + +#define BUFSIZE 8192 +static bool +copy_file_data (int sfd, + int dfd) +{ + char buffer[BUFSIZE]; + ssize_t bytes_read; + + while (TRUE) + { + bytes_read = read (sfd, buffer, BUFSIZE); + if (bytes_read == -1) + { + if (errno == EINTR) + continue; + + return FALSE; + } + + if (bytes_read == 0) + break; + + if (!write_to_file (dfd, buffer, bytes_read)) + return FALSE; + } + + return TRUE; +} + +static bool +copy_file (const char *src_path, + const char *dst_path, + mode_t mode) +{ + cleanup_fd int sfd = -1; + cleanup_fd int dfd = -1; + bool res; + int errsv; + + sfd = open (src_path, O_CLOEXEC | O_RDONLY); + if (sfd == -1) + return FALSE; + + dfd = creat (dst_path, mode); + if (dfd == -1) + return FALSE; + + res = copy_file_data (sfd, dfd); + + errsv = errno; + errno = errsv; + + return res; +} + +static bool +write_file_at (int dirfd, + const char *path, + const char *content) +{ + cleanup_fd int fd = -1; + bool res; + int errsv; + + fd = openat (dirfd, path, O_RDWR | O_CLOEXEC, 0); + if (fd == -1) + return FALSE; + + res = TRUE; + if (content) + res = write_to_file (fd, content, strlen (content)); + + errsv = errno; + errno = errsv; + + return res; +} + +static bool +create_file (const char *path, + mode_t mode, + const char *content) +{ + cleanup_fd int fd = -1; + bool res; + int errsv; + + fd = creat (path, mode); + if (fd == -1) + return FALSE; + + res = TRUE; + if (content) + res = write_to_file (fd, content, strlen (content)); + + errsv = errno; + errno = errsv; + + return res; +} + +static void * +add_rta (struct nlmsghdr *header, + int type, + size_t size) +{ + struct rtattr *rta; + size_t rta_size = RTA_LENGTH(size); + + rta = (struct rtattr*)((char *)header + NLMSG_ALIGN(header->nlmsg_len)); + rta->rta_type = type; + rta->rta_len = rta_size; + + header->nlmsg_len = NLMSG_ALIGN(header->nlmsg_len) + rta_size; + + return RTA_DATA(rta); +} + +static int +rtnl_send_request (int rtnl_fd, + struct nlmsghdr *header) +{ + struct sockaddr_nl dst_addr = { AF_NETLINK, 0 }; + ssize_t sent; + + sent = sendto (rtnl_fd, (void *)header, header->nlmsg_len, 0, + (struct sockaddr *)&dst_addr, sizeof (dst_addr)); + if (sent < 0) + return 1; + + return 0; +} + +static int +rtnl_read_reply (int rtnl_fd, + int seq_nr) +{ + char buffer[1024]; + ssize_t received; + struct nlmsghdr *rheader; + + while (1) + { + received = recv (rtnl_fd, buffer, sizeof(buffer), 0); + if (received < 0) + return 1; + + rheader = (struct nlmsghdr *)buffer; + while (received >= NLMSG_HDRLEN) + { + if (rheader->nlmsg_seq != seq_nr) + return 1; + if (rheader->nlmsg_pid != getpid ()) + return 1; + if (rheader->nlmsg_type == NLMSG_ERROR) + { + uint32_t *err = NLMSG_DATA(rheader); + if (*err == 0) + return 0; + + return 1; + } + if (rheader->nlmsg_type == NLMSG_DONE) + return 0; + + rheader = NLMSG_NEXT(rheader, received); + } + } +} + +static int +rtnl_do_request (int rtnl_fd, + struct nlmsghdr *header) +{ + if (!rtnl_send_request (rtnl_fd, header)) + return 1; + + if (!rtnl_read_reply (rtnl_fd, header->nlmsg_seq)) + return 1; + + return 0; +} + +static struct nlmsghdr * +rtnl_setup_request (char *buffer, + int type, + int flags, + size_t size) +{ + struct nlmsghdr *header; + size_t len = NLMSG_LENGTH (size); + static uint32_t counter = 0; + + memset (buffer, 0, len); + + header = (struct nlmsghdr *)buffer; + header->nlmsg_len = len; + header->nlmsg_type = type; + header->nlmsg_flags = flags | NLM_F_REQUEST; + header->nlmsg_seq = counter++; + header->nlmsg_pid = getpid (); + + return (struct nlmsghdr *)header; +} + +static int +loopback_setup (void) +{ + int r, if_loopback; + cleanup_fd int rtnl_fd = -1; + char buffer[1024]; + struct sockaddr_nl src_addr = { AF_NETLINK, 0 }; + struct nlmsghdr *header; + struct ifaddrmsg *addmsg; + struct ifinfomsg *infomsg; + struct in_addr *ip_addr; + + src_addr.nl_pid = getpid (); + + if_loopback = (int) if_nametoindex ("lo"); + if (if_loopback <= 0) + return -1; + + rtnl_fd = socket (PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC, NETLINK_ROUTE); + if (rtnl_fd < 0) + return -1; + + r = bind (rtnl_fd, (struct sockaddr *)&src_addr, sizeof (src_addr)); + if (r < 0) + return -1; + + header = rtnl_setup_request (buffer, RTM_NEWADDR, + NLM_F_CREATE|NLM_F_EXCL|NLM_F_ACK, + sizeof (struct ifaddrmsg)); + addmsg = NLMSG_DATA(header); + + addmsg->ifa_family = AF_INET; + addmsg->ifa_prefixlen = 8; + addmsg->ifa_flags = IFA_F_PERMANENT; + addmsg->ifa_scope = RT_SCOPE_HOST; + addmsg->ifa_index = if_loopback; + + ip_addr = add_rta (header, IFA_LOCAL, sizeof (*ip_addr)); + ip_addr->s_addr = htonl(INADDR_LOOPBACK); + + ip_addr = add_rta (header, IFA_ADDRESS, sizeof (*ip_addr)); + ip_addr->s_addr = htonl(INADDR_LOOPBACK); + + assert (header->nlmsg_len < sizeof (buffer)); + + if (rtnl_do_request (rtnl_fd, header)) + return -1; + + header = rtnl_setup_request (buffer, RTM_NEWLINK, + NLM_F_ACK, + sizeof (struct ifinfomsg)); + infomsg = NLMSG_DATA(header); + + infomsg->ifi_family = AF_UNSPEC; + infomsg->ifi_type = 0; + infomsg->ifi_index = if_loopback; + infomsg->ifi_flags = IFF_UP; + infomsg->ifi_change = IFF_UP; + + assert (header->nlmsg_len < sizeof (buffer)); + + if (rtnl_do_request (rtnl_fd, header)) + return -1; + + return 0; +} + +static void +block_sigchild (void) +{ + sigset_t mask; + + sigemptyset (&mask); + sigaddset (&mask, SIGCHLD); + + if (sigprocmask (SIG_BLOCK, &mask, NULL) == -1) + die_with_error ("sigprocmask"); +} + +static void +unblock_sigchild (void) +{ + sigset_t mask; + + sigemptyset (&mask); + sigaddset (&mask, SIGCHLD); + + if (sigprocmask (SIG_UNBLOCK, &mask, NULL) == -1) + die_with_error ("sigprocmask"); +} + +/* Closes all fd:s except 0,1,2 and the passed in array of extra fds */ +static int +close_extra_fds (void *data, int fd) +{ + int *extra_fds = (int *)data; + int i; + + for (i = 0; extra_fds[i] != -1; i++) + if (fd == extra_fds[i]) + return 0; + + if (fd <= 2) + return 0; + + close (fd); + return 0; +} + +/* This stays around for as long as the initial process in the app does + * and when that exits it exits, propagating the exit status. We do this + * by having pid1 in the sandbox detect this exit and tell the monitor + * the exit status via a eventfd. We also track the exit of the sandbox + * pid1 via a signalfd for SIGCHLD, and exit with an error in this case. + * This is to catch e.g. problems during setup. */ +static void +monitor_child (int event_fd) +{ + int res; + uint64_t val; + ssize_t s; + int signal_fd; + sigset_t mask; + struct pollfd fds[2]; + int num_fds; + struct signalfd_siginfo fdsi; + int dont_close[] = { event_fd, -1 }; + + /* Close all extra fds in the monitoring process. + Any passed in fds have been passed on to the child anyway. */ + fdwalk (close_extra_fds, dont_close); + + sigemptyset (&mask); + sigaddset (&mask, SIGCHLD); + + signal_fd = signalfd (-1, &mask, SFD_CLOEXEC | SFD_NONBLOCK); + if (signal_fd == -1) + die_with_error ("Can't create signalfd"); + + num_fds = 1; + fds[0].fd = signal_fd; + fds[0].events = POLLIN; + if (event_fd != -1) + { + fds[1].fd = event_fd; + fds[1].events = POLLIN; + num_fds++; + } + + while (1) + { + fds[0].revents = fds[1].revents = 0; + res = poll (fds, num_fds, -1); + if (res == -1 && errno != EINTR) + die_with_error ("poll"); + + /* Always read from the eventfd first, if pid2 died then pid1 often + * dies too, and we could race, reporting that first and we'd lose + * the real exit status. */ + if (event_fd != -1) + { + s = read (event_fd, &val, 8); + if (s == -1 && errno != EINTR && errno != EAGAIN) + die_with_error ("read eventfd"); + else if (s == 8) + exit ((int)val - 1); + } + + s = read (signal_fd, &fdsi, sizeof (struct signalfd_siginfo)); + if (s == -1 && errno != EINTR && errno != EAGAIN) + die_with_error ("read signalfd"); + else if (s == sizeof(struct signalfd_siginfo)) + { + if (fdsi.ssi_signo != SIGCHLD) + die ("Read unexpected signal\n"); + exit (1); + } + } +} + +/* This is pid1 in the app sandbox. It is needed because we're using + * pid namespaces, and someone has to reap zombies in it. We also detect + * when the initial process (pid 2) dies and report its exit status to + * the monitor so that it can return it to the original spawner. + * + * When there are no other processes in the sandbox the wait will return + * ECHILD, and we then exit pid1 to clean up the sandbox. */ +static int +do_init (int event_fd, pid_t initial_pid) +{ + int initial_exit_status = 1; + + /* Grab a read on all .ref files to make it possible to detect that + it is in use. This lock will automatically go away when this + process dies */ + lock_all_dirs (); + + while (1) + { + pid_t child; + int status; + + child = wait (&status); + if (child == initial_pid) + { + uint64_t val; + + if (WIFEXITED (status)) + initial_exit_status = WEXITSTATUS(status); + + val = initial_exit_status + 1; + write (event_fd, &val, 8); + } + + if (child == -1 && errno != EINTR) + { + if (errno != ECHILD) + die_with_error ("init wait()"); + break; + } + } + + return initial_exit_status; +} + +#define REQUIRED_CAPS (CAP_TO_MASK(CAP_SYS_ADMIN)) + +static void +acquire_caps (void) +{ + struct __user_cap_header_struct hdr; + struct __user_cap_data_struct data; + + memset (&hdr, 0, sizeof(hdr)); + hdr.version = _LINUX_CAPABILITY_VERSION; + + if (capget (&hdr, &data) < 0) + die_with_error ("capget failed"); + + if (((data.effective & REQUIRED_CAPS) == REQUIRED_CAPS) && + ((data.permitted & REQUIRED_CAPS) == REQUIRED_CAPS)) + is_privileged = TRUE; + + if (getuid () != geteuid ()) + { + /* Tell kernel not clear capabilities when dropping root */ + if (prctl (PR_SET_KEEPCAPS, 1, 0, 0, 0) < 0) + die_with_error ("prctl(PR_SET_KEEPCAPS) failed"); + + /* Drop root uid, but retain the required permitted caps */ + if (setuid (getuid ()) < 0) + die_with_error ("unable to drop privs"); + } + + if (is_privileged) + { + memset (&hdr, 0, sizeof(hdr)); + hdr.version = _LINUX_CAPABILITY_VERSION; + + /* Drop all non-require capabilities */ + data.effective = REQUIRED_CAPS; + data.permitted = REQUIRED_CAPS; + data.inheritable = 0; + if (capset (&hdr, &data) < 0) + die_with_error ("capset failed"); + } + /* Else, we try unprivileged user namespaces */ +} + +static void +drop_caps (void) +{ + struct __user_cap_header_struct hdr; + struct __user_cap_data_struct data; + + if (!is_privileged) + return; + + memset (&hdr, 0, sizeof(hdr)); + hdr.version = _LINUX_CAPABILITY_VERSION; + data.effective = 0; + data.permitted = 0; + data.inheritable = 0; + + if (capset (&hdr, &data) < 0) + die_with_error ("capset failed"); + + if (prctl (PR_SET_DUMPABLE, 1, 0, 0, 0) < 0) + die_with_error ("prctl(PR_SET_DUMPABLE) failed"); +} + +int +main (int argc, + char **argv) +{ + mode_t old_umask; + char *newroot; + char *runtime_path = NULL; + char *app_path = NULL; + char *chdir_path = NULL; + char *monitor_path = NULL; + char *app_id = NULL; + char *var_path = NULL; + char *pulseaudio_socket = NULL; + char *x11_socket = NULL; + char *wayland_socket = NULL; + char *system_dbus_socket = NULL; + char *session_dbus_socket = NULL; + char *xdg_runtime_dir; + char *tz_val; + char **args; + char *tmp; + int n_args; + bool devel = FALSE; + bool share_shm = FALSE; + bool unshare_pid = FALSE; + bool unshare_ipc = FALSE; + bool unshare_net = FALSE; + bool unshare_uts = FALSE; + bool mount_host_fs = FALSE; + bool mount_host_fs_ro = FALSE; + bool mount_home = FALSE; + bool mount_home_ro = FALSE; + bool lock_files = FALSE; + bool writable = FALSE; + bool writable_app = FALSE; + bool writable_exports = FALSE; + int clone_flags; + char *old_cwd = NULL; + int c, i; + pid_t pid; + int event_fd = -1; + int sync_fd = -1; + char *endp; + char *uid_map, *gid_map; + const char *new_cwd; + uid_t ns_uid; + gid_t ns_gid; + int proc_fd = -1; + + /* Get the (optional) capabilities we need, drop root */ + acquire_caps (); + + /* Never gain any more privs during exec */ + if (prctl (PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) + die_with_error ("prctl(PR_SET_NO_NEW_CAPS) failed"); + + /* The initial code is run with high permissions + (i.e. CAP_SYS_ADMIN), so take lots of care. */ + + argv++; + argc--; + + if (argc == 0) + usage (); + + while (argc > 0) + { + const char *arg = argv[0]; + + if (strcmp (arg, "--help") == 0) + usage (); + else if (strcmp (arg, "--version") == 0) + { + printf ("%s\n", PACKAGE_STRING); + exit (0); + } + else if (strcmp (arg, "--unshare-ipc") == 0) + unshare_ipc = TRUE; + else if (strcmp (arg, "--unshare-pid") == 0) + unshare_pid = TRUE; + else if (strcmp (arg, "--unshare-net") == 0) + unshare_net = TRUE; + else if (strcmp (arg, "--unshare-uts") == 0) + unshare_uts = TRUE; + else if (strcmp (arg, "--chdir") == 0) + { + if (argc < 2) + die ("--chdir takes one argument"); + + chdir_path = argv[1]; + argv++; + argc--; + } + else if (*arg == '-') + die ("Unknown option %s", arg); + else + break; + + argv++; + argc--; + } + + if (argc == 0) + usage (); + + __debug__(("Creating xdg-app-root dir\n")); + + uid = getuid (); + gid = getgid (); + + /* We need *some* mountpoint where we can mount the root tmpfs. + We first try in /run, and if that fails, try in /tmp. */ + newroot = strdup_printf ("/run/user/%d/.build-root", uid); + if (mkdir (newroot, 0755) && errno != EEXIST) + { + free (newroot); + newroot = "/tmp/.build-root"; + if (mkdir (newroot, 0755) && errno != EEXIST) + die_with_error ("Creating root mountpoint failed"); + } + + __debug__(("creating new namespace\n")); + + if (unshare_pid) + event_fd = eventfd (0, EFD_CLOEXEC | EFD_NONBLOCK); + + /* We block sigchild here so that we can use signalfd in the monitor. */ + block_sigchild (); + + clone_flags = SIGCHLD | CLONE_NEWNS; + if (!is_privileged) + clone_flags |= CLONE_NEWUSER; + if (unshare_pid) + clone_flags |= CLONE_NEWPID; + if (unshare_net) + clone_flags |= CLONE_NEWNET; + if (unshare_ipc) + clone_flags |= CLONE_NEWIPC; + if (unshare_uts) + clone_flags |= CLONE_NEWUTS; + + pid = raw_clone (clone_flags, NULL); + if (pid == -1) + { + if (!is_privileged) + { + if (errno == EINVAL) + die ("Creating new namespace failed, likely because the kernel does not support user namespaces. Give the build-root setuid root or cap_sys_admin+ep rights, or switch to a kernel with user namespace support."); + else if (errno == EPERM) + die ("No permissions to creating new namespace, likely because the kernel does not allow non-privileged user namespaces. On e.g. debian this can be enabled with 'sysctl kernel.unprivileged_userns_clone=1'."); + } + + die_with_error ("Creating new namespace failed"); + } + + if (pid != 0) + { + /* We don't need any caps in the launcher, drop them immediately. */ + drop_caps (); + monitor_child (event_fd); + exit (0); /* Should not be reached, but better safe... */ + } + + ns_uid = uid; + ns_gid = gid; + if (!is_privileged) + { + /* This is a bit hacky, but we need to first map the real uid/gid to + 0, otherwise we can't mount the devpts filesystem because root is + not mapped. Later we will create another child user namespace and + map back to the real uid */ + ns_uid = 0; + ns_gid = 0; + + proc_fd = open ("/proc", O_RDONLY | O_PATH); + if (proc_fd == -1) + die_with_error ("Can't open /proc"); + + uid_map = strdup_printf ("%d %d 1\n", ns_uid, uid); + if (!write_file_at (proc_fd, "self/uid_map", uid_map)) + die_with_error ("setting up uid map"); + free (uid_map); + + if (!write_file_at (proc_fd, "self/setgroups", "deny\n")) + die_with_error ("error writing to setgroups"); + + gid_map = strdup_printf ("%d %d 1\n", ns_gid, gid); + if (!write_file_at (proc_fd, "self/gid_map", gid_map)) + die_with_error ("setting up gid map"); + free (gid_map); + } + + old_umask = umask (0); + + /* Mark everything as slave, so that we still + * receive mounts from the real root, but don't + * propagate mounts to the real root. */ + if (mount (NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) + die_with_error ("Failed to make / slave"); + + /* Create a tmpfs which we will use as / in the namespace */ + if (mount ("", newroot, "tmpfs", MS_NODEV|MS_NOSUID, NULL) != 0) + die_with_error ("Failed to mount tmpfs"); + + old_cwd = get_current_dir_name (); + + /* Chdir to the new root tmpfs mount. This will be the CWD during + the entire setup. So, relative paths are in the new root, but + absolute paths are an the old root. */ + if (chdir (newroot) != 0) + die_with_error ("chdir"); + + if (unshare_net && loopback_setup () != 0) + die ("Can't create loopback device"); + + if (mkdir (".oldroot", 0755)) + die_with_error ("Creating .oldroot failed"); + + if (pivot_root (newroot, ".oldroot")) + die_with_error ("pivot_root"); + + chdir ("/"); + new_cwd = "/"; + + /* The old root better be rprivate or we will send unmount events to the parent namespace */ + if (mount (".oldroot", ".oldroot", NULL, MS_REC|MS_PRIVATE, NULL) != 0) + die_with_error ("Failed to make old root rprivate"); + + if (umount2 (".oldroot", MNT_DETACH)) + die_with_error ("unmount oldroot"); + + umask (old_umask); + + /* Now we have everything we need CAP_SYS_ADMIN for, so drop it */ + drop_caps (); + + if (chdir_path) + { + if (chdir (chdir_path)) + die_with_error ("Can't chdir to %s", chdir_path); + new_cwd = chdir_path; + } + else if (chdir (old_cwd) == 0) + { + /* If the old cwd is mapped in the sandbox, go there */ + new_cwd = old_cwd; + } + else + { + /* If the old cwd is not mapped, go to home */ + const char *home = getenv ("HOME"); + if (home != NULL && + chdir (home) == 0) + new_cwd = home; + } + xsetenv ("PWD", new_cwd, 1); + free (old_cwd); + + /* We can't pass regular LD_LIBRARY_PATH, as it would affect the + setuid helper aspect, so we use _LD_LIBRARY_PATH */ + if (getenv("_LD_LIBRARY_PATH")) + { + xsetenv ("LD_LIBRARY_PATH", getenv("_LD_LIBRARY_PATH"), 1); + xunsetenv ("_LD_LIBRARY_PATH"); + } + else + xunsetenv ("LD_LIBRARY_PATH"); /* Make sure to unset if it was not (i.e. unprivileged mode) */ + + __debug__(("forking for child\n")); + + pid = fork (); + if (pid == -1) + die_with_error("Can't fork for child"); + + if (pid == 0) + { + __debug__(("launch executable %s\n", args[0])); + + if (ns_uid != uid || ns_gid != gid) + { + /* Now that devpts is mounted we can create a new userspace + and map our uid 1:1 */ + + if (unshare (CLONE_NEWUSER)) + die_with_error ("unshare user ns"); + + uid_map = strdup_printf ("%d 0 1\n", uid); + if (!write_file_at (proc_fd, "self/uid_map", uid_map)) + die_with_error ("setting up uid map"); + free (uid_map); + + gid_map = strdup_printf ("%d 0 1\n", gid); + if (!write_file_at (proc_fd, "self/gid_map", gid_map)) + die_with_error ("setting up gid map"); + free (gid_map); + } + + if (proc_fd != -1) + close (proc_fd); + + if (sync_fd != -1) + close (sync_fd); + + /* We want sigchild in the child */ + unblock_sigchild (); + + if (execvp (args[0], args) == -1) + die_with_error ("execvp %s", args[0]); + + return 0; + } + + /* Close all extra fds in pid 1. + Any passed in fds have been passed on to the child anyway. */ + { + int dont_close[] = { event_fd, sync_fd, -1 }; + fdwalk (close_extra_fds, dont_close); + } + + return do_init (event_fd, pid); +} diff --git a/configure.ac b/configure.ac new file mode 100644 index 0000000..2e3464d --- /dev/null +++ b/configure.ac @@ -0,0 +1,34 @@ +AC_PREREQ([2.63]) +AC_INIT([build-root], [0.1], [alexl@redhat.com]) +AC_CONFIG_HEADER([config.h]) +AC_CONFIG_MACRO_DIR([m4]) +AC_CONFIG_AUX_DIR([build-aux]) + +AM_INIT_AUTOMAKE([1.11 -Wno-portability foreign no-define tar-ustar no-dist-gzip dist-xz]) +AM_MAINTAINER_MODE([enable]) +AM_SILENT_RULES([yes]) + +AC_SYS_LARGEFILE + +AC_PROG_CC +AM_PROG_CC_C_O + +changequote(,)dnl +if test "x$GCC" = "xyes"; then + WARN_CFLAGS="-Wall -Werror=missing-prototypes" +fi +changequote([,])dnl +AC_SUBST(WARN_CFLAGS) + +# Initialize libtool +LT_PREREQ([2.2.4]) +LT_INIT([disable-static]) + +PKG_CHECK_MODULES(LIBSECCOMP, [libseccomp]) +AC_SUBST(LIBSECCOMP_CFLAGS) +AC_SUBST(LIBSECCOMP_LIBS) + +AC_CONFIG_FILES([ +Makefile +]) +AC_OUTPUT