+/* SPDX-License-Identifier: LGPL-2.1+ */
/***
This file is part of systemd.
along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/
-#ifdef HAVE_BLKID
-#include <blkid/blkid.h>
+#if HAVE_BLKID
+#include <blkid.h>
#endif
#include <errno.h>
#include <getopt.h>
#include <linux/loop.h>
#include <pwd.h>
#include <sched.h>
-#ifdef HAVE_SELINUX
+#if HAVE_SELINUX
#include <selinux/selinux.h>
#endif
#include <signal.h>
#include <sys/wait.h>
#include <unistd.h>
+#include "sd-bus.h"
#include "sd-daemon.h"
#include "sd-id128.h"
#include "base-filesystem.h"
#include "blkid-util.h"
#include "btrfs-util.h"
+#include "bus-util.h"
#include "cap-list.h"
#include "capability-util.h"
#include "cgroup-util.h"
#include "mount-util.h"
#include "netlink-util.h"
#include "nspawn-cgroup.h"
+#include "nspawn-def.h"
#include "nspawn-expose-ports.h"
#include "nspawn-mount.h"
#include "nspawn-network.h"
#include "user-util.h"
#include "util.h"
-/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
- * UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems
- * may have their own allocation ranges too. */
-#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
-#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
-
/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
* nspawn_notify_socket_path is relative to the container
* the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO;
static void *arg_root_hash = NULL;
static size_t arg_root_hash_size = 0;
+static char **arg_syscall_whitelist = NULL;
+static char **arg_syscall_blacklist = NULL;
static void help(void) {
printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
" --capability=CAP In addition to the default, retain specified\n"
" capability\n"
" --drop-capability=CAP Drop the specified capability from the default set\n"
+ " --system-call-filter=LIST|~LIST\n"
+ " Permit/prohibit specific system calls\n"
" --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
" --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
" host, try-guest, try-host\n"
static int detect_unified_cgroup_hierarchy(const char *directory) {
const char *e;
- int r, all_unified, systemd_unified;
+ int r;
/* Allow the user to control whether the unified hierarchy is used */
e = getenv("UNIFIED_CGROUP_HIERARCHY");
return 0;
}
- all_unified = cg_all_unified();
- systemd_unified = cg_unified(SYSTEMD_CGROUP_CONTROLLER);
-
- if (all_unified < 0 || systemd_unified < 0)
- return log_error_errno(all_unified < 0 ? all_unified : systemd_unified,
- "Failed to determine whether the unified cgroups hierarchy is used: %m");
-
/* Otherwise inherit the default from the host system */
- if (all_unified > 0) {
+ r = cg_all_unified();
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
+ if (r > 0) {
/* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
* routine only detects 231, so we'll have a false negative here for 230. */
r = systemd_installation_has_version(directory, 230);
arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
else
arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
- } else if (systemd_unified > 0) {
- /* Mixed cgroup hierarchy support was added in 232 */
- r = systemd_installation_has_version(directory, 232);
+ } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
+ /* Mixed cgroup hierarchy support was added in 233 */
+ r = systemd_installation_has_version(directory, 233);
if (r < 0)
return log_error_errno(r, "Failed to determine systemd version in container: %m");
if (r > 0)
if (r < 0) {
log_warning_errno(r, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring.");
return;
- } else if (r > 0)
- arg_mount_settings &= ~MOUNT_APPLY_APIVFS_RO;
- else
- arg_mount_settings |= MOUNT_APPLY_APIVFS_RO;
+ }
- arg_mount_settings &= ~MOUNT_APPLY_APIVFS_NETNS;
+ SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
+ SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
}
static int parse_argv(int argc, char *argv[]) {
ARG_PRIVATE_USERS_CHOWN,
ARG_NOTIFY_READY,
ARG_ROOT_HASH,
+ ARG_SYSTEM_CALL_FILTER,
};
static const struct option options[] = {
{ "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
{ "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
{ "root-hash", required_argument, NULL, ARG_ROOT_HASH },
+ { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
{}
};
break;
}
+ case ARG_SYSTEM_CALL_FILTER: {
+ bool negative;
+ const char *items;
+
+ negative = optarg[0] == '~';
+ items = negative ? optarg + 1 : optarg;
+
+ for (;;) {
+ _cleanup_free_ char *word = NULL;
+
+ r = extract_first_word(&items, &word, NULL, 0);
+ if (r == 0)
+ break;
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse system call filter: %m");
+
+ if (negative)
+ r = strv_extend(&arg_syscall_blacklist, word);
+ else
+ r = strv_extend(&arg_syscall_whitelist, word);
+ if (r < 0)
+ return log_oom();
+ }
+
+ arg_settings_mask |= SETTING_SYSCALL_FILTER;
+ break;
+ }
+
case '?':
return -EINVAL;
if (arg_userns_mode == USER_NAMESPACE_PICK)
arg_userns_chown = true;
- if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
- log_error("--keep-unit may not be used when invoked from a user session.");
+ if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0) {
+ /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
+ * The latter is not technically a user session, but we don't need to labour the point. */
+ log_error("--keep-unit --register=yes may not be used when invoked from a user session.");
return -EINVAL;
}
arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
+ r = cg_unified_flush();
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
+
e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
if (e)
arg_container_service_name = e;
return -EINVAL;
}
-#ifndef HAVE_LIBIPTC
+#if ! HAVE_LIBIPTC
if (arg_expose_ports) {
log_error("--port= is not supported, compiled without libiptc support.");
return -EOPNOTSUPP;
return 0;
}
+static int resolved_listening(void) {
+ _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+ _cleanup_free_ char *dns_stub_listener_mode = NULL;
+ int r;
+
+ /* Check if resolved is listening */
+
+ r = sd_bus_open_system(&bus);
+ if (r < 0)
+ return r;
+
+ r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
+ if (r <= 0)
+ return r;
+
+ r = sd_bus_get_property_string(bus,
+ "org.freedesktop.resolve1",
+ "/org/freedesktop/resolve1",
+ "org.freedesktop.resolve1.Manager",
+ "DNSStubListener",
+ NULL,
+ &dns_stub_listener_mode);
+ if (r < 0)
+ return r;
+
+ return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
+}
+
static int setup_resolv_conf(const char *dest) {
_cleanup_free_ char *resolved = NULL, *etc = NULL;
const char *where;
return 0;
}
- if (access("/run/systemd/resolve/resolv.conf", F_OK) >= 0 &&
- access("/usr/lib/systemd/resolv.conf", F_OK) >= 0) {
+ if (access("/usr/lib/systemd/resolv.conf", F_OK) >= 0 &&
+ resolved_listening() > 0) {
/* resolved is enabled on the host. In this, case bind mount its static resolv.conf file into the
* container, so that the container can use the host's resolver. Given that network namespacing is
const char *p;
int r;
-#ifdef HAVE_SELINUX
+#if HAVE_SELINUX
if (arg_selinux_apifs_context)
(void) asprintf(&options,
"newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
}
+static int setup_keyring(void) {
+ key_serial_t keyring;
+
+ /* Allocate a new session keyring for the container. This makes sure the keyring of the session systemd-nspawn
+ * was invoked from doesn't leak into the container. Note that by default we block keyctl() and request_key()
+ * anyway via seccomp so doing this operation isn't strictly necessary, but in case people explicitly whitelist
+ * these system calls let's make sure we don't leak anything into the container. */
+
+ keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
+ if (keyring == -1) {
+ if (errno == ENOSYS)
+ log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
+ else if (IN_SET(errno, EACCES, EPERM))
+ log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
+ else
+ return log_error_errno(errno, "Setting up kernel keyring failed: %m");
+ }
+
+ return 0;
+}
+
static int setup_kmsg(const char *dest, int kmsg_socket) {
const char *from, *to;
_cleanup_umask_ mode_t u;
r = readlink_and_make_absolute(p, &d);
if (r >= 0) {
- if ((arg_link_journal == LINK_GUEST ||
- arg_link_journal == LINK_AUTO) &&
+ if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
path_equal(d, q)) {
r = userns_mkdir(directory, p, 0755, 0, 0);
if (r < 0)
return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
if (r == 0) {
- log_error("No image for machine '%s': %m", arg_machine);
+ log_error("No image for machine '%s'.", arg_machine);
return -ENOENT;
}
- if (i->type == IMAGE_RAW)
+ if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
r = free_and_strdup(&arg_image, i->path);
else
r = free_and_strdup(&arg_directory, i->path);
assert(directory);
assert(kmsg_socket >= 0);
- cg_unified_flush();
-
if (arg_userns_mode != USER_NAMESPACE_NO) {
/* Tell the parent, that it now can write the UID map. */
(void) barrier_place(barrier); /* #1 */
setup_hostname();
if (arg_personality != PERSONALITY_INVALID) {
- if (personality(arg_personality) < 0)
- return log_error_errno(errno, "personality() failed: %m");
+ r = safe_personality(arg_personality);
+ if (r < 0)
+ return log_error_errno(r, "personality() failed: %m");
} else if (secondary) {
- if (personality(PER_LINUX32) < 0)
- return log_error_errno(errno, "personality() failed: %m");
+ r = safe_personality(PER_LINUX32);
+ if (r < 0)
+ return log_error_errno(r, "personality() failed: %m");
}
-#ifdef HAVE_SELINUX
+#if HAVE_SELINUX
if (arg_selinux_context)
if (setexeccon(arg_selinux_context) < 0)
return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
assert(notify_socket >= 0);
assert(kmsg_socket >= 0);
- cg_unified_flush();
-
if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
if (r < 0)
return r;
- r = detect_unified_cgroup_hierarchy(directory);
- if (r < 0)
- return r;
-
if (arg_userns_mode != USER_NAMESPACE_NO) {
/* Let the parent know which UID shift we read from the image */
l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
* inside the containter that create a new mount namespace.
* See https://github.com/systemd/systemd/issues/3860
* Further submounts (such as /dev) done after this will inherit the
- * shared propagation mode.*/
+ * shared propagation mode. */
r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
if (r < 0)
return r;
if (r < 0)
return r;
- r = setup_seccomp(arg_caps_retain);
+ r = setup_keyring();
+ if (r < 0)
+ return r;
+
+ r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
if (r < 0)
return r;
n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
if (n < 0) {
- if (errno == EAGAIN || errno == EINTR)
+ if (IN_SET(errno, EAGAIN, EINTR))
return 0;
return log_warning_errno(errno, "Couldn't read notification socket: %m");
}
if (!ucred || ucred->pid != inner_child_pid) {
- log_warning("Received notify message without valid credentials. Ignoring.");
+ log_debug("Received notify message without valid credentials. Ignoring.");
return 0;
}
if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
arg_notify_ready = settings->notify_ready;
+ if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
+
+ if (!arg_settings_trusted && !strv_isempty(arg_syscall_whitelist))
+ log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", p);
+ else {
+ strv_free(arg_syscall_whitelist);
+ strv_free(arg_syscall_blacklist);
+
+ arg_syscall_whitelist = settings->syscall_whitelist;
+ arg_syscall_blacklist = settings->syscall_blacklist;
+
+ settings->syscall_whitelist = settings->syscall_blacklist = NULL;
+ }
+ }
+
return 0;
}
arg_container_service_name);
if (r < 0)
return r;
- }
+ } else if (!arg_keep_unit) {
+ r = allocate_scope(
+ arg_machine,
+ *pid,
+ arg_slice,
+ arg_custom_mounts, arg_n_custom_mounts,
+ arg_kill_signal,
+ arg_property);
+ if (r < 0)
+ return r;
+
+ } else if (arg_slice || arg_property)
+ log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
if (r < 0)
log_error_errno(r, "--image= is not supported, compiled without blkid support.");
goto finish;
}
+ if (r == -EPROTONOSUPPORT) {
+ log_error_errno(r, "Device is loopback block device with partition scanning turned off, please turn it on.");
+ goto finish;
+ }
if (r < 0) {
log_error_errno(r, "Failed to dissect image: %m");
goto finish;
if (r < 0)
goto finish;
+ r = detect_unified_cgroup_hierarchy(arg_directory);
+ if (r < 0)
+ goto finish;
+
interactive =
isatty(STDIN_FILENO) > 0 &&
isatty(STDOUT_FILENO) > 0;