}
int mount_sysfs(const char *dest, MountSettingsMask mount_settings) {
- const char *full, *top;
- int r;
+ _cleanup_free_ char *top = NULL, *full = NULL;;
unsigned long extra_flags = 0;
+ int r;
- top = prefix_roota(dest, "/sys");
- r = path_is_fs_type(top, SYSFS_MAGIC);
+ top = path_join(dest, "/sys");
+ if (!top)
+ return log_oom();
+
+ r = path_is_mount_point(top);
if (r < 0)
- return log_error_errno(r, "Failed to determine filesystem type of %s: %m", top);
- /* /sys might already be mounted as sysfs by the outer child in the
- * !netns case. In this case, it's all good. Don't touch it because we
- * don't have the right to do so, see https://github.com/systemd/systemd/issues/1555.
- */
- if (r > 0)
- return 0;
+ return log_error_errno(r, "Failed to determine if '%s' is a mountpoint: %m", top);
+ if (r == 0) {
+ /* If this is not a mount point yet, then mount a tmpfs there */
+ r = mount_nofollow_verbose(LOG_ERR, "tmpfs", top, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0555" TMPFS_LIMITS_SYS);
+ if (r < 0)
+ return r;
+ } else {
+ r = path_is_fs_type(top, SYSFS_MAGIC);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine filesystem type of %s: %m", top);
+
+ /* /sys/ might already be mounted as sysfs by the outer child in the !netns case. In this case, it's
+ * all good. Don't touch it because we don't have the right to do so, see
+ * https://github.com/systemd/systemd/issues/1555.
+ */
+ if (r > 0)
+ return 0;
+ }
- full = prefix_roota(top, "/full");
+ full = path_join(top, "/full");
+ if (!full)
+ return log_oom();
(void) mkdir(full, 0755);
if (rmdir(full) < 0)
return log_error_errno(errno, "Failed to remove %s: %m", full);
- /* Create mountpoint for cgroups. Otherwise we are not allowed since we
- * remount /sys read-only.
- */
- const char *x = prefix_roota(top, "/fs/cgroup");
+ /* Create mountpoint for cgroups. Otherwise we are not allowed since we remount /sys/ read-only. */
+ _cleanup_free_ char *x = path_join(top, "/fs/cgroup");
+ if (!x)
+ return log_oom();
+
(void) mkdir_p(x, 0755);
return mount_nofollow_verbose(LOG_ERR, NULL, top, NULL,
} MountPoint;
static const MountPoint mount_table[] = {
- /* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing) */
+ /* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing when we are privileged) */
{ "proc", "/proc", "proc", NULL, PROC_DEFAULT_MOUNT_FLAGS,
MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_MKDIR|MOUNT_FOLLOW_SYMLINKS }, /* we follow symlinks here since not following them requires /proc/ already being mounted, which we don't have here. */
{ "mqueue", "/dev/mqueue", "mqueue", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
MOUNT_IN_USERNS|MOUNT_MKDIR },
- /* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing) */
+ /* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing when we are privileged) */
{ "tmpfs", "/tmp", "tmpfs", "mode=01777" NESTED_TMPFS_LIMITS, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
MOUNT_FATAL|MOUNT_APPLY_TMPFS_TMP|MOUNT_MKDIR },
{ "tmpfs", "/sys", "tmpfs", "mode=0555" TMPFS_LIMITS_SYS, MS_NOSUID|MS_NOEXEC|MS_NODEV,
- MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS|MOUNT_MKDIR },
+ MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS|MOUNT_MKDIR|MOUNT_PRIVILEGED },
{ "sysfs", "/sys", "sysfs", NULL, SYS_DEFAULT_MOUNT_FLAGS,
- MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO|MOUNT_MKDIR }, /* skipped if above was mounted */
+ MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO|MOUNT_MKDIR|MOUNT_PRIVILEGED }, /* skipped if above was mounted */
{ "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
- MOUNT_FATAL|MOUNT_MKDIR }, /* skipped if above was mounted */
+ MOUNT_FATAL|MOUNT_MKDIR|MOUNT_PRIVILEGED }, /* skipped if above was mounted */
{ "tmpfs", "/dev", "tmpfs", "mode=0755" TMPFS_LIMITS_PRIVATE_DEV, MS_NOSUID|MS_STRICTATIME,
MOUNT_FATAL|MOUNT_MKDIR },
{ "tmpfs", "/dev/shm", "tmpfs", "mode=01777" NESTED_TMPFS_LIMITS, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
MOUNT_FATAL|MOUNT_IN_USERNS },
#if HAVE_SELINUX
{ "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,
- MOUNT_MKDIR }, /* Bind mount first (mkdir/chown the mount point in case /sys/ is mounted as minimal skeleton tmpfs) */
+ MOUNT_MKDIR|MOUNT_PRIVILEGED }, /* Bind mount first (mkdir/chown the mount point in case /sys/ is mounted as minimal skeleton tmpfs) */
{ NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
- 0 }, /* Then, make it r/o (don't mkdir/chown the mount point here, the previous entry already did that) */
+ MOUNT_PRIVILEGED }, /* Then, make it r/o (don't mkdir/chown the mount point here, the previous entry already did that) */
{ NULL, "/sys/fs/selinux", NULL, NULL, MS_PRIVATE,
- 0 }, /* Turn off propagation (we only want that for the mount propagation tunnel dir) */
+ MOUNT_PRIVILEGED }, /* Turn off propagation (we only want that for the mount propagation tunnel dir) */
#endif
};
bool ro = FLAGS_SET(mount_settings, MOUNT_APPLY_APIVFS_RO);
bool in_userns = FLAGS_SET(mount_settings, MOUNT_IN_USERNS);
bool tmpfs_tmp = FLAGS_SET(mount_settings, MOUNT_APPLY_TMPFS_TMP);
+ bool privileged = FLAGS_SET(mount_settings, MOUNT_PRIVILEGED);
int r;
for (size_t k = 0; k < ELEMENTSOF(mount_table); k++) {
bool fatal = FLAGS_SET(mount_table[k].mount_settings, MOUNT_FATAL);
const char *o;
+ /* If we are not privileged but the entry is marked as privileged and to be mounted outside the user namespace, then skip it */
+ if (!privileged && FLAGS_SET(mount_table[k].mount_settings, MOUNT_PRIVILEGED) && !FLAGS_SET(mount_table[k].mount_settings, MOUNT_IN_USERNS))
+ continue;
+
if (in_userns != FLAGS_SET(mount_table[k].mount_settings, MOUNT_IN_USERNS))
continue;
#include "nspawn-stub-pid1.h"
#include "nspawn-util.h"
#include "nspawn.h"
+#include "nsresource.h"
#include "nulstr-util.h"
#include "os-util.h"
#include "pager.h"
static Architecture arg_architecture = _ARCHITECTURE_INVALID;
static ImagePolicy *arg_image_policy = NULL;
static char *arg_background = NULL;
+static bool arg_privileged = false;
STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
int r;
+ if (!arg_privileged) {
+ /* We only support the unified mode when running unprivileged */
+ arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
+ return 0;
+ }
+
/* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
* in the image actually supports. */
r = cg_all_unified();
e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
if (streq_ptr(e, "network"))
arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
-
else if (e) {
r = parse_boolean(e);
if (r < 0)
static int verify_arguments(void) {
int r;
+ SET_FLAG(arg_mount_settings, MOUNT_PRIVILEGED, arg_privileged);
+
+ if (!arg_privileged) {
+ /* machined is not accessible to unpriv clients */
+ if (arg_register) {
+ log_notice("Automatically implying --register=no, since machined is not accessible to unprivileged clients.");
+ arg_register = false;
+ }
+
+ if (!arg_private_network) {
+ log_notice("Automatically implying --private-network, since mounting /sys/ in an unprivileged user namespaces requires network namespacing.");
+ arg_private_network = true;
+ }
+ }
+
if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
/* If we are running the stub init in the container, we don't need to look at what the init
* in the container supports, because we are not using it. Let's immediately pick the right
if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
return 0;
+ if (!arg_privileged)
+ return 0;
+
r = read_one_line_file("/proc/self/loginuid", &p);
if (r == -ENOENT)
return 0;
const char *p, *q;
int r;
+ if (!arg_privileged) {
+ log_debug("Not digging mount tunnel, because running unprivileged.");
+ return 0;
+ }
+
(void) mkdir_p("/run/systemd/nspawn/", 0755);
(void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
static int mount_tunnel_open(void) {
int r;
+ if (!arg_privileged) {
+ log_debug("Not opening up mount tunnel, because running unprivileged.");
+ return 0;
+ }
+
r = mount_follow_verbose(LOG_ERR, NULL, NSPAWN_MOUNT_TUNNEL, NULL, MS_SLAVE, NULL);
if (r < 0)
return r;
return r;
if (!arg_network_namespace_path && arg_private_network) {
- r = unshare(CLONE_NEWNET);
+ _cleanup_close_ int netns_fd = -EBADF;
+
+ if (arg_privileged) {
+ if (unshare(CLONE_NEWNET) < 0)
+ return log_error_errno(errno, "Failed to unshare network namespace: %m");
+ }
+
+ netns_fd = namespace_open_by_type(NAMESPACE_NET);
+ if (netns_fd < 0)
+ return log_error_errno(netns_fd, "Failed to open newly allocate network namespace: %m");
+
+ r = send_one_fd(fd_inner_socket, netns_fd, 0);
if (r < 0)
- return log_error_errno(errno, "Failed to unshare network namespace: %m");
+ return log_error_errno(r, "Failed to send network namespace to supervisor: %m");
/* Tell the parent that it can setup network interfaces. */
(void) barrier_place(barrier); /* #3 */
}
- r = mount_sysfs(NULL, arg_mount_settings);
- if (r < 0)
- return r;
+ if (arg_privileged) {
+ r = mount_sysfs(NULL, arg_mount_settings);
+ if (r < 0)
+ return r;
+ }
- /* Wait until we are cgroup-ified, so that we
- * can mount the right cgroup path writable */
+ /* Wait until we are cgroup-ified, so that we can mount the right cgroup path writable */
if (!barrier_place_and_sync(barrier)) /* #4 */
return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
"Parent died too early");
return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
}
-static int setup_notify_child(void) {
+static int setup_notify_child(const void *directory) {
_cleanup_close_ int fd = -EBADF;
- static const union sockaddr_union sa = {
+ _cleanup_free_ char *j = NULL;
+ union sockaddr_union sa = {
.un.sun_family = AF_UNIX,
- .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
};
int r;
if (fd < 0)
return log_error_errno(errno, "Failed to allocate notification socket: %m");
- (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
+ if (directory) {
+ j = path_join(directory, NSPAWN_NOTIFY_SOCKET_PATH);
+ if (!j)
+ return log_oom();
+ }
+
+ r = sockaddr_un_set_path(&sa.un, j ?: NSPAWN_NOTIFY_SOCKET_PATH);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set AF_UNIX path to %s: %m", j ?: NSPAWN_NOTIFY_SOCKET_PATH);
+
+ (void) mkdir_parents(sa.un.sun_path, 0755);
(void) sockaddr_un_unlink(&sa.un);
WITH_UMASK(0577) { /* only set "w" bit, which is all that's necessary for connecting from the container */
return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
}
- r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
+ r = userns_lchown(sa.un.sun_path, 0, 0);
if (r < 0)
return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
assert(ret);
+ if (!arg_privileged) {
+ log_debug("Not digging socket tunnel, because running unprivileged.");
+ return 0;
+ }
+
_cleanup_free_ char *p = NULL;
p = path_join("/run/systemd/nspawn/unix-export", arg_machine);
if (!p)
int r;
assert(directory);
+
+ if (!arg_privileged)
+ return 0;
+
assert(unix_export_path);
r = make_run_host(directory);
static DissectImageFlags determine_dissect_image_flags(void) {
return
+ DISSECT_IMAGE_GENERIC_ROOT |
+ DISSECT_IMAGE_REQUIRE_ROOT |
+ DISSECT_IMAGE_RELAX_VAR_CHECK |
DISSECT_IMAGE_USR_NO_ROOT |
DISSECT_IMAGE_DISCARD_ON_LOOP |
+ DISSECT_IMAGE_ADD_PARTITION_DEVICES |
+ DISSECT_IMAGE_PIN_PARTITION_DEVICES |
(arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS) |
- DISSECT_IMAGE_ALLOW_USERSPACE_VERITY;
+ DISSECT_IMAGE_ALLOW_USERSPACE_VERITY |
+ (arg_console_mode == CONSOLE_INTERACTIVE ? DISSECT_IMAGE_ALLOW_INTERACTIVE_AUTH : 0);
}
static int outer_child(
return r;
}
- /* Mark everything as shared so our mounts get propagated down. This is required to make new bind
- * mounts available in systemd services inside the container that create a new mount namespace. See
- * https://github.com/systemd/systemd/issues/3860 Further submounts (such as /dev) done after this
- * will inherit the shared propagation mode.
- *
- * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
- * directory mount to root later on.
- * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
- */
- r = mount_switch_root(directory, MS_SHARED);
- if (r < 0)
- return log_error_errno(r, "Failed to move root directory: %m");
+ /* We have different codepaths here for privileged and non-privileged mode. In privileged mode we'll
+ * now switch into the target directory, and then do the final setup from there. If a user namespace
+ * is then allocated for the container, the root mount and everything else will be out of reach for
+ * it. For unprivileged containers we cannot do that however, since we couldn't mount a sysfs and
+ * procfs then anymore, since that only works if there's an unobstructed instance currently
+ * visible. Hence there we do it the other way round: we first allocate a new set set of namespaces
+ * (and fork for it) for which we then mount sysfs/procfs, and only then switch root. */
- /* We finished setting up the rootfs which is a shared mount. The mount tunnel needs to be a
- * dependent mount otherwise we can't MS_MOVE mounts that were propagated from the host into
- * the container. */
- r = mount_tunnel_open();
- if (r < 0)
- return r;
+ if (arg_privileged) {
+ /* Mark everything as shared so our mounts get propagated down. This is required to make new
+ * bind mounts available in systemd services inside the container that create a new mount
+ * namespace. See https://github.com/systemd/systemd/issues/3860 Further submounts (such as
+ * /dev/) done after this will inherit the shared propagation mode.
+ *
+ * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
+ * directory mount to root later on.
+ * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
+ */
+ r = mount_switch_root(directory, MS_SHARED);
+ if (r < 0)
+ return log_error_errno(r, "Failed to move root directory: %m");
- if (arg_userns_mode != USER_NAMESPACE_NO) {
- /* In order to mount procfs and sysfs in an unprivileged container the kernel
- * requires that a fully visible instance is already present in the target mount
- * namespace. Mount one here so the inner child can mount its own instances. Later
- * we umount the temporary instances created here before we actually exec the
- * payload. Since the rootfs is shared the umount will propagate into the container.
- * Note, the inner child wouldn't be able to unmount the instances on its own since
- * it doesn't own the originating mount namespace. IOW, the outer child needs to do
- * this. */
- r = pin_fully_visible_fs();
+ /* We finished setting up the rootfs which is a shared mount. The mount tunnel needs to be a
+ * dependent mount otherwise we can't MS_MOVE mounts that were propagated from the host into
+ * the container. */
+ r = mount_tunnel_open();
if (r < 0)
return r;
- }
- fd = setup_notify_child();
+ if (arg_userns_mode != USER_NAMESPACE_NO) {
+ /* In order to mount procfs and sysfs in an unprivileged container the kernel
+ * requires that a fully visible instance is already present in the target mount
+ * namespace. Mount one here so the inner child can mount its own instances. Later
+ * we umount the temporary instances created here before we actually exec the
+ * payload. Since the rootfs is shared the umount will propagate into the container.
+ * Note, the inner child wouldn't be able to unmount the instances on its own since
+ * it doesn't own the originating mount namespace. IOW, the outer child needs to do
+ * this. */
+ r = pin_fully_visible_fs();
+ if (r < 0)
+ return r;
+ }
+
+ fd = setup_notify_child(NULL);
+ } else
+ fd = setup_notify_child(directory);
if (fd < 0)
return fd;
pid = raw_clone(SIGCHLD|CLONE_NEWNS|
arg_clone_ns_flags |
- (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
+ (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0) |
+ ((arg_private_network && !arg_privileged) ? CLONE_NEWNET : 0));
if (pid < 0)
return log_error_errno(errno, "Failed to fork inner child: %m");
if (pid == 0) {
return log_error_errno(r, "Failed to join network namespace: %m");
}
+ if (!arg_privileged) {
+ /* In unprivileged operation, sysfs + procfs are special, we'll have to mount them
+ * inside the inner namespaces, but before we switch root. Hence do so here. */
+ _cleanup_free_ char *j = path_join(directory, "/proc");
+ if (!j)
+ return log_oom();
+
+ r = mount_follow_verbose(LOG_ERR, "proc", j, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
+ if (r < 0)
+ return r;
+
+ r = mount_sysfs(directory, arg_mount_settings);
+ if (r < 0)
+ return r;
+
+ r = mount_switch_root(directory, MS_SHARED);
+ if (r < 0)
+ return log_error_errno(r, "Failed to move root directory: %m");
+ }
+
r = inner_child(barrier, fd_inner_socket, fds, os_release_pairs);
if (r < 0)
_exit(EXIT_FAILURE);
static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
int r;
+ if (fd < 0)
+ return 0;
+
r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
if (r < 0)
return log_error_errno(r, "Failed to allocate notify event source: %m");
return 0;
/* We first look in the admin's directories in /etc and /run */
- FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
- _cleanup_free_ char *j = NULL;
+ if (arg_privileged) {
+ FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
+ _cleanup_free_ char *j = NULL;
- j = path_join(i, arg_settings_filename);
- if (!j)
- return log_oom();
+ j = path_join(i, arg_settings_filename);
+ if (!j)
+ return log_oom();
- f = fopen(j, "re");
- if (f) {
- p = TAKE_PTR(j);
+ f = fopen(j, "re");
+ if (f) {
+ p = TAKE_PTR(j);
- /* By default, we trust configuration from /etc and /run */
- if (arg_settings_trusted < 0)
- arg_settings_trusted = true;
+ /* By default, we trust configuration from /etc and /run */
+ if (arg_settings_trusted < 0)
+ arg_settings_trusted = true;
- break;
- }
+ break;
+ }
- if (errno != ENOENT)
- return log_error_errno(errno, "Failed to open %s: %m", j);
+ if (errno != ENOENT)
+ return log_error_errno(errno, "Failed to open %s: %m", j);
+ }
}
if (!f) {
static int run_container(
DissectedImage *dissected_image,
+ int userns_fd,
FDSet *fds,
- char veth_name[IFNAMSIZ], bool *veth_created,
+ char veth_name[IFNAMSIZ],
+ bool *veth_created,
struct ExposeArgs *expose_args,
- int *master, pid_t *pid, int *ret) {
+ int *master,
+ pid_t *pid,
+ int *ret) {
static const struct sigaction sa = {
.sa_handler = nop_signal_handler,
"Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
}
- *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
- if (*pid < 0)
- return log_error_errno(errno, "clone() failed%s: %m",
- errno == EINVAL ?
- ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
+ if (arg_privileged) {
+ assert(userns_fd < 0);
+
+ /* If we have no user namespace then we'll clone and create a new mount namepsace right-away. */
+
+ *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
+ if (*pid < 0)
+ return log_error_errno(errno, "clone() failed%s: %m",
+ errno == EINVAL ?
+ ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
+ } else {
+ assert(userns_fd >= 0);
+
+ /* If we have a user namespace then we'll clone() first, and then join the user namespace,
+ * and then open the mount namespace, so that it is owned by the user namespace */
+
+ *pid = raw_clone(SIGCHLD);
+ if (*pid < 0)
+ return log_error_errno(errno, "clone() failed: %m");
+
+ if (*pid == 0) {
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ log_error_errno(errno, "Failed to join allocate user namespace: %m");
+ _exit(EXIT_FAILURE);
+ }
+
+ r = reset_uid_gid();
+ if (r < 0) {
+ log_error_errno(r, "Failed to reset UID/GID to root: %m");
+ _exit(EXIT_FAILURE);
+ }
+
+ if (unshare(CLONE_NEWNS) < 0) {
+ log_error_errno(errno, "Failed to unshare file system namespace: %m");
+ _exit(EXIT_FAILURE);
+ }
+ }
+ }
if (*pid == 0) {
/* The outer child only has a file system namespace. */
/* Wait until the child has unshared its network namespace. */
if (!barrier_place_and_sync(&barrier)) /* #3 */
return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
- }
- if (child_netns_fd < 0) {
- /* Make sure we have an open file descriptor to the child's network
- * namespace so it stays alive even if the child exits. */
- r = namespace_open(*pid,
- /* ret_pidns_fd = */ NULL,
- /* ret_mntns_fd = */ NULL,
- &child_netns_fd,
- /* ret_userns_fd = */ NULL,
- /* ret_root_fd = */ NULL);
- if (r < 0)
- return log_error_errno(r, "Failed to open child network namespace: %m");
+ /* Make sure we have an open file descriptor to the child's network namespace so it
+ * stays alive even if the child exits. */
+ assert(child_netns_fd < 0);
+ child_netns_fd = receive_one_fd(fd_inner_socket_pair[0], 0);
+ if (child_netns_fd < 0)
+ return log_error_errno(r, "Failed to receive child network namespace: %m");
}
r = move_network_interfaces(child_netns_fd, arg_network_interfaces);
return r;
if (arg_network_veth) {
- r = setup_veth(arg_machine, *pid, veth_name,
- arg_network_bridge || arg_network_zone, &arg_network_provided_mac);
- if (r < 0)
- return r;
- else if (r > 0)
- ifi = r;
+ if (arg_privileged) {
+ r = setup_veth(arg_machine, *pid, veth_name,
+ arg_network_bridge || arg_network_zone, &arg_network_provided_mac);
+ if (r < 0)
+ return r;
+ else if (r > 0)
+ ifi = r;
+ } else {
+ _cleanup_free_ char *host_ifname = NULL;
+
+ r = nsresource_add_netif(userns_fd, child_netns_fd, /* namespace_ifname= */ NULL, &host_ifname, /* ret_namespace_ifname= */ NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to add network interface to container: %m");
+
+ ifi = if_nametoindex(host_ifname);
+ if (ifi == 0)
+ return log_error_errno(errno, "Failed to resolve interface '%s': %m", host_ifname);
+
+ if (strlen(host_ifname) >= IFNAMSIZ)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Host interface name too long?");
+
+ strcpy(veth_name, host_ifname);
+ }
if (arg_network_bridge) {
/* Add the interface to a bridge */
}
if (arg_register || !arg_keep_unit) {
- r = sd_bus_default_system(&bus);
+ if (arg_privileged)
+ r = sd_bus_default_system(&bus);
+ else
+ r = sd_bus_default_user(&bus);
if (r < 0)
- return log_error_errno(r, "Failed to open system bus: %m");
+ return log_error_errno(r, "Failed to open bus: %m");
r = sd_bus_set_close_on_exit(bus, false);
if (r < 0)
} else if (arg_slice || arg_property)
log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
- r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
+ r = create_subcgroup(
+ *pid,
+ arg_keep_unit,
+ arg_unified_cgroup_hierarchy,
+ arg_uid_shift,
+ userns_fd,
+ arg_privileged);
if (r < 0)
return r;
if (r < 0)
return r;
- r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
- if (r < 0)
- return r;
-
- /* Notify the child that the parent is ready with all
- * its setup (including cgroup-ification), and that
- * the child can now hand over control to the code to
- * run inside the container. */
+ /* Notify the child that the parent is ready with all its setup (including cgroup-ification), and
+ * that the child can now hand over control to the code to run inside the container. */
(void) barrier_place(&barrier); /* #4 */
/* Block SIGCHLD here, before notifying child.
fd_kmsg_fifo = safe_close(fd_kmsg_fifo);
- if (arg_private_network) {
+ if (arg_private_network && arg_privileged) {
r = move_back_network_interfaces(child_netns_fd, arg_network_interfaces);
if (r < 0)
return r;
if (r == -ENOENT || ERRNO_IS_NEG_DISCONNECT(r))
return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
"Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
+ if (ERRNO_IS_NEG_PRIVILEGE(r)) {
+ log_debug_errno(r, "Can't connect to udev control socket, assuming we are in same netns.");
+ return 0;
+ }
if (r < 0)
return log_error_errno(r, "Failed to connect socket to udev control socket: %m");
static int run(int argc, char *argv[]) {
bool remove_directory = false, remove_image = false, veth_created = false, remove_tmprootdir = false;
- _cleanup_close_ int master = -EBADF;
+ _cleanup_close_ int master = -EBADF, userns_fd = -EBADF;
_cleanup_fdset_free_ FDSet *fds = NULL;
int r, n_fd_passed, ret = EXIT_SUCCESS;
char veth_name[IFNAMSIZ] = "";
log_parse_environment();
log_open();
+ arg_privileged = getuid() == 0;
+
r = parse_argv(argc, argv);
if (r <= 0)
goto finish;
- if (geteuid() != 0) {
- r = log_warning_errno(SYNTHETIC_ERRNO(EPERM),
- argc >= 2 ? "Need to be root." :
- "Need to be root (and some arguments are usually required).\nHint: try --help");
- goto finish;
- }
-
r = cant_be_in_netns();
if (r < 0)
goto finish;
if (!arg_private_network && arg_userns_mode != USER_NAMESPACE_NO && arg_uid_shift > 0)
arg_caps_retain &= ~(UINT64_C(1) << CAP_NET_BIND_SERVICE);
- r = cg_unified();
+ r = cg_unified(); /* initialize cache early */
if (r < 0) {
log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
goto finish;
/* Reapply environment settings. */
(void) detect_unified_cgroup_hierarchy_from_environment();
+ if (!arg_privileged) {
+ r = cg_all_unified();
+ if (r < 0) {
+ log_error_errno(r, "Failed to determine if we are in unified cgroupv2 mode: %m");
+ goto finish;
+ }
+ if (r == 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Unprivileged operation only supported in unified cgroupv2 mode.");
+ }
+
/* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
* the result is closed. Note that the container payload child will reset signal mask+handler anyway,
* so just turning this off here means we only turn it off in nspawn itself, not any children. */
* the child. Functions like copy_devnodes() change the umask temporarily. */
umask(0022);
+ if (arg_console_mode < 0)
+ arg_console_mode = isatty(STDIN_FILENO) && isatty(STDOUT_FILENO) ?
+ CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
+
+ if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
+ arg_quiet = true;
+
if (arg_directory) {
assert(!arg_image);
+ if (!arg_privileged) {
+ r = log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Invoking container from plain directory tree is currently not supported if called without privileges.");
+ goto finish;
+ }
+
/* Safety precaution: let's not allow running images from the live host OS image, as long as
* /var from the host will propagate into container dynamically (because bad things happen if
* two systems write to the same /var). Let's allow it for the special cases where /var is
/* We take an exclusive lock on this image, since it's our private, ephemeral copy
* only owned by us and no one else. */
- r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
+ r = image_path_lock(
+ np,
+ LOCK_EX|LOCK_NB,
+ arg_privileged ? &tree_global_lock : NULL,
+ &tree_local_lock);
if (r < 0) {
log_error_errno(r, "Failed to lock %s: %m", np);
goto finish;
if (r < 0)
goto finish;
- r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
+ r = image_path_lock(
+ arg_directory,
+ (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB,
+ arg_privileged ? &tree_global_lock : NULL,
+ &tree_local_lock);
if (r == -EBUSY) {
log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
goto finish;
} else {
DissectImageFlags dissect_image_flags =
- DISSECT_IMAGE_GENERIC_ROOT |
- DISSECT_IMAGE_REQUIRE_ROOT |
- DISSECT_IMAGE_RELAX_VAR_CHECK |
- DISSECT_IMAGE_USR_NO_ROOT |
- DISSECT_IMAGE_ADD_PARTITION_DEVICES |
- DISSECT_IMAGE_PIN_PARTITION_DEVICES;
+ determine_dissect_image_flags();
+
assert(arg_image);
assert(!arg_template);
+
r = chase_and_update(&arg_image, 0);
if (r < 0)
goto finish;
}
/* Always take an exclusive lock on our own ephemeral copy. */
- r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
+ r = image_path_lock(
+ np,
+ LOCK_EX|LOCK_NB,
+ arg_privileged ? &tree_global_lock : NULL,
+ &tree_local_lock);
if (r < 0) {
log_error_errno(r, "Failed to create image lock: %m");
goto finish;
free_and_replace(arg_image, np);
remove_image = true;
} else {
- r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
+ r = image_path_lock(
+ arg_image,
+ (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB,
+ arg_privileged ? &tree_global_lock : NULL,
+ &tree_local_lock);
if (r == -EBUSY) {
log_error_errno(r, "Disk image %s is currently busy.", arg_image);
goto finish;
goto finish;
}
- r = loop_device_make_by_path(
- arg_image,
- arg_read_only ? O_RDONLY : O_RDWR,
- /* sector_size= */ UINT32_MAX,
- FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
- LOCK_SH,
- &loop);
- if (r < 0) {
- log_error_errno(r, "Failed to set up loopback block device: %m");
- goto finish;
- }
+ if (arg_privileged) {
+ r = loop_device_make_by_path(
+ arg_image,
+ arg_read_only ? O_RDONLY : O_RDWR,
+ /* sector_size= */ UINT32_MAX,
+ FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
+ LOCK_SH,
+ &loop);
+ if (r < 0) {
+ log_error_errno(r, "Failed to set up loopback block device: %m");
+ goto finish;
+ }
- r = dissect_loop_device_and_warn(
- loop,
- &arg_verity_settings,
- /* mount_options=*/ NULL,
- arg_image_policy ?: &image_policy_container,
- dissect_image_flags,
- &dissected_image);
- if (r == -ENOPKG) {
- /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
- log_notice("Note that the disk image needs to\n"
- " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
- " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
- " c) or follow https://uapi-group.org/specifications/specs/discoverable_partitions_specification\n"
- " d) or contain a file system without a partition table\n"
- "in order to be bootable with systemd-nspawn.");
- goto finish;
- }
- if (r < 0)
- goto finish;
+ r = dissect_loop_device_and_warn(
+ loop,
+ &arg_verity_settings,
+ /* mount_options=*/ NULL,
+ arg_image_policy ?: &image_policy_container,
+ dissect_image_flags,
+ &dissected_image);
+ if (r == -ENOPKG) {
+ /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
+ log_notice("Note that the disk image needs to\n"
+ " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
+ " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
+ " c) or follow https://uapi-group.org/specifications/specs/discoverable_partitions_specification\n"
+ " d) or contain a file system without a partition table\n"
+ "in order to be bootable with systemd-nspawn.");
+ goto finish;
+ }
+ if (r < 0)
+ goto finish;
- r = dissected_image_load_verity_sig_partition(
- dissected_image,
- loop->fd,
- &arg_verity_settings);
- if (r < 0)
- goto finish;
+ r = dissected_image_load_verity_sig_partition(
+ dissected_image,
+ loop->fd,
+ &arg_verity_settings);
+ if (r < 0)
+ goto finish;
- if (dissected_image->has_verity && !arg_verity_settings.root_hash && !dissected_image->has_verity_sig)
- log_notice("Note: image %s contains verity information, but no root hash specified and no embedded "
- "root hash signature found! Proceeding without integrity checking.", arg_image);
+ if (dissected_image->has_verity && !arg_verity_settings.root_hash && !dissected_image->has_verity_sig)
+ log_notice("Note: image %s contains verity information, but no root hash specified and no embedded "
+ "root hash signature found! Proceeding without integrity checking.", arg_image);
- r = dissected_image_decrypt_interactively(
- dissected_image,
- NULL,
- &arg_verity_settings,
- 0);
- if (r < 0)
- goto finish;
+ r = dissected_image_decrypt_interactively(
+ dissected_image,
+ NULL,
+ &arg_verity_settings,
+ dissect_image_flags);
+ if (r < 0)
+ goto finish;
+ } else {
+ _cleanup_free_ char *userns_name = strjoin("nspawn-", arg_machine);
+ if (!userns_name) {
+ r = log_oom();
+ goto finish;
+ }
+
+ /* if we are unprivileged, let's allocate a 64K userns first */
+ userns_fd = nsresource_allocate_userns(userns_name, UINT64_C(0x10000));
+ if (userns_fd < 0) {
+ r = log_error_errno(userns_fd, "Failed to allocate user namespace with 64K users: %m");
+ goto finish;
+ }
+
+ r = mountfsd_mount_image(
+ arg_image,
+ userns_fd,
+ arg_image_policy,
+ dissect_image_flags,
+ &dissected_image);
+ if (r < 0)
+ goto finish;
+ }
/* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
if (remove_image && unlink(arg_image) >= 0)
if (r < 0)
goto finish;
- if (arg_console_mode < 0)
- arg_console_mode = isatty(STDIN_FILENO) && isatty(STDOUT_FILENO) ?
- CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
-
- if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
- arg_quiet = true;
-
if (!arg_quiet) {
const char *t = arg_image ?: arg_directory;
_cleanup_free_ char *u = NULL;
expose_args.fw_ctx = fw_ctx;
}
for (;;) {
- r = run_container(dissected_image,
- fds,
- veth_name, &veth_created,
- &expose_args, &master,
- &pid, &ret);
+ r = run_container(
+ dissected_image,
+ userns_fd,
+ fds,
+ veth_name, &veth_created,
+ &expose_args, &master,
+ &pid, &ret);
if (r <= 0)
break;
}
log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
}
- if (arg_machine) {
+ if (arg_machine && arg_privileged) {
const char *p;
p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET, &expose_args.address4);
expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET6, &expose_args.address6);
- if (veth_created)
- (void) remove_veth_links(veth_name, arg_network_veth_extra);
- (void) remove_bridge(arg_network_zone);
+ if (arg_privileged) {
+ if (veth_created)
+ (void) remove_veth_links(veth_name, arg_network_veth_extra);
+ (void) remove_bridge(arg_network_zone);
+ }
custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
expose_port_free_all(arg_expose_ports);