#include "nspawn-settings.h"
#include "nspawn-setuid.h"
#include "nspawn-stub-pid1.h"
+#include "nspawn-util.h"
#include "nspawn.h"
#include "nulstr-util.h"
#include "os-util.h"
#include "pager.h"
#include "parse-argument.h"
#include "parse-util.h"
-#include "path-util.h"
#include "pretty-print.h"
#include "process-util.h"
#include "ptyfwd.h"
static size_t arg_n_credentials = 0;
static char **arg_bind_user = NULL;
static bool arg_suppress_sync = false;
+static char *arg_settings_filename = NULL;
STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
STATIC_DESTRUCTOR_REGISTER(arg_bind_user, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_settings_filename, freep);
static int handle_arg_console(const char *arg) {
if (streq(arg, "help")) {
" --keep-unit Do not register a scope for the machine, reuse\n"
" the service unit nspawn is running in\n\n"
"%3$sUser Namespacing:%4$s\n"
- " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
- " --private-users[=UIDBASE[:NUIDS]]\n"
+ " --private-users=no Run without user namespacing\n"
+ " --private-users=yes|pick|identity\n"
+ " Run within user namespace, autoselect UID/GID range\n"
+ " --private-users=UIDBASE[:NUIDS]\n"
" Similar, but with user configured UID/GID range\n"
" --private-users-ownership=MODE\n"
" Adjust ('chown') or map ('map') OS tree ownership\n"
- " to private UID/GID range\n\n"
+ " to private UID/GID range\n"
+ " -U Equivalent to --private-users=pick and\n"
+ " --private-users-ownership=auto\n\n"
"%3$sNetworking:%4$s\n"
" --private-network Disable network in container\n"
" --network-interface=INTERFACE\n"
if (r > 0) {
/* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
* routine only detects 231, so we'll have a false negative here for 230. */
- r = systemd_installation_has_version(directory, 230);
+ r = systemd_installation_has_version(directory, "230");
if (r < 0)
return log_error_errno(r, "Failed to determine systemd version in container: %m");
if (r > 0)
arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
} else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
/* Mixed cgroup hierarchy support was added in 233 */
- r = systemd_installation_has_version(directory, 233);
+ r = systemd_installation_has_version(directory, "233");
if (r < 0)
return log_error_errno(r, "Failed to determine systemd version in container: %m");
if (r > 0)
};
static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
- struct ExposeArgs *args = userdata;
+ struct ExposeArgs *args = ASSERT_PTR(userdata);
assert(rtnl);
assert(m);
- assert(args);
(void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET, &args->address4);
(void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET6, &args->address6);
static int setup_journal(const char *directory) {
_cleanup_free_ char *d = NULL;
- const char *dirname, *p, *q;
+ const char *p, *q;
sd_id128_t this_id;
bool try;
int r;
} else if (access(p, F_OK) < 0)
return 0;
- if (dir_is_empty(q) == 0)
+ if (dir_is_empty(q, /* ignore_hidden_or_backup= */ false) == 0)
log_warning("%s is not empty, proceeding anyway.", q);
r = userns_mkdir(directory, p, 0755, 0, 0);
if (!hostname_is_valid(arg_machine, 0))
return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
+ /* Copy the machine name before the random suffix is added below, otherwise we won't be able
+ * to match fixed config file names. */
+ arg_settings_filename = strjoin(arg_machine, ".nspawn");
+ if (!arg_settings_filename)
+ return log_oom();
+
/* Add a random suffix when this is an ephemeral machine, so that we can run many
* instances at once without manually having to specify -M each time. */
if (arg_ephemeral)
if (strextendf(&arg_machine, "-%016" PRIx64, random_u64()) < 0)
return log_oom();
+ } else {
+ arg_settings_filename = strjoin(arg_machine, ".nspawn");
+ if (!arg_settings_filename)
+ return log_oom();
}
return 0;
};
unsigned long flags;
- char **k, **v;
int r;
flags = effective_clone_ns_flags();
NULL, /* LISTEN_PID */
NULL, /* NOTIFY_SOCKET */
NULL, /* CREDENTIALS_DIRECTORY */
+ NULL, /* LANG */
NULL
};
const char *exec_target;
n_env++;
}
+ if (arg_start_mode != START_BOOT) {
+ envp[n_env] = strdup("LANG=" SYSTEMD_NSPAWN_LOCALE);
+ if (!envp[n_env])
+ return log_oom();
+ n_env++;
+ }
+
env_use = strv_env_merge(envp, os_release_pairs, arg_setenv);
if (!env_use)
return log_oom();
(void) fdset_close_others(fds);
if (arg_start_mode == START_BOOT) {
- const char *init;
char **a;
size_t m;
/* If we cannot change the directory, we'll end up in /, that is expected. */
(void) chdir(home ?: "/root");
- execle("/bin/bash", "-bash", NULL, env_use);
- execle("/bin/sh", "-sh", NULL, env_use);
+ execle(DEFAULT_USER_SHELL, "-" DEFAULT_USER_SHELL_NAME, NULL, env_use);
+ if (!streq(DEFAULT_USER_SHELL, "/bin/bash"))
+ execle("/bin/bash", "-bash", NULL, env_use);
+ if (!streq(DEFAULT_USER_SHELL, "/bin/sh"))
+ execle("/bin/sh", "-sh", NULL, env_use);
- exec_target = "/bin/bash, /bin/sh";
+ exec_target = DEFAULT_USER_SHELL ", /bin/bash, /bin/sh";
}
return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
static int setup_notify_child(void) {
_cleanup_close_ int fd = -1;
- union sockaddr_union sa = {
+ static const union sockaddr_union sa = {
.un.sun_family = AF_UNIX,
.un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
};
ssize_t l;
int r;
- /* This is the "outer" child process, i.e the one forked off by the container manager itself. It already has
- * its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in the host's CLONE_NEWPID,
- * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET namespaces. After it completed a number of
- * initializations a second child (the "inner" one) is forked off it, and it exits. */
+ /* This is the "outer" child process, i.e the one forked off by the container manager itself. It
+ * already has its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in
+ * the host's CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET
+ * namespaces. After it completed a number of initializations a second child (the "inner" one) is
+ * forked off it, and it exits. */
assert(barrier);
assert(directory);
return r;
if (dissected_image) {
- /* If we are operating on a disk image, then mount its root directory now, but leave out the rest. We
- * can read the UID shift from it if we need to. Further down we'll mount the rest, but then with the
- * uid shift known. That way we can mount VFAT file systems shifted to the right place right away. This
- * makes sure ESP partitions and userns are compatible. */
+ /* If we are operating on a disk image, then mount its root directory now, but leave out the
+ * rest. We can read the UID shift from it if we need to. Further down we'll mount the rest,
+ * but then with the uid shift known. That way we can mount VFAT file systems shifted to the
+ * right place right away. This makes sure ESP partitions and userns are compatible. */
r = dissected_image_mount_and_warn(
dissected_image,
"Short write while sending UID shift.");
if (arg_userns_mode == USER_NAMESPACE_PICK) {
- /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
- * we just read from the image is available. If yes, it will send the UID shift back to us, if
- * not it will pick a different one, and send it back to us. */
+ /* When we are supposed to pick the UID shift, the parent will check now whether the
+ * UID shift we just read from the image is available. If yes, it will send the UID
+ * shift back to us, if not it will pick a different one, and send it back to us. */
l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
if (l < 0)
return r;
if (arg_userns_mode != USER_NAMESPACE_NO && bind_user_context) {
- /* Send the user maps we determined to the parent, so that it installs it in our user namespace UID map table */
+ /* Send the user maps we determined to the parent, so that it installs it in our user
+ * namespace UID map table */
for (size_t i = 0; i < bind_user_context->n_data; i++) {
uid_t map[] = {
IN_SET(arg_userns_ownership, USER_NAMESPACE_OWNERSHIP_MAP, USER_NAMESPACE_OWNERSHIP_AUTO) &&
arg_uid_shift != 0) {
- r = remount_idmap(directory, arg_uid_shift, arg_uid_range);
+ r = remount_idmap(directory, arg_uid_shift, arg_uid_range, UID_INVALID, REMOUNT_IDMAPPING_HOST_ROOT);
if (r == -EINVAL || ERRNO_IS_NOT_SUPPORTED(r)) {
/* This might fail because the kernel or file system doesn't support idmapping. We
* can't really distinguish this nicely, nor do we have any guarantees about the
unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
}
- /* Mark everything as shared so our mounts get propagated down. This is
- * required to make new bind mounts available in systemd services
- * inside the container that create a new mount namespace.
- * See https://github.com/systemd/systemd/issues/3860
- * Further submounts (such as /dev) done after this will inherit the
- * shared propagation mode.
+ /* Mark everything as shared so our mounts get propagated down. This is required to make new bind
+ * mounts available in systemd services inside the container that create a new mount namespace. See
+ * https://github.com/systemd/systemd/issues/3860 Further submounts (such as /dev) done after this
+ * will inherit the shared propagation mode.
*
- * IMPORTANT: Do not overmount the root directory anymore from now on to
- * enable moving the root directory mount to root later on.
+ * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
+ * directory mount to root later on.
* https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
*/
r = mount_nofollow_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
* quadruplet, consisting of host and container UID + GID. */
for (size_t i = 0; i < n_bind_user_uid; i++) {
- uid_t payload_uid = bind_user_uid[i*2+offset],
- host_uid = bind_user_uid[i*2+offset+1];
+ uid_t payload_uid = bind_user_uid[i*4+offset],
+ host_uid = bind_user_uid[i*4+offset+1];
assert(previous_uid <= payload_uid);
assert(payload_uid < arg_uid_range);
if (!tags)
return log_oom();
- if (strv_find(tags, "READY=1")) {
+ if (strv_contains(tags, "READY=1")) {
r = sd_notify(false, "READY=1\n");
if (r < 0)
log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
_cleanup_(settings_freep) Settings *settings = NULL;
_cleanup_fclose_ FILE *f = NULL;
_cleanup_free_ char *p = NULL;
- const char *fn, *i;
int r;
if (arg_oci_bundle)
if (FLAGS_SET(arg_settings_mask, _SETTINGS_MASK_ALL))
return 0;
- fn = strjoina(arg_machine, ".nspawn");
-
/* We first look in the admin's directories in /etc and /run */
FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
_cleanup_free_ char *j = NULL;
- j = path_join(i, fn);
+ j = path_join(i, arg_settings_filename);
if (!j)
return log_oom();
* actual image we shall boot. */
if (arg_image) {
- p = file_in_same_dir(arg_image, fn);
+ p = file_in_same_dir(arg_image, arg_settings_filename);
if (!p)
return log_oom();
} else if (arg_directory && !path_equal(arg_directory, "/")) {
- p = file_in_same_dir(arg_directory, fn);
+ p = file_in_same_dir(arg_directory, arg_settings_filename);
if (!p)
return log_oom();
}
if (l < 0)
return log_error_errno(errno, "Failed to read cgroup mode: %m");
if (l != sizeof(arg_unified_cgroup_hierarchy))
- return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zu bytes).%s",
+ return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zi bytes).%s",
l, l == 0 ? " The child is most likely dead." : "");
}
}
static int initialize_rlimits(void) {
- /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload
+ /* The default resource limits the kernel passes to PID 1, as per kernel 5.16. Let's pass our container payload
* the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
* container execution environments. */
static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
- [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
- [RLIMIT_CORE] = { 0, RLIM_INFINITY },
- [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
- [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
- [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
- [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
- [RLIMIT_MEMLOCK] = { 65536, 65536 },
- [RLIMIT_MSGQUEUE] = { 819200, 819200 },
- [RLIMIT_NICE] = { 0, 0 },
- [RLIMIT_NOFILE] = { 1024, 4096 },
- [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
- [RLIMIT_RTPRIO] = { 0, 0 },
- [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
- [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
+ [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
+ [RLIMIT_CORE] = { 0, RLIM_INFINITY },
+ [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
+ [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
+ [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
+ [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
+ [RLIMIT_MEMLOCK] = { DEFAULT_RLIMIT_MEMLOCK, DEFAULT_RLIMIT_MEMLOCK },
+ [RLIMIT_MSGQUEUE] = { 819200, 819200 },
+ [RLIMIT_NICE] = { 0, 0 },
+ [RLIMIT_NOFILE] = { 1024, 4096 },
+ [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
+ [RLIMIT_RTPRIO] = { 0, 0 },
+ [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
+ [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
/* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
* RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
}
static int cant_be_in_netns(void) {
- union sockaddr_union sa = {
- .un = {
- .sun_family = AF_UNIX,
- .sun_path = "/run/udev/control",
- },
- };
char udev_path[STRLEN("/proc//ns/net") + DECIMAL_STR_MAX(pid_t)];
_cleanup_free_ char *udev_ns = NULL, *our_ns = NULL;
_cleanup_close_ int fd = -1;
if (fd < 0)
return log_error_errno(errno, "Failed to allocate udev control socket: %m");
- if (connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0) {
-
- if (errno == ENOENT || ERRNO_IS_DISCONNECT(errno))
+ r = connect_unix_path(fd, AT_FDCWD, "/run/udev/control");
+ if (r < 0) {
+ if (r == -ENOENT || ERRNO_IS_DISCONNECT(r))
return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
"Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
- return log_error_errno(errno, "Failed to connect socket to udev control socket: %m");
+ return log_error_errno(r, "Failed to connect socket to udev control socket: %m");
}
r = getpeercred(fd, &ucred);
}
if (arg_start_mode == START_BOOT) {
+ _cleanup_free_ char *b = NULL;
const char *p;
- if (arg_pivot_root_new)
- p = prefix_roota(arg_directory, arg_pivot_root_new);
- else
+ if (arg_pivot_root_new) {
+ b = path_join(arg_directory, arg_pivot_root_new);
+ if (!b)
+ return log_oom();
+
+ p = b;
+ } else
p = arg_directory;
if (path_is_os_tree(p) <= 0) {
- log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
- r = -EINVAL;
+ r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
goto finish;
}
} else {
- const char *p, *q;
+ _cleanup_free_ char *p = NULL;
if (arg_pivot_root_new)
- p = prefix_roota(arg_directory, arg_pivot_root_new);
+ p = path_join(arg_directory, arg_pivot_root_new, "/usr/");
else
- p = arg_directory;
-
- q = strjoina(p, "/usr/");
+ p = path_join(arg_directory, "/usr/");
+ if (!p)
+ return log_oom();
- if (laccess(q, F_OK) < 0) {
- log_error("Directory %s doesn't look like it has an OS tree. Refusing.", p);
- r = -EINVAL;
+ if (laccess(p, F_OK) < 0) {
+ r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Directory %s doesn't look like it has an OS tree (/usr/ directory is missing). Refusing.", arg_directory);
goto finish;
}
}
arg_image,
arg_read_only ? O_RDONLY : O_RDWR,
FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
+ LOCK_SH,
&loop);
if (r < 0) {
log_error_errno(r, "Failed to set up loopback block device: %m");
goto finish;
}
- r = dissect_image_and_warn(
- loop->fd,
- arg_image,
+ r = dissect_loop_device_and_warn(
+ loop,
&arg_verity_settings,
NULL,
- loop->diskseq,
- loop->uevent_seqnum_not_before,
- loop->timestamp_not_before,
dissect_image_flags,
&dissected_image);
if (r == -ENOPKG) {