static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
static void *arg_root_hash = NULL;
+static char *arg_verity_data = NULL;
+static char *arg_root_hash_sig_path = NULL;
+static void *arg_root_hash_sig = NULL;
+static size_t arg_root_hash_sig_size = 0;
static size_t arg_root_hash_size = 0;
-static char **arg_syscall_whitelist = NULL;
-static char **arg_syscall_blacklist = NULL;
+static char **arg_syscall_allow_list = NULL;
+static char **arg_syscall_deny_list = NULL;
#if HAVE_SECCOMP
static scmp_filter_ctx arg_seccomp = NULL;
#endif
STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
STATIC_DESTRUCTOR_REGISTER(arg_root_hash, freep);
-STATIC_DESTRUCTOR_REGISTER(arg_syscall_whitelist, strv_freep);
-STATIC_DESTRUCTOR_REGISTER(arg_syscall_blacklist, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_verity_data, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_root_hash_sig_path, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_root_hash_sig, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_syscall_allow_list, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list, strv_freep);
#if HAVE_SECCOMP
STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
#endif
" --read-only Mount the root directory read-only\n"
" --volatile[=MODE] Run the system in volatile mode\n"
" --root-hash=HASH Specify verity root hash for root disk image\n"
+ " --root-hash-sig=SIG Specify pkcs7 signature of root hash for verity\n"
+ " as a DER encoded PKCS7, either as a path to a file\n"
+ " or as an ASCII base64 encoded string prefixed by\n"
+ " 'base64:'\n"
+ " --verity-data=PATH Specify hash device for verity\n"
" --pivot-root=PATH[:PATH]\n"
" Pivot root to given directory in the container\n\n"
"%3$sExecution:%4$s\n"
ARG_PIPE,
ARG_OCI_BUNDLE,
ARG_NO_PAGER,
+ ARG_VERITY_DATA,
+ ARG_ROOT_HASH_SIG,
};
static const struct option options[] = {
{ "pipe", no_argument, NULL, ARG_PIPE },
{ "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
{ "no-pager", no_argument, NULL, ARG_NO_PAGER },
+ { "verity-data", required_argument, NULL, ARG_VERITY_DATA },
+ { "root-hash-sig", required_argument, NULL, ARG_ROOT_HASH_SIG },
{}
};
break;
}
+ case ARG_VERITY_DATA:
+ r = parse_path_argument_and_warn(optarg, false, &arg_verity_data);
+ if (r < 0)
+ return r;
+ break;
+
+ case ARG_ROOT_HASH_SIG: {
+ char *value;
+
+ if ((value = startswith(optarg, "base64:"))) {
+ void *p;
+ size_t l;
+
+ r = unbase64mem(value, strlen(value), &p, &l);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse root hash signature '%s': %m", optarg);
+
+ free_and_replace(arg_root_hash_sig, p);
+ arg_root_hash_sig_size = l;
+ arg_root_hash_sig_path = mfree(arg_root_hash_sig_path);
+ } else {
+ r = parse_path_argument_and_warn(optarg, false, &arg_root_hash_sig_path);
+ if (r < 0)
+ return r;
+ arg_root_hash_sig = mfree(arg_root_hash_sig);
+ arg_root_hash_sig_size = 0;
+ }
+
+ break;
+ }
+
case ARG_SYSTEM_CALL_FILTER: {
bool negative;
const char *items;
return log_error_errno(r, "Failed to parse system call filter: %m");
if (negative)
- r = strv_extend(&arg_syscall_blacklist, word);
+ r = strv_extend(&arg_syscall_deny_list, word);
else
- r = strv_extend(&arg_syscall_whitelist, word);
+ r = strv_extend(&arg_syscall_allow_list, word);
if (r < 0)
return log_oom();
}
static int setup_keyring(void) {
key_serial_t keyring;
- /* Allocate a new session keyring for the container. This makes sure the keyring of the session systemd-nspawn
- * was invoked from doesn't leak into the container. Note that by default we block keyctl() and request_key()
- * anyway via seccomp so doing this operation isn't strictly necessary, but in case people explicitly whitelist
- * these system calls let's make sure we don't leak anything into the container. */
+ /* Allocate a new session keyring for the container. This makes sure the keyring of the session
+ * systemd-nspawn was invoked from doesn't leak into the container. Note that by default we block
+ * keyctl() and request_key() anyway via seccomp so doing this operation isn't strictly necessary,
+ * but in case people explicitly allow-list these system calls let's make sure we don't leak anything
+ * into the container. */
keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
if (keyring == -1) {
int kmsg_socket,
int rtnl_socket,
int master_pty_socket,
- FDSet *fds) {
+ FDSet *fds,
+ char **os_release_pairs) {
_cleanup_free_ char *home = NULL;
char as_uuid[ID128_UUID_STRING_MAX];
/* Wait until the parent wrote the UID map */
if (!barrier_place_and_sync(barrier)) /* #2 */
- return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
- "Parent died too early");
- }
+ return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
- r = reset_uid_gid();
- if (r < 0)
- return log_error_errno(r, "Couldn't become new root: %m");
+ /* Become the new root user inside our namespace */
+ r = reset_uid_gid();
+ if (r < 0)
+ return log_error_errno(r, "Couldn't become new root: %m");
+
+ /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them
+ * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect
+ * propagation, but simply create new peer groups for all our mounts). */
+ r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SHARED|MS_REC, NULL);
+ if (r < 0)
+ return r;
+ }
r = mount_all(NULL,
arg_mount_settings | MOUNT_IN_USERNS,
} else
#endif
{
- r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
+ r = setup_seccomp(arg_caps_retain, arg_syscall_allow_list, arg_syscall_deny_list);
if (r < 0)
return r;
}
if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
return log_oom();
- env_use = strv_env_merge(2, envp, arg_setenv);
+ env_use = strv_env_merge(3, envp, os_release_pairs, arg_setenv);
if (!env_use)
return log_oom();
FDSet *fds,
int netns_fd) {
+ _cleanup_strv_free_ char **os_release_pairs = NULL;
_cleanup_close_ int fd = -1;
const char *p;
pid_t pid;
log_debug("Outer child is initializing.");
+ r = load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs);
+ if (r < 0)
+ log_debug_errno(r, "Failed to read os-release from host for container, ignoring: %m");
+
if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
if (r < 0)
return r;
- /* Mark everything as slave, so that we still
- * receive mounts from the real root, but don't
- * propagate mounts to the real root. */
+ /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate
+ * mounts to the real root. */
r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
if (r < 0)
return r;
(void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
- p = prefix_roota(directory, "/run/systemd");
+ p = prefix_roota(directory, "/run");
(void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
r = setup_pts(directory);
notify_socket = safe_close(notify_socket);
uid_shift_socket = safe_close(uid_shift_socket);
- /* The inner child has all namespaces that are
- * requested, so that we all are owned by the user if
- * user namespaces are turned on. */
+ /* The inner child has all namespaces that are requested, so that we all are owned by the
+ * user if user namespaces are turned on. */
if (arg_network_namespace_path) {
r = namespace_enter(-1, -1, netns_fd, -1, -1);
return log_error_errno(r, "Failed to join network namespace: %m");
}
- r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, master_pty_socket, fds);
+ r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, master_pty_socket, fds, os_release_pairs);
if (r < 0)
_exit(EXIT_FAILURE);
if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
- if (!arg_settings_trusted && !strv_isempty(settings->syscall_whitelist))
+ if (!arg_settings_trusted && !strv_isempty(settings->syscall_allow_list))
log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
else {
- strv_free_and_replace(arg_syscall_whitelist, settings->syscall_whitelist);
- strv_free_and_replace(arg_syscall_blacklist, settings->syscall_blacklist);
+ strv_free_and_replace(arg_syscall_allow_list, settings->syscall_allow_list);
+ strv_free_and_replace(arg_syscall_deny_list, settings->syscall_deny_list);
}
#if HAVE_SECCOMP
if (!barrier_place_and_sync(&barrier)) /* #5 */
return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
- /* At this point we have made use of the UID we picked, and thus nss-mymachines
+ /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service
* will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
etc_passwd_lock = safe_close(etc_passwd_lock);
}
} else {
+ DissectImageFlags dissect_image_flags = DISSECT_IMAGE_REQUIRE_ROOT | DISSECT_IMAGE_RELAX_VAR_CHECK;
assert(arg_image);
assert(!arg_template);
goto finish;
}
- if (!arg_root_hash) {
- r = root_hash_load(arg_image, &arg_root_hash, &arg_root_hash_size);
- if (r < 0) {
- log_error_errno(r, "Failed to load root hash file for %s: %m", arg_image);
- goto finish;
- }
+ r = verity_metadata_load(arg_image, NULL, arg_root_hash ? NULL : &arg_root_hash, &arg_root_hash_size,
+ arg_verity_data ? NULL : &arg_verity_data,
+ arg_root_hash_sig_path || arg_root_hash_sig ? NULL : &arg_root_hash_sig_path);
+ if (r < 0) {
+ log_error_errno(r, "Failed to read verity artefacts for %s: %m", arg_image);
+ goto finish;
}
+ dissect_image_flags |= arg_verity_data ? DISSECT_IMAGE_NO_PARTITION_TABLE : 0;
}
if (!mkdtemp(tmprootdir)) {
loop->fd,
arg_image,
arg_root_hash, arg_root_hash_size,
- DISSECT_IMAGE_REQUIRE_ROOT|DISSECT_IMAGE_RELAX_VAR_CHECK,
+ arg_verity_data,
+ dissect_image_flags,
&dissected_image);
if (r == -ENOPKG) {
/* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
if (!arg_root_hash && dissected_image->can_verity)
log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
- r = dissected_image_decrypt_interactively(dissected_image, NULL, arg_root_hash, arg_root_hash_size, 0, &decrypted_image);
+ r = dissected_image_decrypt_interactively(dissected_image, NULL, arg_root_hash, arg_root_hash_size, arg_verity_data, arg_root_hash_sig_path, arg_root_hash_sig, arg_root_hash_sig_size, 0, &decrypted_image);
if (r < 0)
goto finish;