]> git.ipfire.org Git - thirdparty/systemd.git/blobdiff - src/nspawn/nspawn.c
nspawn: fix MS_SHARED mount propagation for userns containers
[thirdparty/systemd.git] / src / nspawn / nspawn.c
index 8e8881749aa54f56cf21936c8767a80556b65f9b..3b9493f232e26f4c15a0b7e236f5288e14afe8e2 100644 (file)
@@ -199,9 +199,13 @@ static bool arg_use_cgns = true;
 static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
 static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
 static void *arg_root_hash = NULL;
+static char *arg_verity_data = NULL;
+static char *arg_root_hash_sig_path = NULL;
+static void *arg_root_hash_sig = NULL;
+static size_t arg_root_hash_sig_size = 0;
 static size_t arg_root_hash_size = 0;
-static char **arg_syscall_whitelist = NULL;
-static char **arg_syscall_blacklist = NULL;
+static char **arg_syscall_allow_list = NULL;
+static char **arg_syscall_deny_list = NULL;
 #if HAVE_SECCOMP
 static scmp_filter_ctx arg_seccomp = NULL;
 #endif
@@ -242,8 +246,11 @@ STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
 STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
 STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
 STATIC_DESTRUCTOR_REGISTER(arg_root_hash, freep);
-STATIC_DESTRUCTOR_REGISTER(arg_syscall_whitelist, strv_freep);
-STATIC_DESTRUCTOR_REGISTER(arg_syscall_blacklist, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_verity_data, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_root_hash_sig_path, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_root_hash_sig, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_syscall_allow_list, strv_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list, strv_freep);
 #if HAVE_SECCOMP
 STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
 #endif
@@ -303,6 +310,11 @@ static int help(void) {
                "     --read-only            Mount the root directory read-only\n"
                "     --volatile[=MODE]      Run the system in volatile mode\n"
                "     --root-hash=HASH       Specify verity root hash for root disk image\n"
+               "     --root-hash-sig=SIG    Specify pkcs7 signature of root hash for verity\n"
+               "                            as a DER encoded PKCS7, either as a path to a file\n"
+               "                            or as an ASCII base64 encoded string prefixed by\n"
+               "                            'base64:'\n"
+               "     --verity-data=PATH     Specify hash device for verity\n"
                "     --pivot-root=PATH[:PATH]\n"
                "                            Pivot root to given directory in the container\n\n"
                "%3$sExecution:%4$s\n"
@@ -663,6 +675,8 @@ static int parse_argv(int argc, char *argv[]) {
                 ARG_PIPE,
                 ARG_OCI_BUNDLE,
                 ARG_NO_PAGER,
+                ARG_VERITY_DATA,
+                ARG_ROOT_HASH_SIG,
         };
 
         static const struct option options[] = {
@@ -728,6 +742,8 @@ static int parse_argv(int argc, char *argv[]) {
                 { "pipe",                   no_argument,       NULL, ARG_PIPE                   },
                 { "oci-bundle",             required_argument, NULL, ARG_OCI_BUNDLE             },
                 { "no-pager",               no_argument,       NULL, ARG_NO_PAGER               },
+                { "verity-data",            required_argument, NULL, ARG_VERITY_DATA            },
+                { "root-hash-sig",          required_argument, NULL, ARG_ROOT_HASH_SIG          },
                 {}
         };
 
@@ -1316,6 +1332,37 @@ static int parse_argv(int argc, char *argv[]) {
                         break;
                 }
 
+                case ARG_VERITY_DATA:
+                        r = parse_path_argument_and_warn(optarg, false, &arg_verity_data);
+                        if (r < 0)
+                                return r;
+                        break;
+
+                case ARG_ROOT_HASH_SIG: {
+                        char *value;
+
+                        if ((value = startswith(optarg, "base64:"))) {
+                                void *p;
+                                size_t l;
+
+                                r = unbase64mem(value, strlen(value), &p, &l);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to parse root hash signature '%s': %m", optarg);
+
+                                free_and_replace(arg_root_hash_sig, p);
+                                arg_root_hash_sig_size = l;
+                                arg_root_hash_sig_path = mfree(arg_root_hash_sig_path);
+                        } else {
+                                r = parse_path_argument_and_warn(optarg, false, &arg_root_hash_sig_path);
+                                if (r < 0)
+                                        return r;
+                                arg_root_hash_sig = mfree(arg_root_hash_sig);
+                                arg_root_hash_sig_size = 0;
+                        }
+
+                        break;
+                }
+
                 case ARG_SYSTEM_CALL_FILTER: {
                         bool negative;
                         const char *items;
@@ -1335,9 +1382,9 @@ static int parse_argv(int argc, char *argv[]) {
                                         return log_error_errno(r, "Failed to parse system call filter: %m");
 
                                 if (negative)
-                                        r = strv_extend(&arg_syscall_blacklist, word);
+                                        r = strv_extend(&arg_syscall_deny_list, word);
                                 else
-                                        r = strv_extend(&arg_syscall_whitelist, word);
+                                        r = strv_extend(&arg_syscall_allow_list, word);
                                 if (r < 0)
                                         return log_oom();
                         }
@@ -2164,10 +2211,11 @@ static int setup_dev_console(const char *console) {
 static int setup_keyring(void) {
         key_serial_t keyring;
 
-        /* Allocate a new session keyring for the container. This makes sure the keyring of the session systemd-nspawn
-         * was invoked from doesn't leak into the container. Note that by default we block keyctl() and request_key()
-         * anyway via seccomp so doing this operation isn't strictly necessary, but in case people explicitly whitelist
-         * these system calls let's make sure we don't leak anything into the container. */
+        /* Allocate a new session keyring for the container. This makes sure the keyring of the session
+         * systemd-nspawn was invoked from doesn't leak into the container. Note that by default we block
+         * keyctl() and request_key() anyway via seccomp so doing this operation isn't strictly necessary,
+         * but in case people explicitly allow-list these system calls let's make sure we don't leak anything
+         * into the container. */
 
         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
         if (keyring == -1) {
@@ -2883,7 +2931,8 @@ static int inner_child(
                 int kmsg_socket,
                 int rtnl_socket,
                 int master_pty_socket,
-                FDSet *fds) {
+                FDSet *fds,
+                char **os_release_pairs) {
 
         _cleanup_free_ char *home = NULL;
         char as_uuid[ID128_UUID_STRING_MAX];
@@ -2928,13 +2977,20 @@ static int inner_child(
 
                 /* Wait until the parent wrote the UID map */
                 if (!barrier_place_and_sync(barrier)) /* #2 */
-                        return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
-                                               "Parent died too early");
-        }
+                        return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
 
-        r = reset_uid_gid();
-        if (r < 0)
-                return log_error_errno(r, "Couldn't become new root: %m");
+                /* Become the new root user inside our namespace */
+                r = reset_uid_gid();
+                if (r < 0)
+                        return log_error_errno(r, "Couldn't become new root: %m");
+
+                /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them
+                 * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect
+                 * propagation, but simply create new peer groups for all our mounts). */
+                r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SHARED|MS_REC, NULL);
+                if (r < 0)
+                        return r;
+        }
 
         r = mount_all(NULL,
                       arg_mount_settings | MOUNT_IN_USERNS,
@@ -3078,7 +3134,7 @@ static int inner_child(
         } else
 #endif
         {
-                r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
+                r = setup_seccomp(arg_caps_retain, arg_syscall_allow_list, arg_syscall_deny_list);
                 if (r < 0)
                         return r;
         }
@@ -3142,7 +3198,7 @@ static int inner_child(
         if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
                 return log_oom();
 
-        env_use = strv_env_merge(2, envp, arg_setenv);
+        env_use = strv_env_merge(3, envp, os_release_pairs, arg_setenv);
         if (!env_use)
                 return log_oom();
 
@@ -3268,6 +3324,7 @@ static int outer_child(
                 FDSet *fds,
                 int netns_fd) {
 
+        _cleanup_strv_free_ char **os_release_pairs = NULL;
         _cleanup_close_ int fd = -1;
         const char *p;
         pid_t pid;
@@ -3289,6 +3346,10 @@ static int outer_child(
 
         log_debug("Outer child is initializing.");
 
+        r = load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs);
+        if (r < 0)
+                log_debug_errno(r, "Failed to read os-release from host for container, ignoring: %m");
+
         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
                 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
 
@@ -3296,9 +3357,8 @@ static int outer_child(
         if (r < 0)
                 return r;
 
-        /* Mark everything as slave, so that we still
-         * receive mounts from the real root, but don't
-         * propagate mounts to the real root. */
+        /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate
+         * mounts to the real root. */
         r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
         if (r < 0)
                 return r;
@@ -3471,7 +3531,7 @@ static int outer_child(
 
         (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
 
-        p = prefix_roota(directory, "/run/systemd");
+        p = prefix_roota(directory, "/run");
         (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
 
         r = setup_pts(directory);
@@ -3544,9 +3604,8 @@ static int outer_child(
                 notify_socket = safe_close(notify_socket);
                 uid_shift_socket = safe_close(uid_shift_socket);
 
-                /* The inner child has all namespaces that are
-                 * requested, so that we all are owned by the user if
-                 * user namespaces are turned on. */
+                /* The inner child has all namespaces that are requested, so that we all are owned by the
+                 * user if user namespaces are turned on. */
 
                 if (arg_network_namespace_path) {
                         r = namespace_enter(-1, -1, netns_fd, -1, -1);
@@ -3554,7 +3613,7 @@ static int outer_child(
                                 return log_error_errno(r, "Failed to join network namespace: %m");
                 }
 
-                r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, master_pty_socket, fds);
+                r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, master_pty_socket, fds, os_release_pairs);
                 if (r < 0)
                         _exit(EXIT_FAILURE);
 
@@ -3943,11 +4002,11 @@ static int merge_settings(Settings *settings, const char *path) {
 
         if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
 
-                if (!arg_settings_trusted && !strv_isempty(settings->syscall_whitelist))
+                if (!arg_settings_trusted && !strv_isempty(settings->syscall_allow_list))
                         log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
                 else {
-                        strv_free_and_replace(arg_syscall_whitelist, settings->syscall_whitelist);
-                        strv_free_and_replace(arg_syscall_blacklist, settings->syscall_blacklist);
+                        strv_free_and_replace(arg_syscall_allow_list, settings->syscall_allow_list);
+                        strv_free_and_replace(arg_syscall_deny_list, settings->syscall_deny_list);
                 }
 
 #if HAVE_SECCOMP
@@ -4582,7 +4641,7 @@ static int run_container(
         if (!barrier_place_and_sync(&barrier)) /* #5 */
                 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
 
-        /* At this point we have made use of the UID we picked, and thus nss-mymachines
+        /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service
          * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
         etc_passwd_lock = safe_close(etc_passwd_lock);
 
@@ -5080,6 +5139,7 @@ static int run(int argc, char *argv[]) {
                 }
 
         } else {
+                DissectImageFlags dissect_image_flags = DISSECT_IMAGE_REQUIRE_ROOT | DISSECT_IMAGE_RELAX_VAR_CHECK;
                 assert(arg_image);
                 assert(!arg_template);
 
@@ -5129,13 +5189,14 @@ static int run(int argc, char *argv[]) {
                                 goto finish;
                         }
 
-                        if (!arg_root_hash) {
-                                r = root_hash_load(arg_image, &arg_root_hash, &arg_root_hash_size);
-                                if (r < 0) {
-                                        log_error_errno(r, "Failed to load root hash file for %s: %m", arg_image);
-                                        goto finish;
-                                }
+                        r = verity_metadata_load(arg_image, NULL, arg_root_hash ? NULL : &arg_root_hash, &arg_root_hash_size,
+                                        arg_verity_data ? NULL : &arg_verity_data,
+                                        arg_root_hash_sig_path || arg_root_hash_sig ? NULL : &arg_root_hash_sig_path);
+                        if (r < 0) {
+                                log_error_errno(r, "Failed to read verity artefacts for %s: %m", arg_image);
+                                goto finish;
                         }
+                        dissect_image_flags |= arg_verity_data ? DISSECT_IMAGE_NO_PARTITION_TABLE : 0;
                 }
 
                 if (!mkdtemp(tmprootdir)) {
@@ -5161,7 +5222,8 @@ static int run(int argc, char *argv[]) {
                                 loop->fd,
                                 arg_image,
                                 arg_root_hash, arg_root_hash_size,
-                                DISSECT_IMAGE_REQUIRE_ROOT|DISSECT_IMAGE_RELAX_VAR_CHECK,
+                                arg_verity_data,
+                                dissect_image_flags,
                                 &dissected_image);
                 if (r == -ENOPKG) {
                         /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
@@ -5179,7 +5241,7 @@ static int run(int argc, char *argv[]) {
                 if (!arg_root_hash && dissected_image->can_verity)
                         log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
 
-                r = dissected_image_decrypt_interactively(dissected_image, NULL, arg_root_hash, arg_root_hash_size, 0, &decrypted_image);
+                r = dissected_image_decrypt_interactively(dissected_image, NULL, arg_root_hash, arg_root_hash_size, arg_verity_data, arg_root_hash_sig_path, arg_root_hash_sig, arg_root_hash_sig_size, 0, &decrypted_image);
                 if (r < 0)
                         goto finish;