]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
vmspawn/nspawn: Always use a per-machine runtime subdirectory
authorKai Lüke <kai@amutable.com>
Mon, 15 Jun 2026 13:11:41 +0000 (15:11 +0200)
committerZbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>
Thu, 2 Jul 2026 15:24:18 +0000 (17:24 +0200)
Some state files were prefixed with the machine name which protected
against collisions when running concurrent machines in a systemd unit
with RuntimeDirectory= set. However, the socket files tpm.sock and
control were not, which caused a race. Prefixing these socket files
does not work because the path gets too long when both the subdirectory
and the socket file within it get unique identifiers.

Instead of prefixing all files, we can rather always create a
subdirectory and use simple names within in. This makes paths shorter
in the normal case and protects against races with RuntimeDirectory=
were instead of directly reusing RUNTIME_DIRECTORY we also create the
normal per-machine subdirectory. Since this here is about runtime state
it should not impact any running VMs/containers. The ssh-proxy looks
only at the normal case and does not support RUNTIME_DIRECTORY, so no
impact there as well.

src/libsystemd/sd-path/path-lookup.c
src/libsystemd/sd-path/path-lookup.h
src/nspawn/nspawn.c
src/vmspawn/vmspawn.c

index b0f10e4b5af236fe98b4f422ebb6207959d8ebf9..ee97164bdc02abd8c010b98d9f2c787242a497c7 100644 (file)
@@ -102,40 +102,52 @@ int runtime_directory(RuntimeScope scope, const char *fallback_suffix, char **re
         return 1;
 }
 
-int runtime_directory_make(RuntimeScope scope, const char *subdir, char **ret_dir, char **ret_dir_destroy) {
-        _cleanup_free_ char *dir = NULL, *destroy = NULL;
+int runtime_directory_resolve(RuntimeScope scope, const char *suffix, const char *identifier, char **ret) {
+        _cleanup_free_ char *base = NULL, *dir = NULL;
         int r;
 
-        assert(subdir);
-        assert(ret_dir);
-        assert(ret_dir_destroy);
+        assert(suffix);
+        assert(identifier);
+        assert(ret);
+
+        /* This should not be able to escape the base directory, forbid ../ and similar */
+        if (!filename_is_valid(identifier))
+                return -EINVAL;
 
-        /* Use runtime_directory() (not _generic()) so that when we run in a systemd service
-         * with RuntimeDirectory= set, we pick up $RUNTIME_DIRECTORY and place our stuff into the
-         * directory the service manager prepared for us. When the env var is unset, we fall back
-         * to the provided subdirectory under /run (or the $XDG_RUNTIME_DIR equivalent in user scope)
-         * and take care of creation and destruction ourselves. */
-        r = runtime_directory(scope, subdir, &dir);
+        /* Resolve a runtime directory <base>/<identifier>. The base is $RUNTIME_DIRECTORY when we run as a
+         * systemd service with RuntimeDirectory= set (the service manager prepared it for us), otherwise the
+         * provided suffix under /run (or the $XDG_RUNTIME_DIR equivalent in user scope). The per-identifier
+         * subdirectory keeps concurrent instances from colliding even when they share a base directory, and
+         * lets them use short entry names within it. */
+        r = runtime_directory(scope, suffix, &base);
         if (r < 0)
                 return r;
 
-        if (r > 0) {
-                /* $RUNTIME_DIRECTORY was not set, so we got the fallback path and need to create and
-                 * clean up the directory ourselves. */
-                destroy = strdup(dir);
-                if (!destroy)
-                        return -ENOMEM;
+        dir = path_join(base, identifier);
+        if (!dir)
+                return -ENOMEM;
 
-                r = mkdir_p(dir, 0755);
-                if (r < 0)
-                        return r;
-        }
+        *ret = TAKE_PTR(dir);
+        return 0;
+}
 
-        /* When $RUNTIME_DIRECTORY is set the service manager created the directory for us and
-         * will destroy it (or preserve it, per RuntimeDirectoryPreserve=) when the service stops. */
+int runtime_directory_make(RuntimeScope scope, const char *suffix, const char *identifier, char **ret) {
+        _cleanup_free_ char *dir = NULL;
+        int r;
+
+        assert(ret);
+
+        r = runtime_directory_resolve(scope, suffix, identifier, &dir);
+        if (r < 0)
+                return r;
+
+        /* We always create and destroy the subdirectory ourselves, the service manager only owns the base
+         * it handed us. The caller is expected to remove the subdirectory. */
+        r = mkdir_p(dir, 0755);
+        if (r < 0)
+                return r;
 
-        *ret_dir = TAKE_PTR(dir);
-        *ret_dir_destroy = TAKE_PTR(destroy);
+        *ret = TAKE_PTR(dir);
 
         return 0;
 }
index 80e37a571c59c36d1705b4c027446366b183f0f2..7f5f1999a64404c4d779623535d86d49c2f8e81c 100644 (file)
@@ -60,7 +60,8 @@ void lookup_paths_done(LookupPaths *p);
 int config_directory_generic(RuntimeScope scope, const char *suffix, char **ret);
 int runtime_directory_generic(RuntimeScope scope, const char *suffix, char **ret);
 int runtime_directory(RuntimeScope scope, const char *fallback_suffix, char **ret);
-int runtime_directory_make(RuntimeScope scope, const char *subdir, char **ret_dir, char **ret_dir_destroy);
+int runtime_directory_resolve(RuntimeScope scope, const char *suffix, const char *identifier, char **ret);
+int runtime_directory_make(RuntimeScope scope, const char *suffix, const char *identifier, char **ret);
 int state_directory_generic(RuntimeScope scope, const char *suffix, char **ret);
 
 /* We don't treat /etc/xdg/systemd/ in these functions as the xdg base dir spec suggests because we assume
index bbd5f411d413c14a79e2775d8257daea0c5a118d..add7c9146b1c87af425eae938904f4a4ae600ba3 100644 (file)
@@ -6104,12 +6104,8 @@ static int do_cleanup(void) {
         if (r < 0)
                 return r;
 
-        _cleanup_free_ char *subdir = path_join("systemd/nspawn", arg_machine);
-        if (!subdir)
-                return log_oom();
-
         _cleanup_free_ char *runtime_dir = NULL;
-        r = runtime_directory(arg_runtime_scope, subdir, &runtime_dir);
+        r = runtime_directory_resolve(arg_runtime_scope, "systemd/nspawn", arg_machine, &runtime_dir);
         if (r < 0)
                 return r;
 
@@ -6133,8 +6129,7 @@ static int run(int argc, char *argv[]) {
         _cleanup_(sd_netlink_unrefp) sd_netlink *nfnl = NULL;
         _cleanup_(pidref_done) PidRef pid = PIDREF_NULL;
         _cleanup_(sd_varlink_unrefp) sd_varlink *nsresource_link = NULL, *mountfsd_link = NULL;
-        _cleanup_free_ char *runtime_dir = NULL, *subdir = NULL;
-        _cleanup_(rm_rf_physical_and_freep) char *runtime_dir_destroy = NULL;
+        _cleanup_(rm_rf_physical_and_freep) char *runtime_dir = NULL;
         _cleanup_(fork_notify_terminate) PidRef journal_remote_pidref = PIDREF_NULL;
 
         log_setup();
@@ -6629,17 +6624,11 @@ static int run(int argc, char *argv[]) {
         } else
                 assert_not_reached();
 
-        subdir = path_join("systemd/nspawn", arg_machine);
-        if (!subdir) {
-                r = log_oom();
-                goto finish;
-        }
-
         r = runtime_directory_make(
                         arg_runtime_scope,
-                        subdir,
-                        &runtime_dir,
-                        &runtime_dir_destroy);
+                        "systemd/nspawn",
+                        arg_machine,
+                        &runtime_dir);
         if (r < 0) {
                 log_error_errno(r, "Failed to create runtime directory: %m");
                 goto finish;
index 00df10874ca026d929cab5cd1da94504bee05a64..f11649945036c0c8772dd46032348982a5b806c1 100644 (file)
@@ -1563,7 +1563,6 @@ static int cmdline_add_smbios11(char ***cmdline, int smbios_dir_fd, const char *
 }
 
 static int start_tpm(
-                const char *scope,
                 const char *swtpm,
                 const char *runtime_dir,
                 const char *sd_socket_activate,
@@ -1572,30 +1571,26 @@ static int start_tpm(
 
         int r;
 
-        assert(scope);
         assert(swtpm);
         assert(runtime_dir);
         assert(sd_socket_activate);
 
-        _cleanup_free_ char *scope_prefix = NULL;
-        r = unit_name_to_prefix(scope, &scope_prefix);
-        if (r < 0)
-                return log_error_errno(r, "Failed to strip .scope suffix from scope: %m");
-
         _cleanup_free_ char *listen_address = path_join(runtime_dir, "tpm.sock");
         if (!listen_address)
                 return log_oom();
 
+        /* Validate socket path length up front instead of a failure in swtpm */
+        union sockaddr_union sa;
+        r = sockaddr_un_set_path(&sa.un, listen_address);
+        if (r < 0)
+                return log_error_errno(r, "TPM socket path '%s' is too long: %m", listen_address);
+
         _cleanup_free_ char *transient_state_dir = NULL;
         const char *state_dir;
         if (arg_tpm_state_path)
                 state_dir = arg_tpm_state_path;
         else {
-                _cleanup_free_ char *dirname = strjoin(scope_prefix, "-tpm");
-                if (!dirname)
-                        return log_oom();
-
-                transient_state_dir = path_join(runtime_dir, dirname);
+                transient_state_dir = path_join(runtime_dir, "tpm");
                 if (!transient_state_dir)
                         return log_oom();
 
@@ -1694,7 +1689,6 @@ static int find_virtiofsd(char **ret) {
 }
 
 static int start_virtiofsd(
-                const char *scope,
                 const char *directory,
                 uid_t source_uid,
                 uid_t target_uid,
@@ -1705,7 +1699,6 @@ static int start_virtiofsd(
 
         int r;
 
-        assert(scope);
         assert(directory);
         assert(runtime_dir);
 
@@ -1714,11 +1707,8 @@ static int start_virtiofsd(
         if (r < 0)
                 return r;
 
-        _cleanup_free_ char *scope_prefix = NULL;
-        r = unit_name_to_prefix(scope, &scope_prefix);
-        if (r < 0)
-                return log_error_errno(r, "Failed to strip .scope suffix from scope: %m");
-
+        /* A VM may run several virtiofsd instances (one per shared dir), so the random suffix keeps their
+         * sockets distinct within the machine's runtime directory. */
         _cleanup_free_ char *listen_address = NULL;
         if (asprintf(&listen_address, "%s/sock-%"PRIx64, runtime_dir, random_u64()) < 0)
                 return log_oom();
@@ -2660,14 +2650,9 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
 
         /* Create our runtime directory. We need this for the QMP varlink control socket, the QEMU
          * config file, TPM state, virtiofsd sockets, runtime mounts, and SSH key material. */
-        _cleanup_free_ char *runtime_dir = NULL, *runtime_dir_suffix = NULL;
-        _cleanup_(rm_rf_physical_and_freep) char *runtime_dir_destroy = NULL;
+        _cleanup_(rm_rf_physical_and_freep) char *runtime_dir = NULL;
 
-        runtime_dir_suffix = path_join("systemd/vmspawn", arg_machine);
-        if (!runtime_dir_suffix)
-                return log_oom();
-
-        r = runtime_directory_make(arg_runtime_scope, runtime_dir_suffix, &runtime_dir, &runtime_dir_destroy);
+        r = runtime_directory_make(arg_runtime_scope, "systemd/vmspawn", arg_machine, &runtime_dir);
         if (r < 0)
                 return log_error_errno(r, "Failed to create runtime directory: %m");
 
@@ -3227,7 +3212,6 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
                 }
 
                 r = start_virtiofsd(
-                                unit,
                                 arg_directory,
                                 /* source_uid= */ arg_uid_shift,
                                 /* target_uid= */ 0,
@@ -3303,7 +3287,6 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
                         return log_oom();
 
                 r = start_virtiofsd(
-                                unit,
                                 m->source,
                                 /* source_uid= */ m->source_uid,
                                 /* target_uid= */ m->target_uid,
@@ -3425,7 +3408,7 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
                 if (!GREEDY_REALLOC(children, n_children + 1))
                         return log_oom();
 
-                r = start_tpm(unit, swtpm, runtime_dir, sd_socket_activate, &tpm_socket_address, &child);
+                r = start_tpm(swtpm, runtime_dir, sd_socket_activate, &tpm_socket_address, &child);
                 if (r < 0) {
                         /* only bail if the user asked for a tpm */
                         if (arg_tpm > 0)
@@ -3507,14 +3490,10 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
         }
 
         if (arg_pass_ssh_key) {
-                _cleanup_free_ char *scope_prefix = NULL, *privkey_path = NULL, *pubkey_path = NULL;
+                _cleanup_free_ char *privkey_path = NULL, *pubkey_path = NULL;
                 const char *key_type = arg_ssh_key_type ?: "ed25519";
 
-                r = unit_name_to_prefix(unit, &scope_prefix);
-                if (r < 0)
-                        return log_error_errno(r, "Failed to strip .scope suffix from scope: %m");
-
-                privkey_path = strjoin(runtime_dir, "/", scope_prefix, "-", key_type);
+                privkey_path = path_join(runtime_dir, key_type);
                 if (!privkey_path)
                         return log_oom();
 
@@ -3531,7 +3510,7 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
         }
 
         if (ssh_public_key_path && ssh_private_key_path) {
-                _cleanup_free_ char *scope_prefix = NULL, *cred_path = NULL;
+                _cleanup_free_ char *cred_path = NULL;
 
                 cred_path = strjoin("ssh.ephemeral-authorized_keys-all:", ssh_public_key_path);
                 if (!cred_path)
@@ -3541,10 +3520,6 @@ static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
                 if (r < 0)
                         return log_error_errno(r, "Failed to load credential %s: %m", cred_path);
 
-                r = unit_name_to_prefix(unit, &scope_prefix);
-                if (r < 0)
-                        return log_error_errno(r, "Failed to strip .scope suffix from scope: %m");
-
                 /* on distros that provide their own sshd@.service file we need to provide a dropin which
                  * picks up our public key credential */
                 /* sshd reads AuthorizedKeysFile after dropping to the authenticating user's UID, so the