]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
nspawn: reorganize scope allocation/registration logic
authorLennart Poettering <lennart@poettering.net>
Fri, 23 May 2025 20:04:56 +0000 (22:04 +0200)
committerLennart Poettering <lennart@poettering.net>
Fri, 11 Jul 2025 16:15:12 +0000 (18:15 +0200)
This cleans up allocation of a scope unit for the container: when
invoked in user context we'll now allocate a scope through the per-user
service manager instead of the per-system manager. This makes a ton more
sense, since it's the user that invokes things after all. And given that
machined now can register containers in the user manager there's nothing
stopping us to clean this up.

Note that this means we'll connect to two busses if run unpriv: once to
the per-user bus to allocate the scope unit, and once to the per-system
bus to register it with machined.

src/nspawn/nspawn.c

index 4ccfab8a88c3e591bfff75b0a1e78f0f1d066ce0..805dd91389c21675af2720614738e586d2c69b67 100644 (file)
@@ -1667,11 +1667,6 @@ static int verify_arguments(void) {
         if (has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts))
                 arg_read_only = true;
 
-        if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
-                /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
-                 * The latter is not technically a user session, but we don't need to labour the point. */
-                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
-
         if (arg_directory && arg_image)
                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
 
@@ -5136,7 +5131,6 @@ static int run_container(
         _cleanup_(sd_event_unrefp) sd_event *event = NULL;
         _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
         _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
-        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
         _cleanup_free_ uid_t *bind_user_uid = NULL;
         size_t n_bind_user_uid = 0;
         ContainerStatus container_status = 0;
@@ -5459,19 +5453,35 @@ static int run_container(
                         return r;
         }
 
-        if (arg_register || !arg_keep_unit) {
-                if (arg_privileged || arg_register)
-                        r = sd_bus_default_system(&bus);
-                else
-                        r = sd_bus_default_user(&bus);
+        /* Registration always happens on the system bus */
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *system_bus = NULL;
+        if (arg_register || arg_privileged) {
+                r = sd_bus_default_system(&system_bus);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to open system bus: %m");
+
+                r = sd_bus_set_close_on_exit(system_bus, false);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
+
+                (void) sd_bus_set_allow_interactive_authorization(system_bus, arg_ask_password);
+        }
+
+        /* Scope allocation happens on the user bus if we are unpriv, otherwise system bus. */
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *user_bus = NULL;
+        _cleanup_(sd_bus_unrefp) sd_bus *runtime_bus = NULL;
+        if (arg_privileged)
+                runtime_bus = sd_bus_ref(system_bus);
+        else {
+                r = sd_bus_default_user(&user_bus);
                 if (r < 0)
-                        return log_error_errno(r, "Failed to open bus: %m");
+                        return log_error_errno(r, "Failed to open user bus: %m");
 
-                r = sd_bus_set_close_on_exit(bus, false);
+                r = sd_bus_set_close_on_exit(user_bus, false);
                 if (r < 0)
                         return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
 
-                (void) sd_bus_set_allow_interactive_authorization(bus, arg_ask_password);
+                runtime_bus = sd_bus_ref(user_bus);
         }
 
         if (!arg_keep_unit) {
@@ -5480,7 +5490,7 @@ static int run_container(
                  * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
 
                 r = sd_bus_match_signal_async(
-                                bus,
+                                runtime_bus,
                                 /* ret= */ NULL,
                                 "org.freedesktop.systemd1",
                                 /* path= */ NULL,
@@ -5493,54 +5503,78 @@ static int run_container(
                         return log_error_errno(r, "Failed to request RequestStop match: %m");
         }
 
-        if (arg_register) {
-                RegisterMachineFlags flags = 0;
-                SET_FLAG(flags, REGISTER_MACHINE_KEEP_UNIT, arg_keep_unit);
-                r = register_machine(
-                                bus,
+        if (arg_keep_unit) {
+                /* If we are not supposed to allocate a unit, then let's move the process now, so that we can
+                 * register things while being in the right cgroup location already. Otherwise, let's move
+                 * the process later, once we have unit and hence cgroup. */
+                r = create_subcgroup(
+                                pid,
+                                arg_keep_unit,
+                                arg_uid_shift,
+                                userns_fd,
+                                arg_userns_mode);
+                if (r < 0)
+                        return r;
+        }
+
+        bool scope_allocated = false;
+        if (!arg_keep_unit && (!arg_register || !arg_privileged)) {
+                AllocateScopeFlags flags = ALLOCATE_SCOPE_ALLOW_PIDFD;
+                r = allocate_scope(
+                                runtime_bus,
                                 arg_machine,
                                 pid,
-                                arg_directory,
-                                arg_uuid,
-                                ifi,
                                 arg_slice,
                                 arg_custom_mounts, arg_n_custom_mounts,
                                 arg_kill_signal,
                                 arg_property,
                                 arg_property_message,
-                                arg_container_service_name,
                                 arg_start_mode,
                                 flags);
                 if (r < 0)
                         return r;
 
-        } else if (!arg_keep_unit) {
-                AllocateScopeFlags flags = ALLOCATE_SCOPE_ALLOW_PIDFD;
-                r = allocate_scope(
-                                bus,
+                scope_allocated = true;
+        }
+
+        bool registered = false;
+        if (arg_register) {
+                RegisterMachineFlags flags = 0;
+                SET_FLAG(flags, REGISTER_MACHINE_KEEP_UNIT, arg_keep_unit || !arg_privileged);
+                r = register_machine(
+                                system_bus,
                                 arg_machine,
                                 pid,
+                                arg_directory,
+                                arg_uuid,
+                                ifi,
                                 arg_slice,
                                 arg_custom_mounts, arg_n_custom_mounts,
                                 arg_kill_signal,
                                 arg_property,
                                 arg_property_message,
+                                arg_container_service_name,
                                 arg_start_mode,
                                 flags);
                 if (r < 0)
                         return r;
 
-        } else if (arg_slice || arg_property)
+                registered = true;
+        }
+
+        if (arg_keep_unit && (arg_slice || arg_property))
                 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
 
-        r = create_subcgroup(
-                        pid,
-                        arg_keep_unit,
-                        arg_uid_shift,
-                        userns_fd,
-                        arg_userns_mode);
-        if (r < 0)
-                return r;
+        if (!arg_keep_unit) {
+                r = create_subcgroup(
+                                pid,
+                                arg_keep_unit,
+                                arg_uid_shift,
+                                userns_fd,
+                                arg_userns_mode);
+                if (r < 0)
+                        return r;
+        }
 
         /* Notify the child that the parent is ready with all its setup (including cgroup-ification), and
          * that the child can now hand over control to the code to run inside the container. */
@@ -5561,10 +5595,16 @@ static int run_container(
 
         (void) sd_event_set_watchdog(event, true);
 
-        if (bus) {
-                r = sd_bus_attach_event(bus, event, 0);
+        if (system_bus) {
+                r = sd_bus_attach_event(system_bus, event, 0);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to attach system bus to event loop: %m");
+        }
+
+        if (user_bus) {
+                r = sd_bus_attach_event(user_bus, event, 0);
                 if (r < 0)
-                        return log_error_errno(r, "Failed to attach bus to event loop: %m");
+                        return log_error_errno(r, "Failed to attach user bus to event loop: %m");
         }
 
         r = setup_notify_parent(event, notify_socket, pid, &notify_event_source);
@@ -5710,8 +5750,8 @@ static int run_container(
                 return log_error_errno(r, "Failed to run event loop: %m");
 
         /* Kill if it is not dead yet anyway */
-        if (!arg_register && !arg_keep_unit && bus)
-                terminate_scope(bus, arg_machine);
+        if (scope_allocated)
+                terminate_scope(runtime_bus, arg_machine);
 
         /* Normally redundant, but better safe than sorry */
         (void) pidref_kill(pid, SIGKILL);
@@ -5731,8 +5771,8 @@ static int run_container(
         r = wait_for_container(pid, &container_status);
 
         /* Tell machined that we are gone. */
-        if (arg_register && bus)
-                (void) unregister_machine(bus, arg_machine);
+        if (registered)
+                (void) unregister_machine(system_bus, arg_machine);
 
         if (r < 0)
                 /* We failed to wait for the container, or the container exited abnormally. */