]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
pid1: add D-Bus API for removing delegated subcgroups
authorLennart Poettering <lennart@poettering.net>
Tue, 12 Nov 2024 10:59:40 +0000 (11:59 +0100)
committerLennart Poettering <lennart@poettering.net>
Wed, 8 Jan 2025 14:27:25 +0000 (15:27 +0100)
When running unprivileged containers, we run into a scenario where an
unpriv owned cgroup has a subcgroup delegated to another user (i.e. the
container's own UIDs). When the owner of that cgroup dies without
cleaning it up then the unpriv service manager might encounter a cgroup
it cannot delete anymore.

Let's address that: let's expose a method call on the service manager
(primarly in PID1) that can be used to delete a subcgroup of a unit one
owns. This would then allow the unpriv service manager to ask the priv
service manager to get rid of such a cgroup.

This commit only adds the method call, the next commit then adds the
code that makes use of this.

man/org.freedesktop.systemd1.xml
src/core/cgroup.c
src/core/cgroup.h
src/core/dbus-manager.c
src/core/dbus-unit.c
src/core/dbus-unit.h
src/core/org.freedesktop.systemd1.conf

index 1c5e7f2eb75eaf49f18630cadc217f7d5dd72277..b95f68b4a55ff5f7d6e628350b95ec84a2aec9b5 100644 (file)
@@ -147,6 +147,9 @@ node /org/freedesktop/systemd1 {
       AttachProcessesToUnit(in  s unit_name,
                             in  s subcgroup,
                             in  au pids);
+      RemoveSubgroupFromUnit(in  s unit_name,
+                             in  s subcgroup,
+                             in  t flags);
       AbandonScope(in  s name);
       GetJob(in  u id,
              out o job);
@@ -870,6 +873,8 @@ node /org/freedesktop/systemd1 {
 
     <variablelist class="dbus-method" generated="True" extra-ref="AttachProcessesToUnit()"/>
 
+    <variablelist class="dbus-method" generated="True" extra-ref="RemoveSubgroupFromUnit()"/>
+
     <variablelist class="dbus-method" generated="True" extra-ref="AbandonScope()"/>
 
     <variablelist class="dbus-method" generated="True" extra-ref="GetJob()"/>
@@ -1599,6 +1604,13 @@ node /org/freedesktop/systemd1 {
       parameters. The possible values are <literal>configuration</literal>, <literal>state</literal>,
       <literal>logs</literal>, <literal>cache</literal>, <literal>runtime</literal>,
       <literal>fdstore</literal>, and <literal>all</literal>.</para>
+
+      <para><function>RemoveSubgroupFromUnit()</function> removes a subcgroup belonging to a unit's
+      cgroup. Takes three arguments: the unit name (if empty defaults to the caller's unit), a cgroup path
+      (which must start start with a slash <literal>/</literal>), which is taken relative to the unit's
+      cgroup, and a flags argument (which must be zero for now). This is primarily useful for unprivileged
+      service managers to ask the system service manager for removal of subcgroups it manages, in case one
+      was delegated to other UIDs.</para>
     </refsect2>
 
     <refsect2>
@@ -2704,6 +2716,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
       GetProcesses(out a(sus) processes);
       AttachProcesses(in  s subcgroup,
                       in  au pids);
+      RemoveSubgroup(in  s subcgroup,
+                     in  t flags);
     properties:
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly s Type = '...';
@@ -3398,6 +3412,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
 
     <!--method AttachProcesses is not documented!-->
 
+    <!--method RemoveSubgroup is not documented!-->
+
     <!--property Type is not documented!-->
 
     <!--property ExitType is not documented!-->
@@ -4006,6 +4022,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
 
     <variablelist class="dbus-method" generated="True" extra-ref="AttachProcesses()"/>
 
+    <variablelist class="dbus-method" generated="True" extra-ref="RemoveSubgroup()"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="Type"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="ExitType"/>
@@ -4901,6 +4919,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
       GetProcesses(out a(sus) processes);
       AttachProcesses(in  s subcgroup,
                       in  au pids);
+      RemoveSubgroup(in  s subcgroup,
+                     in  t flags);
     properties:
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly s BindIPv6Only = '...';
@@ -5592,6 +5612,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
 
     <!--method AttachProcesses is not documented!-->
 
+    <!--method RemoveSubgroup is not documented!-->
+
     <!--property BindIPv6Only is not documented!-->
 
     <!--property Backlog is not documented!-->
@@ -6206,6 +6228,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
 
     <variablelist class="dbus-method" generated="True" extra-ref="AttachProcesses()"/>
 
+    <variablelist class="dbus-method" generated="True" extra-ref="RemoveSubgroup()"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="BindIPv6Only"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="Backlog"/>
@@ -7001,6 +7025,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
       GetProcesses(out a(sus) processes);
       AttachProcesses(in  s subcgroup,
                       in  au pids);
+      RemoveSubgroup(in  s subcgroup,
+                     in  t flags);
     properties:
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly s Where = '...';
@@ -7601,6 +7627,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
 
     <!--method AttachProcesses is not documented!-->
 
+    <!--method RemoveSubgroup is not documented!-->
+
     <!--property Where is not documented!-->
 
     <!--property What is not documented!-->
@@ -8141,6 +8169,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
 
     <variablelist class="dbus-method" generated="True" extra-ref="AttachProcesses()"/>
 
+    <variablelist class="dbus-method" generated="True" extra-ref="RemoveSubgroup()"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="Where"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="What"/>
@@ -8991,6 +9021,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
       GetProcesses(out a(sus) processes);
       AttachProcesses(in  s subcgroup,
                       in  au pids);
+      RemoveSubgroup(in  s subcgroup,
+                     in  t flags);
     properties:
       readonly s What = '...';
       readonly i Priority = ...;
@@ -9577,6 +9609,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
 
     <!--method AttachProcesses is not documented!-->
 
+    <!--method RemoveSubgroup is not documented!-->
+
     <!--property What is not documented!-->
 
     <!--property Priority is not documented!-->
@@ -10103,6 +10137,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
 
     <variablelist class="dbus-method" generated="True" extra-ref="AttachProcesses()"/>
 
+    <variablelist class="dbus-method" generated="True" extra-ref="RemoveSubgroup()"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="What"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="Priority"/>
@@ -10805,6 +10841,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
       GetProcesses(out a(sus) processes);
       AttachProcesses(in  s subcgroup,
                       in  au pids);
+      RemoveSubgroup(in  s subcgroup,
+                     in  t flags);
     properties:
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly s Slice = '...';
@@ -11004,6 +11042,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
 
     <!--method AttachProcesses is not documented!-->
 
+    <!--method RemoveSubgroup is not documented!-->
+
     <!--property Slice is not documented!-->
 
     <!--property ControlGroupId is not documented!-->
@@ -11196,6 +11236,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
 
     <variablelist class="dbus-method" generated="True" extra-ref="AttachProcesses()"/>
 
+    <variablelist class="dbus-method" generated="True" extra-ref="RemoveSubgroup()"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="Slice"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="ControlGroup"/>
@@ -11411,6 +11453,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
       GetProcesses(out a(sus) processes);
       AttachProcesses(in  s subcgroup,
                       in  au pids);
+      RemoveSubgroup(in  s subcgroup,
+                     in  t flags);
     signals:
       RequestStop();
     properties:
@@ -11636,6 +11680,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
 
     <!--method AttachProcesses is not documented!-->
 
+    <!--method RemoveSubgroup is not documented!-->
+
     <!--property RuntimeMaxUSec is not documented!-->
 
     <!--property RuntimeRandomizedExtraUSec is not documented!-->
@@ -11850,6 +11896,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
 
     <variablelist class="dbus-method" generated="True" extra-ref="AttachProcesses()"/>
 
+    <variablelist class="dbus-method" generated="True" extra-ref="RemoveSubgroup()"/>
+
     <variablelist class="dbus-signal" generated="True" extra-ref="RequestStop()"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="Controller"/>
@@ -12254,6 +12302,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <varname>ShutdownStartTimestamp</varname>,
       <varname>ShutdownStartTimestampMonotonic</varname>, and
       <varname>SoftRebootsCount</varname> were added in version 256.</para>
+      <para><function>RemoveSubgroupFromUnit()</function> was added in version 258.</para>
     </refsect2>
     <refsect2>
       <title>Unit Objects</title>
@@ -12320,7 +12369,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <varname>ProtectControlGroupsEx</varname>,
       <varname>PrivateUsersEx</varname>, and
       <varname>PrivatePIDs</varname> were added in version 257.</para>
-      <para><varname>ProtectHostnameEx</varname> was added in version 258.</para>
+      <para><varname>ProtectHostnameEx</varname> and <function>RemoveSubGroup()</function> were added in version 258.</para>
     </refsect2>
     <refsect2>
       <title>Socket Unit Objects</title>
@@ -12364,7 +12413,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <varname>ManagedOOMMemoryPressureDurationUSec</varname>,
       <varname>ProtectControlGroupsEx</varname>, and
       <varname>PrivatePIDs</varname> were added in version 257.</para>
-      <para><varname>ProtectHostnameEx</varname> was added in version 258.</para>
+      <para><varname>ProtectHostnameEx</varname> and <function>RemoveSubgroup()</function> were added in version 258.</para>
     </refsect2>
     <refsect2>
       <title>Mount Unit Objects</title>
@@ -12405,7 +12454,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <varname>ManagedOOMMemoryPressureDurationUSec</varname>,
       <varname>ProtectControlGroupsEx</varname>, and
       <varname>PrivatePIDs</varname> were added in version 257.</para>
-      <para><varname>ProtectHostnameEx</varname> was added in version 258.</para>
+      <para><varname>ProtectHostnameEx</varname> and <function>RemoveSubgroup()</function> was added in version 258.</para>
     </refsect2>
     <refsect2>
       <title>Swap Unit Objects</title>
@@ -12446,7 +12495,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <varname>ManagedOOMMemoryPressureDurationUSec</varname>,
       <varname>ProtectControlGroupsEx</varname>, and
       <varname>PrivatePIDs</varname> were added in version 257.</para>
-      <para><varname>ProtectHostnameEx</varname> was added in version 258.</para>
+      <para><varname>ProtectHostnameEx</varname> and <function>RemoveSubgroup()</function> were added in version 258.</para>
     </refsect2>
     <refsect2>
       <title>Slice Unit Objects</title>
@@ -12472,6 +12521,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <varname>EffectiveTasksMax</varname>, and
       <varname>MemoryZSwapWriteback</varname> were added in version 256.</para>
       <para><varname>ManagedOOMMemoryPressureDurationUSec</varname> was added in version 257.</para>
+      <para><function>RemoveSubgroup()</function> was added in version 258.</para>
     </refsect2>
     <refsect2>
       <title>Scope Unit Objects</title>
@@ -12498,6 +12548,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <varname>EffectiveTasksMax</varname>, and
       <varname>MemoryZSwapWriteback</varname> were added in version 256.</para>
       <para><varname>ManagedOOMMemoryPressureDurationUSec</varname> was added in version 257.</para>
+      <para><function>RemoveSubgroup()</function> was added in version 258.</para>
     </refsect2>
     <refsect2>
       <title>Job Objects</title>
index 6933aae54de4833a50964bf2e37cbe2e57323b03..345dc5cfbbdb098c666e585c7f86ea2c79de90ee 100644 (file)
@@ -3126,6 +3126,49 @@ int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) {
         return ret;
 }
 
+int unit_remove_subcgroup(Unit *u, const char *suffix_path) {
+        int r;
+
+        assert(u);
+
+        if (!UNIT_HAS_CGROUP_CONTEXT(u))
+                return -EINVAL;
+
+        if (!unit_cgroup_delegate(u))
+                return -ENOMEDIUM;
+
+        r = unit_pick_cgroup_path(u);
+        if (r < 0)
+                return r;
+
+        CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+        if (!crt || !crt->cgroup_path)
+                return -EOWNERDEAD;
+
+        _cleanup_free_ char *j = NULL;
+        bool delete_root;
+        const char *d;
+        if (empty_or_root(suffix_path)) {
+                d = empty_to_root(crt->cgroup_path);
+                delete_root = false; /* Don't attempt to delete the main cgroup of this unit */
+        } else {
+                j = path_join(crt->cgroup_path, suffix_path);
+                if (!j)
+                        return -ENOMEM;
+
+                d = j;
+                delete_root = true;
+        }
+
+        log_unit_debug(u, "Removing subcgroup '%s'...", d);
+
+        r = cg_trim_everywhere(u->manager->cgroup_supported, d, delete_root);
+        if (r < 0)
+                return log_unit_debug_errno(u, r, "Failed to fully %s cgroup '%s': %m", delete_root ? "remove" : "trim", d);
+
+        return 0;
+}
+
 static bool unit_has_mask_realized(
                 Unit *u,
                 CGroupMask target_mask,
index 1ed74831c8a6644b1628df1bec25c4e21b96a976..807e56c6210ee22aefe7855e9200d899bc0527b4 100644 (file)
@@ -456,6 +456,7 @@ int unit_check_oomd_kill(Unit *u);
 int unit_check_oom(Unit *u);
 
 int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path);
+int unit_remove_subcgroup(Unit *u, const char *suffix_path);
 
 int manager_setup_cgroup(Manager *m);
 void manager_shutdown_cgroup(Manager *m, bool delete);
index ed1f1241826a4f344460db5f8e40c857791acad2..99e3ea12ac5f82efb40a14b9f973f9cea856cb90 100644 (file)
@@ -960,6 +960,12 @@ static int method_attach_processes_to_unit(sd_bus_message *message, void *userda
         return method_generic_unit_operation(message, userdata, error, bus_unit_method_attach_processes, GENERIC_UNIT_VALIDATE_LOADED);
 }
 
+static int method_remove_subgroup_from_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        /* Don't allow removal of subgroups from units that aren't loaded. But allow loading the unit, since
+         * this is clean-up work, that is OK to do when the unit is stopped already. */
+        return method_generic_unit_operation(message, userdata, error, bus_unit_method_remove_subgroup, GENERIC_UNIT_LOAD|GENERIC_UNIT_VALIDATE_LOADED);
+}
+
 static int transient_unit_from_message(
                 Manager *m,
                 sd_bus_message *message,
@@ -3246,6 +3252,11 @@ const sd_bus_vtable bus_manager_vtable[] = {
                                 SD_BUS_NO_RESULT,
                                 method_attach_processes_to_unit,
                                 SD_BUS_VTABLE_UNPRIVILEGED),
+        SD_BUS_METHOD_WITH_ARGS("RemoveSubgroupFromUnit",
+                                SD_BUS_ARGS("s", unit_name, "s", subcgroup, "t", flags),
+                                SD_BUS_NO_RESULT,
+                                method_remove_subgroup_from_unit,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
         SD_BUS_METHOD_WITH_ARGS("AbandonScope",
                                 SD_BUS_ARGS("s", name),
                                 SD_BUS_NO_RESULT,
index 5fa868b8ff81d40fe2239caa4169cd8153bc0658..68cfed9444f07deec7050c9bc86a70b14738b48a 100644 (file)
@@ -1594,6 +1594,59 @@ int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd
         return sd_bus_reply_method_return(message, NULL);
 }
 
+int bus_unit_method_remove_subgroup(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Unit *u = ASSERT_PTR(userdata);
+        int r;
+
+        assert(message);
+
+        /* This removes a subcgroup of the unit, regardless which user owns the subcgroup. This is useful
+         * when cgroup delegation is enabled for a unit, and the unit subdelegates the cgroup further */
+
+        r = mac_selinux_unit_access_check(u, message, "stop", error);
+        if (r < 0)
+                return r;
+
+        const char *path;
+        uint64_t flags;
+        r = sd_bus_message_read(message, "st", &path, &flags);
+        if (r < 0)
+                return r;
+
+        /* No flags defined for now. */
+        if (flags != 0)
+                return sd_bus_reply_method_errorf(message, SD_BUS_ERROR_INVALID_ARGS, "Invalid 'flags' parameter '%" PRIu64 "'", flags);
+
+        if (!unit_cgroup_delegate(u))
+                return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Subcgroup removal not available on non-delegated units.");
+
+        if (!path_is_absolute(path))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Control group path is not absolute: %s", path);
+
+        if (!path_is_normalized(path))
+                return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Control group path is not normalized: %s", path);
+
+        _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+        r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID, &creds);
+        if (r < 0)
+                return r;
+
+        uid_t sender_uid;
+        r = sd_bus_creds_get_euid(creds, &sender_uid);
+        if (r < 0)
+                return r;
+
+        /* Allow this only if the client is privileged, is us, or is the user of the unit itself. */
+        if (sender_uid != 0 && sender_uid != getuid() && sender_uid != u->ref_uid)
+                return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Client is not permitted to alter cgroup.");
+
+        r = unit_remove_subcgroup(u, path);
+        if (r < 0)
+                return sd_bus_error_set_errnof(error, r, "Failed to remove subgroup %s: %m", path);
+
+        return sd_bus_reply_method_return(message, NULL);
+}
+
 const sd_bus_vtable bus_unit_cgroup_vtable[] = {
         SD_BUS_VTABLE_START(0),
         SD_BUS_PROPERTY("Slice", "s", property_get_slice, 0, 0),
@@ -1633,6 +1686,12 @@ const sd_bus_vtable bus_unit_cgroup_vtable[] = {
                                 bus_unit_method_attach_processes,
                                 SD_BUS_VTABLE_UNPRIVILEGED),
 
+        SD_BUS_METHOD_WITH_ARGS("RemoveSubgroup",
+                                SD_BUS_ARGS("s", subcgroup, "t", flags),
+                                SD_BUS_NO_RESULT,
+                                bus_unit_method_remove_subgroup,
+                                SD_BUS_VTABLE_UNPRIVILEGED),
+
         SD_BUS_VTABLE_END
 };
 
index 6b7828e4bae50e46518eef7521db7daf9496bed8..e9dd1ec31703954c48c717897d4500e465f708ec 100644 (file)
@@ -22,6 +22,7 @@ int bus_unit_set_properties(Unit *u, sd_bus_message *message, UnitWriteFlags fla
 int bus_unit_method_set_properties(sd_bus_message *message, void *userdata, sd_bus_error *error);
 int bus_unit_method_get_processes(sd_bus_message *message, void *userdata, sd_bus_error *error);
 int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_unit_method_remove_subgroup(sd_bus_message *message, void *userdata, sd_bus_error *error);
 int bus_unit_method_ref(sd_bus_message *message, void *userdata, sd_bus_error *error);
 int bus_unit_method_unref(sd_bus_message *message, void *userdata, sd_bus_error *error);
 int bus_unit_method_clean(sd_bus_message *message, void *userdata, sd_bus_error *error);
index 52034e07e73280b576504b6468f1e1c9d224437e..2b978a1e770f528ae107b4804e157282a1558965 100644 (file)
                        send_interface="org.freedesktop.systemd1.Manager"
                        send_member="AttachProcessesToUnit"/>
 
+                <allow send_destination="org.freedesktop.systemd1"
+                       send_interface="org.freedesktop.systemd1.Manager"
+                       send_member="RemoveSubgroupFromUnit"/>
+
                 <allow send_destination="org.freedesktop.systemd1"
                        send_interface="org.freedesktop.systemd1.Manager"
                        send_member="CancelJob"/>
                        send_interface="org.freedesktop.systemd1.Service"
                        send_member="AttachProcesses"/>
 
+                <allow send_destination="org.freedesktop.systemd1"
+                       send_interface="org.freedesktop.systemd1.Service"
+                       send_member="RemoveSubgroupFromUnit"/>
+
                 <allow send_destination="org.freedesktop.systemd1"
                        send_interface="org.freedesktop.systemd1.Service"
                        send_member="BindMount"/>
                        send_interface="org.freedesktop.systemd1.Scope"
                        send_member="AttachProcesses"/>
 
+                <allow send_destination="org.freedesktop.systemd1"
+                       send_interface="org.freedesktop.systemd1.Service"
+                       send_member="RemoveSubgroupFromUnit"/>
+
                 <allow receive_sender="org.freedesktop.systemd1"/>
         </policy>