]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
cgroup: Add support for ProtectControlGroups= private and strict 34806/head
authorRyan Wilson <ryantimwilson@meta.com>
Fri, 18 Oct 2024 18:41:09 +0000 (11:41 -0700)
committerRyan Wilson <ryantimwilson@meta.com>
Mon, 28 Oct 2024 15:37:36 +0000 (08:37 -0700)
This commit adds two settings private and strict to
the ProtectControlGroups= property. Private will unshare the cgroup
namespace and mount a read-write private cgroup2 filesystem at /sys/fs/cgroup.
Strict does the same except the mount is read-only. Since the unit is
running in a cgroup namespace, the new root of /sys/fs/cgroup is the unit's
own cgroup.

We also add a new dbus property ProtectControlGroupsEx which accepts strings
instead of boolean. This will allow users to use private/strict via dbus
and systemd-run in addition to service files.

Note private and strict fall back to no and yes respectively if the kernel
doesn't support cgroup2 or system is not using unified hierarchy.

Fixes: #34634
man/org.freedesktop.systemd1.xml
man/systemd.exec.xml
src/core/dbus-execute.c
src/core/exec-invoke.c
src/core/execute.c
src/core/execute.h
src/core/namespace.c
src/core/namespace.h
src/shared/bus-unit-util.c
test/units/TEST-07-PID1.protect-control-groups.sh [new file with mode: 0755]

index d86a5d9f3252f4f418dd16bc84beb6eadeb6cab9..f484f28a70b5429ad0f293de172bb19c331f930c 100644 (file)
@@ -3251,6 +3251,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly b ProtectControlGroups = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+      readonly s ProtectControlGroupsEx = '...';
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly b PrivateNetwork = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly b PrivateUsers = ...;
@@ -3868,8 +3870,6 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
 
     <!--property ProtectKernelLogs is not documented!-->
 
-    <!--property ProtectControlGroups is not documented!-->
-
     <!--property PrivateNetwork is not documented!-->
 
     <!--property PrivateUsers is not documented!-->
@@ -4572,6 +4572,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
 
     <variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroups"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroupsEx"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="PrivateNetwork"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="PrivateUsers"/>
@@ -4858,6 +4860,12 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
       unit file setting <varname>ManagedOOMMemoryPressureDurationSec=</varname> listed in
       <citerefentry><refentrytitle>systemd.resource-control</refentrytitle><manvolnum>5</manvolnum></citerefentry>.
       Note the time unit is expressed in <literal>μs</literal>.</para>
+
+      <para><varname>ProtectControlGroupsEx</varname> implement the destination parameter of the
+      unit file setting <varname>ProtectControlGroups=</varname> listed in
+      <citerefentry><refentrytitle>systemd.exec</refentrytitle><manvolnum>5</manvolnum></citerefentry>.
+      Unlike boolean <varname>ProtectControlGroups</varname>, <varname>ProtectControlGroupsEx</varname>
+      is a string type.</para>
     </refsect2>
   </refsect1>
 
@@ -5415,6 +5423,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly b ProtectControlGroups = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+      readonly s ProtectControlGroupsEx = '...';
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly b PrivateNetwork = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly b PrivateUsers = ...;
@@ -6044,8 +6054,6 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
 
     <!--property ProtectKernelLogs is not documented!-->
 
-    <!--property ProtectControlGroups is not documented!-->
-
     <!--property PrivateNetwork is not documented!-->
 
     <!--property PrivateUsers is not documented!-->
@@ -6720,6 +6728,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
 
     <variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroups"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroupsEx"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="PrivateNetwork"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="PrivateUsers"/>
@@ -7416,6 +7426,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly b ProtectControlGroups = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+      readonly s ProtectControlGroupsEx = '...';
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly b PrivateNetwork = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly b PrivateUsers = ...;
@@ -7971,8 +7983,6 @@ node /org/freedesktop/systemd1/unit/home_2emount {
 
     <!--property ProtectKernelLogs is not documented!-->
 
-    <!--property ProtectControlGroups is not documented!-->
-
     <!--property PrivateNetwork is not documented!-->
 
     <!--property PrivateUsers is not documented!-->
@@ -8559,6 +8569,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
 
     <variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroups"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroupsEx"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="PrivateNetwork"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="PrivateUsers"/>
@@ -9384,6 +9396,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly b ProtectControlGroups = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+      readonly s ProtectControlGroupsEx = '...';
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly b PrivateNetwork = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly b PrivateUsers = ...;
@@ -9925,8 +9939,6 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
 
     <!--property ProtectKernelLogs is not documented!-->
 
-    <!--property ProtectControlGroups is not documented!-->
-
     <!--property PrivateNetwork is not documented!-->
 
     <!--property PrivateUsers is not documented!-->
@@ -10499,6 +10511,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
 
     <variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroups"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroupsEx"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="PrivateNetwork"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="PrivateUsers"/>
@@ -12262,7 +12276,8 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <varname>ImportCredentialEx</varname>,
       <varname>ExtraFileDescriptorNames</varname>,
       <varname>ManagedOOMMemoryPressureDurationUSec</varname>,
-      <varname>BindLogSockets</varname>, and
+      <varname>BindLogSockets</varname>,
+      <varname>ProtectControlGroupsEx</varname>, and
       <varname>PrivateUsersEx</varname> were added in version 257.</para>
     </refsect2>
     <refsect2>
@@ -12303,8 +12318,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <para><varname>PrivateTmpEx</varname>,
       <varname>ImportCredentialEx</varname>,
       <varname>BindLogSockets</varname>,
-      <varname>PrivateUsersEx</varname>, and
-      <varname>ManagedOOMMemoryPressureDurationUSec</varname> were added in version 257.</para>
+      <varname>PrivateUsersEx</varname>,
+      <varname>ManagedOOMMemoryPressureDurationUSec</varname>, and
+      <varname>ProtectControlGroupsEx</varname> were added in version 257.</para>
     </refsect2>
     <refsect2>
       <title>Mount Unit Objects</title>
@@ -12341,8 +12357,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <para><varname>PrivateTmpEx</varname>,
       <varname>ImportCredentialEx</varname>,
       <varname>BindLogSockets</varname>,
-      <varname>PrivateUsersEx</varname>, and
-      <varname>ManagedOOMMemoryPressureDurationUSec</varname> were added in version 257.</para>
+      <varname>PrivateUsersEx</varname>,
+      <varname>ManagedOOMMemoryPressureDurationUSec</varname>, and
+      <varname>ProtectControlGroupsEx</varname> were added in version 257.</para>
     </refsect2>
     <refsect2>
       <title>Swap Unit Objects</title>
@@ -12379,8 +12396,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <para><varname>PrivateTmpEx</varname>,
       <varname>ImportCredentialEx</varname>,
       <varname>BindLogSockets</varname>,
-      <varname>PrivateUsersEx</varname>, and
-      <varname>ManagedOOMMemoryPressureDurationUSec</varname> were added in version 257.</para>
+      <varname>PrivateUsersEx</varname>,
+      <varname>ManagedOOMMemoryPressureDurationUSec</varname>, and
+      <varname>ProtectControlGroupsEx</varname> were added in version 257.</para>
     </refsect2>
     <refsect2>
       <title>Slice Unit Objects</title>
index 6764f89b0209fc70db2d17a2aa7699aa5e94b17d..f84204d247e7cae96b71fff462585cb086054c0d 100644 (file)
@@ -2117,14 +2117,22 @@ BindReadOnlyPaths=/var/lib/systemd</programlisting>
       <varlistentry>
         <term><varname>ProtectControlGroups=</varname></term>
 
-        <listitem><para>Takes a boolean argument. If true, the Linux Control Groups (<citerefentry
-        project='man-pages'><refentrytitle>cgroups</refentrytitle><manvolnum>7</manvolnum></citerefentry>) hierarchies
+        <listitem><para>Takes a boolean argument or the special values <literal>private</literal> or
+        <literal>strict</literal>. If true, the Linux Control Groups (<citerefentry project='man-pages'>
+        <refentrytitle>cgroups</refentrytitle><manvolnum>7</manvolnum></citerefentry>) hierarchies
         accessible through <filename>/sys/fs/cgroup/</filename> will be made read-only to all processes of the
-        unit. Except for container managers no services should require write access to the control groups hierarchies;
-        it is hence recommended to turn this on for most services. For this setting the same restrictions regarding
-        mount propagation and privileges apply as for <varname>ReadOnlyPaths=</varname> and related calls, see
-        above. Defaults to off. If <varname>ProtectControlGroups=</varname> is set, <varname>MountAPIVFS=yes</varname>
-        is implied.</para>
+        unit. If set to <literal>private</literal>, the unit will run in a cgroup namespace with a private
+        writable mount of <filename>/sys/fs/cgroup/</filename>. If set to <literal>strict</literal>, the unit
+        will run in a cgroup namespace with a private read-only mount of <filename>/sys/fs/cgroup/</filename>.
+        Defaults to off. If <varname>ProtectControlGroups=</varname> is set, <varname>MountAPIVFS=yes</varname>
+        is implied. Note <literal>private</literal> and <literal>strict</literal> are downgraded to false and
+        true respectively unless the system is using the unified control group hierarchy and the kernel supports
+        cgroup namespaces.</para>
+
+        <para>Except for container managers no services should require write access to the control groups hierarchies;
+        it is hence recommended to set <varname>ProtectControlGroups=</varname> to true or <literal>strict</literal>
+        for most services. For this setting the same restrictions regarding mount propagation and privileges apply
+        as for <varname>ReadOnlyPaths=</varname> and related settings, see above.</para>
 
         <xi:include href="system-only.xml" xpointer="singular"/>
 
index 4ef57137ea656210cc01886e17a833039c4bee83..4f627a11e24c273f69142c652284ad1dd54a1021 100644 (file)
@@ -61,6 +61,7 @@ static BUS_DEFINE_PROPERTY_GET2(property_get_ioprio_priority, "i", ExecContext,
 static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_empty_string, "s", NULL);
 static BUS_DEFINE_PROPERTY_GET_REF(property_get_private_tmp_ex, "s", PrivateTmp, private_tmp_to_string);
 static BUS_DEFINE_PROPERTY_GET_REF(property_get_private_users_ex, "s", PrivateUsers, private_users_to_string);
+static BUS_DEFINE_PROPERTY_GET_REF(property_get_protect_control_groups_ex, "s", ProtectControlGroups, protect_control_groups_to_string);
 static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_level, "i", int, LOG_PRI);
 static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_facility, "i", int, LOG_FAC);
 static BUS_DEFINE_PROPERTY_GET(property_get_cpu_affinity_from_numa, "b", ExecContext, exec_context_get_cpu_affinity_from_numa);
@@ -1179,6 +1180,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
         SD_BUS_PROPERTY("ProtectKernelModules", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_modules), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("ProtectKernelLogs", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_logs), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("ProtectControlGroups", "b", property_get_protect_control_groups, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ProtectControlGroupsEx", "s", property_get_protect_control_groups_ex, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("PrivateUsers", "b", property_get_private_users, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("PrivateUsersEx", "s", property_get_private_users_ex, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST),
@@ -1939,6 +1941,27 @@ int bus_exec_context_set_transient_property(
                 return 1;
         }
 
+        if (streq(name, "ProtectControlGroupsEx")) {
+                const char *s;
+                ProtectControlGroups t;
+
+                r = sd_bus_message_read(message, "s", &s);
+                if (r < 0)
+                        return r;
+
+                t = protect_control_groups_from_string(s);
+                if (t < 0)
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid %s setting: %s", name, s);
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        c->protect_control_groups = t;
+                        (void) unit_write_settingf(u, flags, name, "ProtectControlGroups=%s",
+                                                   protect_control_groups_to_string(c->protect_control_groups));
+                }
+
+                return 1;
+        }
+
         if (streq(name, "PrivateDevices"))
                 return bus_set_transient_bool(u, name, &c->private_devices, message, flags, error);
 
index a0caf9bf893434a790bada17bcba783c2cdefb99..eda0aee7c2e565c4d38ca2015cdb52107952c8c9 100644 (file)
@@ -3098,7 +3098,7 @@ static int apply_mount_namespace(
 
         /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
          * service will need to write to it in order to start the notifications. */
-        if (context->protect_control_groups != PROTECT_CONTROL_GROUPS_NO && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
+        if (exec_is_cgroup_mount_read_only(context, params) && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
                 read_write_paths_cleanup = strv_copy(context->read_write_paths);
                 if (!read_write_paths_cleanup)
                         return -ENOMEM;
@@ -3242,7 +3242,7 @@ static int apply_mount_namespace(
                  * sandbox inside the mount namespace. */
                 .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
 
-                .protect_control_groups = needs_sandboxing ? context->protect_control_groups : PROTECT_CONTROL_GROUPS_NO,
+                .protect_control_groups = needs_sandboxing ? exec_get_protect_control_groups(context, params) : PROTECT_CONTROL_GROUPS_NO,
                 .protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
                 .protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
                 .protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
@@ -3886,7 +3886,7 @@ static bool exec_context_need_unprivileged_private_users(
                context->protect_kernel_tunables ||
                context->protect_kernel_modules ||
                context->protect_kernel_logs ||
-               context->protect_control_groups != PROTECT_CONTROL_GROUPS_NO ||
+               exec_needs_cgroup_mount(context, params) ||
                context->protect_clock ||
                context->protect_hostname ||
                !strv_isempty(context->read_write_paths) ||
@@ -4580,6 +4580,10 @@ int exec_invoke(
                 }
         }
 
+        /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
+         * from it. */
+        needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
+
         if (params->cgroup_path) {
                 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
                  * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
@@ -4623,6 +4627,18 @@ int exec_invoke(
                                                             "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
                                         memory_pressure_path = mfree(memory_pressure_path);
                                 }
+                                /* First we use the current cgroup path to chmod and chown the memory pressure path, then pass the path relative
+                                 * to the cgroup namespace to environment variables and mounts. If chown/chmod fails, we should not pass memory
+                                 * pressure path environment variable or read-write mount to the unit. This is why we check if
+                                 * memory_pressure_path != NULL in the conditional below. */
+                                if (memory_pressure_path && needs_sandboxing && exec_needs_cgroup_namespace(context, params)) {
+                                        memory_pressure_path = mfree(memory_pressure_path);
+                                        r = cg_get_path("memory", "", "memory.pressure", &memory_pressure_path);
+                                        if (r < 0) {
+                                                *exit_status = EXIT_MEMORY;
+                                                return log_oom();
+                                        }
+                                }
                         } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_NO) {
                                 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
                                 if (!memory_pressure_path) {
@@ -4709,10 +4725,6 @@ int exec_invoke(
                 return log_exec_error_errno(context, params, r, "Failed to set up kernel keyring: %m");
         }
 
-        /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
-         * from it. */
-        needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
-
         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
          * for it, and the kernel doesn't actually support ambient caps. */
         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
@@ -4853,6 +4865,14 @@ int exec_invoke(
                         log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
         }
 
+        if (needs_sandboxing && exec_needs_cgroup_namespace(context, params)) {
+                r = unshare(CLONE_NEWCGROUP);
+                if (r < 0) {
+                        *exit_status = EXIT_NAMESPACE;
+                        return log_exec_error_errno(context, params, r, "Failed to set up cgroup namespacing: %m");
+                }
+        }
+
         if (needs_mount_namespace) {
                 _cleanup_free_ char *error_path = NULL;
 
index c1acc992f5264f360048731651760b8f2579a296..1fda693344fd32b3338e4b4f9b57f5f695a7776b 100644 (file)
@@ -210,6 +210,50 @@ bool exec_needs_ipc_namespace(const ExecContext *context) {
         return context->private_ipc || context->ipc_namespace_path;
 }
 
+static bool can_apply_cgroup_namespace(const ExecContext *context, const ExecParameters *params) {
+        return cg_all_unified() > 0 && ns_type_supported(NAMESPACE_CGROUP);
+}
+
+static bool needs_cgroup_namespace(ProtectControlGroups i) {
+        return IN_SET(i, PROTECT_CONTROL_GROUPS_PRIVATE, PROTECT_CONTROL_GROUPS_STRICT);
+}
+
+ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params) {
+        assert(context);
+
+        /* If cgroup namespace is configured via ProtectControlGroups=private or strict but we can't actually
+         * use cgroup namespace, either from not having unified hierarchy or kernel support, we ignore the
+         * setting and do not unshare the namespace. ProtectControlGroups=private and strict get downgraded
+         * to no and yes respectively. This ensures that strict always gets a read-only mount of /sys/fs/cgroup.
+         *
+         * TODO: Remove fallback once cgroupv1 support is removed in v258. */
+        if (needs_cgroup_namespace(context->protect_control_groups) && !can_apply_cgroup_namespace(context, params)) {
+                if (context->protect_control_groups == PROTECT_CONTROL_GROUPS_PRIVATE)
+                        return PROTECT_CONTROL_GROUPS_NO;
+                if (context->protect_control_groups == PROTECT_CONTROL_GROUPS_STRICT)
+                        return PROTECT_CONTROL_GROUPS_YES;
+        }
+        return context->protect_control_groups;
+}
+
+bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params) {
+        assert(context);
+
+        return needs_cgroup_namespace(exec_get_protect_control_groups(context, params));
+}
+
+bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params) {
+        assert(context);
+
+        return exec_get_protect_control_groups(context, params) != PROTECT_CONTROL_GROUPS_NO;
+}
+
+bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params) {
+        assert(context);
+
+        return IN_SET(exec_get_protect_control_groups(context, params), PROTECT_CONTROL_GROUPS_YES, PROTECT_CONTROL_GROUPS_STRICT);
+}
+
 bool exec_needs_mount_namespace(
                 const ExecContext *context,
                 const ExecParameters *params,
@@ -259,7 +303,7 @@ bool exec_needs_mount_namespace(
             context->protect_kernel_tunables ||
             context->protect_kernel_modules ||
             context->protect_kernel_logs ||
-            context->protect_control_groups != PROTECT_CONTROL_GROUPS_NO ||
+            exec_needs_cgroup_mount(context, params) ||
             context->protect_proc != PROTECT_PROC_DEFAULT ||
             context->proc_subset != PROC_SUBSET_ALL ||
             exec_needs_ipc_namespace(context))
index 5fbc196cb70cf8c374cfea704312caaf9ad7adab..1f9b3f8f142358928a2513d9aa6ddd0a88fb6e60 100644 (file)
@@ -616,6 +616,11 @@ bool exec_needs_mount_namespace(const ExecContext *context, const ExecParameters
 bool exec_needs_network_namespace(const ExecContext *context);
 bool exec_needs_ipc_namespace(const ExecContext *context);
 
+ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params);
+bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params);
+bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params);
+bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params);
+
 /* These logging macros do the same logging as those in unit.h, but using ExecContext and ExecParameters
  * instead of the unit object, so that it can be used in the sd-executor context (where the unit object is
  * not available). */
index 371d4a237ad8b51c53b3815ba646fdd280bd9540..a0d3dc0cbb9b966ba6a86111df502d0335537040 100644 (file)
@@ -65,6 +65,7 @@ typedef enum MountMode {
         MOUNT_PRIVATE_SYSFS,
         MOUNT_BIND_SYSFS,
         MOUNT_PROCFS,
+        MOUNT_PRIVATE_CGROUP2FS,
         MOUNT_READ_ONLY,
         MOUNT_READ_WRITE,
         MOUNT_NOEXEC,
@@ -204,6 +205,17 @@ static const MountEntry protect_control_groups_yes_table[] = {
         { "/sys/fs/cgroup",      MOUNT_READ_ONLY,         false  },
 };
 
+/* ProtectControlGroups=private table. Note mount_private_apivfs() always use MS_NOSUID|MS_NOEXEC|MS_NODEV so
+ * flags is not set here. nsdelegate has been supported since kernels >= 4.13 so it is safe to use. */
+static const MountEntry protect_control_groups_private_table[] = {
+        { "/sys/fs/cgroup",      MOUNT_PRIVATE_CGROUP2FS, false, .read_only = false, .nosuid = true, .noexec = true, .options_const = "nsdelegate"  },
+};
+
+/* ProtectControlGroups=strict table */
+static const MountEntry protect_control_groups_strict_table[] = {
+        { "/sys/fs/cgroup",      MOUNT_PRIVATE_CGROUP2FS, false, .read_only = true,  .nosuid = true, .noexec = true, .options_const = "nsdelegate"  },
+};
+
 /* ProtectSystem=yes table */
 static const MountEntry protect_system_yes_table[] = {
         { "/usr",                MOUNT_READ_ONLY,     false },
@@ -252,6 +264,7 @@ static const char * const mount_mode_table[_MOUNT_MODE_MAX] = {
         [MOUNT_EMPTY_DIR]             = "empty-dir",
         [MOUNT_PRIVATE_SYSFS]         = "private-sysfs",
         [MOUNT_BIND_SYSFS]            = "bind-sysfs",
+        [MOUNT_PRIVATE_CGROUP2FS]     = "private-cgroup2fs",
         [MOUNT_PROCFS]                = "procfs",
         [MOUNT_READ_ONLY]             = "read-only",
         [MOUNT_READ_WRITE]            = "read-write",
@@ -743,6 +756,12 @@ static int append_protect_control_groups(MountList *ml, ProtectControlGroups pro
         case PROTECT_CONTROL_GROUPS_YES:
                 return append_static_mounts(ml, protect_control_groups_yes_table, ELEMENTSOF(protect_control_groups_yes_table), ignore_protect);
 
+        case PROTECT_CONTROL_GROUPS_PRIVATE:
+                return append_static_mounts(ml, protect_control_groups_private_table, ELEMENTSOF(protect_control_groups_private_table), ignore_protect);
+
+        case PROTECT_CONTROL_GROUPS_STRICT:
+                return append_static_mounts(ml, protect_control_groups_strict_table, ELEMENTSOF(protect_control_groups_strict_table), ignore_protect);
+
         default:
                 assert_not_reached();
         }
@@ -1290,10 +1309,14 @@ static int mount_private_apivfs(
 
         r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
         if (r == -EINVAL && opts)
-                /* If this failed with EINVAL then this likely means the textual hidepid= stuff for procfs is
-                 * not supported by the kernel, and thus the per-instance hidepid= neither, which means we
-                 * really don't want to use it, since it would affect our host's /proc mount. Hence let's
-                 * gracefully fallback to a classic, unrestricted version. */
+                /* If this failed with EINVAL then this likely means either:
+                 * 1. the textual hidepid= stuff for procfs is not supported by the kernel, and thus the
+                 *    per-instance hidepid= neither, which means we really don't want to use it, since it
+                 *    would affect our host's /proc mount.
+                 * 2. nsdelegate for cgroup2 is not supported by the kernel even though CLONE_NEWCGROUP
+                 *    is supported.
+                 *
+                 * Hence let's gracefully fallback to a classic, unrestricted version. */
                 r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, /* opts = */ NULL);
         if (ERRNO_IS_NEG_PRIVILEGE(r)) {
                 /* When we do not have enough privileges to mount a new instance, fall back to use an
@@ -1339,6 +1362,39 @@ static int mount_private_sysfs(const MountEntry *m, const NamespaceParameters *p
         return mount_private_apivfs("sysfs", mount_entry_path(m), "/sys", /* opts = */ NULL, p->runtime_scope);
 }
 
+static bool check_recursiveprot_supported(void) {
+        int r;
+
+        /* memory_recursiveprot is only supported for kernels >= 5.7. Note mount_option_supported uses fsopen()
+         * and fsconfig() which are supported for kernels >= 5.2. So if mount_option_supported() returns an
+         * error, we can assume memory_recursiveprot is not supported. */
+        r = mount_option_supported("cgroup2", "memory_recursiveprot", NULL);
+        if (r < 0)
+                log_debug_errno(r, "Failed to determine whether the 'memory_recursiveprot' mount option is supported, assuming not: %m");
+        else if (r == 0)
+                log_debug("This kernel version does not support 'memory_recursiveprot', not using mount option.");
+
+        return r > 0;
+}
+
+static int mount_private_cgroup2fs(const MountEntry *m, const NamespaceParameters *p) {
+        _cleanup_free_ char *opts = NULL;
+
+        assert(m);
+        assert(p);
+
+        if (check_recursiveprot_supported()) {
+                opts = strdup(strempty(mount_entry_options(m)));
+                if (!opts)
+                        return -ENOMEM;
+
+                if (!strextend_with_separator(&opts, ",", "memory_recursiveprot"))
+                        return -ENOMEM;
+        }
+
+        return mount_private_apivfs("cgroup2", mount_entry_path(m), "/sys/fs/cgroup", opts ?: mount_entry_options(m), p->runtime_scope);
+}
+
 static int mount_procfs(const MountEntry *m, const NamespaceParameters *p) {
         _cleanup_free_ char *opts = NULL;
 
@@ -1784,6 +1840,9 @@ static int apply_one_mount(
         case MOUNT_PROCFS:
                 return mount_procfs(m, p);
 
+        case MOUNT_PRIVATE_CGROUP2FS:
+                return mount_private_cgroup2fs(m, p);
+
         case MOUNT_RUN:
                 return mount_run(m);
 
@@ -3212,6 +3271,8 @@ DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_system, ProtectSystem, PROTECT_S
 static const char *const protect_control_groups_table[_PROTECT_CONTROL_GROUPS_MAX] = {
         [PROTECT_CONTROL_GROUPS_NO]      = "no",
         [PROTECT_CONTROL_GROUPS_YES]     = "yes",
+        [PROTECT_CONTROL_GROUPS_PRIVATE] = "private",
+        [PROTECT_CONTROL_GROUPS_STRICT]  = "strict",
 };
 
 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_control_groups, ProtectControlGroups, PROTECT_CONTROL_GROUPS_YES);
index 17c0dd9e187c6087aa1ae323af085c83460a7258..fa4f7c7cf671b427d17de64b490f96123fe34a85 100644 (file)
@@ -72,6 +72,8 @@ typedef enum PrivateUsers {
 typedef enum ProtectControlGroups {
         PROTECT_CONTROL_GROUPS_NO,
         PROTECT_CONTROL_GROUPS_YES,
+        PROTECT_CONTROL_GROUPS_PRIVATE,
+        PROTECT_CONTROL_GROUPS_STRICT,
         _PROTECT_CONTROL_GROUPS_MAX,
         _PROTECT_CONTROL_GROUPS_INVALID = -EINVAL,
 } ProtectControlGroups;
index 59e49018788b466d6c24c800afde5849a8138722..2e17bae51afc677fb178f0aa130ea4bb47aa60a3 100644 (file)
@@ -1047,6 +1047,7 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con
                               "ProtectHome",
                               "PrivateTmpEx",
                               "PrivateUsersEx",
+                              "ProtectControlGroupsEx",
                               "SELinuxContext",
                               "RootImage",
                               "RootVerity",
diff --git a/test/units/TEST-07-PID1.protect-control-groups.sh b/test/units/TEST-07-PID1.protect-control-groups.sh
new file mode 100755 (executable)
index 0000000..e7752ff
--- /dev/null
@@ -0,0 +1,107 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: LGPL-2.1-or-later
+# shellcheck disable=SC2016
+set -eux
+set -o pipefail
+
+# shellcheck source=test/units/test-control.sh
+. "$(dirname "$0")"/test-control.sh
+# shellcheck source=test/units/util.sh
+. "$(dirname "$0")"/util.sh
+
+SLICE="system.slice"
+UNIT_PREFIX="test-07-protect-control-groups"
+
+READ_ONLY_MOUNT_FLAG="ro"
+READ_WRITE_MOUNT_FLAG="rw"
+
+at_exit() {
+    set +e
+
+    systemctl stop "$UNIT_PREFIX*.service"
+    systemctl reset-failed
+}
+
+trap at_exit EXIT
+
+ROOT_CGROUP_NS=$(readlink /proc/self/ns/cgroup)
+
+ENABLE_MEM_PRESSURE_TEST=true
+
+# We do not just test if the file exists, but try to read from it, since if
+# CONFIG_PSI_DEFAULT_DISABLED is set in the kernel the file will exist and can
+# be opened, but any read()s will fail with EOPNOTSUPP, which we want to
+# detect.
+if ! cat /proc/pressure/memory >/dev/null ; then
+    echo "Kernel too old, has no PSI, not running ProtectControlGroups= with MemoryPressureWatch= test." >&2
+    ENABLE_MEM_PRESSURE_TEST=false
+fi
+
+if ! test -f "/sys/fs/cgroup/$(systemctl show TEST-07-PID1.service -P ControlGroup)/memory.pressure" ; then
+    echo "No memory accounting/PSI delegated via cgroup, not running ProtectControlGroups= with MemoryPressureWatch= test." >&2
+    ENABLE_MEM_PRESSURE_TEST=false
+fi
+
+test_basic() {
+    local protect_control_groups_ex="$1"
+    local protect_control_groups="$2"
+    local in_cgroup_ns="$3"
+    local mount_flag="$4"
+
+    if [[ $in_cgroup_ns == true ]]; then
+      local ns_cmp_op="!="
+      local unit_cgroup="0::/"
+      local memory_pressure_watch="/sys/fs/cgroup/memory.pressure"
+    else
+      local ns_cmp_op="=="
+      local unit_cgroup="0::/$SLICE/$UNIT_PREFIX-$protect_control_groups_ex-1.service"
+      local memory_pressure_watch="/sys/fs/cgroup/$SLICE/$UNIT_PREFIX-$protect_control_groups_ex-2.service/memory.pressure"
+    fi
+
+    # Compare cgroup namespace to root namespace
+    systemd-run -p "ProtectControlGroupsEx=$protect_control_groups_ex" --slice "$SLICE" --wait \
+            bash -xec "test \"\$(readlink /proc/self/ns/cgroup)\" $ns_cmp_op \"$ROOT_CGROUP_NS\""
+    # Verify unit cgroup
+    systemd-run -p "ProtectControlGroupsEx=$protect_control_groups_ex" --slice "$SLICE" --wait \
+            --unit "$UNIT_PREFIX-$protect_control_groups_ex-1" \
+            bash -xec "test \"\$(cat /proc/self/cgroup)\" == \"$unit_cgroup\""
+    # Verify memory pressure watch points to correct file
+    if [[ $ENABLE_MEM_PRESSURE_TEST == true ]]; then
+        systemd-run -p "ProtectControlGroupsEx=$protect_control_groups_ex" -p MemoryPressureWatch=yes --slice "$SLICE" --wait \
+                --unit "$UNIT_PREFIX-$protect_control_groups_ex-2" \
+                bash -xec "test \"\$MEMORY_PRESSURE_WATCH\" == \"$memory_pressure_watch\""
+    fi
+    # Verify /sys/fs/cgroup mount is read-only or read-write
+    systemd-run -p "ProtectControlGroupsEx=$protect_control_groups_ex" --slice "$SLICE" --wait \
+            bash -xec "[[ \"\$\$(findmnt --mountpoint /sys/fs/cgroup --noheadings -o FSTYPE)\" == cgroup2 ]];
+                       [[ \"\$\$(findmnt --mountpoint /sys/fs/cgroup --noheadings -o FS-OPTIONS)\" =~ nsdelegate ]];
+                       [[ \"\$\$(findmnt --mountpoint /sys/fs/cgroup --noheadings -o VFS-OPTIONS)\" =~ noexec ]];
+                       [[ \"\$\$(findmnt --mountpoint /sys/fs/cgroup --noheadings -o VFS-OPTIONS)\" =~ nosuid ]];
+                       [[ \"\$\$(findmnt --mountpoint /sys/fs/cgroup --noheadings -o VFS-OPTIONS)\" =~ nodev ]];
+                       [[ \"\$\$(findmnt --mountpoint /sys/fs/cgroup --noheadings -o VFS-OPTIONS)\" =~ \"$mount_flag\" ]];"
+
+    # Verify dbus properties
+    systemd-run -p "ProtectControlGroupsEx=$protect_control_groups_ex" --slice "$SLICE" --remain-after-exit \
+            --unit "$UNIT_PREFIX-$protect_control_groups_ex-3" true
+    assert_eq "$(systemctl show -P ProtectControlGroupsEx "$UNIT_PREFIX-$protect_control_groups_ex-3")" "$protect_control_groups_ex"
+    assert_eq "$(systemctl show -P ProtectControlGroups "$UNIT_PREFIX-$protect_control_groups_ex-3")" "$protect_control_groups"
+    systemctl stop "$UNIT_PREFIX-$protect_control_groups_ex-3"
+}
+
+testcase_basic_no() {
+    test_basic "no" "no" false "$READ_WRITE_MOUNT_FLAG"
+}
+
+testcase_basic_yes() {
+    test_basic "yes" "yes" false "$READ_ONLY_MOUNT_FLAG"
+}
+
+testcase_basic_private() {
+    test_basic "private" "yes" true "$READ_WRITE_MOUNT_FLAG"
+}
+
+testcase_basic_strict() {
+    test_basic "strict" "yes" true "$READ_ONLY_MOUNT_FLAG"
+}
+
+run_testcases