From: Ryan Wilson <ryantimwilson@meta.com>
Date: Fri, 18 Oct 2024 18:41:09 +0000 (-0700)
Subject: cgroup: Add support for ProtectControlGroups= private and strict
X-Git-Tag: v257-rc1~127^2
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=refs%2Fpull%2F34806%2Fhead;p=thirdparty%2Fsystemd.git

cgroup: Add support for ProtectControlGroups= private and strict

This commit adds two settings private and strict to
the ProtectControlGroups= property. Private will unshare the cgroup
namespace and mount a read-write private cgroup2 filesystem at /sys/fs/cgroup.
Strict does the same except the mount is read-only. Since the unit is
running in a cgroup namespace, the new root of /sys/fs/cgroup is the unit's
own cgroup.

We also add a new dbus property ProtectControlGroupsEx which accepts strings
instead of boolean. This will allow users to use private/strict via dbus
and systemd-run in addition to service files.

Note private and strict fall back to no and yes respectively if the kernel
doesn't support cgroup2 or system is not using unified hierarchy.

Fixes: #34634
---

diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml
index d86a5d9f325..f484f28a70b 100644
--- a/man/org.freedesktop.systemd1.xml
+++ b/man/org.freedesktop.systemd1.xml
@@ -3251,6 +3251,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly b ProtectControlGroups = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+      readonly s ProtectControlGroupsEx = '...';
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly b PrivateNetwork = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly b PrivateUsers = ...;
@@ -3868,8 +3870,6 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
 
     <!--property ProtectKernelLogs is not documented!-->
 
-    <!--property ProtectControlGroups is not documented!-->
-
     <!--property PrivateNetwork is not documented!-->
 
     <!--property PrivateUsers is not documented!-->
@@ -4572,6 +4572,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
 
     <variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroups"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroupsEx"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="PrivateNetwork"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="PrivateUsers"/>
@@ -4858,6 +4860,12 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
       unit file setting <varname>ManagedOOMMemoryPressureDurationSec=</varname> listed in
       <citerefentry><refentrytitle>systemd.resource-control</refentrytitle><manvolnum>5</manvolnum></citerefentry>.
       Note the time unit is expressed in <literal>Î¼s</literal>.</para>
+
+      <para><varname>ProtectControlGroupsEx</varname> implement the destination parameter of the
+      unit file setting <varname>ProtectControlGroups=</varname> listed in
+      <citerefentry><refentrytitle>systemd.exec</refentrytitle><manvolnum>5</manvolnum></citerefentry>.
+      Unlike boolean <varname>ProtectControlGroups</varname>, <varname>ProtectControlGroupsEx</varname>
+      is a string type.</para>
     </refsect2>
   </refsect1>
 
@@ -5415,6 +5423,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly b ProtectControlGroups = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+      readonly s ProtectControlGroupsEx = '...';
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly b PrivateNetwork = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly b PrivateUsers = ...;
@@ -6044,8 +6054,6 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
 
     <!--property ProtectKernelLogs is not documented!-->
 
-    <!--property ProtectControlGroups is not documented!-->
-
     <!--property PrivateNetwork is not documented!-->
 
     <!--property PrivateUsers is not documented!-->
@@ -6720,6 +6728,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
 
     <variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroups"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroupsEx"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="PrivateNetwork"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="PrivateUsers"/>
@@ -7416,6 +7426,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly b ProtectControlGroups = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+      readonly s ProtectControlGroupsEx = '...';
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly b PrivateNetwork = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly b PrivateUsers = ...;
@@ -7971,8 +7983,6 @@ node /org/freedesktop/systemd1/unit/home_2emount {
 
     <!--property ProtectKernelLogs is not documented!-->
 
-    <!--property ProtectControlGroups is not documented!-->
-
     <!--property PrivateNetwork is not documented!-->
 
     <!--property PrivateUsers is not documented!-->
@@ -8559,6 +8569,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
 
     <variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroups"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroupsEx"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="PrivateNetwork"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="PrivateUsers"/>
@@ -9384,6 +9396,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly b ProtectControlGroups = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+      readonly s ProtectControlGroupsEx = '...';
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly b PrivateNetwork = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly b PrivateUsers = ...;
@@ -9925,8 +9939,6 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
 
     <!--property ProtectKernelLogs is not documented!-->
 
-    <!--property ProtectControlGroups is not documented!-->
-
     <!--property PrivateNetwork is not documented!-->
 
     <!--property PrivateUsers is not documented!-->
@@ -10499,6 +10511,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
 
     <variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroups"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroupsEx"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="PrivateNetwork"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="PrivateUsers"/>
@@ -12262,7 +12276,8 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <varname>ImportCredentialEx</varname>,
       <varname>ExtraFileDescriptorNames</varname>,
       <varname>ManagedOOMMemoryPressureDurationUSec</varname>,
-      <varname>BindLogSockets</varname>, and
+      <varname>BindLogSockets</varname>,
+      <varname>ProtectControlGroupsEx</varname>, and
       <varname>PrivateUsersEx</varname> were added in version 257.</para>
     </refsect2>
     <refsect2>
@@ -12303,8 +12318,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <para><varname>PrivateTmpEx</varname>,
       <varname>ImportCredentialEx</varname>,
       <varname>BindLogSockets</varname>,
-      <varname>PrivateUsersEx</varname>, and
-      <varname>ManagedOOMMemoryPressureDurationUSec</varname> were added in version 257.</para>
+      <varname>PrivateUsersEx</varname>,
+      <varname>ManagedOOMMemoryPressureDurationUSec</varname>, and
+      <varname>ProtectControlGroupsEx</varname> were added in version 257.</para>
     </refsect2>
     <refsect2>
       <title>Mount Unit Objects</title>
@@ -12341,8 +12357,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <para><varname>PrivateTmpEx</varname>,
       <varname>ImportCredentialEx</varname>,
       <varname>BindLogSockets</varname>,
-      <varname>PrivateUsersEx</varname>, and
-      <varname>ManagedOOMMemoryPressureDurationUSec</varname> were added in version 257.</para>
+      <varname>PrivateUsersEx</varname>,
+      <varname>ManagedOOMMemoryPressureDurationUSec</varname>, and
+      <varname>ProtectControlGroupsEx</varname> were added in version 257.</para>
     </refsect2>
     <refsect2>
       <title>Swap Unit Objects</title>
@@ -12379,8 +12396,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <para><varname>PrivateTmpEx</varname>,
       <varname>ImportCredentialEx</varname>,
       <varname>BindLogSockets</varname>,
-      <varname>PrivateUsersEx</varname>, and
-      <varname>ManagedOOMMemoryPressureDurationUSec</varname> were added in version 257.</para>
+      <varname>PrivateUsersEx</varname>,
+      <varname>ManagedOOMMemoryPressureDurationUSec</varname>, and
+      <varname>ProtectControlGroupsEx</varname> were added in version 257.</para>
     </refsect2>
     <refsect2>
       <title>Slice Unit Objects</title>
diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 6764f89b020..f84204d247e 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -2117,14 +2117,22 @@ BindReadOnlyPaths=/var/lib/systemd</programlisting>
       <varlistentry>
         <term><varname>ProtectControlGroups=</varname></term>
 
-        <listitem><para>Takes a boolean argument. If true, the Linux Control Groups (<citerefentry
-        project='man-pages'><refentrytitle>cgroups</refentrytitle><manvolnum>7</manvolnum></citerefentry>) hierarchies
+        <listitem><para>Takes a boolean argument or the special values <literal>private</literal> or
+        <literal>strict</literal>. If true, the Linux Control Groups (<citerefentry project='man-pages'>
+        <refentrytitle>cgroups</refentrytitle><manvolnum>7</manvolnum></citerefentry>) hierarchies
         accessible through <filename>/sys/fs/cgroup/</filename> will be made read-only to all processes of the
-        unit. Except for container managers no services should require write access to the control groups hierarchies;
-        it is hence recommended to turn this on for most services. For this setting the same restrictions regarding
-        mount propagation and privileges apply as for <varname>ReadOnlyPaths=</varname> and related calls, see
-        above. Defaults to off. If <varname>ProtectControlGroups=</varname> is set, <varname>MountAPIVFS=yes</varname>
-        is implied.</para>
+        unit. If set to <literal>private</literal>, the unit will run in a cgroup namespace with a private
+        writable mount of <filename>/sys/fs/cgroup/</filename>. If set to <literal>strict</literal>, the unit
+        will run in a cgroup namespace with a private read-only mount of <filename>/sys/fs/cgroup/</filename>.
+        Defaults to off. If <varname>ProtectControlGroups=</varname> is set, <varname>MountAPIVFS=yes</varname>
+        is implied. Note <literal>private</literal> and <literal>strict</literal> are downgraded to false and
+        true respectively unless the system is using the unified control group hierarchy and the kernel supports
+        cgroup namespaces.</para>
+
+        <para>Except for container managers no services should require write access to the control groups hierarchies;
+        it is hence recommended to set <varname>ProtectControlGroups=</varname> to true or <literal>strict</literal>
+        for most services. For this setting the same restrictions regarding mount propagation and privileges apply
+        as for <varname>ReadOnlyPaths=</varname> and related settings, see above.</para>
 
         <xi:include href="system-only.xml" xpointer="singular"/>
 
diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c
index 4ef57137ea6..4f627a11e24 100644
--- a/src/core/dbus-execute.c
+++ b/src/core/dbus-execute.c
@@ -61,6 +61,7 @@ static BUS_DEFINE_PROPERTY_GET2(property_get_ioprio_priority, "i", ExecContext,
 static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_empty_string, "s", NULL);
 static BUS_DEFINE_PROPERTY_GET_REF(property_get_private_tmp_ex, "s", PrivateTmp, private_tmp_to_string);
 static BUS_DEFINE_PROPERTY_GET_REF(property_get_private_users_ex, "s", PrivateUsers, private_users_to_string);
+static BUS_DEFINE_PROPERTY_GET_REF(property_get_protect_control_groups_ex, "s", ProtectControlGroups, protect_control_groups_to_string);
 static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_level, "i", int, LOG_PRI);
 static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_facility, "i", int, LOG_FAC);
 static BUS_DEFINE_PROPERTY_GET(property_get_cpu_affinity_from_numa, "b", ExecContext, exec_context_get_cpu_affinity_from_numa);
@@ -1179,6 +1180,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
         SD_BUS_PROPERTY("ProtectKernelModules", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_modules), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("ProtectKernelLogs", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_logs), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("ProtectControlGroups", "b", property_get_protect_control_groups, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ProtectControlGroupsEx", "s", property_get_protect_control_groups_ex, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("PrivateUsers", "b", property_get_private_users, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("PrivateUsersEx", "s", property_get_private_users_ex, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST),
@@ -1939,6 +1941,27 @@ int bus_exec_context_set_transient_property(
                 return 1;
         }
 
+        if (streq(name, "ProtectControlGroupsEx")) {
+                const char *s;
+                ProtectControlGroups t;
+
+                r = sd_bus_message_read(message, "s", &s);
+                if (r < 0)
+                        return r;
+
+                t = protect_control_groups_from_string(s);
+                if (t < 0)
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid %s setting: %s", name, s);
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        c->protect_control_groups = t;
+                        (void) unit_write_settingf(u, flags, name, "ProtectControlGroups=%s",
+                                                   protect_control_groups_to_string(c->protect_control_groups));
+                }
+
+                return 1;
+        }
+
         if (streq(name, "PrivateDevices"))
                 return bus_set_transient_bool(u, name, &c->private_devices, message, flags, error);
 
diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c
index a0caf9bf893..eda0aee7c2e 100644
--- a/src/core/exec-invoke.c
+++ b/src/core/exec-invoke.c
@@ -3098,7 +3098,7 @@ static int apply_mount_namespace(
 
         /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
          * service will need to write to it in order to start the notifications. */
-        if (context->protect_control_groups != PROTECT_CONTROL_GROUPS_NO && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
+        if (exec_is_cgroup_mount_read_only(context, params) && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
                 read_write_paths_cleanup = strv_copy(context->read_write_paths);
                 if (!read_write_paths_cleanup)
                         return -ENOMEM;
@@ -3242,7 +3242,7 @@ static int apply_mount_namespace(
                  * sandbox inside the mount namespace. */
                 .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
 
-                .protect_control_groups = needs_sandboxing ? context->protect_control_groups : PROTECT_CONTROL_GROUPS_NO,
+                .protect_control_groups = needs_sandboxing ? exec_get_protect_control_groups(context, params) : PROTECT_CONTROL_GROUPS_NO,
                 .protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
                 .protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
                 .protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
@@ -3886,7 +3886,7 @@ static bool exec_context_need_unprivileged_private_users(
                context->protect_kernel_tunables ||
                context->protect_kernel_modules ||
                context->protect_kernel_logs ||
-               context->protect_control_groups != PROTECT_CONTROL_GROUPS_NO ||
+               exec_needs_cgroup_mount(context, params) ||
                context->protect_clock ||
                context->protect_hostname ||
                !strv_isempty(context->read_write_paths) ||
@@ -4580,6 +4580,10 @@ int exec_invoke(
                 }
         }
 
+        /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
+         * from it. */
+        needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
+
         if (params->cgroup_path) {
                 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
                  * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
@@ -4623,6 +4627,18 @@ int exec_invoke(
                                                             "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
                                         memory_pressure_path = mfree(memory_pressure_path);
                                 }
+                                /* First we use the current cgroup path to chmod and chown the memory pressure path, then pass the path relative
+                                 * to the cgroup namespace to environment variables and mounts. If chown/chmod fails, we should not pass memory
+                                 * pressure path environment variable or read-write mount to the unit. This is why we check if
+                                 * memory_pressure_path != NULL in the conditional below. */
+                                if (memory_pressure_path && needs_sandboxing && exec_needs_cgroup_namespace(context, params)) {
+                                        memory_pressure_path = mfree(memory_pressure_path);
+                                        r = cg_get_path("memory", "", "memory.pressure", &memory_pressure_path);
+                                        if (r < 0) {
+                                                *exit_status = EXIT_MEMORY;
+                                                return log_oom();
+                                        }
+                                }
                         } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_NO) {
                                 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
                                 if (!memory_pressure_path) {
@@ -4709,10 +4725,6 @@ int exec_invoke(
                 return log_exec_error_errno(context, params, r, "Failed to set up kernel keyring: %m");
         }
 
-        /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
-         * from it. */
-        needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
-
         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
          * for it, and the kernel doesn't actually support ambient caps. */
         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
@@ -4853,6 +4865,14 @@ int exec_invoke(
                         log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
         }
 
+        if (needs_sandboxing && exec_needs_cgroup_namespace(context, params)) {
+                r = unshare(CLONE_NEWCGROUP);
+                if (r < 0) {
+                        *exit_status = EXIT_NAMESPACE;
+                        return log_exec_error_errno(context, params, r, "Failed to set up cgroup namespacing: %m");
+                }
+        }
+
         if (needs_mount_namespace) {
                 _cleanup_free_ char *error_path = NULL;
 
diff --git a/src/core/execute.c b/src/core/execute.c
index c1acc992f52..1fda693344f 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -210,6 +210,50 @@ bool exec_needs_ipc_namespace(const ExecContext *context) {
         return context->private_ipc || context->ipc_namespace_path;
 }
 
+static bool can_apply_cgroup_namespace(const ExecContext *context, const ExecParameters *params) {
+        return cg_all_unified() > 0 && ns_type_supported(NAMESPACE_CGROUP);
+}
+
+static bool needs_cgroup_namespace(ProtectControlGroups i) {
+        return IN_SET(i, PROTECT_CONTROL_GROUPS_PRIVATE, PROTECT_CONTROL_GROUPS_STRICT);
+}
+
+ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params) {
+        assert(context);
+
+        /* If cgroup namespace is configured via ProtectControlGroups=private or strict but we can't actually
+         * use cgroup namespace, either from not having unified hierarchy or kernel support, we ignore the
+         * setting and do not unshare the namespace. ProtectControlGroups=private and strict get downgraded
+         * to no and yes respectively. This ensures that strict always gets a read-only mount of /sys/fs/cgroup.
+         *
+         * TODO: Remove fallback once cgroupv1 support is removed in v258. */
+        if (needs_cgroup_namespace(context->protect_control_groups) && !can_apply_cgroup_namespace(context, params)) {
+                if (context->protect_control_groups == PROTECT_CONTROL_GROUPS_PRIVATE)
+                        return PROTECT_CONTROL_GROUPS_NO;
+                if (context->protect_control_groups == PROTECT_CONTROL_GROUPS_STRICT)
+                        return PROTECT_CONTROL_GROUPS_YES;
+        }
+        return context->protect_control_groups;
+}
+
+bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params) {
+        assert(context);
+
+        return needs_cgroup_namespace(exec_get_protect_control_groups(context, params));
+}
+
+bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params) {
+        assert(context);
+
+        return exec_get_protect_control_groups(context, params) != PROTECT_CONTROL_GROUPS_NO;
+}
+
+bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params) {
+        assert(context);
+
+        return IN_SET(exec_get_protect_control_groups(context, params), PROTECT_CONTROL_GROUPS_YES, PROTECT_CONTROL_GROUPS_STRICT);
+}
+
 bool exec_needs_mount_namespace(
                 const ExecContext *context,
                 const ExecParameters *params,
@@ -259,7 +303,7 @@ bool exec_needs_mount_namespace(
             context->protect_kernel_tunables ||
             context->protect_kernel_modules ||
             context->protect_kernel_logs ||
-            context->protect_control_groups != PROTECT_CONTROL_GROUPS_NO ||
+            exec_needs_cgroup_mount(context, params) ||
             context->protect_proc != PROTECT_PROC_DEFAULT ||
             context->proc_subset != PROC_SUBSET_ALL ||
             exec_needs_ipc_namespace(context))
diff --git a/src/core/execute.h b/src/core/execute.h
index 5fbc196cb70..1f9b3f8f142 100644
--- a/src/core/execute.h
+++ b/src/core/execute.h
@@ -616,6 +616,11 @@ bool exec_needs_mount_namespace(const ExecContext *context, const ExecParameters
 bool exec_needs_network_namespace(const ExecContext *context);
 bool exec_needs_ipc_namespace(const ExecContext *context);
 
+ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params);
+bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params);
+bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params);
+bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params);
+
 /* These logging macros do the same logging as those in unit.h, but using ExecContext and ExecParameters
  * instead of the unit object, so that it can be used in the sd-executor context (where the unit object is
  * not available). */
diff --git a/src/core/namespace.c b/src/core/namespace.c
index 371d4a237ad..a0d3dc0cbb9 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -65,6 +65,7 @@ typedef enum MountMode {
         MOUNT_PRIVATE_SYSFS,
         MOUNT_BIND_SYSFS,
         MOUNT_PROCFS,
+        MOUNT_PRIVATE_CGROUP2FS,
         MOUNT_READ_ONLY,
         MOUNT_READ_WRITE,
         MOUNT_NOEXEC,
@@ -204,6 +205,17 @@ static const MountEntry protect_control_groups_yes_table[] = {
         { "/sys/fs/cgroup",      MOUNT_READ_ONLY,         false  },
 };
 
+/* ProtectControlGroups=private table. Note mount_private_apivfs() always use MS_NOSUID|MS_NOEXEC|MS_NODEV so
+ * flags is not set here. nsdelegate has been supported since kernels >= 4.13 so it is safe to use. */
+static const MountEntry protect_control_groups_private_table[] = {
+        { "/sys/fs/cgroup",      MOUNT_PRIVATE_CGROUP2FS, false, .read_only = false, .nosuid = true, .noexec = true, .options_const = "nsdelegate"  },
+};
+
+/* ProtectControlGroups=strict table */
+static const MountEntry protect_control_groups_strict_table[] = {
+        { "/sys/fs/cgroup",      MOUNT_PRIVATE_CGROUP2FS, false, .read_only = true,  .nosuid = true, .noexec = true, .options_const = "nsdelegate"  },
+};
+
 /* ProtectSystem=yes table */
 static const MountEntry protect_system_yes_table[] = {
         { "/usr",                MOUNT_READ_ONLY,     false },
@@ -252,6 +264,7 @@ static const char * const mount_mode_table[_MOUNT_MODE_MAX] = {
         [MOUNT_EMPTY_DIR]             = "empty-dir",
         [MOUNT_PRIVATE_SYSFS]         = "private-sysfs",
         [MOUNT_BIND_SYSFS]            = "bind-sysfs",
+        [MOUNT_PRIVATE_CGROUP2FS]     = "private-cgroup2fs",
         [MOUNT_PROCFS]                = "procfs",
         [MOUNT_READ_ONLY]             = "read-only",
         [MOUNT_READ_WRITE]            = "read-write",
@@ -743,6 +756,12 @@ static int append_protect_control_groups(MountList *ml, ProtectControlGroups pro
         case PROTECT_CONTROL_GROUPS_YES:
                 return append_static_mounts(ml, protect_control_groups_yes_table, ELEMENTSOF(protect_control_groups_yes_table), ignore_protect);
 
+        case PROTECT_CONTROL_GROUPS_PRIVATE:
+                return append_static_mounts(ml, protect_control_groups_private_table, ELEMENTSOF(protect_control_groups_private_table), ignore_protect);
+
+        case PROTECT_CONTROL_GROUPS_STRICT:
+                return append_static_mounts(ml, protect_control_groups_strict_table, ELEMENTSOF(protect_control_groups_strict_table), ignore_protect);
+
         default:
                 assert_not_reached();
         }
@@ -1290,10 +1309,14 @@ static int mount_private_apivfs(
 
         r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
         if (r == -EINVAL && opts)
-                /* If this failed with EINVAL then this likely means the textual hidepid= stuff for procfs is
-                 * not supported by the kernel, and thus the per-instance hidepid= neither, which means we
-                 * really don't want to use it, since it would affect our host's /proc mount. Hence let's
-                 * gracefully fallback to a classic, unrestricted version. */
+                /* If this failed with EINVAL then this likely means either:
+                 * 1. the textual hidepid= stuff for procfs is not supported by the kernel, and thus the
+                 *    per-instance hidepid= neither, which means we really don't want to use it, since it
+                 *    would affect our host's /proc mount.
+                 * 2. nsdelegate for cgroup2 is not supported by the kernel even though CLONE_NEWCGROUP
+                 *    is supported.
+                 *
+                 * Hence let's gracefully fallback to a classic, unrestricted version. */
                 r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, /* opts = */ NULL);
         if (ERRNO_IS_NEG_PRIVILEGE(r)) {
                 /* When we do not have enough privileges to mount a new instance, fall back to use an
@@ -1339,6 +1362,39 @@ static int mount_private_sysfs(const MountEntry *m, const NamespaceParameters *p
         return mount_private_apivfs("sysfs", mount_entry_path(m), "/sys", /* opts = */ NULL, p->runtime_scope);
 }
 
+static bool check_recursiveprot_supported(void) {
+        int r;
+
+        /* memory_recursiveprot is only supported for kernels >= 5.7. Note mount_option_supported uses fsopen()
+         * and fsconfig() which are supported for kernels >= 5.2. So if mount_option_supported() returns an
+         * error, we can assume memory_recursiveprot is not supported. */
+        r = mount_option_supported("cgroup2", "memory_recursiveprot", NULL);
+        if (r < 0)
+                log_debug_errno(r, "Failed to determine whether the 'memory_recursiveprot' mount option is supported, assuming not: %m");
+        else if (r == 0)
+                log_debug("This kernel version does not support 'memory_recursiveprot', not using mount option.");
+
+        return r > 0;
+}
+
+static int mount_private_cgroup2fs(const MountEntry *m, const NamespaceParameters *p) {
+        _cleanup_free_ char *opts = NULL;
+
+        assert(m);
+        assert(p);
+
+        if (check_recursiveprot_supported()) {
+                opts = strdup(strempty(mount_entry_options(m)));
+                if (!opts)
+                        return -ENOMEM;
+
+                if (!strextend_with_separator(&opts, ",", "memory_recursiveprot"))
+                        return -ENOMEM;
+        }
+
+        return mount_private_apivfs("cgroup2", mount_entry_path(m), "/sys/fs/cgroup", opts ?: mount_entry_options(m), p->runtime_scope);
+}
+
 static int mount_procfs(const MountEntry *m, const NamespaceParameters *p) {
         _cleanup_free_ char *opts = NULL;
 
@@ -1784,6 +1840,9 @@ static int apply_one_mount(
         case MOUNT_PROCFS:
                 return mount_procfs(m, p);
 
+        case MOUNT_PRIVATE_CGROUP2FS:
+                return mount_private_cgroup2fs(m, p);
+
         case MOUNT_RUN:
                 return mount_run(m);
 
@@ -3212,6 +3271,8 @@ DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_system, ProtectSystem, PROTECT_S
 static const char *const protect_control_groups_table[_PROTECT_CONTROL_GROUPS_MAX] = {
         [PROTECT_CONTROL_GROUPS_NO]      = "no",
         [PROTECT_CONTROL_GROUPS_YES]     = "yes",
+        [PROTECT_CONTROL_GROUPS_PRIVATE] = "private",
+        [PROTECT_CONTROL_GROUPS_STRICT]  = "strict",
 };
 
 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_control_groups, ProtectControlGroups, PROTECT_CONTROL_GROUPS_YES);
diff --git a/src/core/namespace.h b/src/core/namespace.h
index 17c0dd9e187..fa4f7c7cf67 100644
--- a/src/core/namespace.h
+++ b/src/core/namespace.h
@@ -72,6 +72,8 @@ typedef enum PrivateUsers {
 typedef enum ProtectControlGroups {
         PROTECT_CONTROL_GROUPS_NO,
         PROTECT_CONTROL_GROUPS_YES,
+        PROTECT_CONTROL_GROUPS_PRIVATE,
+        PROTECT_CONTROL_GROUPS_STRICT,
         _PROTECT_CONTROL_GROUPS_MAX,
         _PROTECT_CONTROL_GROUPS_INVALID = -EINVAL,
 } ProtectControlGroups;
diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c
index 59e49018788..2e17bae51af 100644
--- a/src/shared/bus-unit-util.c
+++ b/src/shared/bus-unit-util.c
@@ -1047,6 +1047,7 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con
                               "ProtectHome",
                               "PrivateTmpEx",
                               "PrivateUsersEx",
+                              "ProtectControlGroupsEx",
                               "SELinuxContext",
                               "RootImage",
                               "RootVerity",
diff --git a/test/units/TEST-07-PID1.protect-control-groups.sh b/test/units/TEST-07-PID1.protect-control-groups.sh
new file mode 100755
index 00000000000..e7752ffb4b1
--- /dev/null
+++ b/test/units/TEST-07-PID1.protect-control-groups.sh
@@ -0,0 +1,107 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: LGPL-2.1-or-later
+# shellcheck disable=SC2016
+set -eux
+set -o pipefail
+
+# shellcheck source=test/units/test-control.sh
+. "$(dirname "$0")"/test-control.sh
+# shellcheck source=test/units/util.sh
+. "$(dirname "$0")"/util.sh
+
+SLICE="system.slice"
+UNIT_PREFIX="test-07-protect-control-groups"
+
+READ_ONLY_MOUNT_FLAG="ro"
+READ_WRITE_MOUNT_FLAG="rw"
+
+at_exit() {
+    set +e
+
+    systemctl stop "$UNIT_PREFIX*.service"
+    systemctl reset-failed
+}
+
+trap at_exit EXIT
+
+ROOT_CGROUP_NS=$(readlink /proc/self/ns/cgroup)
+
+ENABLE_MEM_PRESSURE_TEST=true
+
+# We do not just test if the file exists, but try to read from it, since if
+# CONFIG_PSI_DEFAULT_DISABLED is set in the kernel the file will exist and can
+# be opened, but any read()s will fail with EOPNOTSUPP, which we want to
+# detect.
+if ! cat /proc/pressure/memory >/dev/null ; then
+    echo "Kernel too old, has no PSI, not running ProtectControlGroups= with MemoryPressureWatch= test." >&2
+    ENABLE_MEM_PRESSURE_TEST=false
+fi
+
+if ! test -f "/sys/fs/cgroup/$(systemctl show TEST-07-PID1.service -P ControlGroup)/memory.pressure" ; then
+    echo "No memory accounting/PSI delegated via cgroup, not running ProtectControlGroups= with MemoryPressureWatch= test." >&2
+    ENABLE_MEM_PRESSURE_TEST=false
+fi
+
+test_basic() {
+    local protect_control_groups_ex="$1"
+    local protect_control_groups="$2"
+    local in_cgroup_ns="$3"
+    local mount_flag="$4"
+
+    if [[ $in_cgroup_ns == true ]]; then
+      local ns_cmp_op="!="
+      local unit_cgroup="0::/"
+      local memory_pressure_watch="/sys/fs/cgroup/memory.pressure"
+    else
+      local ns_cmp_op="=="
+      local unit_cgroup="0::/$SLICE/$UNIT_PREFIX-$protect_control_groups_ex-1.service"
+      local memory_pressure_watch="/sys/fs/cgroup/$SLICE/$UNIT_PREFIX-$protect_control_groups_ex-2.service/memory.pressure"
+    fi
+
+    # Compare cgroup namespace to root namespace
+    systemd-run -p "ProtectControlGroupsEx=$protect_control_groups_ex" --slice "$SLICE" --wait \
+            bash -xec "test \"\$(readlink /proc/self/ns/cgroup)\" $ns_cmp_op \"$ROOT_CGROUP_NS\""
+    # Verify unit cgroup
+    systemd-run -p "ProtectControlGroupsEx=$protect_control_groups_ex" --slice "$SLICE" --wait \
+            --unit "$UNIT_PREFIX-$protect_control_groups_ex-1" \
+            bash -xec "test \"\$(cat /proc/self/cgroup)\" == \"$unit_cgroup\""
+    # Verify memory pressure watch points to correct file
+    if [[ $ENABLE_MEM_PRESSURE_TEST == true ]]; then
+        systemd-run -p "ProtectControlGroupsEx=$protect_control_groups_ex" -p MemoryPressureWatch=yes --slice "$SLICE" --wait \
+                --unit "$UNIT_PREFIX-$protect_control_groups_ex-2" \
+                bash -xec "test \"\$MEMORY_PRESSURE_WATCH\" == \"$memory_pressure_watch\""
+    fi
+    # Verify /sys/fs/cgroup mount is read-only or read-write
+    systemd-run -p "ProtectControlGroupsEx=$protect_control_groups_ex" --slice "$SLICE" --wait \
+            bash -xec "[[ \"\$\$(findmnt --mountpoint /sys/fs/cgroup --noheadings -o FSTYPE)\" == cgroup2 ]];
+                       [[ \"\$\$(findmnt --mountpoint /sys/fs/cgroup --noheadings -o FS-OPTIONS)\" =~ nsdelegate ]];
+                       [[ \"\$\$(findmnt --mountpoint /sys/fs/cgroup --noheadings -o VFS-OPTIONS)\" =~ noexec ]];
+                       [[ \"\$\$(findmnt --mountpoint /sys/fs/cgroup --noheadings -o VFS-OPTIONS)\" =~ nosuid ]];
+                       [[ \"\$\$(findmnt --mountpoint /sys/fs/cgroup --noheadings -o VFS-OPTIONS)\" =~ nodev ]];
+                       [[ \"\$\$(findmnt --mountpoint /sys/fs/cgroup --noheadings -o VFS-OPTIONS)\" =~ \"$mount_flag\" ]];"
+
+    # Verify dbus properties
+    systemd-run -p "ProtectControlGroupsEx=$protect_control_groups_ex" --slice "$SLICE" --remain-after-exit \
+            --unit "$UNIT_PREFIX-$protect_control_groups_ex-3" true
+    assert_eq "$(systemctl show -P ProtectControlGroupsEx "$UNIT_PREFIX-$protect_control_groups_ex-3")" "$protect_control_groups_ex"
+    assert_eq "$(systemctl show -P ProtectControlGroups "$UNIT_PREFIX-$protect_control_groups_ex-3")" "$protect_control_groups"
+    systemctl stop "$UNIT_PREFIX-$protect_control_groups_ex-3"
+}
+
+testcase_basic_no() {
+    test_basic "no" "no" false "$READ_WRITE_MOUNT_FLAG"
+}
+
+testcase_basic_yes() {
+    test_basic "yes" "yes" false "$READ_ONLY_MOUNT_FLAG"
+}
+
+testcase_basic_private() {
+    test_basic "private" "yes" true "$READ_WRITE_MOUNT_FLAG"
+}
+
+testcase_basic_strict() {
+    test_basic "strict" "yes" true "$READ_ONLY_MOUNT_FLAG"
+}
+
+run_testcases