@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b ProtectControlGroups = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly s ProtectControlGroupsEx = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateNetwork = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateUsers = ...;
<!--property ProtectKernelLogs is not documented!-->
- <!--property ProtectControlGroups is not documented!-->
-
<!--property PrivateNetwork is not documented!-->
<!--property PrivateUsers is not documented!-->
<variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroups"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroupsEx"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="PrivateNetwork"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivateUsers"/>
unit file setting <varname>ManagedOOMMemoryPressureDurationSec=</varname> listed in
<citerefentry><refentrytitle>systemd.resource-control</refentrytitle><manvolnum>5</manvolnum></citerefentry>.
Note the time unit is expressed in <literal>μs</literal>.</para>
+
+ <para><varname>ProtectControlGroupsEx</varname> implement the destination parameter of the
+ unit file setting <varname>ProtectControlGroups=</varname> listed in
+ <citerefentry><refentrytitle>systemd.exec</refentrytitle><manvolnum>5</manvolnum></citerefentry>.
+ Unlike boolean <varname>ProtectControlGroups</varname>, <varname>ProtectControlGroupsEx</varname>
+ is a string type.</para>
</refsect2>
</refsect1>
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b ProtectControlGroups = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly s ProtectControlGroupsEx = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateNetwork = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateUsers = ...;
<!--property ProtectKernelLogs is not documented!-->
- <!--property ProtectControlGroups is not documented!-->
-
<!--property PrivateNetwork is not documented!-->
<!--property PrivateUsers is not documented!-->
<variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroups"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroupsEx"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="PrivateNetwork"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivateUsers"/>
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b ProtectControlGroups = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly s ProtectControlGroupsEx = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateNetwork = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateUsers = ...;
<!--property ProtectKernelLogs is not documented!-->
- <!--property ProtectControlGroups is not documented!-->
-
<!--property PrivateNetwork is not documented!-->
<!--property PrivateUsers is not documented!-->
<variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroups"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroupsEx"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="PrivateNetwork"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivateUsers"/>
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b ProtectControlGroups = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly s ProtectControlGroupsEx = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateNetwork = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateUsers = ...;
<!--property ProtectKernelLogs is not documented!-->
- <!--property ProtectControlGroups is not documented!-->
-
<!--property PrivateNetwork is not documented!-->
<!--property PrivateUsers is not documented!-->
<variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroups"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroupsEx"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="PrivateNetwork"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivateUsers"/>
<varname>ImportCredentialEx</varname>,
<varname>ExtraFileDescriptorNames</varname>,
<varname>ManagedOOMMemoryPressureDurationUSec</varname>,
- <varname>BindLogSockets</varname>, and
+ <varname>BindLogSockets</varname>,
+ <varname>ProtectControlGroupsEx</varname>, and
<varname>PrivateUsersEx</varname> were added in version 257.</para>
</refsect2>
<refsect2>
<para><varname>PrivateTmpEx</varname>,
<varname>ImportCredentialEx</varname>,
<varname>BindLogSockets</varname>,
- <varname>PrivateUsersEx</varname>, and
- <varname>ManagedOOMMemoryPressureDurationUSec</varname> were added in version 257.</para>
+ <varname>PrivateUsersEx</varname>,
+ <varname>ManagedOOMMemoryPressureDurationUSec</varname>, and
+ <varname>ProtectControlGroupsEx</varname> were added in version 257.</para>
</refsect2>
<refsect2>
<title>Mount Unit Objects</title>
<para><varname>PrivateTmpEx</varname>,
<varname>ImportCredentialEx</varname>,
<varname>BindLogSockets</varname>,
- <varname>PrivateUsersEx</varname>, and
- <varname>ManagedOOMMemoryPressureDurationUSec</varname> were added in version 257.</para>
+ <varname>PrivateUsersEx</varname>,
+ <varname>ManagedOOMMemoryPressureDurationUSec</varname>, and
+ <varname>ProtectControlGroupsEx</varname> were added in version 257.</para>
</refsect2>
<refsect2>
<title>Swap Unit Objects</title>
<para><varname>PrivateTmpEx</varname>,
<varname>ImportCredentialEx</varname>,
<varname>BindLogSockets</varname>,
- <varname>PrivateUsersEx</varname>, and
- <varname>ManagedOOMMemoryPressureDurationUSec</varname> were added in version 257.</para>
+ <varname>PrivateUsersEx</varname>,
+ <varname>ManagedOOMMemoryPressureDurationUSec</varname>, and
+ <varname>ProtectControlGroupsEx</varname> were added in version 257.</para>
</refsect2>
<refsect2>
<title>Slice Unit Objects</title>
<varlistentry>
<term><varname>ProtectControlGroups=</varname></term>
- <listitem><para>Takes a boolean argument. If true, the Linux Control Groups (<citerefentry
- project='man-pages'><refentrytitle>cgroups</refentrytitle><manvolnum>7</manvolnum></citerefentry>) hierarchies
+ <listitem><para>Takes a boolean argument or the special values <literal>private</literal> or
+ <literal>strict</literal>. If true, the Linux Control Groups (<citerefentry project='man-pages'>
+ <refentrytitle>cgroups</refentrytitle><manvolnum>7</manvolnum></citerefentry>) hierarchies
accessible through <filename>/sys/fs/cgroup/</filename> will be made read-only to all processes of the
- unit. Except for container managers no services should require write access to the control groups hierarchies;
- it is hence recommended to turn this on for most services. For this setting the same restrictions regarding
- mount propagation and privileges apply as for <varname>ReadOnlyPaths=</varname> and related calls, see
- above. Defaults to off. If <varname>ProtectControlGroups=</varname> is set, <varname>MountAPIVFS=yes</varname>
- is implied.</para>
+ unit. If set to <literal>private</literal>, the unit will run in a cgroup namespace with a private
+ writable mount of <filename>/sys/fs/cgroup/</filename>. If set to <literal>strict</literal>, the unit
+ will run in a cgroup namespace with a private read-only mount of <filename>/sys/fs/cgroup/</filename>.
+ Defaults to off. If <varname>ProtectControlGroups=</varname> is set, <varname>MountAPIVFS=yes</varname>
+ is implied. Note <literal>private</literal> and <literal>strict</literal> are downgraded to false and
+ true respectively unless the system is using the unified control group hierarchy and the kernel supports
+ cgroup namespaces.</para>
+
+ <para>Except for container managers no services should require write access to the control groups hierarchies;
+ it is hence recommended to set <varname>ProtectControlGroups=</varname> to true or <literal>strict</literal>
+ for most services. For this setting the same restrictions regarding mount propagation and privileges apply
+ as for <varname>ReadOnlyPaths=</varname> and related settings, see above.</para>
<xi:include href="system-only.xml" xpointer="singular"/>
static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_empty_string, "s", NULL);
static BUS_DEFINE_PROPERTY_GET_REF(property_get_private_tmp_ex, "s", PrivateTmp, private_tmp_to_string);
static BUS_DEFINE_PROPERTY_GET_REF(property_get_private_users_ex, "s", PrivateUsers, private_users_to_string);
+static BUS_DEFINE_PROPERTY_GET_REF(property_get_protect_control_groups_ex, "s", ProtectControlGroups, protect_control_groups_to_string);
static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_level, "i", int, LOG_PRI);
static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_facility, "i", int, LOG_FAC);
static BUS_DEFINE_PROPERTY_GET(property_get_cpu_affinity_from_numa, "b", ExecContext, exec_context_get_cpu_affinity_from_numa);
SD_BUS_PROPERTY("ProtectKernelModules", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_modules), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectKernelLogs", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_logs), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectControlGroups", "b", property_get_protect_control_groups, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ProtectControlGroupsEx", "s", property_get_protect_control_groups_ex, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateUsers", "b", property_get_private_users, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateUsersEx", "s", property_get_private_users_ex, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST),
return 1;
}
+ if (streq(name, "ProtectControlGroupsEx")) {
+ const char *s;
+ ProtectControlGroups t;
+
+ r = sd_bus_message_read(message, "s", &s);
+ if (r < 0)
+ return r;
+
+ t = protect_control_groups_from_string(s);
+ if (t < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid %s setting: %s", name, s);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->protect_control_groups = t;
+ (void) unit_write_settingf(u, flags, name, "ProtectControlGroups=%s",
+ protect_control_groups_to_string(c->protect_control_groups));
+ }
+
+ return 1;
+ }
+
if (streq(name, "PrivateDevices"))
return bus_set_transient_bool(u, name, &c->private_devices, message, flags, error);
/* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
* service will need to write to it in order to start the notifications. */
- if (context->protect_control_groups != PROTECT_CONTROL_GROUPS_NO && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
+ if (exec_is_cgroup_mount_read_only(context, params) && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
read_write_paths_cleanup = strv_copy(context->read_write_paths);
if (!read_write_paths_cleanup)
return -ENOMEM;
* sandbox inside the mount namespace. */
.ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
- .protect_control_groups = needs_sandboxing ? context->protect_control_groups : PROTECT_CONTROL_GROUPS_NO,
+ .protect_control_groups = needs_sandboxing ? exec_get_protect_control_groups(context, params) : PROTECT_CONTROL_GROUPS_NO,
.protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
.protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
.protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
context->protect_kernel_tunables ||
context->protect_kernel_modules ||
context->protect_kernel_logs ||
- context->protect_control_groups != PROTECT_CONTROL_GROUPS_NO ||
+ exec_needs_cgroup_mount(context, params) ||
context->protect_clock ||
context->protect_hostname ||
!strv_isempty(context->read_write_paths) ||
}
}
+ /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
+ * from it. */
+ needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
+
if (params->cgroup_path) {
/* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
* this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
"Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
memory_pressure_path = mfree(memory_pressure_path);
}
+ /* First we use the current cgroup path to chmod and chown the memory pressure path, then pass the path relative
+ * to the cgroup namespace to environment variables and mounts. If chown/chmod fails, we should not pass memory
+ * pressure path environment variable or read-write mount to the unit. This is why we check if
+ * memory_pressure_path != NULL in the conditional below. */
+ if (memory_pressure_path && needs_sandboxing && exec_needs_cgroup_namespace(context, params)) {
+ memory_pressure_path = mfree(memory_pressure_path);
+ r = cg_get_path("memory", "", "memory.pressure", &memory_pressure_path);
+ if (r < 0) {
+ *exit_status = EXIT_MEMORY;
+ return log_oom();
+ }
+ }
} else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_NO) {
memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
if (!memory_pressure_path) {
return log_exec_error_errno(context, params, r, "Failed to set up kernel keyring: %m");
}
- /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
- * from it. */
- needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
-
/* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
* for it, and the kernel doesn't actually support ambient caps. */
needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
}
+ if (needs_sandboxing && exec_needs_cgroup_namespace(context, params)) {
+ r = unshare(CLONE_NEWCGROUP);
+ if (r < 0) {
+ *exit_status = EXIT_NAMESPACE;
+ return log_exec_error_errno(context, params, r, "Failed to set up cgroup namespacing: %m");
+ }
+ }
+
if (needs_mount_namespace) {
_cleanup_free_ char *error_path = NULL;
return context->private_ipc || context->ipc_namespace_path;
}
+static bool can_apply_cgroup_namespace(const ExecContext *context, const ExecParameters *params) {
+ return cg_all_unified() > 0 && ns_type_supported(NAMESPACE_CGROUP);
+}
+
+static bool needs_cgroup_namespace(ProtectControlGroups i) {
+ return IN_SET(i, PROTECT_CONTROL_GROUPS_PRIVATE, PROTECT_CONTROL_GROUPS_STRICT);
+}
+
+ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params) {
+ assert(context);
+
+ /* If cgroup namespace is configured via ProtectControlGroups=private or strict but we can't actually
+ * use cgroup namespace, either from not having unified hierarchy or kernel support, we ignore the
+ * setting and do not unshare the namespace. ProtectControlGroups=private and strict get downgraded
+ * to no and yes respectively. This ensures that strict always gets a read-only mount of /sys/fs/cgroup.
+ *
+ * TODO: Remove fallback once cgroupv1 support is removed in v258. */
+ if (needs_cgroup_namespace(context->protect_control_groups) && !can_apply_cgroup_namespace(context, params)) {
+ if (context->protect_control_groups == PROTECT_CONTROL_GROUPS_PRIVATE)
+ return PROTECT_CONTROL_GROUPS_NO;
+ if (context->protect_control_groups == PROTECT_CONTROL_GROUPS_STRICT)
+ return PROTECT_CONTROL_GROUPS_YES;
+ }
+ return context->protect_control_groups;
+}
+
+bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params) {
+ assert(context);
+
+ return needs_cgroup_namespace(exec_get_protect_control_groups(context, params));
+}
+
+bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params) {
+ assert(context);
+
+ return exec_get_protect_control_groups(context, params) != PROTECT_CONTROL_GROUPS_NO;
+}
+
+bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params) {
+ assert(context);
+
+ return IN_SET(exec_get_protect_control_groups(context, params), PROTECT_CONTROL_GROUPS_YES, PROTECT_CONTROL_GROUPS_STRICT);
+}
+
bool exec_needs_mount_namespace(
const ExecContext *context,
const ExecParameters *params,
context->protect_kernel_tunables ||
context->protect_kernel_modules ||
context->protect_kernel_logs ||
- context->protect_control_groups != PROTECT_CONTROL_GROUPS_NO ||
+ exec_needs_cgroup_mount(context, params) ||
context->protect_proc != PROTECT_PROC_DEFAULT ||
context->proc_subset != PROC_SUBSET_ALL ||
exec_needs_ipc_namespace(context))
bool exec_needs_network_namespace(const ExecContext *context);
bool exec_needs_ipc_namespace(const ExecContext *context);
+ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params);
+bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params);
+bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params);
+bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params);
+
/* These logging macros do the same logging as those in unit.h, but using ExecContext and ExecParameters
* instead of the unit object, so that it can be used in the sd-executor context (where the unit object is
* not available). */
MOUNT_PRIVATE_SYSFS,
MOUNT_BIND_SYSFS,
MOUNT_PROCFS,
+ MOUNT_PRIVATE_CGROUP2FS,
MOUNT_READ_ONLY,
MOUNT_READ_WRITE,
MOUNT_NOEXEC,
{ "/sys/fs/cgroup", MOUNT_READ_ONLY, false },
};
+/* ProtectControlGroups=private table. Note mount_private_apivfs() always use MS_NOSUID|MS_NOEXEC|MS_NODEV so
+ * flags is not set here. nsdelegate has been supported since kernels >= 4.13 so it is safe to use. */
+static const MountEntry protect_control_groups_private_table[] = {
+ { "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = false, .nosuid = true, .noexec = true, .options_const = "nsdelegate" },
+};
+
+/* ProtectControlGroups=strict table */
+static const MountEntry protect_control_groups_strict_table[] = {
+ { "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = true, .nosuid = true, .noexec = true, .options_const = "nsdelegate" },
+};
+
/* ProtectSystem=yes table */
static const MountEntry protect_system_yes_table[] = {
{ "/usr", MOUNT_READ_ONLY, false },
[MOUNT_EMPTY_DIR] = "empty-dir",
[MOUNT_PRIVATE_SYSFS] = "private-sysfs",
[MOUNT_BIND_SYSFS] = "bind-sysfs",
+ [MOUNT_PRIVATE_CGROUP2FS] = "private-cgroup2fs",
[MOUNT_PROCFS] = "procfs",
[MOUNT_READ_ONLY] = "read-only",
[MOUNT_READ_WRITE] = "read-write",
case PROTECT_CONTROL_GROUPS_YES:
return append_static_mounts(ml, protect_control_groups_yes_table, ELEMENTSOF(protect_control_groups_yes_table), ignore_protect);
+ case PROTECT_CONTROL_GROUPS_PRIVATE:
+ return append_static_mounts(ml, protect_control_groups_private_table, ELEMENTSOF(protect_control_groups_private_table), ignore_protect);
+
+ case PROTECT_CONTROL_GROUPS_STRICT:
+ return append_static_mounts(ml, protect_control_groups_strict_table, ELEMENTSOF(protect_control_groups_strict_table), ignore_protect);
+
default:
assert_not_reached();
}
r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
if (r == -EINVAL && opts)
- /* If this failed with EINVAL then this likely means the textual hidepid= stuff for procfs is
- * not supported by the kernel, and thus the per-instance hidepid= neither, which means we
- * really don't want to use it, since it would affect our host's /proc mount. Hence let's
- * gracefully fallback to a classic, unrestricted version. */
+ /* If this failed with EINVAL then this likely means either:
+ * 1. the textual hidepid= stuff for procfs is not supported by the kernel, and thus the
+ * per-instance hidepid= neither, which means we really don't want to use it, since it
+ * would affect our host's /proc mount.
+ * 2. nsdelegate for cgroup2 is not supported by the kernel even though CLONE_NEWCGROUP
+ * is supported.
+ *
+ * Hence let's gracefully fallback to a classic, unrestricted version. */
r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, /* opts = */ NULL);
if (ERRNO_IS_NEG_PRIVILEGE(r)) {
/* When we do not have enough privileges to mount a new instance, fall back to use an
return mount_private_apivfs("sysfs", mount_entry_path(m), "/sys", /* opts = */ NULL, p->runtime_scope);
}
+static bool check_recursiveprot_supported(void) {
+ int r;
+
+ /* memory_recursiveprot is only supported for kernels >= 5.7. Note mount_option_supported uses fsopen()
+ * and fsconfig() which are supported for kernels >= 5.2. So if mount_option_supported() returns an
+ * error, we can assume memory_recursiveprot is not supported. */
+ r = mount_option_supported("cgroup2", "memory_recursiveprot", NULL);
+ if (r < 0)
+ log_debug_errno(r, "Failed to determine whether the 'memory_recursiveprot' mount option is supported, assuming not: %m");
+ else if (r == 0)
+ log_debug("This kernel version does not support 'memory_recursiveprot', not using mount option.");
+
+ return r > 0;
+}
+
+static int mount_private_cgroup2fs(const MountEntry *m, const NamespaceParameters *p) {
+ _cleanup_free_ char *opts = NULL;
+
+ assert(m);
+ assert(p);
+
+ if (check_recursiveprot_supported()) {
+ opts = strdup(strempty(mount_entry_options(m)));
+ if (!opts)
+ return -ENOMEM;
+
+ if (!strextend_with_separator(&opts, ",", "memory_recursiveprot"))
+ return -ENOMEM;
+ }
+
+ return mount_private_apivfs("cgroup2", mount_entry_path(m), "/sys/fs/cgroup", opts ?: mount_entry_options(m), p->runtime_scope);
+}
+
static int mount_procfs(const MountEntry *m, const NamespaceParameters *p) {
_cleanup_free_ char *opts = NULL;
case MOUNT_PROCFS:
return mount_procfs(m, p);
+ case MOUNT_PRIVATE_CGROUP2FS:
+ return mount_private_cgroup2fs(m, p);
+
case MOUNT_RUN:
return mount_run(m);
static const char *const protect_control_groups_table[_PROTECT_CONTROL_GROUPS_MAX] = {
[PROTECT_CONTROL_GROUPS_NO] = "no",
[PROTECT_CONTROL_GROUPS_YES] = "yes",
+ [PROTECT_CONTROL_GROUPS_PRIVATE] = "private",
+ [PROTECT_CONTROL_GROUPS_STRICT] = "strict",
};
DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_control_groups, ProtectControlGroups, PROTECT_CONTROL_GROUPS_YES);
typedef enum ProtectControlGroups {
PROTECT_CONTROL_GROUPS_NO,
PROTECT_CONTROL_GROUPS_YES,
+ PROTECT_CONTROL_GROUPS_PRIVATE,
+ PROTECT_CONTROL_GROUPS_STRICT,
_PROTECT_CONTROL_GROUPS_MAX,
_PROTECT_CONTROL_GROUPS_INVALID = -EINVAL,
} ProtectControlGroups;
"ProtectHome",
"PrivateTmpEx",
"PrivateUsersEx",
+ "ProtectControlGroupsEx",
"SELinuxContext",
"RootImage",
"RootVerity",
--- /dev/null
+#!/usr/bin/env bash
+# SPDX-License-Identifier: LGPL-2.1-or-later
+# shellcheck disable=SC2016
+set -eux
+set -o pipefail
+
+# shellcheck source=test/units/test-control.sh
+. "$(dirname "$0")"/test-control.sh
+# shellcheck source=test/units/util.sh
+. "$(dirname "$0")"/util.sh
+
+SLICE="system.slice"
+UNIT_PREFIX="test-07-protect-control-groups"
+
+READ_ONLY_MOUNT_FLAG="ro"
+READ_WRITE_MOUNT_FLAG="rw"
+
+at_exit() {
+ set +e
+
+ systemctl stop "$UNIT_PREFIX*.service"
+ systemctl reset-failed
+}
+
+trap at_exit EXIT
+
+ROOT_CGROUP_NS=$(readlink /proc/self/ns/cgroup)
+
+ENABLE_MEM_PRESSURE_TEST=true
+
+# We do not just test if the file exists, but try to read from it, since if
+# CONFIG_PSI_DEFAULT_DISABLED is set in the kernel the file will exist and can
+# be opened, but any read()s will fail with EOPNOTSUPP, which we want to
+# detect.
+if ! cat /proc/pressure/memory >/dev/null ; then
+ echo "Kernel too old, has no PSI, not running ProtectControlGroups= with MemoryPressureWatch= test." >&2
+ ENABLE_MEM_PRESSURE_TEST=false
+fi
+
+if ! test -f "/sys/fs/cgroup/$(systemctl show TEST-07-PID1.service -P ControlGroup)/memory.pressure" ; then
+ echo "No memory accounting/PSI delegated via cgroup, not running ProtectControlGroups= with MemoryPressureWatch= test." >&2
+ ENABLE_MEM_PRESSURE_TEST=false
+fi
+
+test_basic() {
+ local protect_control_groups_ex="$1"
+ local protect_control_groups="$2"
+ local in_cgroup_ns="$3"
+ local mount_flag="$4"
+
+ if [[ $in_cgroup_ns == true ]]; then
+ local ns_cmp_op="!="
+ local unit_cgroup="0::/"
+ local memory_pressure_watch="/sys/fs/cgroup/memory.pressure"
+ else
+ local ns_cmp_op="=="
+ local unit_cgroup="0::/$SLICE/$UNIT_PREFIX-$protect_control_groups_ex-1.service"
+ local memory_pressure_watch="/sys/fs/cgroup/$SLICE/$UNIT_PREFIX-$protect_control_groups_ex-2.service/memory.pressure"
+ fi
+
+ # Compare cgroup namespace to root namespace
+ systemd-run -p "ProtectControlGroupsEx=$protect_control_groups_ex" --slice "$SLICE" --wait \
+ bash -xec "test \"\$(readlink /proc/self/ns/cgroup)\" $ns_cmp_op \"$ROOT_CGROUP_NS\""
+ # Verify unit cgroup
+ systemd-run -p "ProtectControlGroupsEx=$protect_control_groups_ex" --slice "$SLICE" --wait \
+ --unit "$UNIT_PREFIX-$protect_control_groups_ex-1" \
+ bash -xec "test \"\$(cat /proc/self/cgroup)\" == \"$unit_cgroup\""
+ # Verify memory pressure watch points to correct file
+ if [[ $ENABLE_MEM_PRESSURE_TEST == true ]]; then
+ systemd-run -p "ProtectControlGroupsEx=$protect_control_groups_ex" -p MemoryPressureWatch=yes --slice "$SLICE" --wait \
+ --unit "$UNIT_PREFIX-$protect_control_groups_ex-2" \
+ bash -xec "test \"\$MEMORY_PRESSURE_WATCH\" == \"$memory_pressure_watch\""
+ fi
+ # Verify /sys/fs/cgroup mount is read-only or read-write
+ systemd-run -p "ProtectControlGroupsEx=$protect_control_groups_ex" --slice "$SLICE" --wait \
+ bash -xec "[[ \"\$\$(findmnt --mountpoint /sys/fs/cgroup --noheadings -o FSTYPE)\" == cgroup2 ]];
+ [[ \"\$\$(findmnt --mountpoint /sys/fs/cgroup --noheadings -o FS-OPTIONS)\" =~ nsdelegate ]];
+ [[ \"\$\$(findmnt --mountpoint /sys/fs/cgroup --noheadings -o VFS-OPTIONS)\" =~ noexec ]];
+ [[ \"\$\$(findmnt --mountpoint /sys/fs/cgroup --noheadings -o VFS-OPTIONS)\" =~ nosuid ]];
+ [[ \"\$\$(findmnt --mountpoint /sys/fs/cgroup --noheadings -o VFS-OPTIONS)\" =~ nodev ]];
+ [[ \"\$\$(findmnt --mountpoint /sys/fs/cgroup --noheadings -o VFS-OPTIONS)\" =~ \"$mount_flag\" ]];"
+
+ # Verify dbus properties
+ systemd-run -p "ProtectControlGroupsEx=$protect_control_groups_ex" --slice "$SLICE" --remain-after-exit \
+ --unit "$UNIT_PREFIX-$protect_control_groups_ex-3" true
+ assert_eq "$(systemctl show -P ProtectControlGroupsEx "$UNIT_PREFIX-$protect_control_groups_ex-3")" "$protect_control_groups_ex"
+ assert_eq "$(systemctl show -P ProtectControlGroups "$UNIT_PREFIX-$protect_control_groups_ex-3")" "$protect_control_groups"
+ systemctl stop "$UNIT_PREFIX-$protect_control_groups_ex-3"
+}
+
+testcase_basic_no() {
+ test_basic "no" "no" false "$READ_WRITE_MOUNT_FLAG"
+}
+
+testcase_basic_yes() {
+ test_basic "yes" "yes" false "$READ_ONLY_MOUNT_FLAG"
+}
+
+testcase_basic_private() {
+ test_basic "private" "yes" true "$READ_WRITE_MOUNT_FLAG"
+}
+
+testcase_basic_strict() {
+ test_basic "strict" "yes" true "$READ_ONLY_MOUNT_FLAG"
+}
+
+run_testcases