@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b MemoryKSM = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly s MemoryTHP = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s UserNamespacePath = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s NetworkNamespacePath = '...';
<!--property MemoryKSM is not documented!-->
+ <!--property MemoryTHP is not documented!-->
+
<!--property UserNamespacePath is not documented!-->
<!--property NetworkNamespacePath is not documented!-->
<variablelist class="dbus-property" generated="True" extra-ref="MemoryKSM"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="MemoryTHP"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="UserNamespacePath"/>
<variablelist class="dbus-property" generated="True" extra-ref="NetworkNamespacePath"/>
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b MemoryKSM = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly s MemoryTHP = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s UserNamespacePath = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s NetworkNamespacePath = '...';
<!--property MemoryKSM is not documented!-->
+ <!--property MemoryTHP is not documented!-->
+
<!--property UserNamespacePath is not documented!-->
<!--property NetworkNamespacePath is not documented!-->
<variablelist class="dbus-property" generated="True" extra-ref="MemoryKSM"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="MemoryTHP"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="UserNamespacePath"/>
<variablelist class="dbus-property" generated="True" extra-ref="NetworkNamespacePath"/>
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b MemoryKSM = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly s MemoryTHP = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s UserNamespacePath = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s NetworkNamespacePath = '...';
<!--property MemoryKSM is not documented!-->
+ <!--property MemoryTHP is not documented!-->
+
<!--property UserNamespacePath is not documented!-->
<!--property NetworkNamespacePath is not documented!-->
<variablelist class="dbus-property" generated="True" extra-ref="MemoryKSM"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="MemoryTHP"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="UserNamespacePath"/>
<variablelist class="dbus-property" generated="True" extra-ref="NetworkNamespacePath"/>
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b MemoryKSM = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly s MemoryTHP = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s UserNamespacePath = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s NetworkNamespacePath = '...';
<!--property MemoryKSM is not documented!-->
+ <!--property MemoryTHP is not documented!-->
+
<!--property UserNamespacePath is not documented!-->
<!--property NetworkNamespacePath is not documented!-->
<variablelist class="dbus-property" generated="True" extra-ref="MemoryKSM"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="MemoryTHP"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="UserNamespacePath"/>
<variablelist class="dbus-property" generated="True" extra-ref="NetworkNamespacePath"/>
<varname>ManagedOOMKills</varname>,
<varname>ExecReloadPost</varname>, and
<varname>ExecReloadPostEx</varname> were added in version 259.</para>
- <para><varname>BindNetworkInterface</varname> was added in version 260.</para>
+ <para><varname>BindNetworkInterface</varname>, and
+ <varname>MemoryTHP</varname> were added in version 260.</para>
</refsect2>
<refsect2>
<title>Socket Unit Objects</title>
<para><varname>UserNamespacePath</varname>,
<varname>OOMKills</varname>, and
<varname>ManagedOOMKills</varname> were added in 259.</para>
- <para><varname>BindNetworkInterface</varname> was added in version 260.</para>
+ <para><varname>BindNetworkInterface</varname>, and
+ <varname>MemoryTHP</varname> were added in version 260.</para>
</refsect2>
<refsect2>
<title>Mount Unit Objects</title>
<para><varname>UserNamespacePath</varname>,
<varname>OOMKills</varname>, and
<varname>ManagedOOMKills</varname> were added in 259.</para>
- <para><varname>BindNetworkInterface</varname> was added in version 260.</para>
+ <para><varname>BindNetworkInterface</varname>, and
+ <varname>MemoryTHP</varname> were added in version 260.</para>
</refsect2>
<refsect2>
<title>Swap Unit Objects</title>
<para><varname>UserNamespacePath</varname>,
<varname>OOMKills</varname>, and
<varname>ManagedOOMKills</varname> were added in 259.</para>
- <para><varname>BindNetworkInterface</varname> was added in version 260.</para>
+ <para><varname>BindNetworkInterface</varname>, and
+ <varname>MemoryTHP</varname> were added in version 260.</para>
</refsect2>
<refsect2>
<title>Slice Unit Objects</title>
</listitem>
</varlistentry>
+ <varlistentry>
+ <term><varname>MemoryTHP=</varname></term>
+
+ <listitem><para>Transparent Hugepages (THPs) is a Linux kernel feature that manages memory
+ using larger pages (2MB on x86, compared to the default 4KB). The main goal is to improve memory management
+ efficiency and system performance, especially for memory-intensive applications.
+ However, it can cause drawbacks in some scenarios, such as memory regression and latency spikes.
+ THP policy is governed for the entire system via <filename>/sys/kernel/mm/transparent_hugepage/enabled</filename>.
+ However, it can be overridden for individual workloads via
+ <citerefentry><refentrytitle>prctl</refentrytitle><manvolnum>2</manvolnum></citerefentry>.
+ <varname>MemoryTHP=</varname> may be used to disable THPs at process invocation time to stop providing
+ THPs for workloads where the drawbacks outweigh the advantages.
+ When <varname>MemoryTHP=</varname> is set to <literal>inherit</literal> or not set at all, systemd
+ inherits THP settings from the process that starts it and no
+ <citerefentry><refentrytitle>prctl</refentrytitle><manvolnum>2</manvolnum></citerefentry>
+ <constant>PR_SET_THP_DISABLE</constant> call is made.
+ When set to <literal>disable</literal>, <varname>MemoryTHP=</varname> disables THPs completely for the process,
+ irrespecitive of global THP controls.
+ When set to <literal>madvise</literal>, <varname>MemoryTHP=</varname> disables THPs for the process except when
+ specifically requested via <citerefentry><refentrytitle>madvise</refentrytitle><manvolnum>2</manvolnum></citerefentry>
+ by the process with <constant>MADV_HUGEPAGE</constant> or <constant>MADV_COLLAPSE</constant>.
+ When set to <literal>system</literal>, <varname>MemoryTHP=</varname> resets the THP policy to system wide policy.
+ This can be used when the process that starts systemd has already disabled THPs via
+ <constant>PR_SET_THP_DISABLE</constant>, and we want to restore the system default THP setting at
+ process invokation time. For details, see
+ <ulink url="https://docs.kernel.org/admin-guide/mm/transhuge.html">Transparent Hugepage Support</ulink>
+ in the kernel documentation.</para>
+ <para>Note that this functionality might not be available, for example if THP is disabled in the
+ kernel, or the kernel does not support controlling THP at the process level through
+ <citerefentry><refentrytitle>prctl</refentrytitle><manvolnum>2</manvolnum></citerefentry>.</para>
+
+ <xi:include href="version-info.xml" xpointer="v260"/>
+ </listitem>
+ </varlistentry>
+
<varlistentry>
<term><varname>PrivatePIDs=</varname></term>
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_home, protect_home, ProtectHome);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_system, protect_system, ProtectSystem);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_personality, personality, unsigned long);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_memory_thp, memory_thp, MemoryTHP);
static BUS_DEFINE_PROPERTY_GET(property_get_ioprio, "i", ExecContext, exec_context_get_effective_ioprio);
static BUS_DEFINE_PROPERTY_GET(property_get_mount_apivfs, "b", ExecContext, exec_context_get_effective_mount_apivfs);
static BUS_DEFINE_PROPERTY_GET(property_get_bind_log_sockets, "b", ExecContext, exec_context_get_effective_bind_log_sockets);
SD_BUS_PROPERTY("BPFDelegatePrograms", "s", property_get_bpf_delegate_programs, offsetof(ExecContext, bpf_delegate_programs), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("BPFDelegateAttachments", "s", property_get_bpf_delegate_attachments, offsetof(ExecContext, bpf_delegate_attachments), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("MemoryKSM", "b", bus_property_get_tristate, offsetof(ExecContext, memory_ksm), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("MemoryTHP", "s", property_get_memory_thp, offsetof(ExecContext, memory_thp), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("UserNamespacePath", "s", NULL, offsetof(ExecContext, user_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("NetworkNamespacePath", "s", NULL, offsetof(ExecContext, network_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("IPCNamespacePath", "s", NULL, offsetof(ExecContext, ipc_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST),
static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_proc, ProtectProc, protect_proc_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE(proc_subset, ProcSubset, proc_subset_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE(private_bpf, PrivateBPF, private_bpf_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(memory_thp, MemoryTHP, memory_thp_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE_PTR(bpf_delegate_commands, uint64_t, bpf_delegate_commands_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE_PTR(bpf_delegate_maps, uint64_t, bpf_delegate_maps_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE_PTR(bpf_delegate_programs, uint64_t, bpf_delegate_programs_from_string);
if (streq(name, "MemoryKSM"))
return bus_set_transient_tristate(u, name, &c->memory_ksm, message, flags, reterr_error);
+ if (streq(name, "MemoryTHP"))
+ return bus_set_transient_memory_thp(u, name, &c->memory_thp, message, flags, reterr_error);
+
if (streq(name, "UtmpIdentifier"))
return bus_set_transient_string(u, name, &c->utmp_id, message, flags, reterr_error);
return 1;
}
+static int set_memory_thp(MemoryTHP thp) {
+ switch (thp) {
+
+ case MEMORY_THP_INHERIT:
+ return 0;
+
+ case MEMORY_THP_DISABLE:
+ if (prctl(PR_SET_THP_DISABLE, 1, 0, 0, 0) < 0)
+ return errno == EINVAL ? -EOPNOTSUPP : -errno;
+ return 0;
+
+ case MEMORY_THP_MADVISE:
+ if (prctl(PR_SET_THP_DISABLE, 1, PR_THP_DISABLE_EXCEPT_ADVISED, 0, 0) < 0)
+ return errno == EINVAL ? -EOPNOTSUPP : -errno;
+ return 0;
+
+ case MEMORY_THP_SYSTEM:
+ if (prctl(PR_SET_THP_DISABLE, 0, 0, 0, 0) < 0)
+ return errno == EINVAL ? -EOPNOTSUPP : -errno;
+ return 0;
+
+ default:
+ assert_not_reached();
+ }
+}
+
static int send_handoff_timestamp(
const ExecContext *c,
ExecParameters *p,
}
}
+ r = set_memory_thp(context->memory_thp);
+ if (r == -EOPNOTSUPP)
+ log_debug_errno(r, "Setting MemoryTHP=%s is not supported, ignoring: %m",
+ memory_thp_to_string(context->memory_thp));
+ else if (r < 0) {
+ *exit_status = EXIT_MEMORY_THP;
+ return log_error_errno(r, "Failed to set MemoryTHP=%s: %m",
+ memory_thp_to_string(context->memory_thp));
+ }
+
#if ENABLE_UTMP
if (context->utmp_id) {
_cleanup_free_ char *username_alloc = NULL;
if (r < 0)
return r;
+ r = serialize_item(f, "exec-context-memory-thp", memory_thp_to_string(c->memory_thp));
+ if (r < 0)
+ return r;
+
r = serialize_item(f, "exec-context-private-tmp", private_tmp_to_string(c->private_tmp));
if (r < 0)
return r;
r = safe_atoi(val, &c->memory_ksm);
if (r < 0)
return r;
+ } else if ((val = startswith(l, "exec-context-memory-thp="))) {
+ c->memory_thp = memory_thp_from_string(val);
+ if (c->memory_thp < 0)
+ return c->memory_thp;
} else if ((val = startswith(l, "exec-context-private-tmp="))) {
c->private_tmp = private_tmp_from_string(val);
if (c->private_tmp < 0)
"%sProtectHostname: %s%s%s\n"
"%sProtectProc: %s\n"
"%sProcSubset: %s\n"
- "%sPrivateBPF: %s\n",
+ "%sPrivateBPF: %s\n"
+ "%sMemoryTHP: %s\n",
prefix, c->umask,
prefix, empty_to_root(c->working_directory),
prefix, empty_to_root(c->root_directory),
prefix, protect_hostname_to_string(c->protect_hostname), c->private_hostname ? ":" : "", strempty(c->private_hostname),
prefix, protect_proc_to_string(c->protect_proc),
prefix, proc_subset_to_string(c->proc_subset),
- prefix, private_bpf_to_string(c->private_bpf));
+ prefix, private_bpf_to_string(c->private_bpf),
+ prefix, memory_thp_to_string(c->memory_thp));
if (c->private_bpf == PRIVATE_BPF_YES) {
_cleanup_free_ char
int mount_apivfs;
int bind_log_sockets;
int memory_ksm;
+ MemoryTHP memory_thp;
PrivateTmp private_tmp;
PrivateTmp private_var_tmp; /* This is not an independent parameter, but calculated from other
* parameters in unit_patch_contexts(). */
{% endif %}
{{type}}.ProtectHostname, config_parse_protect_hostname, 0, offsetof({{type}}, exec_context)
{{type}}.MemoryKSM, config_parse_tristate, 0, offsetof({{type}}, exec_context.memory_ksm)
+{{type}}.MemoryTHP, config_parse_memory_thp, 0, offsetof({{type}}, exec_context.memory_thp)
{%- endmacro -%}
{%- macro KILL_CONTEXT_CONFIG_ITEMS(type) -%}
DEFINE_CONFIG_PARSE_PTR(config_parse_bpf_delegate_maps, bpf_delegate_maps_from_string, uint64_t);
DEFINE_CONFIG_PARSE_PTR(config_parse_bpf_delegate_programs, bpf_delegate_programs_from_string, uint64_t);
DEFINE_CONFIG_PARSE_PTR(config_parse_bpf_delegate_attachments, bpf_delegate_attachments_from_string, uint64_t);
+DEFINE_CONFIG_PARSE_ENUM(config_parse_memory_thp, memory_thp, MemoryTHP);
bool contains_instance_specifier_superset(const char *s) {
const char *p, *q;
CONFIG_PARSER_PROTOTYPE(config_parse_mount_node);
CONFIG_PARSER_PROTOTYPE(config_parse_concurrency_max);
CONFIG_PARSER_PROTOTYPE(config_parse_bind_network_interface);
+CONFIG_PARSER_PROTOTYPE(config_parse_memory_thp);
/* gperf prototypes */
const struct ConfigPerfItem* load_fragment_gperf_lookup(const char *key, GPERF_LEN_TYPE length);
DEFINE_STRING_TABLE_LOOKUP(bpf_delegate_prog_type, uint64_t);
DEFINE_STRING_TABLE_LOOKUP(bpf_delegate_attach_type, uint64_t);
+static const char* const memory_thp_table[_MEMORY_THP_MAX] = {
+ [MEMORY_THP_INHERIT] = "inherit",
+ [MEMORY_THP_DISABLE] = "disable",
+ [MEMORY_THP_MADVISE] = "madvise",
+ [MEMORY_THP_SYSTEM] = "system",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(memory_thp, MemoryTHP);
+
char* bpf_delegate_to_string(uint64_t u, const char * (*parser)(uint64_t) _const_ ) {
assert(parser);
_PRIVATE_PIDS_INVALID = -EINVAL,
} PrivatePIDs;
+typedef enum MemoryTHP {
+ /*
+ * Inherit default from process that starts systemd, i.e. do not make
+ * any PR_SET_THP_DISABLE call.
+ */
+ MEMORY_THP_INHERIT,
+ MEMORY_THP_DISABLE, /* Disable THPs completely for the prcess */
+ MEMORY_THP_MADVISE, /* Disable THPs for the process except when madvised */
+ /*
+ * Use system default THP setting. this can be used when the process that
+ * starts systemd has already disabled THPs via PR_SET_THP_DISABLE, and we
+ * want to restore the system default THP setting at process invokation time.
+ */
+ MEMORY_THP_SYSTEM,
+ _MEMORY_THP_MAX,
+ _MEMORY_THP_INVALID = -EINVAL,
+} MemoryTHP;
+
typedef struct BindMount {
char *source;
char *destination;
DECLARE_STRING_TABLE_LOOKUP(private_bpf, PrivateBPF);
+DECLARE_STRING_TABLE_LOOKUP(memory_thp, MemoryTHP);
+
DECLARE_STRING_TABLE_LOOKUP(bpf_delegate_cmd, uint64_t);
DECLARE_STRING_TABLE_LOOKUP(bpf_delegate_map_type, uint64_t);
SD_JSON_BUILD_PAIR_BOOLEAN("RemoveIPC", c->remove_ipc),
JSON_BUILD_PAIR_TRISTATE_NON_NULL("PrivateMounts", c->private_mounts),
JSON_BUILD_PAIR_STRING_NON_EMPTY("MountFlags", mount_propagation_flag_to_string(c->mount_propagation_flag)),
+ SD_JSON_BUILD_PAIR_STRING("MemoryTHP", memory_thp_to_string(c->memory_thp)),
/* System Call Filtering */
JSON_BUILD_PAIR_CALLBACK_NON_NULL("SystemCallFilter", syscall_filter_build_json, c),
#ifndef PR_SET_MEMORY_MERGE
#define PR_SET_MEMORY_MERGE 67
#endif
+
+#ifndef PR_THP_DISABLE_EXCEPT_ADVISED
+#define PR_THP_DISABLE_EXCEPT_ADVISED (1 << 1)
+#endif
{ "LogRateLimitBurst", bus_append_safe_atou },
{ "TTYRows", bus_append_safe_atou },
{ "TTYColumns", bus_append_safe_atou },
+ { "MemoryTHP", bus_append_string },
{ "MountFlags", bus_append_mount_propagation_flag_from_string },
{ "Environment", bus_append_strv_cunescape },
{ "UnsetEnvironment", bus_append_strv_cunescape },
[EXIT_CREDENTIALS] = { "CREDENTIALS", EXIT_STATUS_SYSTEMD },
[EXIT_BPF] = { "BPF", EXIT_STATUS_SYSTEMD },
[EXIT_KSM] = { "KSM", EXIT_STATUS_SYSTEMD },
+ [EXIT_MEMORY_THP] = { "MEMORY_THP", EXIT_STATUS_SYSTEMD },
[EXIT_EXCEPTION] = { "EXCEPTION", EXIT_STATUS_SYSTEMD },
EXIT_CREDENTIALS,
EXIT_BPF,
EXIT_KSM,
+ EXIT_MEMORY_THP,
EXIT_EXCEPTION = 255, /* Whenever we want to propagate an abnormal/signal exit, in line with bash */
};
SD_VARLINK_DEFINE_FIELD(PrivateMounts, SD_VARLINK_BOOL, SD_VARLINK_NULLABLE),
SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man"PROJECT_VERSION_STR"systemd.exec.html#MountFlags="),
SD_VARLINK_DEFINE_FIELD(MountFlags, SD_VARLINK_STRING, SD_VARLINK_NULLABLE),
+ SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man"PROJECT_VERSION_STR"systemd.exec.html#MemoryTHP="),
+ SD_VARLINK_DEFINE_FIELD(MemoryTHP, SD_VARLINK_STRING, SD_VARLINK_NULLABLE),
/* System Call Filtering
* https://www.freedesktop.org/software/systemd/man/latest/systemd.exec.html#System%20Call%20Filtering */