]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
oomd: ruleset parsing
authorMatteo Croce <teknoraver@meta.com>
Thu, 25 Sep 2025 19:17:26 +0000 (21:17 +0200)
committerDaan De Meyer <daan.j.demeyer@gmail.com>
Mon, 18 May 2026 19:42:56 +0000 (21:42 +0200)
Add to oomd the capability to define rulesets in /etc/systemd/oomd/rules.d/
and then reference them with a new config option OOMRule=

23 files changed:
man/oomd.conf.xml
man/org.freedesktop.systemd1.xml
man/systemd.resource-control.xml
src/core/cgroup.c
src/core/cgroup.h
src/core/dbus-cgroup.c
src/core/execute-serialize.c
src/core/load-fragment-gperf.gperf.in
src/core/load-fragment.c
src/core/load-fragment.h
src/core/unit.c
src/core/varlink-cgroup.c
src/core/varlink.c
src/oom/oomd-conf.c
src/oom/oomd-conf.h
src/oom/oomd-manager.c
src/oom/oomd-manager.h
src/oom/oomd-util.c
src/oom/oomd-util.h
src/shared/bus-unit-util.c
src/shared/varlink-io.systemd.Unit.c
src/shared/varlink-io.systemd.oom.c
test/units/TEST-55-OOMD.sh

index f8c3c0a173e15dc9b352c1daf66fc90cd685055e..1e33f3b3ded43eeb984d38bc6c12c04db5e49049 100644 (file)
 
   </refsect1>
 
+  <refsect1>
+    <title>OOM Rulesets</title>
+
+    <para><command>systemd-oomd</command> supports custom rulesets that define conditions and actions for
+    OOM handling on a per-unit basis. Ruleset files use the <filename>.oomrule</filename> extension and are
+    loaded from <filename>/etc/systemd/oomd/rules.d/</filename>,
+    <filename>/run/systemd/oomd/rules.d/</filename>,
+    <filename>/usr/local/lib/systemd/oomd/rules.d/</filename>, and
+    <filename>/usr/lib/systemd/oomd/rules.d/</filename>.
+    Units opt into rulesets via the <varname>OOMRules=</varname> setting in
+    <citerefentry><refentrytitle>systemd.resource-control</refentrytitle><manvolnum>5</manvolnum></citerefentry>,
+    which takes a space-separated list of ruleset names (the file name without the <filename>.oomrule</filename>
+    extension).</para>
+
+    <para>Each ruleset file contains a <literal>[Rule]</literal> section with the following options. At least
+    one of <varname>MemoryPressureAbove=</varname> or <varname>SwapUsageMax=</varname> must be configured;
+    rulesets with no conditions are ignored. If both are set, the conditions are combined with AND, i.e. the
+    action is only triggered when both thresholds are exceeded simultaneously.</para>
+
+    <variablelist class='config-directives'>
+      <varlistentry>
+        <term><varname>MemoryPressureAbove=</varname></term>
+
+        <listitem><para>Sets the memory pressure threshold above which the rule's action will be triggered.
+        The memory pressure represents the fraction of time in a 10 second window in which all tasks in the
+        control group were delayed (PSI <literal>full avg10</literal>). Takes a value specified in percent
+        (when suffixed with <literal>%</literal>), permille (<literal>‰</literal>) or permyriad
+        (<literal>‱</literal>), between 0% and 100%, inclusive. If unset, this condition is not
+        evaluated. A value of <literal>100%</literal> can never be exceeded and is
+        therefore rejected with a warning; a value of <literal>0%</literal> makes the condition true on any
+        observed pressure, which is usually not useful.</para>
+
+        <xi:include href="version-info.xml" xpointer="v261"/></listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><varname>SwapUsageMax=</varname></term>
+
+        <listitem><para>Sets the system-wide swap usage threshold above which the rule's action will be
+        triggered. Takes a value specified in percent (when suffixed with <literal>%</literal>),
+        permille (<literal>‰</literal>) or permyriad (<literal>‱</literal>),
+        between 0% and 100%, inclusive. If unset, this condition is not evaluated. A value of
+        <literal>100%</literal> can never be exceeded and is therefore rejected with
+        a warning; a value of <literal>0%</literal> fires as soon as any swap is in use, which is usually
+        not useful.</para>
+
+        <xi:include href="version-info.xml" xpointer="v261"/></listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><varname>Action=</varname></term>
+
+        <listitem><para>Specifies the action to take when the rule's conditions are met. Takes one of
+        <literal>kill-all</literal>, <literal>kill-by-pgscan</literal>, or
+        <literal>kill-by-swap</literal>. This setting is mandatory; rulesets without
+        <varname>Action=</varname> are ignored.</para>
+
+        <itemizedlist>
+          <listitem><para><literal>kill-all</literal> sends <constant>SIGKILL</constant> to every process
+          in the unit's cgroup hierarchy, including any descendant cgroups.</para></listitem>
+
+          <listitem><para><literal>kill-by-pgscan</literal> selects and kills the descendant cgroup with
+          the highest recent page scan (reclaim) rate.</para></listitem>
+
+          <listitem><para><literal>kill-by-swap</literal> selects and kills the descendant cgroup with the
+          highest swap usage.</para></listitem>
+        </itemizedlist>
+
+        <xi:include href="version-info.xml" xpointer="v261"/></listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><varname>LastingSec=</varname></term>
+
+        <listitem><para>Sets the duration the conditions must be continuously met before the action is taken.
+        Takes a time span value, see
+        <citerefentry><refentrytitle>systemd.time</refentrytitle><manvolnum>7</manvolnum></citerefentry>
+        for details on the permitted syntax. Defaults to 0, i.e. the action is taken
+        immediately when the conditions are met.</para>
+
+        <xi:include href="version-info.xml" xpointer="v261"/></listitem>
+      </varlistentry>
+    </variablelist>
+
+  </refsect1>
+
   <refsect1>
     <title>[OOM] Section Options</title>
 
index 7efc899dba25110bf71d9d45a61162b7733eeb4a..2d63050a68620980a7fb7ea3e9d9c0e9dda98852 100644 (file)
@@ -3098,6 +3098,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly s ManagedOOMPreference = '...';
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly as OOMRules = ['...', ...];
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly a(ss) BPFProgram = [...];
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly a(iiqq) SocketBindAllow = [...];
@@ -3777,6 +3779,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
 
     <!--property ManagedOOMPreference is not documented!-->
 
+    <!--property OOMRules is not documented!-->
+
     <!--property BPFProgram is not documented!-->
 
     <!--property SocketBindAllow is not documented!-->
@@ -4479,6 +4483,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
 
     <variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMPreference"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="OOMRules"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="BPFProgram"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="SocketBindAllow"/>
@@ -5388,6 +5394,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly s ManagedOOMPreference = '...';
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly as OOMRules = ['...', ...];
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly a(ss) BPFProgram = [...];
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly a(iiqq) SocketBindAllow = [...];
@@ -6083,6 +6091,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
 
     <!--property ManagedOOMPreference is not documented!-->
 
+    <!--property OOMRules is not documented!-->
+
     <!--property BPFProgram is not documented!-->
 
     <!--property SocketBindAllow is not documented!-->
@@ -6759,6 +6769,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
 
     <variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMPreference"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="OOMRules"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="BPFProgram"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="SocketBindAllow"/>
@@ -7491,6 +7503,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly s ManagedOOMPreference = '...';
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly as OOMRules = ['...', ...];
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly a(ss) BPFProgram = [...];
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly a(iiqq) SocketBindAllow = [...];
@@ -8110,6 +8124,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
 
     <!--property ManagedOOMPreference is not documented!-->
 
+    <!--property OOMRules is not documented!-->
+
     <!--property BPFProgram is not documented!-->
 
     <!--property SocketBindAllow is not documented!-->
@@ -8694,6 +8710,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
 
     <variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMPreference"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="OOMRules"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="BPFProgram"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="SocketBindAllow"/>
@@ -9559,6 +9577,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly s ManagedOOMPreference = '...';
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly as OOMRules = ['...', ...];
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly a(ss) BPFProgram = [...];
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly a(iiqq) SocketBindAllow = [...];
@@ -10160,6 +10180,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
 
     <!--property ManagedOOMPreference is not documented!-->
 
+    <!--property OOMRules is not documented!-->
+
     <!--property BPFProgram is not documented!-->
 
     <!--property SocketBindAllow is not documented!-->
@@ -10726,6 +10748,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
 
     <variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMPreference"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="OOMRules"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="BPFProgram"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="SocketBindAllow"/>
@@ -11444,6 +11468,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly s ManagedOOMPreference = '...';
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly as OOMRules = ['...', ...];
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly a(ss) BPFProgram = [...];
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly a(iiqq) SocketBindAllow = [...];
@@ -11627,6 +11653,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
 
     <!--property ManagedOOMPreference is not documented!-->
 
+    <!--property OOMRules is not documented!-->
+
     <!--property BPFProgram is not documented!-->
 
     <!--property SocketBindAllow is not documented!-->
@@ -11825,6 +11853,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
 
     <variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMPreference"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="OOMRules"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="BPFProgram"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="SocketBindAllow"/>
@@ -12046,6 +12076,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly s ManagedOOMPreference = '...';
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly as OOMRules = ['...', ...];
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly a(ss) BPFProgram = [...];
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly a(iiqq) SocketBindAllow = [...];
@@ -12243,6 +12275,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
 
     <!--property ManagedOOMPreference is not documented!-->
 
+    <!--property OOMRules is not documented!-->
+
     <!--property BPFProgram is not documented!-->
 
     <!--property SocketBindAllow is not documented!-->
@@ -12465,6 +12499,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
 
     <variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMPreference"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="OOMRules"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="BPFProgram"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="SocketBindAllow"/>
@@ -12801,8 +12837,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <para><varname>CPUPressureThresholdUSec</varname>,
       <varname>CPUPressureWatch</varname>,
       <varname>IOPressureThresholdUSec</varname>,
-      <varname>IOPressureWatch</varname>, and
-      <varname>CPUSetPartition</varname> were added in version 261.</para>
+      <varname>IOPressureWatch</varname>,
+      <varname>CPUSetPartition</varname>, and
+      <varname>OOMRules</varname> were added in version 261.</para>
     </refsect2>
     <refsect2>
       <title>Socket Unit Objects</title>
@@ -12876,8 +12913,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <para><varname>CPUPressureThresholdUSec</varname>,
       <varname>CPUPressureWatch</varname>,
       <varname>IOPressureThresholdUSec</varname>,
-      <varname>IOPressureWatch</varname>, and
-      <varname>CPUSetPartition</varname> were added in version 261.</para>
+      <varname>IOPressureWatch</varname>,
+      <varname>CPUSetPartition</varname>, and
+      <varname>OOMRules</varname> were added in version 261.</para>
     </refsect2>
     <refsect2>
       <title>Mount Unit Objects</title>
@@ -12946,8 +12984,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <para><varname>CPUPressureThresholdUSec</varname>,
       <varname>CPUPressureWatch</varname>,
       <varname>IOPressureThresholdUSec</varname>,
-      <varname>IOPressureWatch</varname>, and
-      <varname>CPUSetPartition</varname> were added in version 261.</para>
+      <varname>IOPressureWatch</varname>,
+      <varname>CPUSetPartition</varname>, and
+      <varname>OOMRules</varname> were added in version 261.</para>
     </refsect2>
     <refsect2>
       <title>Swap Unit Objects</title>
@@ -13014,8 +13053,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <para><varname>CPUPressureThresholdUSec</varname>,
       <varname>CPUPressureWatch</varname>,
       <varname>IOPressureThresholdUSec</varname>,
-      <varname>IOPressureWatch</varname>, and
-      <varname>CPUSetPartition</varname> were added in version 261.</para>
+      <varname>IOPressureWatch</varname>,
+      <varname>CPUSetPartition</varname>, and
+      <varname>OOMRules</varname> were added in version 261.</para>
     </refsect2>
     <refsect2>
       <title>Slice Unit Objects</title>
@@ -13052,8 +13092,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <para><varname>CPUPressureThresholdUSec</varname>,
       <varname>CPUPressureWatch</varname>,
       <varname>IOPressureThresholdUSec</varname>,
-      <varname>IOPressureWatch</varname>, and
-      <varname>CPUSetPartition</varname> were added in version 261.</para>
+      <varname>IOPressureWatch</varname>,
+      <varname>CPUSetPartition</varname>, and
+      <varname>OOMRules</varname> were added in version 261.</para>
     </refsect2>
     <refsect2>
       <title>Scope Unit Objects</title>
@@ -13088,8 +13129,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <para><varname>CPUPressureThresholdUSec</varname>,
       <varname>CPUPressureWatch</varname>,
       <varname>IOPressureThresholdUSec</varname>,
-      <varname>IOPressureWatch</varname>, and
-      <varname>CPUSetPartition</varname> were added in version 261.</para>
+      <varname>IOPressureWatch</varname>,
+      <varname>CPUSetPartition</varname>, and
+      <varname>OOMRules</varname> were added in version 261.</para>
     </refsect2>
     <refsect2>
       <title>Job Objects</title>
index fcad4b31839eacf0a28edf34ed9ee658bdd45221..58e923b618491c53672a246ebe4f952ed83b9197 100644 (file)
@@ -1630,6 +1630,35 @@ DeviceAllow=/dev/loop-control
         </listitem>
       </varlistentry>
 
+      <varlistentry>
+        <term><varname>OOMRules=</varname></term>
+
+        <listitem>
+          <para>Takes a space-separated list of OOM ruleset names. The rulesets are defined in
+          <filename>.oomrule</filename> files placed in
+          <filename>/etc/systemd/oomd/rules.d/</filename>,
+          <filename>/run/systemd/oomd/rules.d/</filename>,
+          <filename>/usr/local/lib/systemd/oomd/rules.d/</filename>, or
+          <filename>/usr/lib/systemd/oomd/rules.d/</filename>. When set,
+          <citerefentry><refentrytitle>systemd-oomd.service</refentrytitle><manvolnum>8</manvolnum></citerefentry>
+          will monitor this unit's cgroup and evaluate the specified rulesets against it.
+          Each ruleset defines conditions (such as memory pressure or swap usage thresholds) and an action
+          to take when those conditions are met. See
+          <citerefentry><refentrytitle>oomd.conf</refentrytitle><manvolnum>5</manvolnum></citerefentry> for
+          details on the available ruleset options.</para>
+
+          <para>Setting this property will also result in <varname>After=</varname> and
+          <varname>Wants=</varname> dependencies on <filename>systemd-oomd.service</filename> unless
+          <varname>DefaultDependencies=no</varname>.</para>
+
+          <para>Defaults to an empty list, which means no rulesets are applied. Note that each monitored
+          cgroup incurs a per-interval walk of its descendant cgroup tree, so monitoring very large numbers of
+          cgroups via <varname>OOMRules=</varname> may have a measurable performance impact.</para>
+
+          <xi:include href="version-info.xml" xpointer="v261"/>
+        </listitem>
+      </varlistentry>
+
       <varlistentry>
         <term><varname>MemoryPressureWatch=</varname></term>
 
index 48b7df0e00c08e8767b2c793fc30ed8f34edc771..543d1ac8e3c43a5af0861e2f1a038485663554eb 100644 (file)
@@ -284,6 +284,8 @@ void cgroup_context_done(CGroupContext *c) {
 
         c->delegate_subgroup = mfree(c->delegate_subgroup);
 
+        c->moom_rules = strv_free(c->moom_rules);
+
         nft_set_context_clear(&c->nft_set_context);
 }
 
@@ -670,6 +672,9 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) {
         FOREACH_ARRAY(nft_set, c->nft_set_context.sets, c->nft_set_context.n_sets)
                 fprintf(f, "%sNFTSet: %s:%s:%s:%s\n", prefix, nft_set_source_to_string(nft_set->source),
                         nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set);
+
+        STRV_FOREACH(rule, c->moom_rules)
+                fprintf(f, "%sOOMRules: %s\n", prefix, *rule);
 }
 
 void cgroup_context_dump_socket_bind_item(const CGroupSocketBindItem *item, FILE *f) {
index b7213d8d59494bb5bab6ad4395af066178cf03a2..e3d33ad5e0910f23d526bf17f5e11873da236fc0 100644 (file)
@@ -203,6 +203,7 @@ typedef struct CGroupContext {
         uint32_t moom_mem_pressure_limit; /* Normalized to 2^32-1 == 100% */
         usec_t moom_mem_pressure_duration_usec;
         ManagedOOMPreference moom_preference;
+        char **moom_rules;
 
         /* Pressure logic */
         CGroupPressure pressure[_PRESSURE_RESOURCE_MAX];
index 6cecc8b9e74194b6070d4255b1e674973ac0b84d..168bdf10c13da78d4726ca648f42a4ffbd95ad80 100644 (file)
@@ -424,6 +424,7 @@ const sd_bus_vtable bus_cgroup_vtable[] = {
         SD_BUS_PROPERTY("ManagedOOMMemoryPressureLimit", "u", NULL, offsetof(CGroupContext, moom_mem_pressure_limit), 0),
         SD_BUS_PROPERTY("ManagedOOMMemoryPressureDurationUSec", "t", bus_property_get_usec, offsetof(CGroupContext, moom_mem_pressure_duration_usec), 0),
         SD_BUS_PROPERTY("ManagedOOMPreference", "s", property_get_managed_oom_preference, offsetof(CGroupContext, moom_preference), 0),
+        SD_BUS_PROPERTY("OOMRules", "as", NULL, offsetof(CGroupContext, moom_rules), 0),
         SD_BUS_PROPERTY("BPFProgram", "a(ss)", property_get_bpf_foreign_program, 0, 0),
         SD_BUS_PROPERTY("SocketBindAllow", "a(iiqq)", property_get_socket_bind, offsetof(CGroupContext, socket_bind_allow), 0),
         SD_BUS_PROPERTY("SocketBindDeny", "a(iiqq)", property_get_socket_bind, offsetof(CGroupContext, socket_bind_deny), 0),
@@ -1796,6 +1797,38 @@ int bus_cgroup_set_property(
 
                 return 1;
         }
+
+        if (streq(name, "OOMRules")) {
+                _cleanup_strv_free_ char **oom_rules = NULL;
+
+                if (!UNIT_VTABLE(u)->can_set_managed_oom)
+                        return sd_bus_error_setf(reterr_error, SD_BUS_ERROR_INVALID_ARGS, "Cannot set %s for this unit type", name);
+
+                r = sd_bus_message_read_strv(message, &oom_rules);
+                if (r < 0)
+                        return r;
+
+                STRV_FOREACH(rule, oom_rules)
+                        if (!string_is_safe(*rule, STRING_FILENAME))
+                                return sd_bus_error_setf(reterr_error, SD_BUS_ERROR_INVALID_ARGS, "Invalid rule name: %s", *rule);
+
+                strv_uniq(oom_rules);
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        _cleanup_free_ char *joined = strv_join(oom_rules, " ");
+                        if (!joined)
+                                return -ENOMEM;
+
+                        strv_free_and_replace(c->moom_rules, oom_rules);
+
+                        unit_write_settingf(u, flags, name, "OOMRules=\nOOMRules=%s", joined);
+
+                        (void) manager_varlink_send_managed_oom_update(u);
+                }
+
+                return 1;
+        }
+
         if (STR_IN_SET(name, "SocketBindAllow", "SocketBindDeny")) {
                 CGroupSocketBindItem **list;
                 uint16_t nr_ports, port_min;
index 5f205772fd81abae61eacf930c302a79abfc44cd..953b0d989484581bb71974be4df21275f998418e 100644 (file)
@@ -285,6 +285,10 @@ static int exec_cgroup_context_serialize(const CGroupContext *c, FILE *f) {
         if (r < 0)
                 return r;
 
+        r = serialize_strv(f, "exec-cgroup-context-managed-oom-rules", c->moom_rules);
+        if (r < 0)
+                return r;
+
         r = serialize_item(f, "exec-cgroup-context-memory-pressure-watch", cgroup_pressure_watch_to_string(c->pressure[PRESSURE_MEMORY].watch));
         if (r < 0)
                 return r;
@@ -650,6 +654,10 @@ static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) {
                         r = deserialize_usec(val, &c->moom_mem_pressure_duration_usec);
                         if (r < 0)
                                 return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-managed-oom-rules="))) {
+                        r = deserialize_strv(val, &c->moom_rules);
+                        if (r < 0)
+                                return r;
                 } else if ((val = startswith(l, "exec-cgroup-context-memory-pressure-watch="))) {
                         c->pressure[PRESSURE_MEMORY].watch = cgroup_pressure_watch_from_string(val);
                         if (c->pressure[PRESSURE_MEMORY].watch < 0)
index b8d744c1f49598085e18ab5958226690abba0642..0e2d679f978052191f07d875af4216dc9334fd54 100644 (file)
 {{type}}.ManagedOOMMemoryPressureLimit,       config_parse_managed_oom_mem_pressure_limit,        0,                                  offsetof({{type}}, cgroup_context.moom_mem_pressure_limit)
 {{type}}.ManagedOOMMemoryPressureDurationSec, config_parse_managed_oom_mem_pressure_duration_sec, 0,                                  offsetof({{type}}, cgroup_context.moom_mem_pressure_duration_usec)
 {{type}}.ManagedOOMPreference,                config_parse_managed_oom_preference,                0,                                  offsetof({{type}}, cgroup_context.moom_preference)
+{{type}}.OOMRules,                            config_parse_managed_oom_rules,                     1,                                  offsetof({{type}}, cgroup_context.moom_rules)
 {{type}}.NetClass,                            config_parse_warn_compat,                           DISABLED_LEGACY,                    0
 {{type}}.BPFProgram,                          config_parse_bpf_foreign_program,                   0,                                  offsetof({{type}}, cgroup_context)
 {{type}}.SocketBindAllow,                     config_parse_cgroup_socket_bind,                    0,                                  offsetof({{type}}, cgroup_context.socket_bind_allow)
index 2a268d813b5bbd9816f466894997c3f3d3b7ddf8..9b2fa71be398f6ae16437da81b10a8a84d52090a 100644 (file)
@@ -4090,6 +4090,65 @@ int config_parse_managed_oom_mem_pressure_duration_sec(
         return 0;
 }
 
+int config_parse_managed_oom_rules(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        char ***sv = ASSERT_PTR(data);
+        UnitType t;
+        int r;
+
+        assert(rvalue);
+
+        t = unit_name_to_type(unit);
+        assert(t != _UNIT_TYPE_INVALID);
+
+        if (!unit_vtable[t]->can_set_managed_oom)
+                return log_syntax(unit, LOG_WARNING, filename, line, 0, "%s= is not supported for this unit type, ignoring.", lvalue);
+
+        if (isempty(rvalue)) {
+                *sv = strv_free(*sv);
+                return 0;
+        }
+
+        /* Tokenize once: validate each rule name (rulesets are loaded from .oomrule files)
+         * and accumulate into a local strv. Invalid rule names are skipped individually
+         * with a warning so the rest of the line still applies. */
+        _cleanup_strv_free_ char **strv = NULL;
+        for (const char *p = rvalue;;) {
+                _cleanup_free_ char *word = NULL;
+
+                r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE);
+                if (r == 0)
+                        break;
+                if (r < 0)
+                        return log_syntax_parse_error(unit, filename, line, r, lvalue, rvalue);
+
+                if (!string_is_safe(word, STRING_FILENAME)) {
+                        log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid rule name in %s=, ignoring: %s", lvalue, word);
+                        continue;
+                }
+
+                r = strv_consume(&strv, TAKE_PTR(word));
+                if (r < 0)
+                        return log_oom();
+        }
+
+        r = strv_extend_strv_consume(sv, TAKE_PTR(strv), /* filter_duplicates= */ ltype);
+        if (r < 0)
+                return log_oom();
+
+        return 0;
+}
+
 int config_parse_device_allow(
                 const char *unit,
                 const char *filename,
index fafb00402830e8223a1025f2e826121a7454cf48..99b53626203ec1e227845017f6658565284f6a41 100644 (file)
@@ -144,6 +144,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_pid_file);
 CONFIG_PARSER_PROTOTYPE(config_parse_exit_status);
 CONFIG_PARSER_PROTOTYPE(config_parse_disable_controllers);
 CONFIG_PARSER_PROTOTYPE(config_parse_oom_policy);
+CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_rules);
 CONFIG_PARSER_PROTOTYPE(config_parse_numa_policy);
 CONFIG_PARSER_PROTOTYPE(config_parse_numa_mask);
 CONFIG_PARSER_PROTOTYPE(config_parse_ip_filter_bpf_progs);
index 8ed74b080d144706c8605b7b52f9981c60c3059b..f81083a70f753cf06e400038a734547b27961a1e 100644 (file)
@@ -1626,7 +1626,7 @@ static int unit_add_oomd_dependencies(Unit *u) {
         if (!c)
                 return 0;
 
-        bool wants_oomd = c->moom_swap == MANAGED_OOM_KILL || c->moom_mem_pressure == MANAGED_OOM_KILL;
+        bool wants_oomd = c->moom_swap == MANAGED_OOM_KILL || c->moom_mem_pressure == MANAGED_OOM_KILL || !strv_isempty(c->moom_rules);
         if (!wants_oomd)
                 return 0;
 
index 9953707417d5d00802a4818a257f6dde85bed491..e031f00368bab20e7622ab92ee27a5b5edc96f0c 100644 (file)
@@ -326,6 +326,7 @@ int unit_cgroup_context_build_json(sd_json_variant **ret, const char *name, void
                         JSON_BUILD_PAIR_UNSIGNED_NON_ZERO("ManagedOOMMemoryPressureLimit", c->moom_mem_pressure_limit),
                         JSON_BUILD_PAIR_FINITE_USEC("ManagedOOMMemoryPressureDurationUSec", c->moom_mem_pressure_duration_usec),
                         JSON_BUILD_PAIR_ENUM("ManagedOOMPreference", managed_oom_preference_to_string(c->moom_preference)),
+                        JSON_BUILD_PAIR_STRV_NON_EMPTY("OOMRules", c->moom_rules),
                         JSON_BUILD_PAIR_ENUM("MemoryPressureWatch", cgroup_pressure_watch_to_string(c->pressure[PRESSURE_MEMORY].watch)),
                         JSON_BUILD_PAIR_FINITE_USEC("MemoryPressureThresholdUSec", c->pressure[PRESSURE_MEMORY].threshold_usec),
                         JSON_BUILD_PAIR_ENUM("CPUPressureWatch", cgroup_pressure_watch_to_string(c->pressure[PRESSURE_CPU].watch)),
index 09817b6dce2c46861a87c47d386636bafeba7842..f6b474c636307dd88d6e5a556472659b610c9949 100644 (file)
@@ -11,6 +11,7 @@
 #include "path-util.h"
 #include "pidref.h"
 #include "string-util.h"
+#include "strv.h"
 #include "unit.h"
 #include "varlink.h"
 #include "varlink-dynamic-user.h"
 static const char* const managed_oom_mode_properties[] = {
         "ManagedOOMSwap",
         "ManagedOOMMemoryPressure",
+        "OOMRules",
 };
 
 static int build_managed_oom_json_array_element(Unit *u, const char *property, sd_json_variant **ret_v) {
-        bool use_limit = false, use_duration = false;
+        bool use_limit = false, use_duration = false, use_rules = false;
         CGroupContext *c;
         const char *mode;
 
@@ -60,15 +62,25 @@ static int build_managed_oom_json_array_element(Unit *u, const char *property, s
                 mode = managed_oom_mode_to_string(c->moom_mem_pressure);
                 use_limit = c->moom_mem_pressure_limit > 0;
                 use_duration = c->moom_mem_pressure_duration_usec != USEC_INFINITY;
+        } else if (streq(property, "OOMRules")) {
+                if (strv_isempty(c->moom_rules))
+                        mode = managed_oom_mode_to_string(MANAGED_OOM_AUTO);
+                else {
+                        mode = managed_oom_mode_to_string(MANAGED_OOM_KILL);
+                        use_rules = true;
+                }
         } else
                 return -EINVAL;
 
+        assert(mode);
+
         return sd_json_buildo(ret_v,
                               JSON_BUILD_PAIR_ENUM("mode", mode),
                               SD_JSON_BUILD_PAIR_STRING("path", crt->cgroup_path),
                               SD_JSON_BUILD_PAIR_STRING("property", property),
                               SD_JSON_BUILD_PAIR_CONDITION(use_limit, "limit", SD_JSON_BUILD_UNSIGNED(c->moom_mem_pressure_limit)),
-                              SD_JSON_BUILD_PAIR_CONDITION(use_duration, "duration", SD_JSON_BUILD_UNSIGNED(c->moom_mem_pressure_duration_usec)));
+                              SD_JSON_BUILD_PAIR_CONDITION(use_duration, "duration", SD_JSON_BUILD_UNSIGNED(c->moom_mem_pressure_duration_usec)),
+                              SD_JSON_BUILD_PAIR_CONDITION(use_rules, "rules", SD_JSON_BUILD_STRV(c->moom_rules)));
 }
 
 static int build_managed_oom_cgroups_json(Manager *m, bool allow_empty, sd_json_variant **ret) {
@@ -109,7 +121,8 @@ static int build_managed_oom_cgroups_json(Manager *m, bool allow_empty, sd_json_
                                 /* For the initial varlink call we only care about units that enabled (i.e. mode is not
                                  * set to "auto") oomd properties. */
                                 if (!(streq(*i, "ManagedOOMSwap") && c->moom_swap == MANAGED_OOM_KILL) &&
-                                    !(streq(*i, "ManagedOOMMemoryPressure") && c->moom_mem_pressure == MANAGED_OOM_KILL))
+                                    !(streq(*i, "ManagedOOMMemoryPressure") && c->moom_mem_pressure == MANAGED_OOM_KILL) &&
+                                    !(streq(*i, "OOMRules") && !strv_isempty(c->moom_rules)))
                                         continue;
 
                                 r = build_managed_oom_json_array_element(u, *i, &e);
index f0091e27561c77bb2017bf919b8af4debf8566da..bd3d0003b07e572b8bd099940312f9dae0b51644 100644 (file)
@@ -1,11 +1,18 @@
 /* SPDX-License-Identifier: LGPL-2.1-or-later */
 
+#include "conf-files.h"
 #include "conf-parser.h"
+#include "hashmap.h"
 #include "log.h"
 #include "oomd-conf.h"
 #include "oomd-manager.h"
 #include "parse-util.h"
+#include "path-util.h"
+#include "percent-util.h"
+#include "stat-util.h"
+#include "string-table.h"
 #include "string-util.h"
+#include "strv.h"
 #include "time-util.h"
 
 static int config_parse_duration(
@@ -66,7 +73,143 @@ void manager_set_defaults(Manager *m) {
                 log_warning_errno(r, "Failed to set default for default_mem_pressure_limit, ignoring: %m");
 }
 
+/* OOMD_ACTION_NONE is intentionally omitted — it's the "unset" sentinel. Rulesets with
+ * .action == OOMD_ACTION_NONE are rejected at load time, so oomd_action_to_string() must
+ * only be called on rulesets that have already passed ruleset_load_one's validation
+ * (otherwise it returns NULL). */
+static const char* const oomd_action_table[] = {
+        [OOMD_ACTION_KILL_ALL]       = "kill-all",
+        [OOMD_ACTION_KILL_BY_PGSCAN] = "kill-by-pgscan",
+        [OOMD_ACTION_KILL_BY_SWAP]   = "kill-by-swap",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(oomd_action, OomdAction);
+static DEFINE_CONFIG_PARSE_ENUM(config_parse_oomd_action, oomd_action, OomdAction);
+
+void oomd_ruleset_free(OomdRuleset *ruleset) {
+        if (!ruleset)
+                return;
+        hashmap_free(ruleset->start_times);
+        free(ruleset->name);
+        free(ruleset);
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(OomdRuleset*, oomd_ruleset_free, NULL);
+DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(oomd_ruleset_hash_ops, char, string_hash_func, string_compare_func, OomdRuleset, oomd_ruleset_free);
+
+static int ruleset_load_one(Manager *m, const char *filename) {
+        _cleanup_free_ char *name = NULL;
+        _cleanup_(oomd_ruleset_freep) OomdRuleset *ruleset = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        struct stat st;
+        int r;
+
+        assert(m);
+        assert(filename);
+
+        /* Pin the file via an fd so the empty-file check and the parse operate on the same
+         * inode (avoids TOCTOU between null_or_empty_path() and a subsequent open()). */
+        f = fopen(filename, "re");
+        if (!f)
+                return log_warning_errno(errno, "Failed to open '%s': %m", filename);
+
+        if (fstat(fileno(f), &st) < 0)
+                return log_warning_errno(errno, "Failed to stat '%s': %m", filename);
+
+        if (null_or_empty(&st)) {
+                log_debug("Skipping empty file: %s", filename);
+                return 0;
+        }
+
+        r = path_extract_filename(filename, &name);
+        if (r < 0)
+                return log_error_errno(r, "Failed to extract file name of '%s': %m", filename);
+
+        char *e = ASSERT_PTR(endswith(name, ".oomrule"));
+        *e = 0;
+
+        /* Apply the same validation the DBus setter and the config parser use, so that any
+         * ruleset we accept here is actually referenceable via OOMRules= from a unit. */
+        if (!string_is_safe(name, STRING_FILENAME)) {
+                log_warning("Invalid ruleset name '%s' derived from '%s', ignoring.", name, filename);
+                return 0;
+        }
+
+        ruleset = new(OomdRuleset, 1);
+        if (!ruleset)
+                return log_oom();
+
+        *ruleset = (OomdRuleset) {
+                .name = TAKE_PTR(name),
+                .memory_pressure_above = -1,
+                .swap_above = -1,
+        };
+
+        const ConfigTableItem items[] = {
+                { "Rule", "MemoryPressureAbove", config_parse_permyriad,   0, &ruleset->memory_pressure_above },
+                { "Rule", "SwapUsageMax",        config_parse_permyriad,   0, &ruleset->swap_above            },
+                { "Rule", "Action",              config_parse_oomd_action, 0, &ruleset->action                },
+                { "Rule", "LastingSec",          config_parse_sec,         0, &ruleset->lasting_usec          },
+                {}
+        };
+
+        r = config_parse(
+                        /* unit= */ NULL,
+                        filename,
+                        f,
+                        "Rule\0",
+                        config_item_table_lookup,
+                        items,
+                        CONFIG_PARSE_WARN,
+                        /* userdata= */ NULL,
+                        /* ret_stat= */ NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to parse ruleset file '%s': %m", filename);
+
+        if (ruleset->memory_pressure_above < 0 && ruleset->swap_above < 0) {
+                log_warning("Ruleset '%s' has no conditions configured (MemoryPressureAbove= or SwapUsageMax=), ignoring.", ruleset->name);
+                return 0;
+        }
+
+        if (ruleset->action == OOMD_ACTION_NONE) {
+                log_warning("Ruleset '%s' has no Action= configured, ignoring.", ruleset->name);
+                return 0;
+        }
+
+        if (ruleset->lasting_usec == USEC_INFINITY) {
+                log_warning("Ruleset '%s' has LastingSec=infinity which can never be satisfied, ignoring.", ruleset->name);
+                return 0;
+        }
+
+        /* A threshold at the maximum can never be exceeded, so the condition would never fire.
+         * Report the normalized percent value so the warning matches regardless of whether the
+         * user wrote '100%', '1000‰' or '10000‱'. */
+        if (ruleset->memory_pressure_above == 10000) {
+                log_warning("Ruleset '%s' has MemoryPressureAbove=" PERMYRIAD_AS_PERCENT_FORMAT_STR " (the maximum) which can never be exceeded, ignoring.",
+                            ruleset->name, PERMYRIAD_AS_PERCENT_FORMAT_VAL(ruleset->memory_pressure_above));
+                return 0;
+        }
+
+        if (ruleset->swap_above == 10000) {
+                log_warning("Ruleset '%s' has SwapUsageMax=" PERMYRIAD_AS_PERCENT_FORMAT_STR " (the maximum) which can never be exceeded, ignoring.",
+                            ruleset->name, PERMYRIAD_AS_PERCENT_FORMAT_VAL(ruleset->swap_above));
+                return 0;
+        }
+
+        /* Duplicates cannot occur here: conf_files_list_strv deduplicates filenames across
+         * directories, and hashmap_clear is called before loading. The value destructor in
+         * oomd_ruleset_hash_ops handles cleanup during hashmap_clear/hashmap_free. */
+        r = hashmap_ensure_replace(&m->rulesets, &oomd_ruleset_hash_ops, ruleset->name, ruleset);
+        if (r < 0)
+                return log_error_errno(r, "Failed to register ruleset '%s': %m", ruleset->name);
+
+        TAKE_PTR(ruleset);
+
+        return 0;
+}
+
 void manager_parse_config_file(Manager *m) {
+        _cleanup_strv_free_ char **files = NULL;
         int r;
 
         assert(m);
@@ -88,4 +231,37 @@ void manager_parse_config_file(Manager *m) {
                         /* userdata= */ m);
         if (r >= 0)
                 log_debug("Config file successfully parsed.");
+
+        r = conf_files_list_strv(&files, ".oomrule", /* root= */ NULL, CONF_FILES_WARN, RULESET_DIRS);
+        if (r < 0) {
+                /* On enumeration failure, keep the previously loaded rulesets rather than clearing them —
+                 * a transient I/O error shouldn't cause in-flight units to silently lose their OOM policy. */
+                log_error_errno(r, "Failed to enumerate ruleset files, keeping previously loaded rulesets: %m");
+                return;
+        }
+
+        /* Clear all rulesets and re-parse. This intentionally resets any accumulated
+         * start_times (LastingSec timers), since the ruleset definitions may have changed. */
+        hashmap_clear(m->rulesets);
+
+        STRV_FOREACH(f, files)
+                (void) ruleset_load_one(m, *f);
+
+        if (DEBUG_LOGGING) {
+                char *name;
+                OomdRuleset *ruleset;
+                HASHMAP_FOREACH_KEY(ruleset, name, m->rulesets) {
+                        log_debug("Registered ruleset: %s", name);
+                        if (ruleset->memory_pressure_above >= 0)
+                                log_debug("  MemoryPressureAbove=" PERMYRIAD_AS_PERCENT_FORMAT_STR, PERMYRIAD_AS_PERCENT_FORMAT_VAL(ruleset->memory_pressure_above));
+                        else
+                                log_debug("  MemoryPressureAbove=unset");
+                        if (ruleset->swap_above >= 0)
+                                log_debug("  SwapUsageMax=" PERMYRIAD_AS_PERCENT_FORMAT_STR, PERMYRIAD_AS_PERCENT_FORMAT_VAL(ruleset->swap_above));
+                        else
+                                log_debug("  SwapUsageMax=unset");
+                        log_debug("  Action=%s", oomd_action_to_string(ruleset->action));
+                        log_debug("  LastingSec=%s", FORMAT_TIMESPAN(ruleset->lasting_usec, USEC_PER_SEC));
+                }
+        }
 }
index 429b976b91be274045386d86289ef6a0c4284a11..8f715e81c6fe0ff432f6a0b711ec4caf2ff3ba64 100644 (file)
@@ -1,8 +1,16 @@
 /* SPDX-License-Identifier: LGPL-2.1-or-later */
 #pragma once
 
+#include "string-table.h"      /* IWYU pragma: keep */
+
 typedef struct Manager Manager;
+typedef struct OomdRuleset OomdRuleset;
+typedef enum OomdAction OomdAction;
+
+void oomd_ruleset_free(OomdRuleset *ruleset);
 
 void manager_set_defaults(Manager *m);
 
 void manager_parse_config_file(Manager *m);
+
+DECLARE_STRING_TABLE_LOOKUP(oomd_action, OomdAction);
index 382a246c2dddbb78328584e353c85f89b1e687b6..7ec6684f6e2bd172628cb2a7759ae93ec943162f 100644 (file)
@@ -24,6 +24,7 @@
 #include "percent-util.h"
 #include "set.h"
 #include "string-util.h"
+#include "strv.h"
 #include "time-util.h"
 #include "varlink-io.systemd.oom.h"
 #include "varlink-io.systemd.service.h"
@@ -35,12 +36,14 @@ typedef struct ManagedOOMMessage {
         char *property;
         uint32_t limit;
         usec_t duration;
+        char **rules;
 } ManagedOOMMessage;
 
 static void managed_oom_message_destroy(ManagedOOMMessage *message) {
         assert(message);
         free(message->path);
         free(message->property);
+        strv_free(message->rules);
 }
 
 static JSON_DISPATCH_ENUM_DEFINE(dispatch_managed_oom_mode, ManagedOOMMode, managed_oom_mode_from_string);
@@ -55,6 +58,7 @@ static int process_managed_oom_message(Manager *m, uid_t uid, sd_json_variant *p
                 { "property", SD_JSON_VARIANT_STRING,        sd_json_dispatch_string,   offsetof(ManagedOOMMessage, property), SD_JSON_MANDATORY },
                 { "limit",    _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint32,   offsetof(ManagedOOMMessage, limit),    0                 },
                 { "duration", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint64,   offsetof(ManagedOOMMessage, duration), 0                 },
+                { "rules",    _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_strv,     offsetof(ManagedOOMMessage, rules),    0                 },
                 {},
         };
 
@@ -101,11 +105,31 @@ static int process_managed_oom_message(Manager *m, uid_t uid, sd_json_variant *p
                                                        "(" UID_FMT " != " UID_FMT ")", uid, cg_uid);
                 }
 
-                monitor_hm = streq(message.property, "ManagedOOMSwap") ?
-                                m->monitored_swap_cgroup_contexts : m->monitored_mem_pressure_cgroup_contexts;
+                if (streq(message.property, "ManagedOOMSwap"))
+                        monitor_hm = m->monitored_swap_cgroup_contexts;
+                else if (streq(message.property, "OOMRules"))
+                        monitor_hm = m->monitored_rules_cgroup_contexts;
+                else if (streq(message.property, "ManagedOOMMemoryPressure"))
+                        monitor_hm = m->monitored_mem_pressure_cgroup_contexts;
+                else {
+                        log_debug("Unknown property '%s', ignoring.", message.property);
+                        continue;
+                }
 
                 if (message.mode == MANAGED_OOM_AUTO) {
                         (void) oomd_cgroup_context_unref(hashmap_remove(monitor_hm, empty_to_root(message.path)));
+
+                        /* Clean up start_times entries for this cgroup across all rulesets
+                         * to prevent stale timers from causing premature action triggers
+                         * if the cgroup re-subscribes later. */
+                        if (streq(message.property, "OOMRules")) {
+                                OomdRuleset *ruleset;
+                                HASHMAP_FOREACH(ruleset, m->rulesets) {
+                                        _cleanup_free_ char *key = NULL;
+                                        free(hashmap_remove2(ruleset->start_times, empty_to_root(message.path), (void **) &key));
+                                }
+                        }
+
                         continue;
                 }
 
@@ -124,6 +148,57 @@ static int process_managed_oom_message(Manager *m, uid_t uid, sd_json_variant *p
                 else
                         duration = m->default_mem_pressure_duration_usec;
 
+                /* For OOMRules, only insert/update if rules are actually provided */
+                if (streq(message.property, "OOMRules")) {
+                        if (strv_isempty(message.rules))
+                                continue;
+
+                        /* Avoid re-reading memory.pressure/pgscan/etc. on every OOMRules update for a
+                         * cgroup we already track — fetch the existing context first and only acquire
+                         * a fresh one if the cgroup is new. */
+                        ctx = hashmap_get(monitor_hm, empty_to_root(message.path));
+                        if (!ctx) {
+                                r = oomd_insert_cgroup_context(NULL, monitor_hm, message.path);
+                                if (r == -ENOMEM)
+                                        return r;
+                                if (r < 0) {
+                                        log_debug_errno(r, "Failed to insert message, ignoring: %m");
+                                        continue;
+                                }
+                                ctx = hashmap_get(monitor_hm, empty_to_root(message.path));
+                        }
+
+                        if (ctx) {
+                                /* For each rule being dropped from this cgroup's subscription,
+                                 * remove its start_times entry so the timer doesn't linger. */
+                                STRV_FOREACH(old_rule, ctx->rules) {
+                                        if (strv_contains(message.rules, *old_rule))
+                                                continue;
+                                        OomdRuleset *dropped = hashmap_get(m->rulesets, *old_rule);
+                                        if (!dropped)
+                                                continue;
+                                        _cleanup_free_ char *key = NULL;
+                                        free(hashmap_remove2(dropped->start_times, empty_to_root(message.path), (void**) &key));
+                                }
+
+                                strv_free_and_replace(ctx->rules, message.rules);
+
+                                /* Defensively deduplicate: the DBus setter and config parser both
+                                 * dedupe, but another varlink client could in principle send
+                                 * duplicates, which would cause redundant per-interval evaluation. */
+                                strv_uniq(ctx->rules);
+
+                                /* Warn about any referenced rules that don't exist. Done here
+                                 * (once per subscription change) rather than per-interval to avoid
+                                 * log spam when a unit references a missing ruleset. */
+                                STRV_FOREACH(new_rule, ctx->rules)
+                                        if (!hashmap_contains(m->rulesets, *new_rule))
+                                                log_warning("Cgroup %s references undefined ruleset '%s', it will be ignored.",
+                                                            ctx->path, *new_rule);
+                        }
+                        continue;
+                }
+
                 r = oomd_insert_cgroup_context(NULL, monitor_hm, message.path);
                 if (r == -ENOMEM)
                         return r;
@@ -145,6 +220,12 @@ static int process_managed_oom_message(Manager *m, uid_t uid, sd_json_variant *p
         if (r < 0)
                 return log_error_errno(r, "Failed to toggle enabled state of swap context source: %m");
 
+        /* Toggle wake-ups for "OOMRules" if entries are present. */
+        r = sd_event_source_set_enabled(m->rules_context_event_source,
+                                        hashmap_isempty(m->monitored_rules_cgroup_contexts) ? SD_EVENT_OFF : SD_EVENT_ON);
+        if (r < 0)
+                return log_error_errno(r, "Failed to toggle enabled state of rules context source: %m");
+
         return 0;
 }
 
@@ -408,7 +489,7 @@ static int monitor_swap_contexts_handler(sd_event_source *s, uint64_t usec, void
                         log_debug_errno(r, "Failed to get monitored swap cgroup candidates, ignoring: %m");
 
                 threshold = m->system_context.swap_total * THRESHOLD_SWAP_USED_PERCENT / 100;
-                r = oomd_select_by_swap_usage(candidates, threshold, &selected);
+                r = oomd_select_by_swap_usage(candidates, /* prefix= */ NULL, threshold, &selected);
                 if (r < 0)
                         return log_error_errno(r, "Failed to select any cgroups based on swap: %m");
                 if (r == 0) {
@@ -584,6 +665,398 @@ static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t
         return 0;
 }
 
+static int ruleset_execute_action(
+                Manager *m,
+                OomdCGroupContext *ctx,
+                OomdRuleset *ruleset,
+                const char *rule_name,
+                usec_t usec_now) {
+
+        _cleanup_free_ char *reason = NULL;
+        int r;
+
+        assert(m);
+        assert(ctx);
+        assert(ruleset);
+        assert(rule_name);
+
+        if (ruleset->lasting_usec > 0)
+                log_notice("Rule '%s' conditions met for cgroup %s (lasting %s), taking action %s",
+                           rule_name,
+                           ctx->path,
+                           FORMAT_TIMESPAN(ruleset->lasting_usec, USEC_PER_SEC),
+                           oomd_action_to_string(ruleset->action));
+        else
+                log_notice("Rule '%s' conditions met for cgroup %s, taking action %s",
+                           rule_name,
+                           ctx->path,
+                           oomd_action_to_string(ruleset->action));
+
+        reason = strjoin("rule ", rule_name);
+        if (!reason)
+                return log_oom();
+
+        if (ruleset->action == OOMD_ACTION_KILL_ALL) {
+                r = oomd_cgroup_kill_mark(m, ctx, reason);
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0) {
+                        log_notice_errno(r, "Failed to kill all processes in %s: %m", ctx->path);
+                        return 0;
+                }
+        } else if (ruleset->action == OOMD_ACTION_KILL_BY_PGSCAN) {
+                OomdCGroupContext *selected = NULL;
+
+                /* Check if there was reclaim activity in the given interval. If there isn't any reclaim
+                 * pressure, killing won't help — well-behaved processes faulting in recently resident
+                 * pages will keep pressure high even after the offending cgroup is killed. */
+                if (usec_sub_unsigned(usec_now, ctx->last_had_mem_reclaim) > RECLAIM_DURATION_USEC) {
+                        log_debug("No reclaim activity for %s, skipping pgscan-based action", ctx->path);
+                        return 0;
+                }
+
+                r = oomd_select_by_pgscan_rate(m->monitored_rules_cgroup_contexts_candidates, ctx->path, &selected);
+                if (r < 0) {
+                        log_notice_errno(r, "Failed to select cgroup by pgscan rate for %s: %m", ctx->path);
+                        return 0;
+                }
+                if (r == 0) {
+                        log_debug("No cgroup candidates found for pgscan-based action for %s", ctx->path);
+                        return 0;
+                }
+
+                r = oomd_cgroup_kill_mark(m, selected, reason);
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0) {
+                        log_notice_errno(r, "Failed to kill processes in %s: %m", selected->path);
+                        return 0;
+                }
+        } else if (ruleset->action == OOMD_ACTION_KILL_BY_SWAP) {
+                OomdCGroupContext *selected = NULL;
+                uint64_t threshold;
+
+                if (m->system_context.swap_total == 0) {
+                        if (!ruleset->warned_no_swap) {
+                                log_warning("Rule '%s' uses kill-by-swap action but no swap is configured, skipping (further occurrences suppressed)", rule_name);
+                                ruleset->warned_no_swap = true;
+                        }
+                        return 0;
+                }
+
+                /* Swap came back — clear the latch so re-disabling swap warns again. */
+                ruleset->warned_no_swap = false;
+
+                threshold = m->system_context.swap_total * THRESHOLD_SWAP_USED_PERCENT / 100;
+                r = oomd_select_by_swap_usage(m->monitored_rules_cgroup_contexts_candidates, ctx->path, threshold, &selected);
+                if (r < 0) {
+                        log_notice_errno(r, "Failed to select cgroup by swap usage for %s: %m", ctx->path);
+                        return 0;
+                }
+                if (r == 0) {
+                        log_debug("No cgroup candidates found for swap-based action for %s", ctx->path);
+                        return 0;
+                }
+
+                r = oomd_cgroup_kill_mark(m, selected, reason);
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0) {
+                        log_notice_errno(r, "Failed to kill processes in %s: %m", selected->path);
+                        return 0;
+                }
+        } else
+                assert_not_reached();
+
+        return 1;
+}
+
+static int ruleset_check_conditions(
+                Manager *m,
+                OomdCGroupContext *ctx,
+                OomdRuleset *ruleset,
+                const char *rule_name,
+                usec_t usec_now) {
+
+        int r;
+
+        assert(m);
+        assert(ctx);
+        assert(ruleset);
+        assert(rule_name);
+
+        /* Check memory pressure condition.
+         * memory_pressure_above is in permyriad (0-10000, i.e. 6050 = 60.50%).
+         * store_loadavg_fixed_point takes integer and decimal parts of a percentage,
+         * so divide/modulo by 100 to split permyriad into percent + centipercent. */
+        if (ruleset->memory_pressure_above >= 0) {
+                loadavg_t threshold;
+                r = store_loadavg_fixed_point(ruleset->memory_pressure_above / 100,
+                                              ruleset->memory_pressure_above % 100,
+                                              &threshold);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to convert pressure threshold for rule '%s': %m", rule_name);
+
+                if (ctx->memory_pressure.avg10 <= threshold)
+                        goto reset;
+        }
+
+        /* swap_above means take action when swap usage is above threshold.
+         * oomd_swap_free_below returns true when swap free is below threshold,
+         * so if swap_above is X%, check if swap free is below (100-X)%.
+         * When no swap is configured, the condition cannot be meaningfully evaluated. */
+        if (ruleset->swap_above >= 0) {
+                if (m->system_context.swap_total == 0 ||
+                    !oomd_swap_free_below(&m->system_context, 10000 - ruleset->swap_above))
+                        goto reset;
+        }
+
+        /* All conditions met, check if LastingSec requirement is satisfied */
+        usec_t *start_time = hashmap_get(ruleset->start_times, ctx->path);
+        if (!start_time) {
+                /* First time seeing this condition - record the start time */
+                _cleanup_free_ usec_t *new_start_time = new(usec_t, 1);
+                if (!new_start_time)
+                        return log_oom();
+
+                *new_start_time = usec_now;
+
+                _cleanup_free_ char *path_copy = strdup(ctx->path);
+                if (!path_copy)
+                        return log_oom();
+
+                r = hashmap_ensure_put(&ruleset->start_times, &string_hash_ops_free_free, path_copy, new_start_time);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to record start time for rule '%s' on %s: %m",
+                                               rule_name, ctx->path);
+                TAKE_PTR(path_copy);
+                TAKE_PTR(new_start_time);
+
+                /* If lasting_usec is 0, take action immediately */
+                if (ruleset->lasting_usec == 0)
+                        return true;
+
+                log_debug("Rule '%s' conditions met for cgroup %s, waiting for %s",
+                          rule_name, ctx->path,
+                          FORMAT_TIMESPAN(ruleset->lasting_usec, USEC_PER_SEC));
+                return false;
+        }
+
+        /* Check if the condition has been true for long enough */
+        usec_t duration = usec_sub_unsigned(usec_now, *start_time);
+        if (duration >= ruleset->lasting_usec)
+                return true;
+
+        log_debug("Rule '%s' conditions met for cgroup %s for %s (need %s)",
+                  rule_name, ctx->path,
+                  FORMAT_TIMESPAN(duration, USEC_PER_SEC),
+                  FORMAT_TIMESPAN(ruleset->lasting_usec, USEC_PER_SEC));
+        return false;
+
+reset:
+        /* Conditions no longer met — remove start time if it exists. */
+        {
+                _cleanup_free_ char *old_key = NULL;
+                _cleanup_free_ usec_t *old_start_time =
+                        hashmap_remove2(ruleset->start_times, ctx->path, (void**) &old_key);
+                if (old_start_time)
+                        log_debug("Rule '%s' conditions no longer met for cgroup %s, resetting timer",
+                                  rule_name, ctx->path);
+        }
+        return false;
+}
+
+/* After a reload, some cgroups may reference rulesets that no longer exist (or didn't exist yet
+ * when the cgroup subscribed). Warn once per (cgroup, rule) pair so the operator sees the mismatch,
+ * without spamming the per-interval evaluation loop. */
+static void warn_missing_rulesets(Manager *m) {
+        OomdCGroupContext *ctx;
+
+        assert(m);
+
+        HASHMAP_FOREACH(ctx, m->monitored_rules_cgroup_contexts)
+                STRV_FOREACH(rule, ctx->rules)
+                        if (!hashmap_contains(m->rulesets, *rule))
+                                log_warning("Cgroup %s references undefined ruleset '%s', it will be ignored.",
+                                            ctx->path, *rule);
+}
+
+/* Remove start_times entries for cgroups that are no longer in monitored_rules_cgroup_contexts.
+ * Cgroups can vanish silently (unit stops, cgroup destroyed) without an explicit unsubscribe
+ * message, so we periodically reconcile to prevent unbounded growth of start_times. */
+static int prune_stale_ruleset_start_times(Manager *m) {
+        OomdRuleset *ruleset;
+        int r;
+
+        assert(m);
+
+        HASHMAP_FOREACH(ruleset, m->rulesets) {
+                _cleanup_strv_free_ char **to_remove = NULL;
+                const char *path;
+                void *v;
+
+                HASHMAP_FOREACH_KEY(v, path, ruleset->start_times)
+                        if (!hashmap_contains(m->monitored_rules_cgroup_contexts, path)) {
+                                r = strv_extend(&to_remove, path);
+                                if (r < 0)
+                                        return log_oom();
+                        }
+
+                STRV_FOREACH(p, to_remove) {
+                        _cleanup_free_ char *key = NULL;
+                        free(hashmap_remove2(ruleset->start_times, *p, (void**) &key));
+                }
+        }
+
+        return 0;
+}
+
+static int process_rules_cgroup_context(Manager *m, OomdCGroupContext *ctx, usec_t usec_now) {
+        int r;
+
+        assert(m);
+        assert(ctx);
+
+        if (strv_isempty(ctx->rules))
+                return 0;
+
+        STRV_FOREACH(rule_name, ctx->rules) {
+                OomdRuleset *ruleset = hashmap_get(m->rulesets, *rule_name);
+                if (!ruleset)
+                        /* Silently skip: already warned once when the subscription was attached or when
+                         * rulesets were loaded. Repeating here would fire every interval. */
+                        continue;
+
+                r = ruleset_check_conditions(m, ctx, ruleset, *rule_name, usec_now);
+                if (r < 0)
+                        continue;
+                if (r == 0)
+                        continue;
+
+                r = ruleset_execute_action(m, ctx, ruleset, *rule_name, usec_now);
+                if (r < 0)
+                        return r;
+
+                /* Only remove start time if the action actually killed something, so that
+                 * LastingSec must be satisfied again before re-triggering. If the action
+                 * failed to kill, keep the timer running to retry on the next interval. */
+                if (r > 0) {
+                        _cleanup_free_ char *action_key = NULL;
+                        free(hashmap_remove2(ruleset->start_times, ctx->path, (void **) &action_key));
+
+                        /* Global (not per-cgroup/per-ruleset) post-action delay: after any
+                         * successful ruleset kill we suppress *all* subsequent rule evaluations
+                         * until POST_ACTION_DELAY_USEC elapses. This is intentional — pressure
+                         * and swap metrics need time to reflect the effect of a kill before we
+                         * act again, otherwise a single overload could cascade into multiple
+                         * unrelated kills across sibling cgroups within the same interval. */
+                        m->rules_post_action_delay_start = usec_now;
+                        return 0;
+                }
+        }
+
+        return 0;
+}
+
+static int monitor_rules_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
+        Manager *m = ASSERT_PTR(userdata);
+        bool in_post_action_delay = false;
+        usec_t usec_now;
+        int r;
+
+        assert(s);
+
+        /* Reset timer */
+        r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now);
+        if (r < 0)
+                return log_error_errno(r, "Failed to reset event timer: %m");
+
+        r = sd_event_source_set_time_relative(s, RULESETS_INTERVAL_USEC);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set relative time for timer: %m");
+
+        /* Reconnect if our connection dropped */
+        if (!m->varlink_client) {
+                r = acquire_managed_oom_connect(m);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to acquire varlink connection: %m");
+        }
+
+        /* Return early if no rules are set */
+        if (hashmap_isempty(m->monitored_rules_cgroup_contexts))
+                return 0;
+
+        /* Determine whether we're still inside the post-action delay window before doing any
+         * heavy lifting, so we can short-circuit the expensive descendant walk below. */
+        if (m->rules_post_action_delay_start > 0) {
+                if (usec_add(m->rules_post_action_delay_start, POST_ACTION_DELAY_USEC) > usec_now)
+                        in_post_action_delay = true;
+                else
+                        m->rules_post_action_delay_start = 0;
+        }
+
+        /* Always keep the subscribed (parent) cgroup contexts fresh so pgscan rate differentials
+         * stay accurate across intervals, even during the post-action delay. Only suppress the
+         * kill action itself.
+         *
+         * Note: update_monitored_cgroup_contexts() rebuilds the hashmap by calling
+         * oomd_insert_cgroup_context(), which also carries over the per-cgroup 'rules' strv
+         * from the old context. We rely on that implicit rule propagation here — the
+         * rules attached to each cgroup context persist across refreshes. */
+        r = update_monitored_cgroup_contexts(&m->monitored_rules_cgroup_contexts);
+        if (r == -ENOMEM)
+                return log_oom();
+        if (r < 0)
+                log_debug_errno(r, "Failed to update monitored rules cgroup contexts, ignoring: %m");
+
+        /* The candidate refresh is the expensive part — it recursively walks descendants of every
+         * monitored cgroup. Since candidates are only consumed by kill-by-pgscan / kill-by-swap
+         * (both suppressed during the delay), skip the walk while we're not going to act. */
+        if (!in_post_action_delay) {
+                r = update_monitored_cgroup_contexts_candidates(
+                                m->monitored_rules_cgroup_contexts, &m->monitored_rules_cgroup_contexts_candidates);
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0)
+                        log_debug_errno(r, "Failed to update monitored rules cgroup candidates, ignoring: %m");
+        }
+
+        r = prune_stale_ruleset_start_times(m);
+        if (r < 0)
+                return r;
+
+        if (in_post_action_delay)
+                return 0;
+
+        /* Only read /proc/meminfo if at least one ruleset actually needs swap info — either as
+         * a SwapUsageMax= condition or as a kill-by-swap action (which uses swap_total to
+         * compute the per-cgroup selection threshold). */
+        OomdRuleset *ruleset;
+        HASHMAP_FOREACH(ruleset, m->rulesets)
+                if (ruleset->swap_above >= 0 || ruleset->action == OOMD_ACTION_KILL_BY_SWAP) {
+                        r = oomd_system_context_acquire("/proc/meminfo", &m->system_context);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to acquire system context: %m");
+                        break;
+                }
+
+        OomdCGroupContext *ctx;
+        HASHMAP_FOREACH(ctx, m->monitored_rules_cgroup_contexts) {
+                r = process_rules_cgroup_context(m, ctx, usec_now);
+                if (r < 0)
+                        return r;
+
+                /* process_rules_cgroup_context() sets rules_post_action_delay_start when it queues
+                 * a kill. Honor the delay *within the same tick* too: otherwise a single overload
+                 * could cascade into kills across unrelated sibling cgroups before pressure metrics
+                 * have a chance to reflect the first kill. */
+                if (m->rules_post_action_delay_start > 0)
+                        break;
+        }
+
+        return 0;
+}
+
 static int monitor_swap_contexts(Manager *m) {
         _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
         int r;
@@ -634,6 +1107,31 @@ static int monitor_memory_pressure_contexts(Manager *m) {
         return 0;
 }
 
+static int monitor_rules_contexts(Manager *m) {
+        _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
+        int r;
+
+        assert(m);
+        assert(m->event);
+
+        r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_rules_contexts_handler, m);
+        if (r < 0)
+                return r;
+
+        r = sd_event_source_set_exit_on_failure(s, true);
+        if (r < 0)
+                return r;
+
+        r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
+        if (r < 0)
+                return r;
+
+        (void) sd_event_source_set_description(s, "oomd-rules-timer");
+
+        m->rules_context_event_source = TAKE_PTR(s);
+        return 0;
+}
+
 Manager* manager_free(Manager *m) {
         assert(m);
 
@@ -641,6 +1139,7 @@ Manager* manager_free(Manager *m) {
         sd_varlink_close_unref(m->varlink_client);
         sd_event_source_unref(m->swap_context_event_source);
         sd_event_source_unref(m->mem_pressure_context_event_source);
+        sd_event_source_unref(m->rules_context_event_source);
         sd_event_unref(m->event);
 
         hashmap_free(m->polkit_registry);
@@ -649,9 +1148,13 @@ Manager* manager_free(Manager *m) {
         hashmap_free(m->monitored_swap_cgroup_contexts);
         hashmap_free(m->monitored_mem_pressure_cgroup_contexts);
         hashmap_free(m->monitored_mem_pressure_cgroup_contexts_candidates);
+        hashmap_free(m->monitored_rules_cgroup_contexts);
+        hashmap_free(m->monitored_rules_cgroup_contexts_candidates);
 
         set_free(m->kill_states);
 
+        hashmap_free(m->rulesets);
+
         return mfree(m);
 }
 
@@ -662,6 +1165,7 @@ static int manager_dispatch_reload_signal(sd_event_source *s, const struct signa
 
         manager_set_defaults(m);
         manager_parse_config_file(m);
+        warn_missing_rulesets(m);
 
         (void) sd_notify(/* unset_environment= */ false, NOTIFY_READY_MESSAGE);
         return 0;
@@ -706,6 +1210,14 @@ int manager_new(Manager **ret) {
         if (!m->monitored_mem_pressure_cgroup_contexts_candidates)
                 return -ENOMEM;
 
+        m->monitored_rules_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops);
+        if (!m->monitored_rules_cgroup_contexts)
+                return -ENOMEM;
+
+        m->monitored_rules_cgroup_contexts_candidates = hashmap_new(&oomd_cgroup_ctx_hash_ops);
+        if (!m->monitored_rules_cgroup_contexts_candidates)
+                return -ENOMEM;
+
         *ret = TAKE_PTR(m);
         return 0;
 }
@@ -815,6 +1327,10 @@ int manager_start(
         if (r < 0)
                 return r;
 
+        r = monitor_rules_contexts(m);
+        if (r < 0)
+                return r;
+
         return 0;
 }
 
index 8b9476232fb597ab000b2339b3c93b694aacca21..cc588461b086048d2e2d9c253edd4544edc13563 100644 (file)
@@ -2,15 +2,22 @@
 #pragma once
 
 #include "conf-parser-forward.h"
+#include "constants.h"
 #include "shared-forward.h"
 #include "oomd-conf.h"
 #include "oomd-util.h"
 
+#define RULESET_DIRS ((const char* const*) CONF_PATHS_STRV("systemd/oomd/rules.d"))
+
 /* Polling interval for monitoring stats */
 #define SWAP_INTERVAL_USEC 150000 /* 0.15 seconds */
 /* Pressure counters are lagging (~2 seconds) compared to swap so polling too frequently just wastes CPU */
 #define MEM_PRESSURE_INTERVAL_USEC (1 * USEC_PER_SEC)
 
+/* Rules evaluate both pressure and swap metrics; align on the slower-moving metric
+ * (pressure counters lag ~2s), so polling faster than 1s just wastes CPU. */
+#define RULESETS_INTERVAL_USEC MEM_PRESSURE_INTERVAL_USEC
+
 /* Take action if 10s of memory pressure > 60 for more than 30s. We use the "full" value from PSI so this is the
  * percentage of time all tasks were delayed (i.e. unproductive).
  * Generally 60 or higher might be acceptable for something like system.slice with no memory.high set; processes in
 #define RECLAIM_DURATION_USEC (30 * USEC_PER_SEC)
 #define POST_ACTION_DELAY_USEC (15 * USEC_PER_SEC)
 
+typedef enum OomdAction {
+        OOMD_ACTION_NONE,
+        OOMD_ACTION_KILL_ALL,
+        OOMD_ACTION_KILL_BY_PGSCAN,
+        OOMD_ACTION_KILL_BY_SWAP,
+        _OOMD_ACTION_MAX,
+        _OOMD_ACTION_INVALID = -EINVAL,
+} OomdAction;
+
+typedef struct OomdRuleset {
+        char *name;
+        int memory_pressure_above; /* permyriad (0-10000), or -1 for unset */
+        int swap_above;            /* permyriad (0-10000), or -1 for unset */
+        OomdAction action;
+        usec_t lasting_usec;
+        Hashmap *start_times; /* key: cgroup path (char*) -> value: heap-allocated timestamp (usec_t*) */
+        bool warned_no_swap;  /* latched once we've warned that kill-by-swap is misconfigured */
+} OomdRuleset;
+
 typedef struct Manager {
         sd_bus *bus;
         sd_event *event;
@@ -41,13 +67,17 @@ typedef struct Manager {
         Hashmap *monitored_swap_cgroup_contexts;
         Hashmap *monitored_mem_pressure_cgroup_contexts;
         Hashmap *monitored_mem_pressure_cgroup_contexts_candidates;
+        Hashmap *monitored_rules_cgroup_contexts;
+        Hashmap *monitored_rules_cgroup_contexts_candidates;
 
         OomdSystemContext system_context;
 
         usec_t mem_pressure_post_action_delay_start;
+        usec_t rules_post_action_delay_start;
 
         sd_event_source *swap_context_event_source;
         sd_event_source *mem_pressure_context_event_source;
+        sd_event_source *rules_context_event_source;
 
         /* This varlink object is used to manage the subscription from systemd-oomd to PID1 which it uses to
          * listen for changes in ManagedOOM settings (oomd client - systemd server). */
@@ -58,6 +88,7 @@ typedef struct Manager {
 
         usec_t prekill_timeout;
         Set *kill_states; /* currently ongoing OomdKillState operations */
+        Hashmap *rulesets;
 } Manager;
 
 Manager* manager_free(Manager *m);
index c0e04041a7e6a83ea0293efcba61901aa866850d..4128d315a635e027150e1ed44b3e8384457537c5 100644 (file)
 #include "sort-util.h"
 #include "stdio-util.h"
 #include "string-util.h"
+#include "strv.h"
 #include "time-util.h"
 #include "varlink-util.h"
 
 typedef struct OomdKillState {
         Manager *manager;
         OomdCGroupContext *ctx;
-        const char *reason;
+        char *reason;
         /* This holds sd_varlink references */
         Set *links;
 } OomdKillState;
@@ -80,6 +81,7 @@ static OomdCGroupContext *oomd_cgroup_context_free(OomdCGroupContext *ctx) {
                 return NULL;
 
         free(ctx->path);
+        strv_free(ctx->rules);
         return mfree(ctx);
 }
 
@@ -305,6 +307,7 @@ static void oomd_kill_state_free(OomdKillState *ks) {
 
         set_remove(ks->manager->kill_states, ks);
         oomd_cgroup_context_unref(ks->ctx);
+        free(ks->reason);
         free(ks);
 }
 
@@ -485,6 +488,10 @@ int oomd_cgroup_kill_mark(Manager *m, OomdCGroupContext *ctx, const char *reason
                 return 0;
         }
 
+        _cleanup_free_ char *reason_copy = strdup(reason);
+        if (!reason_copy)
+                return log_oom_debug();
+
         _cleanup_(oomd_kill_state_removep) OomdKillState *ks = new(OomdKillState, 1);
         if (!ks)
                 return log_oom_debug();
@@ -492,7 +499,7 @@ int oomd_cgroup_kill_mark(Manager *m, OomdCGroupContext *ctx, const char *reason
         *ks = (OomdKillState) {
                 .manager = m,
                 .ctx = oomd_cgroup_context_ref(ctx),
-                .reason = reason,
+                .reason = TAKE_PTR(reason_copy),
         };
 
         r = set_ensure_put(&m->kill_states, &oomd_kill_state_hash_ops, ks);
@@ -503,6 +510,7 @@ int oomd_cgroup_kill_mark(Manager *m, OomdCGroupContext *ctx, const char *reason
                  * cleanup path would remove by cgroup path key and could interfere with the existing queued
                  * kill state. */
                 oomd_cgroup_context_unref(ks->ctx);
+                free(ks->reason);
                 ks = mfree(ks);
                 return 0;
         }
@@ -585,14 +593,14 @@ int oomd_select_by_pgscan_rate(Hashmap *h, const char *prefix, OomdCGroupContext
         return ret;
 }
 
-int oomd_select_by_swap_usage(Hashmap *h, uint64_t threshold_usage, OomdCGroupContext **ret_selected) {
+int oomd_select_by_swap_usage(Hashmap *h, const char *prefix, uint64_t threshold_usage, OomdCGroupContext **ret_selected) {
         _cleanup_free_ OomdCGroupContext **sorted = NULL;
         int r, n, ret = 0;
 
         assert(h);
         assert(ret_selected);
 
-        n = oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted);
+        n = oomd_sort_cgroup_contexts(h, compare_swap_usage, prefix, &sorted);
         if (n < 0)
                 return n;
 
@@ -786,6 +794,9 @@ int oomd_insert_cgroup_context(Hashmap *old_h, Hashmap *new_h, const char *path)
                 curr_ctx->mem_pressure_limit_hit_start = old_ctx->mem_pressure_limit_hit_start;
                 curr_ctx->mem_pressure_duration_usec = old_ctx->mem_pressure_duration_usec;
                 curr_ctx->last_had_mem_reclaim = old_ctx->last_had_mem_reclaim;
+                curr_ctx->rules = strv_copy(old_ctx->rules);
+                if (old_ctx->rules && !curr_ctx->rules)
+                        return -ENOMEM;
         }
 
         if (oomd_pgscan_rate(curr_ctx) > 0)
@@ -817,6 +828,9 @@ void oomd_update_cgroup_contexts_between_hashmaps(Hashmap *old_h, Hashmap *curr_
                 ctx->mem_pressure_limit_hit_start = old_ctx->mem_pressure_limit_hit_start;
                 ctx->mem_pressure_duration_usec = old_ctx->mem_pressure_duration_usec;
                 ctx->last_had_mem_reclaim = old_ctx->last_had_mem_reclaim;
+                /* Note: rules are intentionally not copied here. This function is only used on
+                 * candidate hashmaps (populated by recursively_get_cgroup_context for descendant
+                 * cgroups), which never carry rules. */
 
                 if (oomd_pgscan_rate(ctx) > 0)
                         ctx->last_had_mem_reclaim = now(CLOCK_MONOTONIC);
index d4e1a9207bd50ae1bf1e6201a5adad21cce4bff0..a76454f812393c524077834571dfe012663e319b 100644 (file)
@@ -40,6 +40,7 @@ struct OomdCGroupContext {
         usec_t mem_pressure_limit_hit_start;
         usec_t last_had_mem_reclaim;
         usec_t mem_pressure_duration_usec;
+        char **rules;
 };
 
 struct OomdSystemContext {
@@ -132,7 +133,7 @@ int oomd_cgroup_kill_mark(Manager *m, OomdCGroupContext *ctx, const char *reason
  * everything in `h` is a candidate.
  * Returns the killed cgroup in ret_selected. */
 int oomd_select_by_pgscan_rate(Hashmap *h, const char *prefix, OomdCGroupContext **ret_selected);
-int oomd_select_by_swap_usage(Hashmap *h, uint64_t threshold_usage, OomdCGroupContext **ret_selected);
+int oomd_select_by_swap_usage(Hashmap *h, const char *prefix, uint64_t threshold_usage, OomdCGroupContext **ret_selected);
 
 int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret);
 int oomd_system_context_acquire(const char *proc_meminfo_path, OomdSystemContext *ret);
index 9b69ebd1a9361524353e2590a99a118c0e5a4c33..e3a1fcd8934b2a94bea866ee678ce70edea2dfae 100644 (file)
@@ -2384,6 +2384,7 @@ static const BusProperty cgroup_properties[] = {
         { "ManagedOOMSwap",                        bus_append_string                             },
         { "ManagedOOMMemoryPressure",              bus_append_string                             },
         { "ManagedOOMPreference",                  bus_append_string                             },
+        { "OOMRules",                              bus_append_strv                               },
         { "MemoryPressureWatch",                   bus_append_string                             },
         { "CPUPressureWatch",                      bus_append_string                             },
         { "IOPressureWatch",                       bus_append_string                             },
index c9f2a59728c1f58a83e90816b4b9cebf696e69ef..2ed91121e48753dac97f40719e306c098f40312d 100644 (file)
@@ -409,6 +409,8 @@ static SD_VARLINK_DEFINE_STRUCT_TYPE(
                 SD_VARLINK_DEFINE_FIELD(ManagedOOMMemoryPressureDurationUSec, SD_VARLINK_INT, SD_VARLINK_NULLABLE),
                 SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd.resource-control.html#ManagedOOMPreference=none%7Cavoid%7Comit"),
                 SD_VARLINK_DEFINE_FIELD_BY_TYPE(ManagedOOMPreference, ManagedOOMPreference, 0),
+                SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd.resource-control.html#OOMRules="),
+                SD_VARLINK_DEFINE_FIELD(OOMRules, SD_VARLINK_STRING, SD_VARLINK_ARRAY|SD_VARLINK_NULLABLE),
                 SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd.resource-control.html#MemoryPressureWatch="),
                 SD_VARLINK_DEFINE_FIELD_BY_TYPE(MemoryPressureWatch, CGroupPressureWatch, 0),
                 SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd.resource-control.html#MemoryPressureThresholdSec="),
index 80fa50a73a92c5052ed01db0593cd62b9ee5b201..15e28b3e1b0c75d7ee6fb9eabd92ad82c2301258 100644 (file)
@@ -14,7 +14,8 @@ SD_VARLINK_DEFINE_STRUCT_TYPE(
                 SD_VARLINK_DEFINE_FIELD(path, SD_VARLINK_STRING, 0),
                 SD_VARLINK_DEFINE_FIELD(property, SD_VARLINK_STRING, 0),
                 SD_VARLINK_DEFINE_FIELD(limit, SD_VARLINK_INT, SD_VARLINK_NULLABLE),
-                SD_VARLINK_DEFINE_FIELD(duration, SD_VARLINK_INT, SD_VARLINK_NULLABLE));
+                SD_VARLINK_DEFINE_FIELD(duration, SD_VARLINK_INT, SD_VARLINK_NULLABLE),
+                SD_VARLINK_DEFINE_FIELD(rules, SD_VARLINK_STRING, SD_VARLINK_ARRAY|SD_VARLINK_NULLABLE));
 
 static SD_VARLINK_DEFINE_METHOD(
                 ReportManagedOOMCGroups,
index 96a15989c745a5456b891d3470692ac77b14c113..6689bbdd733c9abd9732aafa80acf4f5eda3a91c 100755 (executable)
@@ -353,6 +353,127 @@ EOF
     systemctl reset-failed
 }
 
+testcase_oom_rulesets() {
+    # Create a ruleset that triggers on any memory pressure with no delay
+    mkdir -p /run/systemd/oomd/rules.d/
+    cat >/run/systemd/oomd/rules.d/testrule.oomrule <<'EOF'
+[Rule]
+MemoryPressureAbove=0%
+Action=kill-all
+LastingSec=0
+EOF
+
+    systemctl reload systemd-oomd.service
+
+    # Run a transient service with OOMRules=testrule that generates memory pressure
+    (! systemd-run --wait --unit=TEST-55-OOMD-testrules \
+        -p MemoryHigh=3M \
+        -p OOMRules=testrule \
+        stress-ng --timeout 3m --vm 10 --vm-bytes 50M --vm-keep)
+
+    # Verify in the journal that the rule triggered
+    journalctl --sync
+    journalctl -u systemd-oomd.service --since "-2min" | grep "Rule 'testrule' conditions met" >/dev/null
+
+    # clean up
+    rm -f /run/systemd/oomd/rules.d/testrule.oomrule
+    systemctl reload systemd-oomd.service
+}
+
+testcase_oom_rulesets_invalid_name() {
+    # Invalid rule names must be rejected at property-set time (filename_is_valid check).
+    # "foo/bar" contains a slash and "." and ".." are disallowed by filename_is_valid.
+    set +e
+    err=$(systemd-run --wait --unit=TEST-55-OOMD-badname1 -p 'OOMRules=foo/bar' true 2>&1)
+    rc=$?
+    set -e
+    [[ $rc -ne 0 ]]
+    echo "$err" | grep "Invalid rule name" >/dev/null
+
+    set +e
+    err=$(systemd-run --wait --unit=TEST-55-OOMD-badname2 -p 'OOMRules=.' true 2>&1)
+    rc=$?
+    set -e
+    [[ $rc -ne 0 ]]
+    echo "$err" | grep "Invalid rule name" >/dev/null
+}
+
+testcase_oom_rulesets_missing_warning() {
+    # A unit that references a ruleset which does not exist must produce a
+    # warn_missing_rulesets warning in oomd's journal (once, at subscription time).
+    mkdir -p /run/systemd/oomd/rules.d/
+    rm -f /run/systemd/oomd/rules.d/absentrule.oomrule
+    systemctl reload systemd-oomd.service
+
+    # Start a long-lived transient unit that references a ruleset that doesn't exist.
+    systemd-run --unit=TEST-55-OOMD-missing --remain-after-exit \
+        -p OOMRules=absentrule \
+        sleep infinity
+
+    # Give oomd a moment to receive the subscription, then verify the warning fires once.
+    timeout 30 bash -c '
+        until journalctl --sync && journalctl -u systemd-oomd.service --since "-1min" 2>/dev/null | grep "references undefined ruleset .absentrule." >/dev/null; do
+            sleep 1
+        done
+    '
+
+    # And when we now add the ruleset and reload, oomd must pick it up without
+    # the unit needing to restart. Verify by checking for the debug-log line that
+    # reports the ruleset was registered.
+    cat >/run/systemd/oomd/rules.d/absentrule.oomrule <<'EOF'
+[Rule]
+SwapUsageMax=99%
+Action=kill-all
+LastingSec=0
+EOF
+    systemctl reload systemd-oomd.service
+
+    journalctl --sync
+    journalctl -u systemd-oomd.service --since "-1min" | grep "Registered ruleset: absentrule" >/dev/null
+
+    # cleanup
+    systemctl stop TEST-55-OOMD-missing.service
+    rm -f /run/systemd/oomd/rules.d/absentrule.oomrule
+    systemctl reload systemd-oomd.service
+}
+
+testcase_oom_rulesets_lasting_sec() {
+    # A rule with LastingSec > 0 must NOT trigger during the waiting period.
+    # Baseline proof: with the same workload but LastingSec=0 (testcase_oom_rulesets
+    # above) oomd kills the unit within a couple of seconds, so an active unit after
+    # ~6 s demonstrates LastingSec is being respected.
+    mkdir -p /run/systemd/oomd/rules.d/
+    cat >/run/systemd/oomd/rules.d/slowrule.oomrule <<'EOF'
+[Rule]
+MemoryPressureAbove=0%
+Action=kill-all
+LastingSec=1h
+EOF
+
+    systemctl reload systemd-oomd.service
+
+    # Start the unit without --wait so we can check mid-run state. The
+    # stress-ng timeout bounds the test if anything goes wrong.
+    systemd-run --unit=TEST-55-OOMD-slowrule \
+        -p MemoryHigh=3M \
+        -p OOMRules=slowrule \
+        stress-ng --timeout 15s --vm 10 --vm-bytes 50M --vm-keep
+
+    # Wait long enough for oomd's 1s rule-check loop to evaluate the condition
+    # many times. With LastingSec=1h the kill must not fire.
+    sleep 6
+
+    # Unit must still be active — if it were killed, Result= would be oom-kill.
+    assert_eq "$(systemctl show TEST-55-OOMD-slowrule.service -P ActiveState)" "active"
+    assert_eq "$(systemctl show TEST-55-OOMD-slowrule.service -P Result)" "success"
+
+    systemctl stop TEST-55-OOMD-slowrule.service 2>/dev/null || true
+
+    # cleanup
+    rm -f /run/systemd/oomd/rules.d/slowrule.oomrule
+    systemctl reload systemd-oomd.service
+}
+
 testcase_prekill_hook() {
     cat >/run/systemd/oomd.conf.d/99-oomd-prekill-test.conf <<'EOF'
 [OOM]