From: Matteo Croce Date: Thu, 25 Sep 2025 19:17:26 +0000 (+0200) Subject: oomd: ruleset parsing X-Git-Tag: v261-rc1~111 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=15783e4657bdc5b80f91cd20b1dc08b8e17b565c;p=thirdparty%2Fsystemd.git oomd: ruleset parsing Add to oomd the capability to define rulesets in /etc/systemd/oomd/rules.d/ and then reference them with a new config option OOMRule= --- diff --git a/man/oomd.conf.xml b/man/oomd.conf.xml index f8c3c0a173e..1e33f3b3ded 100644 --- a/man/oomd.conf.xml +++ b/man/oomd.conf.xml @@ -68,6 +68,92 @@ + + OOM Rulesets + + systemd-oomd supports custom rulesets that define conditions and actions for + OOM handling on a per-unit basis. Ruleset files use the .oomrule extension and are + loaded from /etc/systemd/oomd/rules.d/, + /run/systemd/oomd/rules.d/, + /usr/local/lib/systemd/oomd/rules.d/, and + /usr/lib/systemd/oomd/rules.d/. + Units opt into rulesets via the OOMRules= setting in + systemd.resource-control5, + which takes a space-separated list of ruleset names (the file name without the .oomrule + extension). + + Each ruleset file contains a [Rule] section with the following options. At least + one of MemoryPressureAbove= or SwapUsageMax= must be configured; + rulesets with no conditions are ignored. If both are set, the conditions are combined with AND, i.e. the + action is only triggered when both thresholds are exceeded simultaneously. + + + + MemoryPressureAbove= + + Sets the memory pressure threshold above which the rule's action will be triggered. + The memory pressure represents the fraction of time in a 10 second window in which all tasks in the + control group were delayed (PSI full avg10). Takes a value specified in percent + (when suffixed with %), permille (‰) or permyriad + (‱), between 0% and 100%, inclusive. If unset, this condition is not + evaluated. A value of 100% can never be exceeded and is + therefore rejected with a warning; a value of 0% makes the condition true on any + observed pressure, which is usually not useful. + + + + + + SwapUsageMax= + + Sets the system-wide swap usage threshold above which the rule's action will be + triggered. Takes a value specified in percent (when suffixed with %), + permille (‰) or permyriad (‱), + between 0% and 100%, inclusive. If unset, this condition is not evaluated. A value of + 100% can never be exceeded and is therefore rejected with + a warning; a value of 0% fires as soon as any swap is in use, which is usually + not useful. + + + + + + Action= + + Specifies the action to take when the rule's conditions are met. Takes one of + kill-all, kill-by-pgscan, or + kill-by-swap. This setting is mandatory; rulesets without + Action= are ignored. + + + kill-all sends SIGKILL to every process + in the unit's cgroup hierarchy, including any descendant cgroups. + + kill-by-pgscan selects and kills the descendant cgroup with + the highest recent page scan (reclaim) rate. + + kill-by-swap selects and kills the descendant cgroup with the + highest swap usage. + + + + + + + LastingSec= + + Sets the duration the conditions must be continuously met before the action is taken. + Takes a time span value, see + systemd.time7 + for details on the permitted syntax. Defaults to 0, i.e. the action is taken + immediately when the conditions are met. + + + + + + + [OOM] Section Options diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml index 7efc899dba2..2d63050a686 100644 --- a/man/org.freedesktop.systemd1.xml +++ b/man/org.freedesktop.systemd1.xml @@ -3098,6 +3098,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly s ManagedOOMPreference = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly as OOMRules = ['...', ...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(ss) BPFProgram = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(iiqq) SocketBindAllow = [...]; @@ -3777,6 +3779,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + @@ -4479,6 +4483,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + @@ -5388,6 +5394,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly s ManagedOOMPreference = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly as OOMRules = ['...', ...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(ss) BPFProgram = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(iiqq) SocketBindAllow = [...]; @@ -6083,6 +6091,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + @@ -6759,6 +6769,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + @@ -7491,6 +7503,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly s ManagedOOMPreference = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly as OOMRules = ['...', ...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(ss) BPFProgram = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(iiqq) SocketBindAllow = [...]; @@ -8110,6 +8124,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + @@ -8694,6 +8710,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + @@ -9559,6 +9577,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly s ManagedOOMPreference = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly as OOMRules = ['...', ...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(ss) BPFProgram = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(iiqq) SocketBindAllow = [...]; @@ -10160,6 +10180,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + @@ -10726,6 +10748,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + @@ -11444,6 +11468,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly s ManagedOOMPreference = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly as OOMRules = ['...', ...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(ss) BPFProgram = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(iiqq) SocketBindAllow = [...]; @@ -11627,6 +11653,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + + @@ -11825,6 +11853,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + + @@ -12046,6 +12076,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly s ManagedOOMPreference = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly as OOMRules = ['...', ...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(ss) BPFProgram = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(iiqq) SocketBindAllow = [...]; @@ -12243,6 +12275,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + + @@ -12465,6 +12499,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + + @@ -12801,8 +12837,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ CPUPressureThresholdUSec, CPUPressureWatch, IOPressureThresholdUSec, - IOPressureWatch, and - CPUSetPartition were added in version 261. + IOPressureWatch, + CPUSetPartition, and + OOMRules were added in version 261. Socket Unit Objects @@ -12876,8 +12913,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ CPUPressureThresholdUSec, CPUPressureWatch, IOPressureThresholdUSec, - IOPressureWatch, and - CPUSetPartition were added in version 261. + IOPressureWatch, + CPUSetPartition, and + OOMRules were added in version 261. Mount Unit Objects @@ -12946,8 +12984,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ CPUPressureThresholdUSec, CPUPressureWatch, IOPressureThresholdUSec, - IOPressureWatch, and - CPUSetPartition were added in version 261. + IOPressureWatch, + CPUSetPartition, and + OOMRules were added in version 261. Swap Unit Objects @@ -13014,8 +13053,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ CPUPressureThresholdUSec, CPUPressureWatch, IOPressureThresholdUSec, - IOPressureWatch, and - CPUSetPartition were added in version 261. + IOPressureWatch, + CPUSetPartition, and + OOMRules were added in version 261. Slice Unit Objects @@ -13052,8 +13092,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ CPUPressureThresholdUSec, CPUPressureWatch, IOPressureThresholdUSec, - IOPressureWatch, and - CPUSetPartition were added in version 261. + IOPressureWatch, + CPUSetPartition, and + OOMRules were added in version 261. Scope Unit Objects @@ -13088,8 +13129,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ CPUPressureThresholdUSec, CPUPressureWatch, IOPressureThresholdUSec, - IOPressureWatch, and - CPUSetPartition were added in version 261. + IOPressureWatch, + CPUSetPartition, and + OOMRules were added in version 261. Job Objects diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml index fcad4b31839..58e923b6184 100644 --- a/man/systemd.resource-control.xml +++ b/man/systemd.resource-control.xml @@ -1630,6 +1630,35 @@ DeviceAllow=/dev/loop-control + + OOMRules= + + + Takes a space-separated list of OOM ruleset names. The rulesets are defined in + .oomrule files placed in + /etc/systemd/oomd/rules.d/, + /run/systemd/oomd/rules.d/, + /usr/local/lib/systemd/oomd/rules.d/, or + /usr/lib/systemd/oomd/rules.d/. When set, + systemd-oomd.service8 + will monitor this unit's cgroup and evaluate the specified rulesets against it. + Each ruleset defines conditions (such as memory pressure or swap usage thresholds) and an action + to take when those conditions are met. See + oomd.conf5 for + details on the available ruleset options. + + Setting this property will also result in After= and + Wants= dependencies on systemd-oomd.service unless + DefaultDependencies=no. + + Defaults to an empty list, which means no rulesets are applied. Note that each monitored + cgroup incurs a per-interval walk of its descendant cgroup tree, so monitoring very large numbers of + cgroups via OOMRules= may have a measurable performance impact. + + + + + MemoryPressureWatch= diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 48b7df0e00c..543d1ac8e3c 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -284,6 +284,8 @@ void cgroup_context_done(CGroupContext *c) { c->delegate_subgroup = mfree(c->delegate_subgroup); + c->moom_rules = strv_free(c->moom_rules); + nft_set_context_clear(&c->nft_set_context); } @@ -670,6 +672,9 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { FOREACH_ARRAY(nft_set, c->nft_set_context.sets, c->nft_set_context.n_sets) fprintf(f, "%sNFTSet: %s:%s:%s:%s\n", prefix, nft_set_source_to_string(nft_set->source), nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set); + + STRV_FOREACH(rule, c->moom_rules) + fprintf(f, "%sOOMRules: %s\n", prefix, *rule); } void cgroup_context_dump_socket_bind_item(const CGroupSocketBindItem *item, FILE *f) { diff --git a/src/core/cgroup.h b/src/core/cgroup.h index b7213d8d594..e3d33ad5e09 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -203,6 +203,7 @@ typedef struct CGroupContext { uint32_t moom_mem_pressure_limit; /* Normalized to 2^32-1 == 100% */ usec_t moom_mem_pressure_duration_usec; ManagedOOMPreference moom_preference; + char **moom_rules; /* Pressure logic */ CGroupPressure pressure[_PRESSURE_RESOURCE_MAX]; diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c index 6cecc8b9e74..168bdf10c13 100644 --- a/src/core/dbus-cgroup.c +++ b/src/core/dbus-cgroup.c @@ -424,6 +424,7 @@ const sd_bus_vtable bus_cgroup_vtable[] = { SD_BUS_PROPERTY("ManagedOOMMemoryPressureLimit", "u", NULL, offsetof(CGroupContext, moom_mem_pressure_limit), 0), SD_BUS_PROPERTY("ManagedOOMMemoryPressureDurationUSec", "t", bus_property_get_usec, offsetof(CGroupContext, moom_mem_pressure_duration_usec), 0), SD_BUS_PROPERTY("ManagedOOMPreference", "s", property_get_managed_oom_preference, offsetof(CGroupContext, moom_preference), 0), + SD_BUS_PROPERTY("OOMRules", "as", NULL, offsetof(CGroupContext, moom_rules), 0), SD_BUS_PROPERTY("BPFProgram", "a(ss)", property_get_bpf_foreign_program, 0, 0), SD_BUS_PROPERTY("SocketBindAllow", "a(iiqq)", property_get_socket_bind, offsetof(CGroupContext, socket_bind_allow), 0), SD_BUS_PROPERTY("SocketBindDeny", "a(iiqq)", property_get_socket_bind, offsetof(CGroupContext, socket_bind_deny), 0), @@ -1796,6 +1797,38 @@ int bus_cgroup_set_property( return 1; } + + if (streq(name, "OOMRules")) { + _cleanup_strv_free_ char **oom_rules = NULL; + + if (!UNIT_VTABLE(u)->can_set_managed_oom) + return sd_bus_error_setf(reterr_error, SD_BUS_ERROR_INVALID_ARGS, "Cannot set %s for this unit type", name); + + r = sd_bus_message_read_strv(message, &oom_rules); + if (r < 0) + return r; + + STRV_FOREACH(rule, oom_rules) + if (!string_is_safe(*rule, STRING_FILENAME)) + return sd_bus_error_setf(reterr_error, SD_BUS_ERROR_INVALID_ARGS, "Invalid rule name: %s", *rule); + + strv_uniq(oom_rules); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *joined = strv_join(oom_rules, " "); + if (!joined) + return -ENOMEM; + + strv_free_and_replace(c->moom_rules, oom_rules); + + unit_write_settingf(u, flags, name, "OOMRules=\nOOMRules=%s", joined); + + (void) manager_varlink_send_managed_oom_update(u); + } + + return 1; + } + if (STR_IN_SET(name, "SocketBindAllow", "SocketBindDeny")) { CGroupSocketBindItem **list; uint16_t nr_ports, port_min; diff --git a/src/core/execute-serialize.c b/src/core/execute-serialize.c index 5f205772fd8..953b0d98948 100644 --- a/src/core/execute-serialize.c +++ b/src/core/execute-serialize.c @@ -285,6 +285,10 @@ static int exec_cgroup_context_serialize(const CGroupContext *c, FILE *f) { if (r < 0) return r; + r = serialize_strv(f, "exec-cgroup-context-managed-oom-rules", c->moom_rules); + if (r < 0) + return r; + r = serialize_item(f, "exec-cgroup-context-memory-pressure-watch", cgroup_pressure_watch_to_string(c->pressure[PRESSURE_MEMORY].watch)); if (r < 0) return r; @@ -650,6 +654,10 @@ static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) { r = deserialize_usec(val, &c->moom_mem_pressure_duration_usec); if (r < 0) return r; + } else if ((val = startswith(l, "exec-cgroup-context-managed-oom-rules="))) { + r = deserialize_strv(val, &c->moom_rules); + if (r < 0) + return r; } else if ((val = startswith(l, "exec-cgroup-context-memory-pressure-watch="))) { c->pressure[PRESSURE_MEMORY].watch = cgroup_pressure_watch_from_string(val); if (c->pressure[PRESSURE_MEMORY].watch < 0) diff --git a/src/core/load-fragment-gperf.gperf.in b/src/core/load-fragment-gperf.gperf.in index b8d744c1f49..0e2d679f978 100644 --- a/src/core/load-fragment-gperf.gperf.in +++ b/src/core/load-fragment-gperf.gperf.in @@ -272,6 +272,7 @@ {{type}}.ManagedOOMMemoryPressureLimit, config_parse_managed_oom_mem_pressure_limit, 0, offsetof({{type}}, cgroup_context.moom_mem_pressure_limit) {{type}}.ManagedOOMMemoryPressureDurationSec, config_parse_managed_oom_mem_pressure_duration_sec, 0, offsetof({{type}}, cgroup_context.moom_mem_pressure_duration_usec) {{type}}.ManagedOOMPreference, config_parse_managed_oom_preference, 0, offsetof({{type}}, cgroup_context.moom_preference) +{{type}}.OOMRules, config_parse_managed_oom_rules, 1, offsetof({{type}}, cgroup_context.moom_rules) {{type}}.NetClass, config_parse_warn_compat, DISABLED_LEGACY, 0 {{type}}.BPFProgram, config_parse_bpf_foreign_program, 0, offsetof({{type}}, cgroup_context) {{type}}.SocketBindAllow, config_parse_cgroup_socket_bind, 0, offsetof({{type}}, cgroup_context.socket_bind_allow) diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index 2a268d813b5..9b2fa71be39 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -4090,6 +4090,65 @@ int config_parse_managed_oom_mem_pressure_duration_sec( return 0; } +int config_parse_managed_oom_rules( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char ***sv = ASSERT_PTR(data); + UnitType t; + int r; + + assert(rvalue); + + t = unit_name_to_type(unit); + assert(t != _UNIT_TYPE_INVALID); + + if (!unit_vtable[t]->can_set_managed_oom) + return log_syntax(unit, LOG_WARNING, filename, line, 0, "%s= is not supported for this unit type, ignoring.", lvalue); + + if (isempty(rvalue)) { + *sv = strv_free(*sv); + return 0; + } + + /* Tokenize once: validate each rule name (rulesets are loaded from .oomrule files) + * and accumulate into a local strv. Invalid rule names are skipped individually + * with a warning so the rest of the line still applies. */ + _cleanup_strv_free_ char **strv = NULL; + for (const char *p = rvalue;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE); + if (r == 0) + break; + if (r < 0) + return log_syntax_parse_error(unit, filename, line, r, lvalue, rvalue); + + if (!string_is_safe(word, STRING_FILENAME)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid rule name in %s=, ignoring: %s", lvalue, word); + continue; + } + + r = strv_consume(&strv, TAKE_PTR(word)); + if (r < 0) + return log_oom(); + } + + r = strv_extend_strv_consume(sv, TAKE_PTR(strv), /* filter_duplicates= */ ltype); + if (r < 0) + return log_oom(); + + return 0; +} + int config_parse_device_allow( const char *unit, const char *filename, diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h index fafb0040283..99b53626203 100644 --- a/src/core/load-fragment.h +++ b/src/core/load-fragment.h @@ -144,6 +144,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_pid_file); CONFIG_PARSER_PROTOTYPE(config_parse_exit_status); CONFIG_PARSER_PROTOTYPE(config_parse_disable_controllers); CONFIG_PARSER_PROTOTYPE(config_parse_oom_policy); +CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_rules); CONFIG_PARSER_PROTOTYPE(config_parse_numa_policy); CONFIG_PARSER_PROTOTYPE(config_parse_numa_mask); CONFIG_PARSER_PROTOTYPE(config_parse_ip_filter_bpf_progs); diff --git a/src/core/unit.c b/src/core/unit.c index 8ed74b080d1..f81083a70f7 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -1626,7 +1626,7 @@ static int unit_add_oomd_dependencies(Unit *u) { if (!c) return 0; - bool wants_oomd = c->moom_swap == MANAGED_OOM_KILL || c->moom_mem_pressure == MANAGED_OOM_KILL; + bool wants_oomd = c->moom_swap == MANAGED_OOM_KILL || c->moom_mem_pressure == MANAGED_OOM_KILL || !strv_isempty(c->moom_rules); if (!wants_oomd) return 0; diff --git a/src/core/varlink-cgroup.c b/src/core/varlink-cgroup.c index 9953707417d..e031f00368b 100644 --- a/src/core/varlink-cgroup.c +++ b/src/core/varlink-cgroup.c @@ -326,6 +326,7 @@ int unit_cgroup_context_build_json(sd_json_variant **ret, const char *name, void JSON_BUILD_PAIR_UNSIGNED_NON_ZERO("ManagedOOMMemoryPressureLimit", c->moom_mem_pressure_limit), JSON_BUILD_PAIR_FINITE_USEC("ManagedOOMMemoryPressureDurationUSec", c->moom_mem_pressure_duration_usec), JSON_BUILD_PAIR_ENUM("ManagedOOMPreference", managed_oom_preference_to_string(c->moom_preference)), + JSON_BUILD_PAIR_STRV_NON_EMPTY("OOMRules", c->moom_rules), JSON_BUILD_PAIR_ENUM("MemoryPressureWatch", cgroup_pressure_watch_to_string(c->pressure[PRESSURE_MEMORY].watch)), JSON_BUILD_PAIR_FINITE_USEC("MemoryPressureThresholdUSec", c->pressure[PRESSURE_MEMORY].threshold_usec), JSON_BUILD_PAIR_ENUM("CPUPressureWatch", cgroup_pressure_watch_to_string(c->pressure[PRESSURE_CPU].watch)), diff --git a/src/core/varlink.c b/src/core/varlink.c index 09817b6dce2..f6b474c6363 100644 --- a/src/core/varlink.c +++ b/src/core/varlink.c @@ -11,6 +11,7 @@ #include "path-util.h" #include "pidref.h" #include "string-util.h" +#include "strv.h" #include "unit.h" #include "varlink.h" #include "varlink-dynamic-user.h" @@ -28,10 +29,11 @@ static const char* const managed_oom_mode_properties[] = { "ManagedOOMSwap", "ManagedOOMMemoryPressure", + "OOMRules", }; static int build_managed_oom_json_array_element(Unit *u, const char *property, sd_json_variant **ret_v) { - bool use_limit = false, use_duration = false; + bool use_limit = false, use_duration = false, use_rules = false; CGroupContext *c; const char *mode; @@ -60,15 +62,25 @@ static int build_managed_oom_json_array_element(Unit *u, const char *property, s mode = managed_oom_mode_to_string(c->moom_mem_pressure); use_limit = c->moom_mem_pressure_limit > 0; use_duration = c->moom_mem_pressure_duration_usec != USEC_INFINITY; + } else if (streq(property, "OOMRules")) { + if (strv_isempty(c->moom_rules)) + mode = managed_oom_mode_to_string(MANAGED_OOM_AUTO); + else { + mode = managed_oom_mode_to_string(MANAGED_OOM_KILL); + use_rules = true; + } } else return -EINVAL; + assert(mode); + return sd_json_buildo(ret_v, JSON_BUILD_PAIR_ENUM("mode", mode), SD_JSON_BUILD_PAIR_STRING("path", crt->cgroup_path), SD_JSON_BUILD_PAIR_STRING("property", property), SD_JSON_BUILD_PAIR_CONDITION(use_limit, "limit", SD_JSON_BUILD_UNSIGNED(c->moom_mem_pressure_limit)), - SD_JSON_BUILD_PAIR_CONDITION(use_duration, "duration", SD_JSON_BUILD_UNSIGNED(c->moom_mem_pressure_duration_usec))); + SD_JSON_BUILD_PAIR_CONDITION(use_duration, "duration", SD_JSON_BUILD_UNSIGNED(c->moom_mem_pressure_duration_usec)), + SD_JSON_BUILD_PAIR_CONDITION(use_rules, "rules", SD_JSON_BUILD_STRV(c->moom_rules))); } static int build_managed_oom_cgroups_json(Manager *m, bool allow_empty, sd_json_variant **ret) { @@ -109,7 +121,8 @@ static int build_managed_oom_cgroups_json(Manager *m, bool allow_empty, sd_json_ /* For the initial varlink call we only care about units that enabled (i.e. mode is not * set to "auto") oomd properties. */ if (!(streq(*i, "ManagedOOMSwap") && c->moom_swap == MANAGED_OOM_KILL) && - !(streq(*i, "ManagedOOMMemoryPressure") && c->moom_mem_pressure == MANAGED_OOM_KILL)) + !(streq(*i, "ManagedOOMMemoryPressure") && c->moom_mem_pressure == MANAGED_OOM_KILL) && + !(streq(*i, "OOMRules") && !strv_isempty(c->moom_rules))) continue; r = build_managed_oom_json_array_element(u, *i, &e); diff --git a/src/oom/oomd-conf.c b/src/oom/oomd-conf.c index f0091e27561..bd3d0003b07 100644 --- a/src/oom/oomd-conf.c +++ b/src/oom/oomd-conf.c @@ -1,11 +1,18 @@ /* SPDX-License-Identifier: LGPL-2.1-or-later */ +#include "conf-files.h" #include "conf-parser.h" +#include "hashmap.h" #include "log.h" #include "oomd-conf.h" #include "oomd-manager.h" #include "parse-util.h" +#include "path-util.h" +#include "percent-util.h" +#include "stat-util.h" +#include "string-table.h" #include "string-util.h" +#include "strv.h" #include "time-util.h" static int config_parse_duration( @@ -66,7 +73,143 @@ void manager_set_defaults(Manager *m) { log_warning_errno(r, "Failed to set default for default_mem_pressure_limit, ignoring: %m"); } +/* OOMD_ACTION_NONE is intentionally omitted — it's the "unset" sentinel. Rulesets with + * .action == OOMD_ACTION_NONE are rejected at load time, so oomd_action_to_string() must + * only be called on rulesets that have already passed ruleset_load_one's validation + * (otherwise it returns NULL). */ +static const char* const oomd_action_table[] = { + [OOMD_ACTION_KILL_ALL] = "kill-all", + [OOMD_ACTION_KILL_BY_PGSCAN] = "kill-by-pgscan", + [OOMD_ACTION_KILL_BY_SWAP] = "kill-by-swap", +}; + +DEFINE_STRING_TABLE_LOOKUP(oomd_action, OomdAction); +static DEFINE_CONFIG_PARSE_ENUM(config_parse_oomd_action, oomd_action, OomdAction); + +void oomd_ruleset_free(OomdRuleset *ruleset) { + if (!ruleset) + return; + hashmap_free(ruleset->start_times); + free(ruleset->name); + free(ruleset); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(OomdRuleset*, oomd_ruleset_free, NULL); +DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(oomd_ruleset_hash_ops, char, string_hash_func, string_compare_func, OomdRuleset, oomd_ruleset_free); + +static int ruleset_load_one(Manager *m, const char *filename) { + _cleanup_free_ char *name = NULL; + _cleanup_(oomd_ruleset_freep) OomdRuleset *ruleset = NULL; + _cleanup_fclose_ FILE *f = NULL; + struct stat st; + int r; + + assert(m); + assert(filename); + + /* Pin the file via an fd so the empty-file check and the parse operate on the same + * inode (avoids TOCTOU between null_or_empty_path() and a subsequent open()). */ + f = fopen(filename, "re"); + if (!f) + return log_warning_errno(errno, "Failed to open '%s': %m", filename); + + if (fstat(fileno(f), &st) < 0) + return log_warning_errno(errno, "Failed to stat '%s': %m", filename); + + if (null_or_empty(&st)) { + log_debug("Skipping empty file: %s", filename); + return 0; + } + + r = path_extract_filename(filename, &name); + if (r < 0) + return log_error_errno(r, "Failed to extract file name of '%s': %m", filename); + + char *e = ASSERT_PTR(endswith(name, ".oomrule")); + *e = 0; + + /* Apply the same validation the DBus setter and the config parser use, so that any + * ruleset we accept here is actually referenceable via OOMRules= from a unit. */ + if (!string_is_safe(name, STRING_FILENAME)) { + log_warning("Invalid ruleset name '%s' derived from '%s', ignoring.", name, filename); + return 0; + } + + ruleset = new(OomdRuleset, 1); + if (!ruleset) + return log_oom(); + + *ruleset = (OomdRuleset) { + .name = TAKE_PTR(name), + .memory_pressure_above = -1, + .swap_above = -1, + }; + + const ConfigTableItem items[] = { + { "Rule", "MemoryPressureAbove", config_parse_permyriad, 0, &ruleset->memory_pressure_above }, + { "Rule", "SwapUsageMax", config_parse_permyriad, 0, &ruleset->swap_above }, + { "Rule", "Action", config_parse_oomd_action, 0, &ruleset->action }, + { "Rule", "LastingSec", config_parse_sec, 0, &ruleset->lasting_usec }, + {} + }; + + r = config_parse( + /* unit= */ NULL, + filename, + f, + "Rule\0", + config_item_table_lookup, + items, + CONFIG_PARSE_WARN, + /* userdata= */ NULL, + /* ret_stat= */ NULL); + if (r < 0) + return log_error_errno(r, "Failed to parse ruleset file '%s': %m", filename); + + if (ruleset->memory_pressure_above < 0 && ruleset->swap_above < 0) { + log_warning("Ruleset '%s' has no conditions configured (MemoryPressureAbove= or SwapUsageMax=), ignoring.", ruleset->name); + return 0; + } + + if (ruleset->action == OOMD_ACTION_NONE) { + log_warning("Ruleset '%s' has no Action= configured, ignoring.", ruleset->name); + return 0; + } + + if (ruleset->lasting_usec == USEC_INFINITY) { + log_warning("Ruleset '%s' has LastingSec=infinity which can never be satisfied, ignoring.", ruleset->name); + return 0; + } + + /* A threshold at the maximum can never be exceeded, so the condition would never fire. + * Report the normalized percent value so the warning matches regardless of whether the + * user wrote '100%', '1000‰' or '10000‱'. */ + if (ruleset->memory_pressure_above == 10000) { + log_warning("Ruleset '%s' has MemoryPressureAbove=" PERMYRIAD_AS_PERCENT_FORMAT_STR " (the maximum) which can never be exceeded, ignoring.", + ruleset->name, PERMYRIAD_AS_PERCENT_FORMAT_VAL(ruleset->memory_pressure_above)); + return 0; + } + + if (ruleset->swap_above == 10000) { + log_warning("Ruleset '%s' has SwapUsageMax=" PERMYRIAD_AS_PERCENT_FORMAT_STR " (the maximum) which can never be exceeded, ignoring.", + ruleset->name, PERMYRIAD_AS_PERCENT_FORMAT_VAL(ruleset->swap_above)); + return 0; + } + + /* Duplicates cannot occur here: conf_files_list_strv deduplicates filenames across + * directories, and hashmap_clear is called before loading. The value destructor in + * oomd_ruleset_hash_ops handles cleanup during hashmap_clear/hashmap_free. */ + r = hashmap_ensure_replace(&m->rulesets, &oomd_ruleset_hash_ops, ruleset->name, ruleset); + if (r < 0) + return log_error_errno(r, "Failed to register ruleset '%s': %m", ruleset->name); + + TAKE_PTR(ruleset); + + return 0; +} + void manager_parse_config_file(Manager *m) { + _cleanup_strv_free_ char **files = NULL; int r; assert(m); @@ -88,4 +231,37 @@ void manager_parse_config_file(Manager *m) { /* userdata= */ m); if (r >= 0) log_debug("Config file successfully parsed."); + + r = conf_files_list_strv(&files, ".oomrule", /* root= */ NULL, CONF_FILES_WARN, RULESET_DIRS); + if (r < 0) { + /* On enumeration failure, keep the previously loaded rulesets rather than clearing them — + * a transient I/O error shouldn't cause in-flight units to silently lose their OOM policy. */ + log_error_errno(r, "Failed to enumerate ruleset files, keeping previously loaded rulesets: %m"); + return; + } + + /* Clear all rulesets and re-parse. This intentionally resets any accumulated + * start_times (LastingSec timers), since the ruleset definitions may have changed. */ + hashmap_clear(m->rulesets); + + STRV_FOREACH(f, files) + (void) ruleset_load_one(m, *f); + + if (DEBUG_LOGGING) { + char *name; + OomdRuleset *ruleset; + HASHMAP_FOREACH_KEY(ruleset, name, m->rulesets) { + log_debug("Registered ruleset: %s", name); + if (ruleset->memory_pressure_above >= 0) + log_debug(" MemoryPressureAbove=" PERMYRIAD_AS_PERCENT_FORMAT_STR, PERMYRIAD_AS_PERCENT_FORMAT_VAL(ruleset->memory_pressure_above)); + else + log_debug(" MemoryPressureAbove=unset"); + if (ruleset->swap_above >= 0) + log_debug(" SwapUsageMax=" PERMYRIAD_AS_PERCENT_FORMAT_STR, PERMYRIAD_AS_PERCENT_FORMAT_VAL(ruleset->swap_above)); + else + log_debug(" SwapUsageMax=unset"); + log_debug(" Action=%s", oomd_action_to_string(ruleset->action)); + log_debug(" LastingSec=%s", FORMAT_TIMESPAN(ruleset->lasting_usec, USEC_PER_SEC)); + } + } } diff --git a/src/oom/oomd-conf.h b/src/oom/oomd-conf.h index 429b976b91b..8f715e81c6f 100644 --- a/src/oom/oomd-conf.h +++ b/src/oom/oomd-conf.h @@ -1,8 +1,16 @@ /* SPDX-License-Identifier: LGPL-2.1-or-later */ #pragma once +#include "string-table.h" /* IWYU pragma: keep */ + typedef struct Manager Manager; +typedef struct OomdRuleset OomdRuleset; +typedef enum OomdAction OomdAction; + +void oomd_ruleset_free(OomdRuleset *ruleset); void manager_set_defaults(Manager *m); void manager_parse_config_file(Manager *m); + +DECLARE_STRING_TABLE_LOOKUP(oomd_action, OomdAction); diff --git a/src/oom/oomd-manager.c b/src/oom/oomd-manager.c index 382a246c2dd..7ec6684f6e2 100644 --- a/src/oom/oomd-manager.c +++ b/src/oom/oomd-manager.c @@ -24,6 +24,7 @@ #include "percent-util.h" #include "set.h" #include "string-util.h" +#include "strv.h" #include "time-util.h" #include "varlink-io.systemd.oom.h" #include "varlink-io.systemd.service.h" @@ -35,12 +36,14 @@ typedef struct ManagedOOMMessage { char *property; uint32_t limit; usec_t duration; + char **rules; } ManagedOOMMessage; static void managed_oom_message_destroy(ManagedOOMMessage *message) { assert(message); free(message->path); free(message->property); + strv_free(message->rules); } static JSON_DISPATCH_ENUM_DEFINE(dispatch_managed_oom_mode, ManagedOOMMode, managed_oom_mode_from_string); @@ -55,6 +58,7 @@ static int process_managed_oom_message(Manager *m, uid_t uid, sd_json_variant *p { "property", SD_JSON_VARIANT_STRING, sd_json_dispatch_string, offsetof(ManagedOOMMessage, property), SD_JSON_MANDATORY }, { "limit", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint32, offsetof(ManagedOOMMessage, limit), 0 }, { "duration", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint64, offsetof(ManagedOOMMessage, duration), 0 }, + { "rules", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_strv, offsetof(ManagedOOMMessage, rules), 0 }, {}, }; @@ -101,11 +105,31 @@ static int process_managed_oom_message(Manager *m, uid_t uid, sd_json_variant *p "(" UID_FMT " != " UID_FMT ")", uid, cg_uid); } - monitor_hm = streq(message.property, "ManagedOOMSwap") ? - m->monitored_swap_cgroup_contexts : m->monitored_mem_pressure_cgroup_contexts; + if (streq(message.property, "ManagedOOMSwap")) + monitor_hm = m->monitored_swap_cgroup_contexts; + else if (streq(message.property, "OOMRules")) + monitor_hm = m->monitored_rules_cgroup_contexts; + else if (streq(message.property, "ManagedOOMMemoryPressure")) + monitor_hm = m->monitored_mem_pressure_cgroup_contexts; + else { + log_debug("Unknown property '%s', ignoring.", message.property); + continue; + } if (message.mode == MANAGED_OOM_AUTO) { (void) oomd_cgroup_context_unref(hashmap_remove(monitor_hm, empty_to_root(message.path))); + + /* Clean up start_times entries for this cgroup across all rulesets + * to prevent stale timers from causing premature action triggers + * if the cgroup re-subscribes later. */ + if (streq(message.property, "OOMRules")) { + OomdRuleset *ruleset; + HASHMAP_FOREACH(ruleset, m->rulesets) { + _cleanup_free_ char *key = NULL; + free(hashmap_remove2(ruleset->start_times, empty_to_root(message.path), (void **) &key)); + } + } + continue; } @@ -124,6 +148,57 @@ static int process_managed_oom_message(Manager *m, uid_t uid, sd_json_variant *p else duration = m->default_mem_pressure_duration_usec; + /* For OOMRules, only insert/update if rules are actually provided */ + if (streq(message.property, "OOMRules")) { + if (strv_isempty(message.rules)) + continue; + + /* Avoid re-reading memory.pressure/pgscan/etc. on every OOMRules update for a + * cgroup we already track — fetch the existing context first and only acquire + * a fresh one if the cgroup is new. */ + ctx = hashmap_get(monitor_hm, empty_to_root(message.path)); + if (!ctx) { + r = oomd_insert_cgroup_context(NULL, monitor_hm, message.path); + if (r == -ENOMEM) + return r; + if (r < 0) { + log_debug_errno(r, "Failed to insert message, ignoring: %m"); + continue; + } + ctx = hashmap_get(monitor_hm, empty_to_root(message.path)); + } + + if (ctx) { + /* For each rule being dropped from this cgroup's subscription, + * remove its start_times entry so the timer doesn't linger. */ + STRV_FOREACH(old_rule, ctx->rules) { + if (strv_contains(message.rules, *old_rule)) + continue; + OomdRuleset *dropped = hashmap_get(m->rulesets, *old_rule); + if (!dropped) + continue; + _cleanup_free_ char *key = NULL; + free(hashmap_remove2(dropped->start_times, empty_to_root(message.path), (void**) &key)); + } + + strv_free_and_replace(ctx->rules, message.rules); + + /* Defensively deduplicate: the DBus setter and config parser both + * dedupe, but another varlink client could in principle send + * duplicates, which would cause redundant per-interval evaluation. */ + strv_uniq(ctx->rules); + + /* Warn about any referenced rules that don't exist. Done here + * (once per subscription change) rather than per-interval to avoid + * log spam when a unit references a missing ruleset. */ + STRV_FOREACH(new_rule, ctx->rules) + if (!hashmap_contains(m->rulesets, *new_rule)) + log_warning("Cgroup %s references undefined ruleset '%s', it will be ignored.", + ctx->path, *new_rule); + } + continue; + } + r = oomd_insert_cgroup_context(NULL, monitor_hm, message.path); if (r == -ENOMEM) return r; @@ -145,6 +220,12 @@ static int process_managed_oom_message(Manager *m, uid_t uid, sd_json_variant *p if (r < 0) return log_error_errno(r, "Failed to toggle enabled state of swap context source: %m"); + /* Toggle wake-ups for "OOMRules" if entries are present. */ + r = sd_event_source_set_enabled(m->rules_context_event_source, + hashmap_isempty(m->monitored_rules_cgroup_contexts) ? SD_EVENT_OFF : SD_EVENT_ON); + if (r < 0) + return log_error_errno(r, "Failed to toggle enabled state of rules context source: %m"); + return 0; } @@ -408,7 +489,7 @@ static int monitor_swap_contexts_handler(sd_event_source *s, uint64_t usec, void log_debug_errno(r, "Failed to get monitored swap cgroup candidates, ignoring: %m"); threshold = m->system_context.swap_total * THRESHOLD_SWAP_USED_PERCENT / 100; - r = oomd_select_by_swap_usage(candidates, threshold, &selected); + r = oomd_select_by_swap_usage(candidates, /* prefix= */ NULL, threshold, &selected); if (r < 0) return log_error_errno(r, "Failed to select any cgroups based on swap: %m"); if (r == 0) { @@ -584,6 +665,398 @@ static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t return 0; } +static int ruleset_execute_action( + Manager *m, + OomdCGroupContext *ctx, + OomdRuleset *ruleset, + const char *rule_name, + usec_t usec_now) { + + _cleanup_free_ char *reason = NULL; + int r; + + assert(m); + assert(ctx); + assert(ruleset); + assert(rule_name); + + if (ruleset->lasting_usec > 0) + log_notice("Rule '%s' conditions met for cgroup %s (lasting %s), taking action %s", + rule_name, + ctx->path, + FORMAT_TIMESPAN(ruleset->lasting_usec, USEC_PER_SEC), + oomd_action_to_string(ruleset->action)); + else + log_notice("Rule '%s' conditions met for cgroup %s, taking action %s", + rule_name, + ctx->path, + oomd_action_to_string(ruleset->action)); + + reason = strjoin("rule ", rule_name); + if (!reason) + return log_oom(); + + if (ruleset->action == OOMD_ACTION_KILL_ALL) { + r = oomd_cgroup_kill_mark(m, ctx, reason); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_notice_errno(r, "Failed to kill all processes in %s: %m", ctx->path); + return 0; + } + } else if (ruleset->action == OOMD_ACTION_KILL_BY_PGSCAN) { + OomdCGroupContext *selected = NULL; + + /* Check if there was reclaim activity in the given interval. If there isn't any reclaim + * pressure, killing won't help — well-behaved processes faulting in recently resident + * pages will keep pressure high even after the offending cgroup is killed. */ + if (usec_sub_unsigned(usec_now, ctx->last_had_mem_reclaim) > RECLAIM_DURATION_USEC) { + log_debug("No reclaim activity for %s, skipping pgscan-based action", ctx->path); + return 0; + } + + r = oomd_select_by_pgscan_rate(m->monitored_rules_cgroup_contexts_candidates, ctx->path, &selected); + if (r < 0) { + log_notice_errno(r, "Failed to select cgroup by pgscan rate for %s: %m", ctx->path); + return 0; + } + if (r == 0) { + log_debug("No cgroup candidates found for pgscan-based action for %s", ctx->path); + return 0; + } + + r = oomd_cgroup_kill_mark(m, selected, reason); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_notice_errno(r, "Failed to kill processes in %s: %m", selected->path); + return 0; + } + } else if (ruleset->action == OOMD_ACTION_KILL_BY_SWAP) { + OomdCGroupContext *selected = NULL; + uint64_t threshold; + + if (m->system_context.swap_total == 0) { + if (!ruleset->warned_no_swap) { + log_warning("Rule '%s' uses kill-by-swap action but no swap is configured, skipping (further occurrences suppressed)", rule_name); + ruleset->warned_no_swap = true; + } + return 0; + } + + /* Swap came back — clear the latch so re-disabling swap warns again. */ + ruleset->warned_no_swap = false; + + threshold = m->system_context.swap_total * THRESHOLD_SWAP_USED_PERCENT / 100; + r = oomd_select_by_swap_usage(m->monitored_rules_cgroup_contexts_candidates, ctx->path, threshold, &selected); + if (r < 0) { + log_notice_errno(r, "Failed to select cgroup by swap usage for %s: %m", ctx->path); + return 0; + } + if (r == 0) { + log_debug("No cgroup candidates found for swap-based action for %s", ctx->path); + return 0; + } + + r = oomd_cgroup_kill_mark(m, selected, reason); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_notice_errno(r, "Failed to kill processes in %s: %m", selected->path); + return 0; + } + } else + assert_not_reached(); + + return 1; +} + +static int ruleset_check_conditions( + Manager *m, + OomdCGroupContext *ctx, + OomdRuleset *ruleset, + const char *rule_name, + usec_t usec_now) { + + int r; + + assert(m); + assert(ctx); + assert(ruleset); + assert(rule_name); + + /* Check memory pressure condition. + * memory_pressure_above is in permyriad (0-10000, i.e. 6050 = 60.50%). + * store_loadavg_fixed_point takes integer and decimal parts of a percentage, + * so divide/modulo by 100 to split permyriad into percent + centipercent. */ + if (ruleset->memory_pressure_above >= 0) { + loadavg_t threshold; + r = store_loadavg_fixed_point(ruleset->memory_pressure_above / 100, + ruleset->memory_pressure_above % 100, + &threshold); + if (r < 0) + return log_debug_errno(r, "Failed to convert pressure threshold for rule '%s': %m", rule_name); + + if (ctx->memory_pressure.avg10 <= threshold) + goto reset; + } + + /* swap_above means take action when swap usage is above threshold. + * oomd_swap_free_below returns true when swap free is below threshold, + * so if swap_above is X%, check if swap free is below (100-X)%. + * When no swap is configured, the condition cannot be meaningfully evaluated. */ + if (ruleset->swap_above >= 0) { + if (m->system_context.swap_total == 0 || + !oomd_swap_free_below(&m->system_context, 10000 - ruleset->swap_above)) + goto reset; + } + + /* All conditions met, check if LastingSec requirement is satisfied */ + usec_t *start_time = hashmap_get(ruleset->start_times, ctx->path); + if (!start_time) { + /* First time seeing this condition - record the start time */ + _cleanup_free_ usec_t *new_start_time = new(usec_t, 1); + if (!new_start_time) + return log_oom(); + + *new_start_time = usec_now; + + _cleanup_free_ char *path_copy = strdup(ctx->path); + if (!path_copy) + return log_oom(); + + r = hashmap_ensure_put(&ruleset->start_times, &string_hash_ops_free_free, path_copy, new_start_time); + if (r < 0) + return log_error_errno(r, "Failed to record start time for rule '%s' on %s: %m", + rule_name, ctx->path); + TAKE_PTR(path_copy); + TAKE_PTR(new_start_time); + + /* If lasting_usec is 0, take action immediately */ + if (ruleset->lasting_usec == 0) + return true; + + log_debug("Rule '%s' conditions met for cgroup %s, waiting for %s", + rule_name, ctx->path, + FORMAT_TIMESPAN(ruleset->lasting_usec, USEC_PER_SEC)); + return false; + } + + /* Check if the condition has been true for long enough */ + usec_t duration = usec_sub_unsigned(usec_now, *start_time); + if (duration >= ruleset->lasting_usec) + return true; + + log_debug("Rule '%s' conditions met for cgroup %s for %s (need %s)", + rule_name, ctx->path, + FORMAT_TIMESPAN(duration, USEC_PER_SEC), + FORMAT_TIMESPAN(ruleset->lasting_usec, USEC_PER_SEC)); + return false; + +reset: + /* Conditions no longer met — remove start time if it exists. */ + { + _cleanup_free_ char *old_key = NULL; + _cleanup_free_ usec_t *old_start_time = + hashmap_remove2(ruleset->start_times, ctx->path, (void**) &old_key); + if (old_start_time) + log_debug("Rule '%s' conditions no longer met for cgroup %s, resetting timer", + rule_name, ctx->path); + } + return false; +} + +/* After a reload, some cgroups may reference rulesets that no longer exist (or didn't exist yet + * when the cgroup subscribed). Warn once per (cgroup, rule) pair so the operator sees the mismatch, + * without spamming the per-interval evaluation loop. */ +static void warn_missing_rulesets(Manager *m) { + OomdCGroupContext *ctx; + + assert(m); + + HASHMAP_FOREACH(ctx, m->monitored_rules_cgroup_contexts) + STRV_FOREACH(rule, ctx->rules) + if (!hashmap_contains(m->rulesets, *rule)) + log_warning("Cgroup %s references undefined ruleset '%s', it will be ignored.", + ctx->path, *rule); +} + +/* Remove start_times entries for cgroups that are no longer in monitored_rules_cgroup_contexts. + * Cgroups can vanish silently (unit stops, cgroup destroyed) without an explicit unsubscribe + * message, so we periodically reconcile to prevent unbounded growth of start_times. */ +static int prune_stale_ruleset_start_times(Manager *m) { + OomdRuleset *ruleset; + int r; + + assert(m); + + HASHMAP_FOREACH(ruleset, m->rulesets) { + _cleanup_strv_free_ char **to_remove = NULL; + const char *path; + void *v; + + HASHMAP_FOREACH_KEY(v, path, ruleset->start_times) + if (!hashmap_contains(m->monitored_rules_cgroup_contexts, path)) { + r = strv_extend(&to_remove, path); + if (r < 0) + return log_oom(); + } + + STRV_FOREACH(p, to_remove) { + _cleanup_free_ char *key = NULL; + free(hashmap_remove2(ruleset->start_times, *p, (void**) &key)); + } + } + + return 0; +} + +static int process_rules_cgroup_context(Manager *m, OomdCGroupContext *ctx, usec_t usec_now) { + int r; + + assert(m); + assert(ctx); + + if (strv_isempty(ctx->rules)) + return 0; + + STRV_FOREACH(rule_name, ctx->rules) { + OomdRuleset *ruleset = hashmap_get(m->rulesets, *rule_name); + if (!ruleset) + /* Silently skip: already warned once when the subscription was attached or when + * rulesets were loaded. Repeating here would fire every interval. */ + continue; + + r = ruleset_check_conditions(m, ctx, ruleset, *rule_name, usec_now); + if (r < 0) + continue; + if (r == 0) + continue; + + r = ruleset_execute_action(m, ctx, ruleset, *rule_name, usec_now); + if (r < 0) + return r; + + /* Only remove start time if the action actually killed something, so that + * LastingSec must be satisfied again before re-triggering. If the action + * failed to kill, keep the timer running to retry on the next interval. */ + if (r > 0) { + _cleanup_free_ char *action_key = NULL; + free(hashmap_remove2(ruleset->start_times, ctx->path, (void **) &action_key)); + + /* Global (not per-cgroup/per-ruleset) post-action delay: after any + * successful ruleset kill we suppress *all* subsequent rule evaluations + * until POST_ACTION_DELAY_USEC elapses. This is intentional — pressure + * and swap metrics need time to reflect the effect of a kill before we + * act again, otherwise a single overload could cascade into multiple + * unrelated kills across sibling cgroups within the same interval. */ + m->rules_post_action_delay_start = usec_now; + return 0; + } + } + + return 0; +} + +static int monitor_rules_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + bool in_post_action_delay = false; + usec_t usec_now; + int r; + + assert(s); + + /* Reset timer */ + r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now); + if (r < 0) + return log_error_errno(r, "Failed to reset event timer: %m"); + + r = sd_event_source_set_time_relative(s, RULESETS_INTERVAL_USEC); + if (r < 0) + return log_error_errno(r, "Failed to set relative time for timer: %m"); + + /* Reconnect if our connection dropped */ + if (!m->varlink_client) { + r = acquire_managed_oom_connect(m); + if (r < 0) + return log_error_errno(r, "Failed to acquire varlink connection: %m"); + } + + /* Return early if no rules are set */ + if (hashmap_isempty(m->monitored_rules_cgroup_contexts)) + return 0; + + /* Determine whether we're still inside the post-action delay window before doing any + * heavy lifting, so we can short-circuit the expensive descendant walk below. */ + if (m->rules_post_action_delay_start > 0) { + if (usec_add(m->rules_post_action_delay_start, POST_ACTION_DELAY_USEC) > usec_now) + in_post_action_delay = true; + else + m->rules_post_action_delay_start = 0; + } + + /* Always keep the subscribed (parent) cgroup contexts fresh so pgscan rate differentials + * stay accurate across intervals, even during the post-action delay. Only suppress the + * kill action itself. + * + * Note: update_monitored_cgroup_contexts() rebuilds the hashmap by calling + * oomd_insert_cgroup_context(), which also carries over the per-cgroup 'rules' strv + * from the old context. We rely on that implicit rule propagation here — the + * rules attached to each cgroup context persist across refreshes. */ + r = update_monitored_cgroup_contexts(&m->monitored_rules_cgroup_contexts); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + log_debug_errno(r, "Failed to update monitored rules cgroup contexts, ignoring: %m"); + + /* The candidate refresh is the expensive part — it recursively walks descendants of every + * monitored cgroup. Since candidates are only consumed by kill-by-pgscan / kill-by-swap + * (both suppressed during the delay), skip the walk while we're not going to act. */ + if (!in_post_action_delay) { + r = update_monitored_cgroup_contexts_candidates( + m->monitored_rules_cgroup_contexts, &m->monitored_rules_cgroup_contexts_candidates); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + log_debug_errno(r, "Failed to update monitored rules cgroup candidates, ignoring: %m"); + } + + r = prune_stale_ruleset_start_times(m); + if (r < 0) + return r; + + if (in_post_action_delay) + return 0; + + /* Only read /proc/meminfo if at least one ruleset actually needs swap info — either as + * a SwapUsageMax= condition or as a kill-by-swap action (which uses swap_total to + * compute the per-cgroup selection threshold). */ + OomdRuleset *ruleset; + HASHMAP_FOREACH(ruleset, m->rulesets) + if (ruleset->swap_above >= 0 || ruleset->action == OOMD_ACTION_KILL_BY_SWAP) { + r = oomd_system_context_acquire("/proc/meminfo", &m->system_context); + if (r < 0) + return log_error_errno(r, "Failed to acquire system context: %m"); + break; + } + + OomdCGroupContext *ctx; + HASHMAP_FOREACH(ctx, m->monitored_rules_cgroup_contexts) { + r = process_rules_cgroup_context(m, ctx, usec_now); + if (r < 0) + return r; + + /* process_rules_cgroup_context() sets rules_post_action_delay_start when it queues + * a kill. Honor the delay *within the same tick* too: otherwise a single overload + * could cascade into kills across unrelated sibling cgroups before pressure metrics + * have a chance to reflect the first kill. */ + if (m->rules_post_action_delay_start > 0) + break; + } + + return 0; +} + static int monitor_swap_contexts(Manager *m) { _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; int r; @@ -634,6 +1107,31 @@ static int monitor_memory_pressure_contexts(Manager *m) { return 0; } +static int monitor_rules_contexts(Manager *m) { + _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; + int r; + + assert(m); + assert(m->event); + + r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_rules_contexts_handler, m); + if (r < 0) + return r; + + r = sd_event_source_set_exit_on_failure(s, true); + if (r < 0) + return r; + + r = sd_event_source_set_enabled(s, SD_EVENT_OFF); + if (r < 0) + return r; + + (void) sd_event_source_set_description(s, "oomd-rules-timer"); + + m->rules_context_event_source = TAKE_PTR(s); + return 0; +} + Manager* manager_free(Manager *m) { assert(m); @@ -641,6 +1139,7 @@ Manager* manager_free(Manager *m) { sd_varlink_close_unref(m->varlink_client); sd_event_source_unref(m->swap_context_event_source); sd_event_source_unref(m->mem_pressure_context_event_source); + sd_event_source_unref(m->rules_context_event_source); sd_event_unref(m->event); hashmap_free(m->polkit_registry); @@ -649,9 +1148,13 @@ Manager* manager_free(Manager *m) { hashmap_free(m->monitored_swap_cgroup_contexts); hashmap_free(m->monitored_mem_pressure_cgroup_contexts); hashmap_free(m->monitored_mem_pressure_cgroup_contexts_candidates); + hashmap_free(m->monitored_rules_cgroup_contexts); + hashmap_free(m->monitored_rules_cgroup_contexts_candidates); set_free(m->kill_states); + hashmap_free(m->rulesets); + return mfree(m); } @@ -662,6 +1165,7 @@ static int manager_dispatch_reload_signal(sd_event_source *s, const struct signa manager_set_defaults(m); manager_parse_config_file(m); + warn_missing_rulesets(m); (void) sd_notify(/* unset_environment= */ false, NOTIFY_READY_MESSAGE); return 0; @@ -706,6 +1210,14 @@ int manager_new(Manager **ret) { if (!m->monitored_mem_pressure_cgroup_contexts_candidates) return -ENOMEM; + m->monitored_rules_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops); + if (!m->monitored_rules_cgroup_contexts) + return -ENOMEM; + + m->monitored_rules_cgroup_contexts_candidates = hashmap_new(&oomd_cgroup_ctx_hash_ops); + if (!m->monitored_rules_cgroup_contexts_candidates) + return -ENOMEM; + *ret = TAKE_PTR(m); return 0; } @@ -815,6 +1327,10 @@ int manager_start( if (r < 0) return r; + r = monitor_rules_contexts(m); + if (r < 0) + return r; + return 0; } diff --git a/src/oom/oomd-manager.h b/src/oom/oomd-manager.h index 8b9476232fb..cc588461b08 100644 --- a/src/oom/oomd-manager.h +++ b/src/oom/oomd-manager.h @@ -2,15 +2,22 @@ #pragma once #include "conf-parser-forward.h" +#include "constants.h" #include "shared-forward.h" #include "oomd-conf.h" #include "oomd-util.h" +#define RULESET_DIRS ((const char* const*) CONF_PATHS_STRV("systemd/oomd/rules.d")) + /* Polling interval for monitoring stats */ #define SWAP_INTERVAL_USEC 150000 /* 0.15 seconds */ /* Pressure counters are lagging (~2 seconds) compared to swap so polling too frequently just wastes CPU */ #define MEM_PRESSURE_INTERVAL_USEC (1 * USEC_PER_SEC) +/* Rules evaluate both pressure and swap metrics; align on the slower-moving metric + * (pressure counters lag ~2s), so polling faster than 1s just wastes CPU. */ +#define RULESETS_INTERVAL_USEC MEM_PRESSURE_INTERVAL_USEC + /* Take action if 10s of memory pressure > 60 for more than 30s. We use the "full" value from PSI so this is the * percentage of time all tasks were delayed (i.e. unproductive). * Generally 60 or higher might be acceptable for something like system.slice with no memory.high set; processes in @@ -25,6 +32,25 @@ #define RECLAIM_DURATION_USEC (30 * USEC_PER_SEC) #define POST_ACTION_DELAY_USEC (15 * USEC_PER_SEC) +typedef enum OomdAction { + OOMD_ACTION_NONE, + OOMD_ACTION_KILL_ALL, + OOMD_ACTION_KILL_BY_PGSCAN, + OOMD_ACTION_KILL_BY_SWAP, + _OOMD_ACTION_MAX, + _OOMD_ACTION_INVALID = -EINVAL, +} OomdAction; + +typedef struct OomdRuleset { + char *name; + int memory_pressure_above; /* permyriad (0-10000), or -1 for unset */ + int swap_above; /* permyriad (0-10000), or -1 for unset */ + OomdAction action; + usec_t lasting_usec; + Hashmap *start_times; /* key: cgroup path (char*) -> value: heap-allocated timestamp (usec_t*) */ + bool warned_no_swap; /* latched once we've warned that kill-by-swap is misconfigured */ +} OomdRuleset; + typedef struct Manager { sd_bus *bus; sd_event *event; @@ -41,13 +67,17 @@ typedef struct Manager { Hashmap *monitored_swap_cgroup_contexts; Hashmap *monitored_mem_pressure_cgroup_contexts; Hashmap *monitored_mem_pressure_cgroup_contexts_candidates; + Hashmap *monitored_rules_cgroup_contexts; + Hashmap *monitored_rules_cgroup_contexts_candidates; OomdSystemContext system_context; usec_t mem_pressure_post_action_delay_start; + usec_t rules_post_action_delay_start; sd_event_source *swap_context_event_source; sd_event_source *mem_pressure_context_event_source; + sd_event_source *rules_context_event_source; /* This varlink object is used to manage the subscription from systemd-oomd to PID1 which it uses to * listen for changes in ManagedOOM settings (oomd client - systemd server). */ @@ -58,6 +88,7 @@ typedef struct Manager { usec_t prekill_timeout; Set *kill_states; /* currently ongoing OomdKillState operations */ + Hashmap *rulesets; } Manager; Manager* manager_free(Manager *m); diff --git a/src/oom/oomd-util.c b/src/oom/oomd-util.c index c0e04041a7e..4128d315a63 100644 --- a/src/oom/oomd-util.c +++ b/src/oom/oomd-util.c @@ -21,13 +21,14 @@ #include "sort-util.h" #include "stdio-util.h" #include "string-util.h" +#include "strv.h" #include "time-util.h" #include "varlink-util.h" typedef struct OomdKillState { Manager *manager; OomdCGroupContext *ctx; - const char *reason; + char *reason; /* This holds sd_varlink references */ Set *links; } OomdKillState; @@ -80,6 +81,7 @@ static OomdCGroupContext *oomd_cgroup_context_free(OomdCGroupContext *ctx) { return NULL; free(ctx->path); + strv_free(ctx->rules); return mfree(ctx); } @@ -305,6 +307,7 @@ static void oomd_kill_state_free(OomdKillState *ks) { set_remove(ks->manager->kill_states, ks); oomd_cgroup_context_unref(ks->ctx); + free(ks->reason); free(ks); } @@ -485,6 +488,10 @@ int oomd_cgroup_kill_mark(Manager *m, OomdCGroupContext *ctx, const char *reason return 0; } + _cleanup_free_ char *reason_copy = strdup(reason); + if (!reason_copy) + return log_oom_debug(); + _cleanup_(oomd_kill_state_removep) OomdKillState *ks = new(OomdKillState, 1); if (!ks) return log_oom_debug(); @@ -492,7 +499,7 @@ int oomd_cgroup_kill_mark(Manager *m, OomdCGroupContext *ctx, const char *reason *ks = (OomdKillState) { .manager = m, .ctx = oomd_cgroup_context_ref(ctx), - .reason = reason, + .reason = TAKE_PTR(reason_copy), }; r = set_ensure_put(&m->kill_states, &oomd_kill_state_hash_ops, ks); @@ -503,6 +510,7 @@ int oomd_cgroup_kill_mark(Manager *m, OomdCGroupContext *ctx, const char *reason * cleanup path would remove by cgroup path key and could interfere with the existing queued * kill state. */ oomd_cgroup_context_unref(ks->ctx); + free(ks->reason); ks = mfree(ks); return 0; } @@ -585,14 +593,14 @@ int oomd_select_by_pgscan_rate(Hashmap *h, const char *prefix, OomdCGroupContext return ret; } -int oomd_select_by_swap_usage(Hashmap *h, uint64_t threshold_usage, OomdCGroupContext **ret_selected) { +int oomd_select_by_swap_usage(Hashmap *h, const char *prefix, uint64_t threshold_usage, OomdCGroupContext **ret_selected) { _cleanup_free_ OomdCGroupContext **sorted = NULL; int r, n, ret = 0; assert(h); assert(ret_selected); - n = oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted); + n = oomd_sort_cgroup_contexts(h, compare_swap_usage, prefix, &sorted); if (n < 0) return n; @@ -786,6 +794,9 @@ int oomd_insert_cgroup_context(Hashmap *old_h, Hashmap *new_h, const char *path) curr_ctx->mem_pressure_limit_hit_start = old_ctx->mem_pressure_limit_hit_start; curr_ctx->mem_pressure_duration_usec = old_ctx->mem_pressure_duration_usec; curr_ctx->last_had_mem_reclaim = old_ctx->last_had_mem_reclaim; + curr_ctx->rules = strv_copy(old_ctx->rules); + if (old_ctx->rules && !curr_ctx->rules) + return -ENOMEM; } if (oomd_pgscan_rate(curr_ctx) > 0) @@ -817,6 +828,9 @@ void oomd_update_cgroup_contexts_between_hashmaps(Hashmap *old_h, Hashmap *curr_ ctx->mem_pressure_limit_hit_start = old_ctx->mem_pressure_limit_hit_start; ctx->mem_pressure_duration_usec = old_ctx->mem_pressure_duration_usec; ctx->last_had_mem_reclaim = old_ctx->last_had_mem_reclaim; + /* Note: rules are intentionally not copied here. This function is only used on + * candidate hashmaps (populated by recursively_get_cgroup_context for descendant + * cgroups), which never carry rules. */ if (oomd_pgscan_rate(ctx) > 0) ctx->last_had_mem_reclaim = now(CLOCK_MONOTONIC); diff --git a/src/oom/oomd-util.h b/src/oom/oomd-util.h index d4e1a9207bd..a76454f8123 100644 --- a/src/oom/oomd-util.h +++ b/src/oom/oomd-util.h @@ -40,6 +40,7 @@ struct OomdCGroupContext { usec_t mem_pressure_limit_hit_start; usec_t last_had_mem_reclaim; usec_t mem_pressure_duration_usec; + char **rules; }; struct OomdSystemContext { @@ -132,7 +133,7 @@ int oomd_cgroup_kill_mark(Manager *m, OomdCGroupContext *ctx, const char *reason * everything in `h` is a candidate. * Returns the killed cgroup in ret_selected. */ int oomd_select_by_pgscan_rate(Hashmap *h, const char *prefix, OomdCGroupContext **ret_selected); -int oomd_select_by_swap_usage(Hashmap *h, uint64_t threshold_usage, OomdCGroupContext **ret_selected); +int oomd_select_by_swap_usage(Hashmap *h, const char *prefix, uint64_t threshold_usage, OomdCGroupContext **ret_selected); int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret); int oomd_system_context_acquire(const char *proc_meminfo_path, OomdSystemContext *ret); diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index 9b69ebd1a93..e3a1fcd8934 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -2384,6 +2384,7 @@ static const BusProperty cgroup_properties[] = { { "ManagedOOMSwap", bus_append_string }, { "ManagedOOMMemoryPressure", bus_append_string }, { "ManagedOOMPreference", bus_append_string }, + { "OOMRules", bus_append_strv }, { "MemoryPressureWatch", bus_append_string }, { "CPUPressureWatch", bus_append_string }, { "IOPressureWatch", bus_append_string }, diff --git a/src/shared/varlink-io.systemd.Unit.c b/src/shared/varlink-io.systemd.Unit.c index c9f2a59728c..2ed91121e48 100644 --- a/src/shared/varlink-io.systemd.Unit.c +++ b/src/shared/varlink-io.systemd.Unit.c @@ -409,6 +409,8 @@ static SD_VARLINK_DEFINE_STRUCT_TYPE( SD_VARLINK_DEFINE_FIELD(ManagedOOMMemoryPressureDurationUSec, SD_VARLINK_INT, SD_VARLINK_NULLABLE), SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd.resource-control.html#ManagedOOMPreference=none%7Cavoid%7Comit"), SD_VARLINK_DEFINE_FIELD_BY_TYPE(ManagedOOMPreference, ManagedOOMPreference, 0), + SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd.resource-control.html#OOMRules="), + SD_VARLINK_DEFINE_FIELD(OOMRules, SD_VARLINK_STRING, SD_VARLINK_ARRAY|SD_VARLINK_NULLABLE), SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd.resource-control.html#MemoryPressureWatch="), SD_VARLINK_DEFINE_FIELD_BY_TYPE(MemoryPressureWatch, CGroupPressureWatch, 0), SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd.resource-control.html#MemoryPressureThresholdSec="), diff --git a/src/shared/varlink-io.systemd.oom.c b/src/shared/varlink-io.systemd.oom.c index 80fa50a73a9..15e28b3e1b0 100644 --- a/src/shared/varlink-io.systemd.oom.c +++ b/src/shared/varlink-io.systemd.oom.c @@ -14,7 +14,8 @@ SD_VARLINK_DEFINE_STRUCT_TYPE( SD_VARLINK_DEFINE_FIELD(path, SD_VARLINK_STRING, 0), SD_VARLINK_DEFINE_FIELD(property, SD_VARLINK_STRING, 0), SD_VARLINK_DEFINE_FIELD(limit, SD_VARLINK_INT, SD_VARLINK_NULLABLE), - SD_VARLINK_DEFINE_FIELD(duration, SD_VARLINK_INT, SD_VARLINK_NULLABLE)); + SD_VARLINK_DEFINE_FIELD(duration, SD_VARLINK_INT, SD_VARLINK_NULLABLE), + SD_VARLINK_DEFINE_FIELD(rules, SD_VARLINK_STRING, SD_VARLINK_ARRAY|SD_VARLINK_NULLABLE)); static SD_VARLINK_DEFINE_METHOD( ReportManagedOOMCGroups, diff --git a/test/units/TEST-55-OOMD.sh b/test/units/TEST-55-OOMD.sh index 96a15989c74..6689bbdd733 100755 --- a/test/units/TEST-55-OOMD.sh +++ b/test/units/TEST-55-OOMD.sh @@ -353,6 +353,127 @@ EOF systemctl reset-failed } +testcase_oom_rulesets() { + # Create a ruleset that triggers on any memory pressure with no delay + mkdir -p /run/systemd/oomd/rules.d/ + cat >/run/systemd/oomd/rules.d/testrule.oomrule <<'EOF' +[Rule] +MemoryPressureAbove=0% +Action=kill-all +LastingSec=0 +EOF + + systemctl reload systemd-oomd.service + + # Run a transient service with OOMRules=testrule that generates memory pressure + (! systemd-run --wait --unit=TEST-55-OOMD-testrules \ + -p MemoryHigh=3M \ + -p OOMRules=testrule \ + stress-ng --timeout 3m --vm 10 --vm-bytes 50M --vm-keep) + + # Verify in the journal that the rule triggered + journalctl --sync + journalctl -u systemd-oomd.service --since "-2min" | grep "Rule 'testrule' conditions met" >/dev/null + + # clean up + rm -f /run/systemd/oomd/rules.d/testrule.oomrule + systemctl reload systemd-oomd.service +} + +testcase_oom_rulesets_invalid_name() { + # Invalid rule names must be rejected at property-set time (filename_is_valid check). + # "foo/bar" contains a slash and "." and ".." are disallowed by filename_is_valid. + set +e + err=$(systemd-run --wait --unit=TEST-55-OOMD-badname1 -p 'OOMRules=foo/bar' true 2>&1) + rc=$? + set -e + [[ $rc -ne 0 ]] + echo "$err" | grep "Invalid rule name" >/dev/null + + set +e + err=$(systemd-run --wait --unit=TEST-55-OOMD-badname2 -p 'OOMRules=.' true 2>&1) + rc=$? + set -e + [[ $rc -ne 0 ]] + echo "$err" | grep "Invalid rule name" >/dev/null +} + +testcase_oom_rulesets_missing_warning() { + # A unit that references a ruleset which does not exist must produce a + # warn_missing_rulesets warning in oomd's journal (once, at subscription time). + mkdir -p /run/systemd/oomd/rules.d/ + rm -f /run/systemd/oomd/rules.d/absentrule.oomrule + systemctl reload systemd-oomd.service + + # Start a long-lived transient unit that references a ruleset that doesn't exist. + systemd-run --unit=TEST-55-OOMD-missing --remain-after-exit \ + -p OOMRules=absentrule \ + sleep infinity + + # Give oomd a moment to receive the subscription, then verify the warning fires once. + timeout 30 bash -c ' + until journalctl --sync && journalctl -u systemd-oomd.service --since "-1min" 2>/dev/null | grep "references undefined ruleset .absentrule." >/dev/null; do + sleep 1 + done + ' + + # And when we now add the ruleset and reload, oomd must pick it up without + # the unit needing to restart. Verify by checking for the debug-log line that + # reports the ruleset was registered. + cat >/run/systemd/oomd/rules.d/absentrule.oomrule <<'EOF' +[Rule] +SwapUsageMax=99% +Action=kill-all +LastingSec=0 +EOF + systemctl reload systemd-oomd.service + + journalctl --sync + journalctl -u systemd-oomd.service --since "-1min" | grep "Registered ruleset: absentrule" >/dev/null + + # cleanup + systemctl stop TEST-55-OOMD-missing.service + rm -f /run/systemd/oomd/rules.d/absentrule.oomrule + systemctl reload systemd-oomd.service +} + +testcase_oom_rulesets_lasting_sec() { + # A rule with LastingSec > 0 must NOT trigger during the waiting period. + # Baseline proof: with the same workload but LastingSec=0 (testcase_oom_rulesets + # above) oomd kills the unit within a couple of seconds, so an active unit after + # ~6 s demonstrates LastingSec is being respected. + mkdir -p /run/systemd/oomd/rules.d/ + cat >/run/systemd/oomd/rules.d/slowrule.oomrule <<'EOF' +[Rule] +MemoryPressureAbove=0% +Action=kill-all +LastingSec=1h +EOF + + systemctl reload systemd-oomd.service + + # Start the unit without --wait so we can check mid-run state. The + # stress-ng timeout bounds the test if anything goes wrong. + systemd-run --unit=TEST-55-OOMD-slowrule \ + -p MemoryHigh=3M \ + -p OOMRules=slowrule \ + stress-ng --timeout 15s --vm 10 --vm-bytes 50M --vm-keep + + # Wait long enough for oomd's 1s rule-check loop to evaluate the condition + # many times. With LastingSec=1h the kill must not fire. + sleep 6 + + # Unit must still be active — if it were killed, Result= would be oom-kill. + assert_eq "$(systemctl show TEST-55-OOMD-slowrule.service -P ActiveState)" "active" + assert_eq "$(systemctl show TEST-55-OOMD-slowrule.service -P Result)" "success" + + systemctl stop TEST-55-OOMD-slowrule.service 2>/dev/null || true + + # cleanup + rm -f /run/systemd/oomd/rules.d/slowrule.oomrule + systemctl reload systemd-oomd.service +} + testcase_prekill_hook() { cat >/run/systemd/oomd.conf.d/99-oomd-prekill-test.conf <<'EOF' [OOM]