From: Ryan Wilson Date: Tue, 15 Oct 2024 03:49:54 +0000 (-0700) Subject: cgroup: Add ManagedOOMMemoryPressureDurationSec= override setting for units X-Git-Tag: v257-rc1~196^2~1 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=63d4c4271ca529f8357a84cbc075170fffdb3de8;p=thirdparty%2Fsystemd.git cgroup: Add ManagedOOMMemoryPressureDurationSec= override setting for units This will allow units (scopes/slices/services) to override the default systemd-oomd setting DefaultMemoryPressureDurationSec=. The semantics of ManagedOOMMemoryPressureDurationSec= are: - If >= 1 second, overrides DefaultMemoryPressureDurationSec= from oomd.conf - If is empty, uses DefaultMemoryPressureDurationSec= from oomd.conf - Ignored if ManagedOOMMemoryPressure= is not "kill" - Disallowed if < 1 second Note the corresponding dbus property is DefaultMemoryPressureDurationUSec which is in microseconds. This is consistent with other time-based dbus properties. --- diff --git a/docs/TRANSIENT-SETTINGS.md b/docs/TRANSIENT-SETTINGS.md index e219131ce61..ebb8ba536a3 100644 --- a/docs/TRANSIENT-SETTINGS.md +++ b/docs/TRANSIENT-SETTINGS.md @@ -281,6 +281,7 @@ All cgroup/resource control settings are available for transient units ✓ ManagedOOMSwap= ✓ ManagedOOMMemoryPressure= ✓ ManagedOOMMemoryPressureLimit= +✓ ManagedOOMMemoryPressureDurationSec= ✓ ManagedOOMPreference= ✓ CoredumpReceive= ``` diff --git a/man/oomd.conf.xml b/man/oomd.conf.xml index 582fb27de16..13f1f22e53a 100644 --- a/man/oomd.conf.xml +++ b/man/oomd.conf.xml @@ -90,7 +90,8 @@ DefaultMemoryPressureDurationSec= Sets the amount of time a unit's control group needs to have exceeded memory pressure - limits before systemd-oomd will take action. Memory pressure limits are defined by + limits before systemd-oomd will take action. A unit can override this value with + ManagedOOMMemoryPressureDurationSec=. Memory pressure limits are defined by DefaultMemoryPressureLimit= and ManagedOOMMemoryPressureLimit=. Must be set to 0, or at least 1 second. Defaults to 30 seconds when unset or 0. diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml index 1e34ddbc857..25905de8c85 100644 --- a/man/org.freedesktop.systemd1.xml +++ b/man/org.freedesktop.systemd1.xml @@ -2993,6 +2993,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly u ManagedOOMMemoryPressureLimit = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t ManagedOOMMemoryPressureDurationUSec = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly s ManagedOOMPreference = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(ss) BPFProgram = [...]; @@ -4312,6 +4314,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + @@ -4849,6 +4853,11 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { method. See sd_listen_fds3 for more details on how to retrieve these file descriptors. Unlike the ExtraFileDescriptors input property, ExtraFileDescriptorNames only contains names and not the file descriptors. + + ManagedOOMMemoryPressureDurationUSec implement the destination parameter of the + unit file setting ManagedOOMMemoryPressureDurationSec= listed in + systemd.resource-control5. + Note the time unit is expressed in μs. @@ -5148,6 +5157,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly u ManagedOOMMemoryPressureLimit = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t ManagedOOMMemoryPressureDurationUSec = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly s ManagedOOMPreference = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(ss) BPFProgram = [...]; @@ -6451,6 +6462,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + @@ -7145,6 +7158,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly u ManagedOOMMemoryPressureLimit = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t ManagedOOMMemoryPressureDurationUSec = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly s ManagedOOMPreference = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(ss) BPFProgram = [...]; @@ -8286,6 +8301,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + @@ -9109,6 +9126,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly u ManagedOOMMemoryPressureLimit = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t ManagedOOMMemoryPressureDurationUSec = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly s ManagedOOMPreference = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(ss) BPFProgram = [...]; @@ -10222,6 +10241,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + @@ -10898,6 +10919,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly u ManagedOOMMemoryPressureLimit = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t ManagedOOMMemoryPressureDurationUSec = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly s ManagedOOMPreference = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(ss) BPFProgram = [...]; @@ -11285,6 +11308,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + + @@ -11309,6 +11334,11 @@ node /org/freedesktop/systemd1/unit/system_2eslice { Properties Most properties correspond directly with the matching settings in slice unit files. + + ManagedOOMMemoryPressureDurationUSec implement the destination parameter of the + unit file setting ManagedOOMMemoryPressureDurationSec= listed in + systemd.resource-control5. + Note the time unit is expressed in μs. @@ -11507,6 +11537,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly u ManagedOOMMemoryPressureLimit = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t ManagedOOMMemoryPressureDurationUSec = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly s ManagedOOMPreference = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(ss) BPFProgram = [...]; @@ -11944,6 +11976,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + + @@ -12004,6 +12038,11 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { the scope unit is to be shut down via a RequestStop() signal (see below). This is set when the scope is created. If not set, the scope's processes will terminated with SIGTERM directly. + + ManagedOOMMemoryPressureDurationUSec implement the destination parameter of the + unit file setting ManagedOOMMemoryPressureDurationSec= listed in + systemd.resource-control5. + Note the time unit is expressed in μs. @@ -12222,6 +12261,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ PrivateTmpEx, ImportCredentialEx, ExtraFileDescriptorNames, + ManagedOOMMemoryPressureDurationUSec, BindLogSockets, and PrivateUsersEx were added in version 257. @@ -12362,6 +12402,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ EffectiveMemoryMax, EffectiveTasksMax, and MemoryZSwapWriteback were added in version 256. + ManagedOOMMemoryPressureDurationUSec was added in version 257. Scope Unit Objects @@ -12387,6 +12428,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ EffectiveMemoryMax, EffectiveTasksMax, and MemoryZSwapWriteback were added in version 256. + ManagedOOMMemoryPressureDurationUSec was added in version 257. Job Objects diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml index 2ffc279a35f..1f16052a335 100644 --- a/man/systemd.resource-control.xml +++ b/man/systemd.resource-control.xml @@ -1535,16 +1535,35 @@ DeviceAllow=/dev/loop-control Overrides the default memory pressure limit set by oomd.conf5 for - this unit (cgroup). Takes a percentage value between 0% and 100%, inclusive. This property is - ignored unless ManagedOOMMemoryPressure=. Defaults to 0%, + the cgroup of this unit. Takes a percentage value between 0% and 100%, inclusive. Defaults to 0%, which means to use the default set by oomd.conf5. + This property is ignored unless ManagedOOMMemoryPressure=. + + ManagedOOMMemoryPressureDurationSec= + + + Overrides the default memory pressure duration set by + oomd.conf5 for + the cgroup of this unit. The specified value supports a time unit such as ms or + μs, see + systemd.time7 + for details on the permitted syntax. Must be set to either empty or a value of at least 1s. Defaults + to empty, which means to use the default set by + oomd.conf5. + This property is ignored unless ManagedOOMMemoryPressure=. + + + + + + ManagedOOMPreference=none|avoid|omit diff --git a/src/core/cgroup.c b/src/core/cgroup.c index fb89a22d2e1..47a771d51e0 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -194,6 +194,9 @@ void cgroup_context_init(CGroupContext *c) { .moom_swap = MANAGED_OOM_AUTO, .moom_mem_pressure = MANAGED_OOM_AUTO, .moom_preference = MANAGED_OOM_PREFERENCE_NONE, + /* The default duration value in oomd.conf will be used when + * moom_mem_pressure_duration_usec is set to infinity. */ + .moom_mem_pressure_duration_usec = USEC_INFINITY, .memory_pressure_watch = _CGROUP_PRESSURE_WATCH_INVALID, .memory_pressure_threshold_usec = USEC_INFINITY, @@ -947,6 +950,10 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { fprintf(f, "%sMemoryPressureThresholdSec: %s\n", prefix, FORMAT_TIMESPAN(c->memory_pressure_threshold_usec, 1)); + if (c->moom_mem_pressure_duration_usec != USEC_INFINITY) + fprintf(f, "%sManagedOOMMemoryPressureDurationSec: %s\n", + prefix, FORMAT_TIMESPAN(c->moom_mem_pressure_duration_usec, 1)); + LIST_FOREACH(device_allow, a, c->device_allow) /* strna() below should be redundant, for avoiding -Werror=format-overflow= error. See #30223. */ fprintf(f, diff --git a/src/core/cgroup.h b/src/core/cgroup.h index 7525da728e5..550c1ea88fa 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -236,6 +236,7 @@ struct CGroupContext { ManagedOOMMode moom_swap; ManagedOOMMode moom_mem_pressure; uint32_t moom_mem_pressure_limit; /* Normalized to 2^32-1 == 100% */ + usec_t moom_mem_pressure_duration_usec; ManagedOOMPreference moom_preference; /* Memory pressure logic */ diff --git a/src/core/core-varlink.c b/src/core/core-varlink.c index 0ecc8e23f15..352fd28b0db 100644 --- a/src/core/core-varlink.c +++ b/src/core/core-varlink.c @@ -57,7 +57,7 @@ static bool user_match_lookup_parameters(LookupParameters *p, const char *name, } static int build_managed_oom_json_array_element(Unit *u, const char *property, sd_json_variant **ret_v) { - bool use_limit = false; + bool use_limit = false, use_duration = false; CGroupContext *c; const char *mode; @@ -84,7 +84,8 @@ static int build_managed_oom_json_array_element(Unit *u, const char *property, s mode = managed_oom_mode_to_string(c->moom_swap); else if (streq(property, "ManagedOOMMemoryPressure")) { mode = managed_oom_mode_to_string(c->moom_mem_pressure); - use_limit = true; + use_limit = c->moom_mem_pressure_limit > 0; + use_duration = c->moom_mem_pressure_duration_usec != USEC_INFINITY; } else return -EINVAL; @@ -92,7 +93,8 @@ static int build_managed_oom_json_array_element(Unit *u, const char *property, s SD_JSON_BUILD_PAIR("mode", SD_JSON_BUILD_STRING(mode)), SD_JSON_BUILD_PAIR("path", SD_JSON_BUILD_STRING(crt->cgroup_path)), SD_JSON_BUILD_PAIR("property", SD_JSON_BUILD_STRING(property)), - SD_JSON_BUILD_PAIR_CONDITION(use_limit, "limit", SD_JSON_BUILD_UNSIGNED(c->moom_mem_pressure_limit))); + SD_JSON_BUILD_PAIR_CONDITION(use_limit, "limit", SD_JSON_BUILD_UNSIGNED(c->moom_mem_pressure_limit)), + SD_JSON_BUILD_PAIR_CONDITION(use_duration, "duration", SD_JSON_BUILD_UNSIGNED(c->moom_mem_pressure_duration_usec))); } static int build_managed_oom_cgroups_json(Manager *m, sd_json_variant **ret) { diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c index 459fa6f774c..445132a659c 100644 --- a/src/core/dbus-cgroup.c +++ b/src/core/dbus-cgroup.c @@ -502,6 +502,7 @@ const sd_bus_vtable bus_cgroup_vtable[] = { SD_BUS_PROPERTY("ManagedOOMSwap", "s", property_get_managed_oom_mode, offsetof(CGroupContext, moom_swap), 0), SD_BUS_PROPERTY("ManagedOOMMemoryPressure", "s", property_get_managed_oom_mode, offsetof(CGroupContext, moom_mem_pressure), 0), SD_BUS_PROPERTY("ManagedOOMMemoryPressureLimit", "u", NULL, offsetof(CGroupContext, moom_mem_pressure_limit), 0), + SD_BUS_PROPERTY("ManagedOOMMemoryPressureDurationUSec", "t", bus_property_get_usec, offsetof(CGroupContext, moom_mem_pressure_duration_usec), 0), SD_BUS_PROPERTY("ManagedOOMPreference", "s", property_get_managed_oom_preference, offsetof(CGroupContext, moom_preference), 0), SD_BUS_PROPERTY("BPFProgram", "a(ss)", property_get_bpf_foreign_program, 0, 0), SD_BUS_PROPERTY("SocketBindAllow", "a(iiqq)", property_get_socket_bind, offsetof(CGroupContext, socket_bind_allow), 0), @@ -2053,6 +2054,36 @@ int bus_cgroup_set_property( return 1; } + if (streq(name, "ManagedOOMMemoryPressureDurationUSec")) { + uint64_t t; + + if (!UNIT_VTABLE(u)->can_set_managed_oom) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot set %s for this unit type", name); + + r = sd_bus_message_read(message, "t", &t); + if (r < 0) + return r; + + if (t < 1 * USEC_PER_SEC) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= must be at least 1s, got %s", name, + FORMAT_TIMESPAN(t, USEC_PER_SEC)); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->memory_pressure_threshold_usec = t; + if (c->memory_pressure_threshold_usec == USEC_INFINITY) + unit_write_setting(u, flags, name, "ManagedOOMMemoryPressureDurationSec="); + else + unit_write_settingf(u, flags, name, + "ManagedOOMMemoryPressureDurationSec=%s", + FORMAT_TIMESPAN(c->memory_pressure_threshold_usec, 1)); + } + + if (c->moom_mem_pressure == MANAGED_OOM_KILL) + (void) manager_varlink_send_managed_oom_update(u); + + return 1; + } + if (streq(name, "ManagedOOMPreference")) { ManagedOOMPreference p; const char *pref; diff --git a/src/core/execute-serialize.c b/src/core/execute-serialize.c index 13e7078b1a9..1b44c49238c 100644 --- a/src/core/execute-serialize.c +++ b/src/core/execute-serialize.c @@ -328,6 +328,10 @@ static int exec_cgroup_context_serialize(const CGroupContext *c, FILE *f) { if (r < 0) return r; + r = serialize_usec(f, "exec-cgroup-context-managed-oom-memory-pressure-duration-usec", c->moom_mem_pressure_duration_usec); + if (r < 0) + return r; + r = serialize_item(f, "exec-cgroup-context-managed-oom-preference", managed_oom_preference_to_string(c->moom_preference)); if (r < 0) return r; @@ -781,6 +785,10 @@ static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) { c->moom_preference = managed_oom_preference_from_string(val); if (c->moom_preference < 0) return -EINVAL; + } else if ((val = startswith(l, "exec-cgroup-context-managed-oom-memory-pressure-duration-usec="))) { + r = deserialize_usec(val, &c->moom_mem_pressure_duration_usec); + if (r < 0) + return r; } else if ((val = startswith(l, "exec-cgroup-context-memory-pressure-watch="))) { c->memory_pressure_watch = cgroup_pressure_watch_from_string(val); if (c->memory_pressure_watch < 0) diff --git a/src/core/load-fragment-gperf.gperf.in b/src/core/load-fragment-gperf.gperf.in index e94b518a9d8..df49633cee9 100644 --- a/src/core/load-fragment-gperf.gperf.in +++ b/src/core/load-fragment-gperf.gperf.in @@ -253,6 +253,7 @@ {{type}}.ManagedOOMSwap, config_parse_managed_oom_mode, 0, offsetof({{type}}, cgroup_context.moom_swap) {{type}}.ManagedOOMMemoryPressure, config_parse_managed_oom_mode, 0, offsetof({{type}}, cgroup_context.moom_mem_pressure) {{type}}.ManagedOOMMemoryPressureLimit, config_parse_managed_oom_mem_pressure_limit, 0, offsetof({{type}}, cgroup_context.moom_mem_pressure_limit) +{{type}}.ManagedOOMMemoryPressureDurationSec, config_parse_managed_oom_mem_pressure_duration_sec, 0, offsetof({{type}}, cgroup_context.moom_mem_pressure_duration_usec) {{type}}.ManagedOOMPreference, config_parse_managed_oom_preference, 0, offsetof({{type}}, cgroup_context.moom_preference) {{type}}.NetClass, config_parse_warn_compat, DISABLED_LEGACY, 0 {{type}}.BPFProgram, config_parse_bpf_foreign_program, 0, offsetof({{type}}, cgroup_context) diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index ba6aad2f2b4..4b702038e6c 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -4121,6 +4121,44 @@ int config_parse_managed_oom_mem_pressure_limit( return 0; } +int config_parse_managed_oom_mem_pressure_duration_sec( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + usec_t usec, *duration = ASSERT_PTR(data); + UnitType t; + int r; + + t = unit_name_to_type(unit); + assert(t != _UNIT_TYPE_INVALID); + + if (!unit_vtable[t]->can_set_managed_oom) + return log_syntax(unit, LOG_WARNING, filename, line, 0, "%s= is not supported for this unit type, ignoring.", lvalue); + + if (isempty(rvalue)) { + *duration = USEC_INFINITY; + return 0; + } + + r = parse_sec(rvalue, &usec); + if (r < 0) + return log_syntax_parse_error(unit, filename, line, r, lvalue, rvalue); + + if (usec < 1 * USEC_PER_SEC || usec == USEC_INFINITY) + return log_syntax(unit, LOG_WARNING, filename, line, 0, "%s= must be at least 1s and less than infinity, ignoring: %s", lvalue, rvalue); + + *duration = usec; + return 0; +} + int config_parse_device_allow( const char *unit, const char *filename, diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h index c7301cec52e..e8b2eaee52c 100644 --- a/src/core/load-fragment.h +++ b/src/core/load-fragment.h @@ -88,6 +88,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_delegate); CONFIG_PARSER_PROTOTYPE(config_parse_delegate_subgroup); CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mode); CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mem_pressure_limit); +CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mem_pressure_duration_sec); CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_preference); CONFIG_PARSER_PROTOTYPE(config_parse_device_policy); CONFIG_PARSER_PROTOTYPE(config_parse_device_allow); diff --git a/src/oom/oomd-manager.c b/src/oom/oomd-manager.c index 6d1b4f024b2..7437a6e889b 100644 --- a/src/oom/oomd-manager.c +++ b/src/oom/oomd-manager.c @@ -24,6 +24,7 @@ typedef struct ManagedOOMMessage { char *path; char *property; uint32_t limit; + usec_t duration; } ManagedOOMMessage; static void managed_oom_message_destroy(ManagedOOMMessage *message) { @@ -43,6 +44,7 @@ static int process_managed_oom_message(Manager *m, uid_t uid, sd_json_variant *p { "path", SD_JSON_VARIANT_STRING, sd_json_dispatch_string, offsetof(ManagedOOMMessage, path), SD_JSON_MANDATORY }, { "property", SD_JSON_VARIANT_STRING, sd_json_dispatch_string, offsetof(ManagedOOMMessage, property), SD_JSON_MANDATORY }, { "limit", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint32, offsetof(ManagedOOMMessage, limit), 0 }, + { "duration", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint64, offsetof(ManagedOOMMessage, duration), 0 }, {}, }; @@ -55,10 +57,13 @@ static int process_managed_oom_message(Manager *m, uid_t uid, sd_json_variant *p /* Skip malformed elements and keep processing in case the others are good */ JSON_VARIANT_ARRAY_FOREACH(c, cgroups) { - _cleanup_(managed_oom_message_destroy) ManagedOOMMessage message = {}; + _cleanup_(managed_oom_message_destroy) ManagedOOMMessage message = { + .duration = USEC_INFINITY, + }; OomdCGroupContext *ctx; Hashmap *monitor_hm; loadavg_t limit; + usec_t duration; if (!sd_json_variant_is_object(c)) continue; @@ -104,6 +109,11 @@ static int process_managed_oom_message(Manager *m, uid_t uid, sd_json_variant *p continue; } + if (streq(message.property, "ManagedOOMMemoryPressure") && message.duration != USEC_INFINITY) + duration = message.duration; + else + duration = m->default_mem_pressure_duration_usec; + r = oomd_insert_cgroup_context(NULL, monitor_hm, message.path); if (r == -ENOMEM) return r; @@ -113,8 +123,10 @@ static int process_managed_oom_message(Manager *m, uid_t uid, sd_json_variant *p /* Always update the limit in case it was changed. For non-memory pressure detection the value is * ignored so always updating it here is not a problem. */ ctx = hashmap_get(monitor_hm, empty_to_root(message.path)); - if (ctx) + if (ctx) { ctx->mem_pressure_limit = limit; + ctx->mem_pressure_duration_usec = duration; + } } /* Toggle wake-ups for "ManagedOOMSwap" if entries are present. */ @@ -472,7 +484,7 @@ static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t m->mem_pressure_post_action_delay_start = 0; } - r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets); + r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, &targets); if (r == -ENOMEM) return log_oom(); if (r < 0) @@ -494,7 +506,7 @@ static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t t->path, LOADAVG_INT_SIDE(t->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(t->memory_pressure.avg10), LOADAVG_INT_SIDE(t->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(t->mem_pressure_limit), - FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC)); + FORMAT_TIMESPAN(t->mem_pressure_duration_usec, USEC_PER_SEC)); r = update_monitored_cgroup_contexts_candidates( m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates); @@ -526,7 +538,7 @@ static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t selected, t->path, LOADAVG_INT_SIDE(t->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(t->memory_pressure.avg10), LOADAVG_INT_SIDE(t->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(t->mem_pressure_limit), - FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC)); + FORMAT_TIMESPAN(t->mem_pressure_duration_usec, USEC_PER_SEC)); /* send dbus signal */ (void) sd_bus_emit_signal(m->bus, diff --git a/src/oom/oomd-util.c b/src/oom/oomd-util.c index 6307c2783e0..b9967870390 100644 --- a/src/oom/oomd-util.c +++ b/src/oom/oomd-util.c @@ -69,7 +69,7 @@ OomdCGroupContext *oomd_cgroup_context_free(OomdCGroupContext *ctx) { return mfree(ctx); } -int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret) { +int oomd_pressure_above(Hashmap *h, Set **ret) { _cleanup_set_free_ Set *targets = NULL; OomdCGroupContext *ctx; char *key; @@ -90,7 +90,7 @@ int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret) { ctx->mem_pressure_limit_hit_start = now(CLOCK_MONOTONIC); diff = now(CLOCK_MONOTONIC) - ctx->mem_pressure_limit_hit_start; - if (diff >= duration) { + if (diff >= ctx->mem_pressure_duration_usec) { r = set_put(targets, ctx); if (r < 0) return -ENOMEM; @@ -564,6 +564,7 @@ int oomd_insert_cgroup_context(Hashmap *old_h, Hashmap *new_h, const char *path) curr_ctx->last_pgscan = old_ctx->pgscan; curr_ctx->mem_pressure_limit = old_ctx->mem_pressure_limit; curr_ctx->mem_pressure_limit_hit_start = old_ctx->mem_pressure_limit_hit_start; + curr_ctx->mem_pressure_duration_usec = old_ctx->mem_pressure_duration_usec; curr_ctx->last_had_mem_reclaim = old_ctx->last_had_mem_reclaim; } @@ -594,6 +595,7 @@ void oomd_update_cgroup_contexts_between_hashmaps(Hashmap *old_h, Hashmap *curr_ ctx->last_pgscan = old_ctx->pgscan; ctx->mem_pressure_limit = old_ctx->mem_pressure_limit; ctx->mem_pressure_limit_hit_start = old_ctx->mem_pressure_limit_hit_start; + ctx->mem_pressure_duration_usec = old_ctx->mem_pressure_duration_usec; ctx->last_had_mem_reclaim = old_ctx->last_had_mem_reclaim; if (oomd_pgscan_rate(ctx) > 0) @@ -626,10 +628,12 @@ void oomd_dump_memory_pressure_cgroup_context(const OomdCGroupContext *ctx, FILE fprintf(f, "%sPath: %s\n" "%s\tMemory Pressure Limit: %lu.%02lu%%\n" + "%s\tMemory Pressure Duration: %s\n" "%s\tPressure: Avg10: %lu.%02lu, Avg60: %lu.%02lu, Avg300: %lu.%02lu, Total: %s\n" "%s\tCurrent Memory Usage: %s\n", strempty(prefix), ctx->path, strempty(prefix), LOADAVG_INT_SIDE(ctx->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(ctx->mem_pressure_limit), + strempty(prefix), FORMAT_TIMESPAN(ctx->mem_pressure_duration_usec, USEC_PER_SEC), strempty(prefix), LOADAVG_INT_SIDE(ctx->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(ctx->memory_pressure.avg10), LOADAVG_INT_SIDE(ctx->memory_pressure.avg60), LOADAVG_DECIMAL_SIDE(ctx->memory_pressure.avg60), diff --git a/src/oom/oomd-util.h b/src/oom/oomd-util.h index 95a236f48f4..14fe5c5ebab 100644 --- a/src/oom/oomd-util.h +++ b/src/oom/oomd-util.h @@ -37,6 +37,7 @@ struct OomdCGroupContext { loadavg_t mem_pressure_limit; usec_t mem_pressure_limit_hit_start; usec_t last_had_mem_reclaim; + usec_t mem_pressure_duration_usec; }; struct OomdSystemContext { @@ -53,12 +54,12 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(OomdCGroupContext*, oomd_cgroup_context_free); * key: cgroup paths -> value: OomdCGroupContext. */ /* Scans all the OomdCGroupContexts in `h` and returns 1 and a set of pointers to those OomdCGroupContexts in `ret` - * if any of them have exceeded their supplied memory pressure limits for the `duration` length of time. + * if any of them have exceeded their supplied memory pressure limits for the `ctx->mem_pressure_duration_usec` length of time. * `mem_pressure_limit_hit_start` is updated accordingly for the first time the limit is exceeded, and when it returns * below the limit. - * Returns 0 and sets `ret` to an empty set if no entries exceeded limits for `duration`. + * Returns 0 and sets `ret` to an empty set if no entries exceeded limits for `ctx->mem_pressure_duration_usec`. * Returns -ENOMEM for allocation errors. */ -int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret); +int oomd_pressure_above(Hashmap *h, Set **ret); /* Returns true if the amount of memory available (see proc(5)) is below the permyriad of memory specified by `threshold_permyriad`. */ bool oomd_mem_available_below(const OomdSystemContext *ctx, int threshold_permyriad); diff --git a/src/oom/test-oomd-util.c b/src/oom/test-oomd-util.c index 1aef6039e12..53109c160d3 100644 --- a/src/oom/test-oomd-util.c +++ b/src/oom/test-oomd-util.c @@ -138,6 +138,7 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) { c1->pgscan = UINT64_MAX; c1->mem_pressure_limit = 6789; c1->mem_pressure_limit_hit_start = 42; + c1->mem_pressure_duration_usec = 1234; c1->last_had_mem_reclaim = 888; assert_se(h2 = hashmap_new(&oomd_cgroup_ctx_hash_ops)); assert_se(oomd_insert_cgroup_context(h1, h2, cgroup) == 0); @@ -149,6 +150,7 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) { assert_se(c2->last_pgscan == UINT64_MAX); assert_se(c2->mem_pressure_limit == 6789); assert_se(c2->mem_pressure_limit_hit_start == 42); + assert_se(c2->mem_pressure_duration_usec == 1234); assert_se(c2->last_had_mem_reclaim == 888); /* assumes the live pgscan is less than UINT64_MAX */ } @@ -162,11 +164,13 @@ static void test_oomd_update_cgroup_contexts_between_hashmaps(void) { { .path = paths[0], .mem_pressure_limit = 5, .mem_pressure_limit_hit_start = 777, + .mem_pressure_duration_usec = 111, .last_had_mem_reclaim = 888, .pgscan = 57 }, { .path = paths[1], .mem_pressure_limit = 6, .mem_pressure_limit_hit_start = 888, + .mem_pressure_duration_usec = 222, .last_had_mem_reclaim = 888, .pgscan = 42 }, }; @@ -193,6 +197,7 @@ static void test_oomd_update_cgroup_contexts_between_hashmaps(void) { assert_se(c_old->pgscan == c_new->last_pgscan); assert_se(c_old->mem_pressure_limit == c_new->mem_pressure_limit); assert_se(c_old->mem_pressure_limit_hit_start == c_new->mem_pressure_limit_hit_start); + assert_se(c_old->mem_pressure_duration_usec == c_new->mem_pressure_duration_usec); assert_se(c_old->last_had_mem_reclaim == c_new->last_had_mem_reclaim); assert_se(c_old = hashmap_get(h_old, "/1.slice")); @@ -200,6 +205,7 @@ static void test_oomd_update_cgroup_contexts_between_hashmaps(void) { assert_se(c_old->pgscan == c_new->last_pgscan); assert_se(c_old->mem_pressure_limit == c_new->mem_pressure_limit); assert_se(c_old->mem_pressure_limit_hit_start == c_new->mem_pressure_limit_hit_start); + assert_se(c_old->mem_pressure_duration_usec == c_new->mem_pressure_duration_usec); assert_se(c_new->last_had_mem_reclaim > c_old->last_had_mem_reclaim); } @@ -255,17 +261,21 @@ static void test_oomd_pressure_above(void) { assert_se(store_loadavg_fixed_point(99, 99, &(ctx[0].memory_pressure.avg60)) == 0); assert_se(store_loadavg_fixed_point(99, 99, &(ctx[0].memory_pressure.avg300)) == 0); ctx[0].mem_pressure_limit = threshold; + /* Set memory pressure duration to 0 since we use the real system monotonic clock + * in oomd_pressure_above() and we want to avoid this test depending on timing. */ + ctx[0].mem_pressure_duration_usec = 0; /* /derp.slice */ assert_se(store_loadavg_fixed_point(1, 11, &(ctx[1].memory_pressure.avg10)) == 0); assert_se(store_loadavg_fixed_point(1, 11, &(ctx[1].memory_pressure.avg60)) == 0); assert_se(store_loadavg_fixed_point(1, 11, &(ctx[1].memory_pressure.avg300)) == 0); ctx[1].mem_pressure_limit = threshold; + ctx[1].mem_pressure_duration_usec = 0; /* High memory pressure */ assert_se(h1 = hashmap_new(&string_hash_ops)); assert_se(hashmap_put(h1, "/herp.slice", &ctx[0]) >= 0); - assert_se(oomd_pressure_above(h1, 0 /* duration */, &t1) == 1); + assert_se(oomd_pressure_above(h1, &t1) == 1); assert_se(set_contains(t1, &ctx[0])); assert_se(c = hashmap_get(h1, "/herp.slice")); assert_se(c->mem_pressure_limit_hit_start > 0); @@ -273,14 +283,14 @@ static void test_oomd_pressure_above(void) { /* Low memory pressure */ assert_se(h2 = hashmap_new(&string_hash_ops)); assert_se(hashmap_put(h2, "/derp.slice", &ctx[1]) >= 0); - assert_se(oomd_pressure_above(h2, 0 /* duration */, &t2) == 0); + assert_se(oomd_pressure_above(h2, &t2) == 0); assert_se(!t2); assert_se(c = hashmap_get(h2, "/derp.slice")); assert_se(c->mem_pressure_limit_hit_start == 0); /* High memory pressure w/ multiple cgroups */ assert_se(hashmap_put(h1, "/derp.slice", &ctx[1]) >= 0); - assert_se(oomd_pressure_above(h1, 0 /* duration */, &t3) == 1); + assert_se(oomd_pressure_above(h1, &t3) == 1); assert_se(set_contains(t3, &ctx[0])); assert_se(set_size(t3) == 1); assert_se(c = hashmap_get(h1, "/herp.slice")); diff --git a/src/shared/bus-print-properties.c b/src/shared/bus-print-properties.c index 7da8cb1b126..5857fde5ad2 100644 --- a/src/shared/bus-print-properties.c +++ b/src/shared/bus-print-properties.c @@ -109,6 +109,12 @@ static int bus_print_property(const char *name, const char *expected_value, sd_b bus_print_property_value(name, expected_value, flags, FORMAT_TIMESTAMP(u)); + /* Managed OOM pressure default implies "unset" and use the default set in oomd.conf. Without + * this condition, we will print "infinity" which implies there is no limit on memory + * pressure duration and is incorrect. */ + else if (streq(name, "ManagedOOMMemoryPressureDurationUSec") && u == USEC_INFINITY) + bus_print_property_value(name, expected_value, flags, "[not set]"); + else if (strstr(name, "USec")) bus_print_property_value(name, expected_value, flags, FORMAT_TIMESPAN(u, 0)); diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index b1518269208..59e49018788 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -1008,6 +1008,11 @@ static int bus_append_cgroup_property(sd_bus_message *m, const char *field, cons if (streq(field, "NFTSet")) return bus_append_nft_set(m, field, eq); + if (streq(field, "ManagedOOMMemoryPressureDurationSec")) + /* While infinity is disallowed in unit file, infinity is allowed in D-Bus API which + * means use the default memory pressure duration from oomd.conf. */ + return bus_append_parse_sec_rename(m, field, isempty(eq) ? "infinity" : eq); + return 0; } diff --git a/src/shared/varlink-io.systemd.oom.c b/src/shared/varlink-io.systemd.oom.c index 67beb6b7805..350b933d03d 100644 --- a/src/shared/varlink-io.systemd.oom.c +++ b/src/shared/varlink-io.systemd.oom.c @@ -12,7 +12,8 @@ SD_VARLINK_DEFINE_STRUCT_TYPE( SD_VARLINK_DEFINE_FIELD(mode, SD_VARLINK_STRING, 0), SD_VARLINK_DEFINE_FIELD(path, SD_VARLINK_STRING, 0), SD_VARLINK_DEFINE_FIELD(property, SD_VARLINK_STRING, 0), - SD_VARLINK_DEFINE_FIELD(limit, SD_VARLINK_INT, SD_VARLINK_NULLABLE)); + SD_VARLINK_DEFINE_FIELD(limit, SD_VARLINK_INT, SD_VARLINK_NULLABLE), + SD_VARLINK_DEFINE_FIELD(duration, SD_VARLINK_INT, SD_VARLINK_NULLABLE)); static SD_VARLINK_DEFINE_METHOD( ReportManagedOOMCGroups, diff --git a/test/fuzz/fuzz-unit-file/directives-all.service b/test/fuzz/fuzz-unit-file/directives-all.service index 1cb212bcadd..a0883d0ebe9 100644 --- a/test/fuzz/fuzz-unit-file/directives-all.service +++ b/test/fuzz/fuzz-unit-file/directives-all.service @@ -154,6 +154,7 @@ MaxConnectionsPerSource= ManagedOOMSwap= ManagedOOMMemoryPressure= ManagedOOMMemoryPressureLimitPercent= +ManagedOOMMemoryPressureDurationSec= ManagedOOMPreference= MemoryAccounting= MemoryHigh= diff --git a/test/units/TEST-55-OOMD.sh b/test/units/TEST-55-OOMD.sh index c615e7a4b22..10b3777df6d 100755 --- a/test/units/TEST-55-OOMD.sh +++ b/test/units/TEST-55-OOMD.sh @@ -106,7 +106,7 @@ test_basic() { # Verify systemd-oomd is monitoring the expected units. timeout 1m bash -xec "until oomctl | grep -q -F 'Path: $cgroup_path'; do sleep 1; done" assert_in 'Memory Pressure Limit: 20.00%' \ - "$(oomctl | tac | sed -e '/Memory Pressure Monitored CGroups:/q' | tac | grep -A7 "Path: $cgroup_path")" + "$(oomctl | tac | sed -e '/Memory Pressure Monitored CGroups:/q' | tac | grep -A8 "Path: $cgroup_path")" systemctl "$@" start TEST-55-OOMD-testbloat.service @@ -181,6 +181,86 @@ EOF systemctl stop TEST-55-OOMD-testmunch.service systemctl stop TEST-55-OOMD-testchill.service systemctl stop TEST-55-OOMD-workload.slice + + # clean up overrides since test cases can be run in any order + # and overrides shouldn't affect other tests + rm -rf /run/systemd/system/TEST-55-OOMD-testbloat.service.d + systemctl daemon-reload +} + +testcase_duration_analyze() { + # Verify memory pressure duration is valid if >= 1 second + cat </tmp/TEST-55-OOMD-valid-duration.service +[Service] +ExecStart=echo hello +ManagedOOMMemoryPressureDurationSec=1s +EOF + + # Verify memory pressure duration is invalid if < 1 second + cat </tmp/TEST-55-OOMD-invalid-duration.service +[Service] +ExecStart=echo hello +ManagedOOMMemoryPressureDurationSec=0 +EOF + + systemd-analyze --recursive-errors=no verify /tmp/TEST-55-OOMD-valid-duration.service + (! systemd-analyze --recursive-errors=no verify /tmp/TEST-55-OOMD-invalid-duration.service) + + rm -f /tmp/TEST-55-OOMD-valid-duration.service + rm -f /tmp/TEST-55-OOMD-invalid-duration.service +} + +testcase_duration_override() { + # Verify memory pressure duration can be overriden to non-zero values + mkdir -p /run/systemd/system/TEST-55-OOMD-testmunch.service.d/ + cat >/run/systemd/system/TEST-55-OOMD-testmunch.service.d/99-duration-test.conf </run/systemd/system/TEST-55-OOMD-testchill.service.d/99-duration-test.conf <