From: Matteo Croce Date: Mon, 25 Aug 2025 15:13:00 +0000 (+0200) Subject: oomd: implement a prekill varlink event X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=c1bf450fa06e1d120a5d4ca782e82a68b8662b09;p=thirdparty%2Fsystemd.git oomd: implement a prekill varlink event When a cgroup is selected for termination, send varlink messages to hooks registered in `/run/systemd/oomd.prekill-hooks/`. oomd waits up to `PreKillHookTimeoutSec=` seconds for response before proceeding with the kill. --- diff --git a/man/oomd.conf.xml b/man/oomd.conf.xml index 13f1f22e53a..a4be5e1274f 100644 --- a/man/oomd.conf.xml +++ b/man/oomd.conf.xml @@ -48,6 +48,26 @@ + + Prekill event + + systemd-oomd supports notifying external components before killing a control + group. + This is done by sending a notification over varlink to all sockets found in + /run/systemd/oomd.prekill.hook/ folder. Each socket should implement the + io.systemd.oom.Prekill interface. The notification contains the control group path + to allow the hook to identify which control group is being killed. This allows external components to + perform any necessary cleanup or logging before the control group is terminated. The hook is not intended + as a way to avoid the kill, but rather as a notification mechanism. + Note that this is a privileged option as, even if it has a timeout, is synchronous and delays the kill, + so use with care. + The typically preferable mechanism to process memory pressure is to do what + MEMORY_PRESSURE describes which is unprivileged, + asynchronous and does not delay the kill. + + + + [OOM] Section Options @@ -98,6 +118,22 @@ + + PrekillHookTimeoutSec= + + Sets the amount of time systemd-oomd will wait for pre-kill hooks + to complete, before proceeding with the control group termination. Pre-kill hooks work by placing + varlink socket to /run/systemd/oomd.prekill.hook/ folder. Each socket should + implement interface for notification to work. systemd-oomd sends a notification + before killing a control group for each discovered socket. The timeout is intended to be global and + not per hook. If all hooks return earlier, the kill is performed as soon as possible. The timeout + must be at least 1s. + Defaults to 0, which means systemd-oomd will not wait and no notifications + will be sent. + + + + diff --git a/src/basic/constants.h b/src/basic/constants.h index ab1ae594379..3dbffdee263 100644 --- a/src/basic/constants.h +++ b/src/basic/constants.h @@ -66,6 +66,8 @@ #define VARLINK_PATH_MACHINED_USERDB "/run/systemd/userdb/io.systemd.Machine" /* Path where systemd-machined listens to resolve.hook varlink queries */ #define VARLINK_PATH_MACHINED_RESOLVE_HOOK "/run/systemd/resolve.hook/io.systemd.Machine" +/* Path where to connect to send varlink prekill events */ +#define VARLINK_DIR_OOMD_PREKILL_HOOK "/run/systemd/oomd.prekill.hook/" /* Recommended baseline - see README for details */ #define KERNEL_BASELINE_VERSION "5.14" diff --git a/src/oom/oomd-conf.c b/src/oom/oomd-conf.c index 299d2afe96c..f0091e27561 100644 --- a/src/oom/oomd-conf.c +++ b/src/oom/oomd-conf.c @@ -75,6 +75,7 @@ void manager_parse_config_file(Manager *m) { { "OOM", "SwapUsedLimit", config_parse_permyriad, 0, &m->swap_used_limit_permyriad }, { "OOM", "DefaultMemoryPressureLimit", config_parse_loadavg, 0, &m->default_mem_pressure_limit }, { "OOM", "DefaultMemoryPressureDurationSec", config_parse_duration, 0, &m->default_mem_pressure_duration_usec }, + { "OOM", "PrekillHookTimeoutSec", config_parse_sec, 0, &m->prekill_timeout }, {} }; diff --git a/src/oom/oomd-manager.c b/src/oom/oomd-manager.c index a228aa1ae05..3898ab707c3 100644 --- a/src/oom/oomd-manager.c +++ b/src/oom/oomd-manager.c @@ -392,7 +392,7 @@ static int monitor_swap_contexts_handler(sd_event_source *s, uint64_t usec, void if (oomd_mem_available_below(&m->system_context, 10000 - m->swap_used_limit_permyriad) && oomd_swap_free_below(&m->system_context, 10000 - m->swap_used_limit_permyriad)) { _cleanup_hashmap_free_ Hashmap *candidates = NULL; - _cleanup_free_ char *selected = NULL; + OomdCGroupContext *selected = NULL; uint64_t threshold; log_debug("Memory used (%"PRIu64") / total (%"PRIu64") and " @@ -408,29 +408,28 @@ static int monitor_swap_contexts_handler(sd_event_source *s, uint64_t usec, void log_debug_errno(r, "Failed to get monitored swap cgroup candidates, ignoring: %m"); threshold = m->system_context.swap_total * THRESHOLD_SWAP_USED_PERCENT / 100; - r = oomd_kill_by_swap_usage(candidates, threshold, m->dry_run, &selected); + r = oomd_select_by_swap_usage(candidates, threshold, &selected); + if (r < 0) + return log_error_errno(r, "Failed to select any cgroups based on swap: %m"); + if (r == 0) { + log_debug("No cgroup candidates found for swap-based OOM action"); + return 0; + } + + r = oomd_cgroup_kill_mark(m, selected); if (r == -ENOMEM) return log_oom(); if (r < 0) - log_notice_errno(r, "Failed to kill any cgroups based on swap: %m"); + log_error_errno(r, "Failed to select any cgroups based on swap: %m"); else { if (selected && r > 0) { - log_notice("Killed %s due to memory used (%"PRIu64") / total (%"PRIu64") and " + log_notice("Marked %s for killing due to memory used (%"PRIu64") / total (%"PRIu64") and " "swap used (%"PRIu64") / total (%"PRIu64") being more than " PERMYRIAD_AS_PERCENT_FORMAT_STR, - selected, + selected->path, m->system_context.mem_used, m->system_context.mem_total, m->system_context.swap_used, m->system_context.swap_total, PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad)); - - /* send dbus signal */ - (void) sd_bus_emit_signal(m->bus, - "/org/freedesktop/oom1", - "org.freedesktop.oom1.Manager", - "Killed", - "ss", - selected, - "memory-used"); } return 0; } @@ -500,7 +499,7 @@ static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t else if (r == 1 && !in_post_action_delay) { OomdCGroupContext *t; SET_FOREACH(t, targets) { - _cleanup_free_ char *selected = NULL; + OomdCGroupContext *selected = NULL; /* Check if there was reclaim activity in the given interval. The concern is the following case: * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending @@ -525,14 +524,21 @@ static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t else clear_candidates = NULL; - r = oomd_kill_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates, - /* prefix= */ t->path, - /* dry_run= */ m->dry_run, - &selected); + r = oomd_select_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates, + /* prefix= */ t->path, + &selected); + if (r < 0) + return log_error_errno(r, "Failed to select any cgroups based on swap, ignoring: %m"); + if (r == 0) { + log_debug("No cgroup candidates found for memory pressure-based OOM action for %s", t->path); + return 0; + } + + r = oomd_cgroup_kill_mark(m, selected); if (r == -ENOMEM) return log_oom(); if (r < 0) - log_notice_errno(r, "Failed to kill any cgroups under %s based on pressure: %m", t->path); + log_error_errno(r, "Failed to select any cgroups under %s based on pressure, ignoring: %m", t->path); else { /* Don't act on all the high pressure cgroups at once; return as soon as we kill one. * If r == 0 then it means there were not eligible candidates, the candidate cgroup @@ -541,21 +547,12 @@ static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t * pressure is still high. */ m->mem_pressure_post_action_delay_start = usec_now; if (selected && r > 0) { - log_notice("Killed %s due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%" + log_notice("Marked %s for killing due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%" " for > %s with reclaim activity", - selected, t->path, + selected->path, t->path, LOADAVG_INT_SIDE(t->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(t->memory_pressure.avg10), LOADAVG_INT_SIDE(t->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(t->mem_pressure_limit), FORMAT_TIMESPAN(t->mem_pressure_duration_usec, USEC_PER_SEC)); - - /* send dbus signal */ - (void) sd_bus_emit_signal(m->bus, - "/org/freedesktop/oom1", - "org.freedesktop.oom1.Manager", - "Killed", - "ss", - selected, - "memory-pressure"); } return 0; } @@ -653,6 +650,8 @@ Manager* manager_free(Manager *m) { hashmap_free(m->monitored_mem_pressure_cgroup_contexts); hashmap_free(m->monitored_mem_pressure_cgroup_contexts_candidates); + set_free(m->kill_states); + return mfree(m); } diff --git a/src/oom/oomd-manager.h b/src/oom/oomd-manager.h index 016460a7191..8b9476232fb 100644 --- a/src/oom/oomd-manager.h +++ b/src/oom/oomd-manager.h @@ -3,6 +3,7 @@ #include "conf-parser-forward.h" #include "shared-forward.h" +#include "oomd-conf.h" #include "oomd-util.h" /* Polling interval for monitoring stats */ @@ -54,6 +55,9 @@ typedef struct Manager { /* This varlink server object is used to manage systemd-oomd's varlink server which is used by user * managers to report changes in ManagedOOM settings (oomd server - systemd client). */ sd_varlink_server *varlink_server; + + usec_t prekill_timeout; + Set *kill_states; /* currently ongoing OomdKillState operations */ } Manager; Manager* manager_free(Manager *m); diff --git a/src/oom/oomd-util.c b/src/oom/oomd-util.c index e8d44f23fee..340d2bb60e7 100644 --- a/src/oom/oomd-util.c +++ b/src/oom/oomd-util.c @@ -1,23 +1,35 @@ /* SPDX-License-Identifier: LGPL-2.1-or-later */ #include "alloc-util.h" +#include "constants.h" +#include "dirent-util.h" #include "errno-util.h" #include "fd-util.h" #include "fileio.h" #include "format-util.h" #include "log.h" #include "memstream-util.h" +#include "oomd-manager.h" #include "oomd-util.h" #include "parse-util.h" #include "path-util.h" #include "pidref.h" #include "procfs-util.h" +#include "sd-bus.h" #include "set.h" #include "signal-util.h" #include "sort-util.h" #include "stdio-util.h" #include "string-util.h" #include "time-util.h" +#include "varlink-util.h" + +typedef struct OomdKillState { + Manager *manager; + OomdCGroupContext *ctx; + /* This holds sd_varlink references */ + Set *links; +} OomdKillState; DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR( oomd_cgroup_ctx_hash_ops, @@ -233,55 +245,263 @@ int oomd_sort_cgroup_contexts(Hashmap *h, oomd_compare_t compare_func, const cha return (int) k; } -int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run) { +int oomd_cgroup_kill(Manager *m, OomdCGroupContext *ctx, bool recurse) { _cleanup_set_free_ Set *pids_killed = NULL; int r; - assert(path); - - if (dry_run) { - _cleanup_free_ char *cg_path = NULL; - - r = cg_get_path(path, /* suffix= */ NULL, &cg_path); - if (r < 0) - return r; - - log_info("oomd dry-run: Would have tried to kill %s with recurse=%s", cg_path, true_false(recurse)); - return 0; - } + assert(ctx); pids_killed = set_new(NULL); if (!pids_killed) return -ENOMEM; - r = increment_oomd_xattr(path, "user.oomd_ooms", 1); + r = increment_oomd_xattr(ctx->path, "user.oomd_ooms", 1); if (r < 0) log_debug_errno(r, "Failed to set user.oomd_ooms before kill: %m"); if (recurse) - r = cg_kill_recursive(path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL); + r = cg_kill_recursive(ctx->path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL); else - r = cg_kill(path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL); + r = cg_kill(ctx->path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL); /* The cgroup could have been cleaned up after we have sent SIGKILL to all of the processes, but before * we could do one last iteration of cgroup.procs to check. Or the service unit could have exited and * was removed between picking candidates and coming into this function. In either case, let's log * about it let the caller decide what to do once they know how many PIDs were killed. */ if (IN_SET(r, -ENOENT, -ENODEV)) - log_debug_errno(r, "Error when sending SIGKILL to processes in cgroup path %s, ignoring: %m", path); + log_debug_errno(r, "Error when sending SIGKILL to processes in cgroup path %s, ignoring: %m", ctx->path); else if (r < 0) return r; if (set_isempty(pids_killed)) - log_debug("Nothing killed when attempting to kill %s", path); + log_debug("Nothing killed when attempting to kill %s", ctx->path); - r = increment_oomd_xattr(path, "user.oomd_kill", set_size(pids_killed)); + r = increment_oomd_xattr(ctx->path, "user.oomd_kill", set_size(pids_killed)); if (r < 0) log_debug_errno(r, "Failed to set user.oomd_kill on kill: %m"); + /* send dbus signal */ + if (m) + (void) sd_bus_emit_signal(m->bus, + "/org/freedesktop/oom1", + "org.freedesktop.oom1.Manager", + "Killed", + "ss", + ctx, + "oom"); + return !set_isempty(pids_killed); } +static void oomd_kill_state_free(OomdKillState *ks) { + if (!ks) + return; + + assert(ks->manager); + + set_free(ks->links); + + set_remove(ks->manager->kill_states, ks); + oomd_cgroup_context_unref(ks->ctx); + free(ks); +} + +static int oomd_kill_state_compare(const OomdKillState *a, const OomdKillState *b) { + return path_compare(a->ctx->path, b->ctx->path); +} + +static void oomd_kill_state_hash_func(const OomdKillState *ks, struct siphash *state) { + path_hash_func(ks->ctx->path, state); +} + +DEFINE_PRIVATE_HASH_OPS_WITH_KEY_DESTRUCTOR( + oomd_kill_state_hash_ops, + OomdKillState, + oomd_kill_state_hash_func, + oomd_kill_state_compare, + oomd_kill_state_free); + +/* oomd_kill_state_remove() is called N+1 times where N is the number of prekill hooks found. + * The extra call is just after creating the kill state, so to have at least a call if no + * prekill hooks are found. Each call removes one link from the kill state, and when the set + * is empty, it performs the actual cgroup kill. */ +static void oomd_kill_state_remove(OomdKillState *ks) { + int r; + + assert(ks); + assert(ks->ctx); + + if (!set_isempty(ks->links)) + return; + + r = oomd_cgroup_kill(ks->manager, ks->ctx, /* recurse= */ true); + if (r < 0) + log_debug_errno(r, "Failed to kill cgroup '%s', ignoring: %m", ks->ctx->path); + oomd_kill_state_free(ks); +} + +static int prekill_callback( + sd_varlink *link, + sd_json_variant *parameters, + const char *error_id, + sd_varlink_reply_flags_t flags, + void *userdata) { + + OomdKillState *ks = ASSERT_PTR(userdata); + + assert(ks); + assert(ks->ctx); + + if (error_id) + log_warning("oomd prekill hook for %s returned error: %s", ks->ctx->path, error_id); + else + log_info("oomd prekill hook finished for cgroup %s", ks->ctx->path); + + assert_se(set_remove(ks->links, link) == link); + oomd_kill_state_remove(ks); + sd_varlink_unref(link); + + return 0; +} + +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(OomdKillState *, oomd_kill_state_remove, NULL); + +static int send_prekill_message( + const char *basename, + sd_json_variant *cparams, + OomdKillState *ks, + sd_event *e) { + + _cleanup_(sd_varlink_close_unrefp) sd_varlink *link = NULL; + _cleanup_free_ char *hook_path = NULL; + int r; + + assert(basename); + assert(cparams); + assert(e); + assert(ks); + assert(ks->ctx); + assert(ks->manager); + + log_info("Invoking oomd prekill hook %s for cgroup %s", basename, ks->ctx->path); + + hook_path = path_join(VARLINK_DIR_OOMD_PREKILL_HOOK, basename); + if (!hook_path) + return log_oom_debug(); + + r = sd_varlink_connect_address(&link, hook_path); + if (r < 0) { + log_debug_errno(r, "Socket '%s' is not connectible, probably stale, ignoring: %m", hook_path); + return 0; + } + + (void) sd_varlink_set_userdata(link, ks); + r = sd_varlink_set_description(link, "oomd prekill hook"); + if (r < 0) + return log_debug_errno(r, "Failed to set varlink description: %m"); + (void) sd_varlink_set_relative_timeout(link, ks->manager->prekill_timeout); + + r = sd_varlink_attach_event(link, e, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) + return log_debug_errno(r, "Failed to attach varlink to event loop: %m"); + + r = sd_varlink_bind_reply(link, prekill_callback); + if (r < 0) + return log_debug_errno(r, "Failed to bind reply callback: %m"); + + r = sd_varlink_invoke(link, "io.systemd.oom.Prekill.Notify", cparams); + if (r < 0) + return log_debug_errno(r, "Failed to call varlink method io.systemd.oom.Prekill.Notify: %m"); + + r = set_ensure_consume(&ks->links, &varlink_hash_ops, TAKE_PTR(link)); + if (r < 0) + return log_oom_debug(); + + return 0; +} + +/* oomd_prekill_hook() sets the prekill hooks up by sending varlink messages to all sockets found + * in VARLINK_DIR_OOMD_PREKILL_HOOK directory. It returns immediately if no prekill hooks are configured + * or PrekillHookTimeoutSec= is not set. In that case, the actual killing is done immediately by + * the callback set up by the cleanup handler in oomd_cgroup_kill_mark(). */ +static int oomd_prekill_hook(Manager *m, OomdKillState *ks) { + _cleanup_closedir_ DIR *d = NULL; + int r; + + assert(m); + assert(ks); + assert(ks->ctx); + + if (m->prekill_timeout == 0) { + log_debug("Zero oomd prekill timeout configured, skipping prekill hooks."); + return 0; + } + + d = opendir(VARLINK_DIR_OOMD_PREKILL_HOOK); + if (!d) { + if (errno == ENOENT) { + log_debug("No prekill varlink socket directory %s, ignoring.", VARLINK_DIR_OOMD_PREKILL_HOOK); + return 0; + } + return log_debug_errno(errno, "Failed to open prekill varlink socket directory %s: %m", + VARLINK_DIR_OOMD_PREKILL_HOOK); + } + + _cleanup_(sd_json_variant_unrefp) sd_json_variant *cparams = NULL; + r = sd_json_buildo(&cparams, SD_JSON_BUILD_PAIR_STRING("cgroup", ks->ctx->path)); + if (r < 0) + return log_oom_debug(); + + FOREACH_DIRENT(de, d, return -errno) { + if (!IN_SET(de->d_type, DT_SOCK, DT_UNKNOWN)) + continue; + + r = send_prekill_message(de->d_name, cparams, ks, m->event); + if (r < 0) + log_warning_errno(r, "Failed to send oomd prekill message to %s for cgroup %s, ignoring: %m", + de->d_name, ks->ctx->path); + } + + return 0; +} + +int oomd_cgroup_kill_mark(Manager *m, OomdCGroupContext *ctx) { + int r; + + assert(ctx); + assert(m); + + if (m->dry_run) { + _cleanup_free_ char *cg_path = NULL; + + r = cg_get_path(ctx->path, /* suffix= */ NULL, &cg_path); + if (r < 0) + return r; + + log_info("oomd dry-run: Would have tried to kill %s and all its descendants", cg_path); + return 0; + } + + _cleanup_(oomd_kill_state_removep) OomdKillState *ks = new(OomdKillState, 1); + if (!ks) + return log_oom_debug(); + + *ks = (OomdKillState) { + .manager = m, + .ctx = oomd_cgroup_context_ref(ctx), + }; + + r = set_ensure_put(&m->kill_states, &oomd_kill_state_hash_ops, ks); + if (r < 0) + return log_oom_debug(); + + r = oomd_prekill_hook(m, ks); + if (r < 0) + log_warning_errno(r, "oomd prekill hook failed for %s, ignoring: %m", ctx->path); + + return 0; +} + typedef void (*dump_candidate_func)(const OomdCGroupContext *ctx, FILE *f, const char *prefix); static int dump_kill_candidates( @@ -319,10 +539,9 @@ static int dump_kill_candidates( return memstream_dump(LOG_INFO, &m); } -int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char **ret_selected) { +int oomd_select_by_pgscan_rate(Hashmap *h, const char *prefix, OomdCGroupContext **ret_selected) { _cleanup_free_ OomdCGroupContext **sorted = NULL; - const OomdCGroupContext *killed = NULL; - int n, r, ret = 0; + int r, n, ret = 0; assert(h); assert(ret_selected); @@ -332,7 +551,7 @@ int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char return n; FOREACH_ARRAY(i, sorted, n) { - const OomdCGroupContext *c = *i; + OomdCGroupContext *c = *i; /* Skip cgroups with no reclaim and memory usage; it won't alleviate pressure. * Continue since there might be "avoid" cgroups at the end. */ @@ -345,31 +564,18 @@ int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char if (r < 0) continue; - r = oomd_cgroup_kill(c->path, /* recurse= */ true, /* dry_run= */ dry_run); - if (r == -ENOMEM) - return r; /* Treat oom as a hard error */ - if (r < 0) { - RET_GATHER(ret, r); - continue; /* Try to find something else to kill */ - } - - ret = r; - r = strdup_to(ret_selected, c->path); - if (r < 0) - return r; - - killed = c; + ret = 1; + *ret_selected = c; break; } - (void) dump_kill_candidates(sorted, n, killed, oomd_dump_memory_pressure_cgroup_context); + (void) dump_kill_candidates(sorted, n, *ret_selected, oomd_dump_memory_pressure_cgroup_context); return ret; } -int oomd_kill_by_swap_usage(Hashmap *h, uint64_t threshold_usage, bool dry_run, char **ret_selected) { +int oomd_select_by_swap_usage(Hashmap *h, uint64_t threshold_usage, OomdCGroupContext **ret_selected) { _cleanup_free_ OomdCGroupContext **sorted = NULL; - const OomdCGroupContext *killed = NULL; - int n, r, ret = 0; + int r, n, ret = 0; assert(h); assert(ret_selected); @@ -382,7 +588,7 @@ int oomd_kill_by_swap_usage(Hashmap *h, uint64_t threshold_usage, bool dry_run, * no swap usage. Threshold killing only cgroups with more than threshold swap usage. */ FOREACH_ARRAY(i, sorted, n) { - const OomdCGroupContext *c = *i; + OomdCGroupContext *c = *i; /* Skip over cgroups with not enough swap usage. Don't break since there might be "avoid" * cgroups at the end. */ @@ -395,24 +601,12 @@ int oomd_kill_by_swap_usage(Hashmap *h, uint64_t threshold_usage, bool dry_run, if (r < 0) continue; - r = oomd_cgroup_kill(c->path, /* recurse= */ true, /* dry_run= */ dry_run); - if (r == -ENOMEM) - return r; /* Treat oom as a hard error */ - if (r < 0) { - RET_GATHER(ret, r); - continue; /* Try to find something else to kill */ - } - - ret = r; - r = strdup_to(ret_selected, c->path); - if (r < 0) - return r; - - killed = c; + ret = 1; + *ret_selected = c; break; } - (void) dump_kill_candidates(sorted, n, killed, oomd_dump_swap_cgroup_context); + (void) dump_kill_candidates(sorted, n, *ret_selected, oomd_dump_swap_cgroup_context); return ret; } diff --git a/src/oom/oomd-util.h b/src/oom/oomd-util.h index 2632ebdec89..cf80c6f57b5 100644 --- a/src/oom/oomd-util.h +++ b/src/oom/oomd-util.h @@ -10,8 +10,11 @@ extern const struct hash_ops oomd_cgroup_ctx_hash_ops; +struct Manager; + typedef struct OomdCGroupContext OomdCGroupContext; typedef struct OomdSystemContext OomdSystemContext; +typedef struct Manager Manager; typedef int (oomd_compare_t)(OomdCGroupContext * const *, OomdCGroupContext * const *); @@ -121,14 +124,15 @@ int oomd_sort_cgroup_contexts(Hashmap *h, oomd_compare_t compare_func, const cha int oomd_fetch_cgroup_oom_preference(OomdCGroupContext *ctx, const char *prefix); /* Returns a negative value on error, 0 if no processes were killed, or 1 if processes were killed. */ -int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run); +int oomd_cgroup_kill(Manager *m, OomdCGroupContext *ctx, bool recurse); +int oomd_cgroup_kill_mark(Manager *m, OomdCGroupContext *ctx); /* The following oomd_kill_by_* functions return 1 if processes were killed, or negative otherwise. */ /* If `prefix` is supplied, only cgroups whose paths start with `prefix` are eligible candidates. Otherwise, * everything in `h` is a candidate. * Returns the killed cgroup in ret_selected. */ -int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char **ret_selected); -int oomd_kill_by_swap_usage(Hashmap *h, uint64_t threshold_usage, bool dry_run, char **ret_selected); +int oomd_select_by_pgscan_rate(Hashmap *h, const char *prefix, OomdCGroupContext **ret_selected); +int oomd_select_by_swap_usage(Hashmap *h, uint64_t threshold_usage, OomdCGroupContext **ret_selected); int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret); int oomd_system_context_acquire(const char *proc_meminfo_path, OomdSystemContext *ret); diff --git a/src/oom/test-oomd-util.c b/src/oom/test-oomd-util.c index 259bd9f6ae5..76ff43cb272 100644 --- a/src/oom/test-oomd-util.c +++ b/src/oom/test-oomd-util.c @@ -88,7 +88,7 @@ TEST(oomd_cgroup_kill) { ASSERT_OK(fork_and_sleep(5, &two)); ASSERT_OK(cg_attach(subcgroup, two.pid)); - ASSERT_OK_POSITIVE(oomd_cgroup_kill(subcgroup, false /* recurse */, false /* dry run */)); + ASSERT_OK_POSITIVE(oomd_cgroup_kill(NULL /* manager */, &(OomdCGroupContext){ .path = subcgroup }, false /* recurse */)); ASSERT_OK(cg_get_xattr(subcgroup, "user.oomd_ooms", &v, /* ret_size= */ NULL)); ASSERT_STREQ(v, i == 0 ? "1" : "2"); diff --git a/src/shared/meson.build b/src/shared/meson.build index e878b5e65a2..fea5e22109f 100644 --- a/src/shared/meson.build +++ b/src/shared/meson.build @@ -224,6 +224,7 @@ shared_sources = files( 'varlink-io.systemd.Unit.c', 'varlink-io.systemd.UserDatabase.c', 'varlink-io.systemd.oom.c', + 'varlink-io.systemd.oom.Prekill.c', 'varlink-io.systemd.service.c', 'varlink-io.systemd.sysext.c', 'varlink-serialize.c', diff --git a/src/shared/varlink-io.systemd.oom.Prekill.c b/src/shared/varlink-io.systemd.oom.Prekill.c new file mode 100644 index 00000000000..41658fe4ce5 --- /dev/null +++ b/src/shared/varlink-io.systemd.oom.Prekill.c @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "varlink-io.systemd.oom.Prekill.h" + +/* This is a new Varlink interface for pre-kill notifications from oomd. + * It will be available through /run/systemd/oomd.prekill.hook/ */ + +static SD_VARLINK_DEFINE_METHOD( + Notify, + SD_VARLINK_FIELD_COMMENT("The cgroup which is going to be killed"), + SD_VARLINK_DEFINE_INPUT(cgroup, SD_VARLINK_STRING, 0)); + +SD_VARLINK_DEFINE_INTERFACE( + io_systemd_oom_Prekill, + "io.systemd.oom.Prekill", + SD_VARLINK_INTERFACE_COMMENT("Prekill notifications from oomd"), + SD_VARLINK_SYMBOL_COMMENT("Notify about an imminent OOM kill"), + &vl_method_Notify); diff --git a/src/shared/varlink-io.systemd.oom.Prekill.h b/src/shared/varlink-io.systemd.oom.Prekill.h new file mode 100644 index 00000000000..52bf5eb3070 --- /dev/null +++ b/src/shared/varlink-io.systemd.oom.Prekill.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-varlink-idl.h" + +extern const sd_varlink_interface vl_interface_io_systemd_oom_Prekill; diff --git a/src/test/test-varlink-idl.c b/src/test/test-varlink-idl.c index 714189bbd12..69c9f0a7d63 100644 --- a/src/test/test-varlink-idl.c +++ b/src/test/test-varlink-idl.c @@ -43,6 +43,7 @@ #include "varlink-io.systemd.Unit.h" #include "varlink-io.systemd.UserDatabase.h" #include "varlink-io.systemd.oom.h" +#include "varlink-io.systemd.oom.Prekill.h" #include "varlink-io.systemd.service.h" #include "varlink-io.systemd.sysext.h" #include "varlink-org.varlink.service.h" @@ -206,6 +207,7 @@ TEST(parse_format) { &vl_interface_io_systemd_Unit, &vl_interface_io_systemd_UserDatabase, &vl_interface_io_systemd_oom, + &vl_interface_io_systemd_oom_Prekill, &vl_interface_io_systemd_service, &vl_interface_io_systemd_sysext, &vl_interface_org_varlink_service, diff --git a/test/units/TEST-55-OOMD.sh b/test/units/TEST-55-OOMD.sh index 56a32926a04..96a15989c74 100755 --- a/test/units/TEST-55-OOMD.sh +++ b/test/units/TEST-55-OOMD.sh @@ -353,6 +353,35 @@ EOF systemctl reset-failed } +testcase_prekill_hook() { + cat >/run/systemd/oomd.conf.d/99-oomd-prekill-test.conf <<'EOF' +[OOM] +PrekillHookTimeoutSec=3s +EOF + + # no hooks + systemctl reload systemd-oomd.service + ! systemctl start --wait TEST-55-OOMD-testbloat.service || exit 1 + + # one hook + mkdir -p /run/systemd/oomd.prekill.hook/ + ncat --recv-only -kUl /run/systemd/oomd.prekill.hook/althook >/tmp/oomd_event.json & + ! systemctl start --wait TEST-55-OOMD-testbloat.service || exit 1 + [[ $(jq -r .method "/tmp/oomd_event$i.json" & + done + + ! systemctl start --wait TEST-55-OOMD-testbloat.service || exit 1 + for j in /tmp/oomd_event*.json; do + [[ $(jq -r .method <"$j") = 'io.systemd.oom.Prekill.Notify' ]] + done +} + run_testcases touch /testok