]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
oomd: implement a prekill varlink event 38584/head
authorMatteo Croce <teknoraver@meta.com>
Mon, 25 Aug 2025 15:13:00 +0000 (17:13 +0200)
committerMatteo Croce <teknoraver@meta.com>
Mon, 9 Feb 2026 01:05:57 +0000 (02:05 +0100)
When a cgroup is selected for termination, send varlink messages
to hooks registered in `/run/systemd/oomd.prekill-hooks/`.
oomd waits up to `PreKillHookTimeoutSec=` seconds for response
before proceeding with the kill.

13 files changed:
man/oomd.conf.xml
src/basic/constants.h
src/oom/oomd-conf.c
src/oom/oomd-manager.c
src/oom/oomd-manager.h
src/oom/oomd-util.c
src/oom/oomd-util.h
src/oom/test-oomd-util.c
src/shared/meson.build
src/shared/varlink-io.systemd.oom.Prekill.c [new file with mode: 0644]
src/shared/varlink-io.systemd.oom.Prekill.h [new file with mode: 0644]
src/test/test-varlink-idl.c
test/units/TEST-55-OOMD.sh

index 13f1f22e53a457a225b40147dddbc77bc1e59b26..a4be5e1274ff9b1b8e3b9d511c55abde4ce6e901 100644 (file)
 
   <xi:include href="standard-conf.xml" xpointer="main-conf" />
 
+  <refsect1>
+    <title>Prekill event</title>
+
+    <para><command>systemd-oomd</command> supports notifying external components before killing a control
+    group.
+    This is done by sending a notification over varlink to all sockets found in
+    <filename>/run/systemd/oomd.prekill.hook/</filename> folder. Each socket should implement the
+    <constant>io.systemd.oom.Prekill</constant> interface. The notification contains the control group path
+    to allow the hook to identify which control group is being killed. This allows external components to
+    perform any necessary cleanup or logging before the control group is terminated. The hook is not intended
+    as a way to avoid the kill, but rather as a notification mechanism.
+    Note that this is a privileged option as, even if it has a timeout, is synchronous and delays the kill,
+    so use with care.
+    The typically preferable mechanism to process memory pressure is to do what
+    <ulink url="https://systemd.io/MEMORY_PRESSURE/">MEMORY_PRESSURE</ulink> describes which is unprivileged,
+    asynchronous and does not delay the kill.
+    </para>
+
+  </refsect1>
+
   <refsect1>
     <title>[OOM] Section Options</title>
 
         <xi:include href="version-info.xml" xpointer="v248"/></listitem>
       </varlistentry>
 
+      <varlistentry>
+        <term><varname>PrekillHookTimeoutSec=</varname></term>
+
+        <listitem><para>Sets the amount of time <command>systemd-oomd</command> will wait for pre-kill hooks
+        to complete, before proceeding with the control group termination. Pre-kill hooks work by placing
+        varlink socket to <filename>/run/systemd/oomd.prekill.hook/</filename> folder. Each socket should
+        implement interface for notification to work. <command>systemd-oomd</command> sends a notification
+        before killing a control group for each discovered socket. The timeout is intended to be global and
+        not per hook. If all hooks return earlier, the kill is performed as soon as possible. The timeout
+        must be at least 1s.
+        Defaults to 0, which means <command>systemd-oomd</command> will not wait and no notifications
+        will be sent.</para>
+
+        <xi:include href="version-info.xml" xpointer="v260"/></listitem>
+      </varlistentry>
+
     </variablelist>
   </refsect1>
 
index ab1ae59437937006f06cca2c137f7af7d4f60b11..3dbffdee2633510bddb0483cef941f13a2031d75 100644 (file)
@@ -66,6 +66,8 @@
 #define VARLINK_PATH_MACHINED_USERDB "/run/systemd/userdb/io.systemd.Machine"
 /* Path where systemd-machined listens to resolve.hook varlink queries */
 #define VARLINK_PATH_MACHINED_RESOLVE_HOOK "/run/systemd/resolve.hook/io.systemd.Machine"
+/* Path where to connect to send varlink prekill events */
+#define VARLINK_DIR_OOMD_PREKILL_HOOK "/run/systemd/oomd.prekill.hook/"
 
 /* Recommended baseline - see README for details */
 #define KERNEL_BASELINE_VERSION "5.14"
index 299d2afe96c749a0af90be4c9467f2a3b36233d8..f0091e27561c77bb2017bf919b8af4debf8566da 100644 (file)
@@ -75,6 +75,7 @@ void manager_parse_config_file(Manager *m) {
                 { "OOM", "SwapUsedLimit",                    config_parse_permyriad, 0, &m->swap_used_limit_permyriad          },
                 { "OOM", "DefaultMemoryPressureLimit",       config_parse_loadavg,   0, &m->default_mem_pressure_limit         },
                 { "OOM", "DefaultMemoryPressureDurationSec", config_parse_duration,  0, &m->default_mem_pressure_duration_usec },
+                { "OOM", "PrekillHookTimeoutSec",            config_parse_sec,       0, &m->prekill_timeout                    },
                 {}
         };
 
index a228aa1ae0539e8715714d671a233b52b0fb3017..3898ab707c3e1ba8b0e055cb7cc2f5d6bd645853 100644 (file)
@@ -392,7 +392,7 @@ static int monitor_swap_contexts_handler(sd_event_source *s, uint64_t usec, void
         if (oomd_mem_available_below(&m->system_context, 10000 - m->swap_used_limit_permyriad) &&
                         oomd_swap_free_below(&m->system_context, 10000 - m->swap_used_limit_permyriad)) {
                 _cleanup_hashmap_free_ Hashmap *candidates = NULL;
-                _cleanup_free_ char *selected = NULL;
+                OomdCGroupContext *selected = NULL;
                 uint64_t threshold;
 
                 log_debug("Memory used (%"PRIu64") / total (%"PRIu64") and "
@@ -408,29 +408,28 @@ static int monitor_swap_contexts_handler(sd_event_source *s, uint64_t usec, void
                         log_debug_errno(r, "Failed to get monitored swap cgroup candidates, ignoring: %m");
 
                 threshold = m->system_context.swap_total * THRESHOLD_SWAP_USED_PERCENT / 100;
-                r = oomd_kill_by_swap_usage(candidates, threshold, m->dry_run, &selected);
+                r = oomd_select_by_swap_usage(candidates, threshold, &selected);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to select any cgroups based on swap: %m");
+                if (r == 0) {
+                        log_debug("No cgroup candidates found for swap-based OOM action");
+                        return 0;
+                }
+
+                r = oomd_cgroup_kill_mark(m, selected);
                 if (r == -ENOMEM)
                         return log_oom();
                 if (r < 0)
-                        log_notice_errno(r, "Failed to kill any cgroups based on swap: %m");
+                        log_error_errno(r, "Failed to select any cgroups based on swap: %m");
                 else {
                         if (selected && r > 0) {
-                                log_notice("Killed %s due to memory used (%"PRIu64") / total (%"PRIu64") and "
+                                log_notice("Marked %s for killing due to memory used (%"PRIu64") / total (%"PRIu64") and "
                                            "swap used (%"PRIu64") / total (%"PRIu64") being more than "
                                            PERMYRIAD_AS_PERCENT_FORMAT_STR,
-                                           selected,
+                                           selected->path,
                                            m->system_context.mem_used, m->system_context.mem_total,
                                            m->system_context.swap_used, m->system_context.swap_total,
                                            PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
-
-                                /* send dbus signal */
-                                (void) sd_bus_emit_signal(m->bus,
-                                                          "/org/freedesktop/oom1",
-                                                          "org.freedesktop.oom1.Manager",
-                                                          "Killed",
-                                                          "ss",
-                                                          selected,
-                                                          "memory-used");
                         }
                         return 0;
                 }
@@ -500,7 +499,7 @@ static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t
         else if (r == 1 && !in_post_action_delay) {
                 OomdCGroupContext *t;
                 SET_FOREACH(t, targets) {
-                        _cleanup_free_ char *selected = NULL;
+                        OomdCGroupContext *selected = NULL;
 
                         /* Check if there was reclaim activity in the given interval. The concern is the following case:
                          * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending
@@ -525,14 +524,21 @@ static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t
                         else
                                 clear_candidates = NULL;
 
-                        r = oomd_kill_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates,
-                                                     /* prefix= */ t->path,
-                                                     /* dry_run= */ m->dry_run,
-                                                     &selected);
+                        r = oomd_select_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates,
+                                                       /* prefix= */ t->path,
+                                                       &selected);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to select any cgroups based on swap, ignoring: %m");
+                        if (r == 0) {
+                                log_debug("No cgroup candidates found for memory pressure-based OOM action for %s", t->path);
+                                return 0;
+                        }
+
+                        r = oomd_cgroup_kill_mark(m, selected);
                         if (r == -ENOMEM)
                                 return log_oom();
                         if (r < 0)
-                                log_notice_errno(r, "Failed to kill any cgroups under %s based on pressure: %m", t->path);
+                                log_error_errno(r, "Failed to select any cgroups under %s based on pressure, ignoring: %m", t->path);
                         else {
                                 /* Don't act on all the high pressure cgroups at once; return as soon as we kill one.
                                  * If r == 0 then it means there were not eligible candidates, the candidate cgroup
@@ -541,21 +547,12 @@ static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t
                                  * pressure is still high. */
                                 m->mem_pressure_post_action_delay_start = usec_now;
                                 if (selected && r > 0) {
-                                        log_notice("Killed %s due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%"
+                                        log_notice("Marked %s for killing due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%"
                                                    " for > %s with reclaim activity",
-                                                   selected, t->path,
+                                                   selected->path, t->path,
                                                    LOADAVG_INT_SIDE(t->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(t->memory_pressure.avg10),
                                                    LOADAVG_INT_SIDE(t->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(t->mem_pressure_limit),
                                                    FORMAT_TIMESPAN(t->mem_pressure_duration_usec, USEC_PER_SEC));
-
-                                        /* send dbus signal */
-                                        (void) sd_bus_emit_signal(m->bus,
-                                                                  "/org/freedesktop/oom1",
-                                                                  "org.freedesktop.oom1.Manager",
-                                                                  "Killed",
-                                                                  "ss",
-                                                                  selected,
-                                                                  "memory-pressure");
                                 }
                                 return 0;
                         }
@@ -653,6 +650,8 @@ Manager* manager_free(Manager *m) {
         hashmap_free(m->monitored_mem_pressure_cgroup_contexts);
         hashmap_free(m->monitored_mem_pressure_cgroup_contexts_candidates);
 
+        set_free(m->kill_states);
+
         return mfree(m);
 }
 
index 016460a7191ad73cb2b75dc0df8f3364259bc53a..8b9476232fb597ab000b2339b3c93b694aacca21 100644 (file)
@@ -3,6 +3,7 @@
 
 #include "conf-parser-forward.h"
 #include "shared-forward.h"
+#include "oomd-conf.h"
 #include "oomd-util.h"
 
 /* Polling interval for monitoring stats */
@@ -54,6 +55,9 @@ typedef struct Manager {
         /* This varlink server object is used to manage systemd-oomd's varlink server which is used by user
          * managers to report changes in ManagedOOM settings (oomd server - systemd client). */
         sd_varlink_server *varlink_server;
+
+        usec_t prekill_timeout;
+        Set *kill_states; /* currently ongoing OomdKillState operations */
 } Manager;
 
 Manager* manager_free(Manager *m);
index e8d44f23fee61eb3879fc4ec171d544fe7b90844..340d2bb60e74ba242c5a8de6bc3e760df89f9ba7 100644 (file)
@@ -1,23 +1,35 @@
 /* SPDX-License-Identifier: LGPL-2.1-or-later */
 
 #include "alloc-util.h"
+#include "constants.h"
+#include "dirent-util.h"
 #include "errno-util.h"
 #include "fd-util.h"
 #include "fileio.h"
 #include "format-util.h"
 #include "log.h"
 #include "memstream-util.h"
+#include "oomd-manager.h"
 #include "oomd-util.h"
 #include "parse-util.h"
 #include "path-util.h"
 #include "pidref.h"
 #include "procfs-util.h"
+#include "sd-bus.h"
 #include "set.h"
 #include "signal-util.h"
 #include "sort-util.h"
 #include "stdio-util.h"
 #include "string-util.h"
 #include "time-util.h"
+#include "varlink-util.h"
+
+typedef struct OomdKillState {
+        Manager *manager;
+        OomdCGroupContext *ctx;
+        /* This holds sd_varlink references */
+        Set *links;
+} OomdKillState;
 
 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(
                 oomd_cgroup_ctx_hash_ops,
@@ -233,55 +245,263 @@ int oomd_sort_cgroup_contexts(Hashmap *h, oomd_compare_t compare_func, const cha
         return (int) k;
 }
 
-int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run) {
+int oomd_cgroup_kill(Manager *m, OomdCGroupContext *ctx, bool recurse) {
         _cleanup_set_free_ Set *pids_killed = NULL;
         int r;
 
-        assert(path);
-
-        if (dry_run) {
-                _cleanup_free_ char *cg_path = NULL;
-
-                r = cg_get_path(path, /* suffix= */ NULL, &cg_path);
-                if (r < 0)
-                        return r;
-
-                log_info("oomd dry-run: Would have tried to kill %s with recurse=%s", cg_path, true_false(recurse));
-                return 0;
-        }
+        assert(ctx);
 
         pids_killed = set_new(NULL);
         if (!pids_killed)
                 return -ENOMEM;
 
-        r = increment_oomd_xattr(path, "user.oomd_ooms", 1);
+        r = increment_oomd_xattr(ctx->path, "user.oomd_ooms", 1);
         if (r < 0)
                 log_debug_errno(r, "Failed to set user.oomd_ooms before kill: %m");
 
         if (recurse)
-                r = cg_kill_recursive(path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL);
+                r = cg_kill_recursive(ctx->path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL);
         else
-                r = cg_kill(path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL);
+                r = cg_kill(ctx->path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL);
 
         /* The cgroup could have been cleaned up after we have sent SIGKILL to all of the processes, but before
          * we could do one last iteration of cgroup.procs to check. Or the service unit could have exited and
          * was removed between picking candidates and coming into this function. In either case, let's log
          * about it let the caller decide what to do once they know how many PIDs were killed. */
         if (IN_SET(r, -ENOENT, -ENODEV))
-                log_debug_errno(r, "Error when sending SIGKILL to processes in cgroup path %s, ignoring: %m", path);
+                log_debug_errno(r, "Error when sending SIGKILL to processes in cgroup path %s, ignoring: %m", ctx->path);
         else if (r < 0)
                 return r;
 
         if (set_isempty(pids_killed))
-                log_debug("Nothing killed when attempting to kill %s", path);
+                log_debug("Nothing killed when attempting to kill %s", ctx->path);
 
-        r = increment_oomd_xattr(path, "user.oomd_kill", set_size(pids_killed));
+        r = increment_oomd_xattr(ctx->path, "user.oomd_kill", set_size(pids_killed));
         if (r < 0)
                 log_debug_errno(r, "Failed to set user.oomd_kill on kill: %m");
 
+        /* send dbus signal */
+        if (m)
+                (void) sd_bus_emit_signal(m->bus,
+                                          "/org/freedesktop/oom1",
+                                          "org.freedesktop.oom1.Manager",
+                                          "Killed",
+                                          "ss",
+                                          ctx,
+                                          "oom");
+
         return !set_isempty(pids_killed);
 }
 
+static void oomd_kill_state_free(OomdKillState *ks) {
+        if (!ks)
+                return;
+
+        assert(ks->manager);
+
+        set_free(ks->links);
+
+        set_remove(ks->manager->kill_states, ks);
+        oomd_cgroup_context_unref(ks->ctx);
+        free(ks);
+}
+
+static int oomd_kill_state_compare(const OomdKillState *a, const OomdKillState *b) {
+        return path_compare(a->ctx->path, b->ctx->path);
+}
+
+static void oomd_kill_state_hash_func(const OomdKillState *ks, struct siphash *state) {
+        path_hash_func(ks->ctx->path, state);
+}
+
+DEFINE_PRIVATE_HASH_OPS_WITH_KEY_DESTRUCTOR(
+                oomd_kill_state_hash_ops,
+                OomdKillState,
+                oomd_kill_state_hash_func,
+                oomd_kill_state_compare,
+                oomd_kill_state_free);
+
+/* oomd_kill_state_remove() is called N+1 times where N is the number of prekill hooks found.
+ * The extra call is just after creating the kill state, so to have at least a call if no
+ * prekill hooks are found. Each call removes one link from the kill state, and when the set
+ * is empty, it performs the actual cgroup kill. */
+static void oomd_kill_state_remove(OomdKillState *ks) {
+        int r;
+
+        assert(ks);
+        assert(ks->ctx);
+
+        if (!set_isempty(ks->links))
+                return;
+
+        r = oomd_cgroup_kill(ks->manager, ks->ctx, /* recurse= */ true);
+        if (r < 0)
+                log_debug_errno(r, "Failed to kill cgroup '%s', ignoring: %m", ks->ctx->path);
+        oomd_kill_state_free(ks);
+}
+
+static int prekill_callback(
+                sd_varlink *link,
+                sd_json_variant *parameters,
+                const char *error_id,
+                sd_varlink_reply_flags_t flags,
+                void *userdata) {
+
+        OomdKillState *ks = ASSERT_PTR(userdata);
+
+        assert(ks);
+        assert(ks->ctx);
+
+        if (error_id)
+                log_warning("oomd prekill hook for %s returned error: %s", ks->ctx->path, error_id);
+        else
+                log_info("oomd prekill hook finished for cgroup %s", ks->ctx->path);
+
+        assert_se(set_remove(ks->links, link) == link);
+        oomd_kill_state_remove(ks);
+        sd_varlink_unref(link);
+
+        return 0;
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(OomdKillState *, oomd_kill_state_remove, NULL);
+
+static int send_prekill_message(
+                const char *basename,
+                sd_json_variant *cparams,
+                OomdKillState *ks,
+                sd_event *e) {
+
+        _cleanup_(sd_varlink_close_unrefp) sd_varlink *link = NULL;
+        _cleanup_free_ char *hook_path = NULL;
+        int r;
+
+        assert(basename);
+        assert(cparams);
+        assert(e);
+        assert(ks);
+        assert(ks->ctx);
+        assert(ks->manager);
+
+        log_info("Invoking oomd prekill hook %s for cgroup %s", basename, ks->ctx->path);
+
+        hook_path = path_join(VARLINK_DIR_OOMD_PREKILL_HOOK, basename);
+        if (!hook_path)
+                return log_oom_debug();
+
+        r = sd_varlink_connect_address(&link, hook_path);
+        if (r < 0) {
+                log_debug_errno(r, "Socket '%s' is not connectible, probably stale, ignoring: %m", hook_path);
+                return 0;
+        }
+
+        (void) sd_varlink_set_userdata(link, ks);
+        r = sd_varlink_set_description(link, "oomd prekill hook");
+        if (r < 0)
+                return log_debug_errno(r, "Failed to set varlink description: %m");
+        (void) sd_varlink_set_relative_timeout(link, ks->manager->prekill_timeout);
+
+        r = sd_varlink_attach_event(link, e, SD_EVENT_PRIORITY_NORMAL);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to attach varlink to event loop: %m");
+
+        r = sd_varlink_bind_reply(link, prekill_callback);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to bind reply callback: %m");
+
+        r = sd_varlink_invoke(link, "io.systemd.oom.Prekill.Notify", cparams);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to call varlink method io.systemd.oom.Prekill.Notify: %m");
+
+        r = set_ensure_consume(&ks->links, &varlink_hash_ops, TAKE_PTR(link));
+        if (r < 0)
+                return log_oom_debug();
+
+        return 0;
+}
+
+/* oomd_prekill_hook() sets the prekill hooks up by sending varlink messages to all sockets found
+ * in VARLINK_DIR_OOMD_PREKILL_HOOK directory. It returns immediately if no prekill hooks are configured
+ * or PrekillHookTimeoutSec= is not set. In that case, the actual killing is done immediately by
+ * the callback set up by the cleanup handler in oomd_cgroup_kill_mark(). */
+static int oomd_prekill_hook(Manager *m, OomdKillState *ks) {
+        _cleanup_closedir_ DIR *d = NULL;
+        int r;
+
+        assert(m);
+        assert(ks);
+        assert(ks->ctx);
+
+        if (m->prekill_timeout == 0) {
+                log_debug("Zero oomd prekill timeout configured, skipping prekill hooks.");
+                return 0;
+        }
+
+        d = opendir(VARLINK_DIR_OOMD_PREKILL_HOOK);
+        if (!d) {
+                if (errno == ENOENT) {
+                        log_debug("No prekill varlink socket directory %s, ignoring.", VARLINK_DIR_OOMD_PREKILL_HOOK);
+                        return 0;
+                }
+                return log_debug_errno(errno, "Failed to open prekill varlink socket directory %s: %m",
+                                       VARLINK_DIR_OOMD_PREKILL_HOOK);
+        }
+
+        _cleanup_(sd_json_variant_unrefp) sd_json_variant *cparams = NULL;
+        r = sd_json_buildo(&cparams, SD_JSON_BUILD_PAIR_STRING("cgroup", ks->ctx->path));
+        if (r < 0)
+                return log_oom_debug();
+
+        FOREACH_DIRENT(de, d, return -errno) {
+                if (!IN_SET(de->d_type, DT_SOCK, DT_UNKNOWN))
+                        continue;
+
+                r = send_prekill_message(de->d_name, cparams, ks, m->event);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to send oomd prekill message to %s for cgroup %s, ignoring: %m",
+                                          de->d_name, ks->ctx->path);
+        }
+
+        return 0;
+}
+
+int oomd_cgroup_kill_mark(Manager *m, OomdCGroupContext *ctx) {
+        int r;
+
+        assert(ctx);
+        assert(m);
+
+        if (m->dry_run) {
+                _cleanup_free_ char *cg_path = NULL;
+
+                r = cg_get_path(ctx->path, /* suffix= */ NULL, &cg_path);
+                if (r < 0)
+                        return r;
+
+                log_info("oomd dry-run: Would have tried to kill %s and all its descendants", cg_path);
+                return 0;
+        }
+
+        _cleanup_(oomd_kill_state_removep) OomdKillState *ks = new(OomdKillState, 1);
+        if (!ks)
+                return log_oom_debug();
+
+        *ks = (OomdKillState) {
+                .manager = m,
+                .ctx = oomd_cgroup_context_ref(ctx),
+        };
+
+        r = set_ensure_put(&m->kill_states, &oomd_kill_state_hash_ops, ks);
+        if (r < 0)
+                return log_oom_debug();
+
+        r = oomd_prekill_hook(m, ks);
+        if (r < 0)
+                log_warning_errno(r, "oomd prekill hook failed for %s, ignoring: %m", ctx->path);
+
+        return 0;
+}
+
 typedef void (*dump_candidate_func)(const OomdCGroupContext *ctx, FILE *f, const char *prefix);
 
 static int dump_kill_candidates(
@@ -319,10 +539,9 @@ static int dump_kill_candidates(
         return memstream_dump(LOG_INFO, &m);
 }
 
-int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char **ret_selected) {
+int oomd_select_by_pgscan_rate(Hashmap *h, const char *prefix, OomdCGroupContext **ret_selected) {
         _cleanup_free_ OomdCGroupContext **sorted = NULL;
-        const OomdCGroupContext *killed = NULL;
-        int n, r, ret = 0;
+        int r, n, ret = 0;
 
         assert(h);
         assert(ret_selected);
@@ -332,7 +551,7 @@ int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char
                 return n;
 
         FOREACH_ARRAY(i, sorted, n) {
-                const OomdCGroupContext *c = *i;
+                OomdCGroupContext *c = *i;
 
                 /* Skip cgroups with no reclaim and memory usage; it won't alleviate pressure.
                  * Continue since there might be "avoid" cgroups at the end. */
@@ -345,31 +564,18 @@ int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char
                 if (r < 0)
                         continue;
 
-                r = oomd_cgroup_kill(c->path, /* recurse= */ true, /* dry_run= */ dry_run);
-                if (r == -ENOMEM)
-                        return r; /* Treat oom as a hard error */
-                if (r < 0) {
-                        RET_GATHER(ret, r);
-                        continue; /* Try to find something else to kill */
-                }
-
-                ret = r;
-                r = strdup_to(ret_selected, c->path);
-                if (r < 0)
-                        return r;
-
-                killed = c;
+                ret = 1;
+                *ret_selected = c;
                 break;
         }
 
-        (void) dump_kill_candidates(sorted, n, killed, oomd_dump_memory_pressure_cgroup_context);
+        (void) dump_kill_candidates(sorted, n, *ret_selected, oomd_dump_memory_pressure_cgroup_context);
         return ret;
 }
 
-int oomd_kill_by_swap_usage(Hashmap *h, uint64_t threshold_usage, bool dry_run, char **ret_selected) {
+int oomd_select_by_swap_usage(Hashmap *h, uint64_t threshold_usage, OomdCGroupContext **ret_selected) {
         _cleanup_free_ OomdCGroupContext **sorted = NULL;
-        const OomdCGroupContext *killed = NULL;
-        int n, r, ret = 0;
+        int r, n, ret = 0;
 
         assert(h);
         assert(ret_selected);
@@ -382,7 +588,7 @@ int oomd_kill_by_swap_usage(Hashmap *h, uint64_t threshold_usage, bool dry_run,
          * no swap usage. Threshold killing only cgroups with more than threshold swap usage. */
 
         FOREACH_ARRAY(i, sorted, n) {
-                const OomdCGroupContext *c = *i;
+                OomdCGroupContext *c = *i;
 
                 /* Skip over cgroups with not enough swap usage. Don't break since there might be "avoid"
                  * cgroups at the end. */
@@ -395,24 +601,12 @@ int oomd_kill_by_swap_usage(Hashmap *h, uint64_t threshold_usage, bool dry_run,
                 if (r < 0)
                         continue;
 
-                r = oomd_cgroup_kill(c->path, /* recurse= */ true, /* dry_run= */ dry_run);
-                if (r == -ENOMEM)
-                        return r; /* Treat oom as a hard error */
-                if (r < 0) {
-                        RET_GATHER(ret, r);
-                        continue; /* Try to find something else to kill */
-                }
-
-                ret = r;
-                r = strdup_to(ret_selected, c->path);
-                if (r < 0)
-                        return r;
-
-                killed = c;
+                ret = 1;
+                *ret_selected = c;
                 break;
         }
 
-        (void) dump_kill_candidates(sorted, n, killed, oomd_dump_swap_cgroup_context);
+        (void) dump_kill_candidates(sorted, n, *ret_selected, oomd_dump_swap_cgroup_context);
         return ret;
 }
 
index 2632ebdec89ca231d08337e00a774da9fb15e6e4..cf80c6f57b5818ff2de18ac5f05f5bb62ff08d81 100644 (file)
 
 extern const struct hash_ops oomd_cgroup_ctx_hash_ops;
 
+struct Manager;
+
 typedef struct OomdCGroupContext OomdCGroupContext;
 typedef struct OomdSystemContext OomdSystemContext;
+typedef struct Manager Manager;
 
 typedef int (oomd_compare_t)(OomdCGroupContext * const *, OomdCGroupContext * const *);
 
@@ -121,14 +124,15 @@ int oomd_sort_cgroup_contexts(Hashmap *h, oomd_compare_t compare_func, const cha
 int oomd_fetch_cgroup_oom_preference(OomdCGroupContext *ctx, const char *prefix);
 
 /* Returns a negative value on error, 0 if no processes were killed, or 1 if processes were killed. */
-int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run);
+int oomd_cgroup_kill(Manager *m, OomdCGroupContext *ctx, bool recurse);
+int oomd_cgroup_kill_mark(Manager *m, OomdCGroupContext *ctx);
 
 /* The following oomd_kill_by_* functions return 1 if processes were killed, or negative otherwise. */
 /* If `prefix` is supplied, only cgroups whose paths start with `prefix` are eligible candidates. Otherwise,
  * everything in `h` is a candidate.
  * Returns the killed cgroup in ret_selected. */
-int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char **ret_selected);
-int oomd_kill_by_swap_usage(Hashmap *h, uint64_t threshold_usage, bool dry_run, char **ret_selected);
+int oomd_select_by_pgscan_rate(Hashmap *h, const char *prefix, OomdCGroupContext **ret_selected);
+int oomd_select_by_swap_usage(Hashmap *h, uint64_t threshold_usage, OomdCGroupContext **ret_selected);
 
 int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret);
 int oomd_system_context_acquire(const char *proc_meminfo_path, OomdSystemContext *ret);
index 259bd9f6ae508a8c0bc9d2525cca5eb58d03b0af..76ff43cb27210d2cb565240373a88bfc5310796f 100644 (file)
@@ -88,7 +88,7 @@ TEST(oomd_cgroup_kill) {
                 ASSERT_OK(fork_and_sleep(5, &two));
                 ASSERT_OK(cg_attach(subcgroup, two.pid));
 
-                ASSERT_OK_POSITIVE(oomd_cgroup_kill(subcgroup, false /* recurse */, false /* dry run */));
+                ASSERT_OK_POSITIVE(oomd_cgroup_kill(NULL /* manager */, &(OomdCGroupContext){ .path = subcgroup }, false /* recurse */));
 
                 ASSERT_OK(cg_get_xattr(subcgroup, "user.oomd_ooms", &v, /* ret_size= */ NULL));
                 ASSERT_STREQ(v, i == 0 ? "1" : "2");
index e878b5e65a2e8f065d1e3239b6f141d231be03ce..fea5e22109f8f23b012f9bc16538f1f601deeaaa 100644 (file)
@@ -224,6 +224,7 @@ shared_sources = files(
         'varlink-io.systemd.Unit.c',
         'varlink-io.systemd.UserDatabase.c',
         'varlink-io.systemd.oom.c',
+        'varlink-io.systemd.oom.Prekill.c',
         'varlink-io.systemd.service.c',
         'varlink-io.systemd.sysext.c',
         'varlink-serialize.c',
diff --git a/src/shared/varlink-io.systemd.oom.Prekill.c b/src/shared/varlink-io.systemd.oom.Prekill.c
new file mode 100644 (file)
index 0000000..41658fe
--- /dev/null
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "varlink-io.systemd.oom.Prekill.h"
+
+/* This is a new Varlink interface for pre-kill notifications from oomd.
+ * It will be available through /run/systemd/oomd.prekill.hook/ */
+
+static SD_VARLINK_DEFINE_METHOD(
+                Notify,
+                SD_VARLINK_FIELD_COMMENT("The cgroup which is going to be killed"),
+                SD_VARLINK_DEFINE_INPUT(cgroup, SD_VARLINK_STRING, 0));
+
+SD_VARLINK_DEFINE_INTERFACE(
+                io_systemd_oom_Prekill,
+                "io.systemd.oom.Prekill",
+                SD_VARLINK_INTERFACE_COMMENT("Prekill notifications from oomd"),
+                SD_VARLINK_SYMBOL_COMMENT("Notify about an imminent OOM kill"),
+                &vl_method_Notify);
diff --git a/src/shared/varlink-io.systemd.oom.Prekill.h b/src/shared/varlink-io.systemd.oom.Prekill.h
new file mode 100644 (file)
index 0000000..52bf5eb
--- /dev/null
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-varlink-idl.h"
+
+extern const sd_varlink_interface vl_interface_io_systemd_oom_Prekill;
index 714189bbd12e77bcc1ea218a116692081dbc6702..69c9f0a7d63e023b839edcf2691cf0110f270cf5 100644 (file)
@@ -43,6 +43,7 @@
 #include "varlink-io.systemd.Unit.h"
 #include "varlink-io.systemd.UserDatabase.h"
 #include "varlink-io.systemd.oom.h"
+#include "varlink-io.systemd.oom.Prekill.h"
 #include "varlink-io.systemd.service.h"
 #include "varlink-io.systemd.sysext.h"
 #include "varlink-org.varlink.service.h"
@@ -206,6 +207,7 @@ TEST(parse_format) {
                 &vl_interface_io_systemd_Unit,
                 &vl_interface_io_systemd_UserDatabase,
                 &vl_interface_io_systemd_oom,
+                &vl_interface_io_systemd_oom_Prekill,
                 &vl_interface_io_systemd_service,
                 &vl_interface_io_systemd_sysext,
                 &vl_interface_org_varlink_service,
index 56a32926a04b98b41d3c68da20d47f5415231ebf..96a15989c745a5456b891d3470692ac77b14c113 100755 (executable)
@@ -353,6 +353,35 @@ EOF
     systemctl reset-failed
 }
 
+testcase_prekill_hook() {
+    cat >/run/systemd/oomd.conf.d/99-oomd-prekill-test.conf <<'EOF'
+[OOM]
+PrekillHookTimeoutSec=3s
+EOF
+
+    # no hooks
+    systemctl reload systemd-oomd.service
+    ! systemctl start --wait TEST-55-OOMD-testbloat.service || exit 1
+
+    # one hook
+    mkdir -p /run/systemd/oomd.prekill.hook/
+    ncat --recv-only -kUl /run/systemd/oomd.prekill.hook/althook >/tmp/oomd_event.json &
+    ! systemctl start --wait TEST-55-OOMD-testbloat.service || exit 1
+    [[ $(jq -r .method </tmp/oomd_event.json) = 'io.systemd.oom.Prekill.Notify' ]]
+
+    rm -f /run/systemd/oomd.prekill.hook/* /tmp/oomd_event.json
+
+    # many hooks
+    for i in {1..4}; do
+        ncat --recv-only -kUl "/run/systemd/oomd.prekill.hook/althook$i" >"/tmp/oomd_event$i.json" &
+    done
+
+    ! systemctl start --wait TEST-55-OOMD-testbloat.service || exit 1
+    for j in /tmp/oomd_event*.json; do
+        [[ $(jq -r .method <"$j") = 'io.systemd.oom.Prekill.Notify' ]]
+    done
+}
+
 run_testcases
 
 touch /testok