]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
oomd: rework memory reclaim detection logic
authorAnita Zhang <the.anitazha@gmail.com>
Fri, 26 Mar 2021 07:39:25 +0000 (00:39 -0700)
committerAnita Zhang <the.anitazha@gmail.com>
Fri, 2 Apr 2021 02:51:54 +0000 (19:51 -0700)
systemd-oomd only monitors and kills within a selected cgroup subtree
For memory pressure kills, this means it's unnecessary to get the
pgscan rate across all the monitored memory pressure cgroups.
The increase will show up whether we do a total sum or not, but since
we only care about the increase in the subtree we're about to target
for a kill, we can simplify the code a bit by not doing this total sum.

src/oom/oomd-manager.c
src/oom/oomd-manager.h
src/oom/oomd-util.c
src/oom/oomd-util.h
src/oom/test-oomd-util.c

index 2a683d5469e43ae0493d5ee54cb9ce281d27b3d4..73555d2594d5e67c11e13162c44f187c12ea43c6 100644 (file)
@@ -419,9 +419,6 @@ static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t
         if (r < 0)
                 log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
 
-        if (oomd_memory_reclaim(m->monitored_mem_pressure_cgroup_contexts))
-                m->last_reclaim_at = usec_now;
-
         /* Since pressure counters are lagging, we need to wait a bit after a kill to ensure we don't read stale
          * values and go on a kill storm. */
         if (m->mem_pressure_post_action_delay_start > 0) {
@@ -437,45 +434,45 @@ static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t
         if (r < 0)
                 log_debug_errno(r, "Failed to check if memory pressure exceeded limits, ignoring: %m");
         else if (r == 1) {
-                /* Check if there was reclaim activity in the given interval. The concern is the following case:
-                 * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending
-                 * cgroup. Even after this, well-behaved processes will fault in recently resident pages and
-                 * this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need
-                 * to kill something (it won't help anyways). */
-                if ((usec_now - m->last_reclaim_at) <= RECLAIM_DURATION_USEC) {
-                        OomdCGroupContext *t;
-
-                        SET_FOREACH(t, targets) {
-                                _cleanup_free_ char *selected = NULL;
-                                char ts[FORMAT_TIMESPAN_MAX];
-
-                                log_debug("Memory pressure for %s is %lu.%02lu%% > %lu.%02lu%% for > %s with reclaim activity",
-                                          t->path,
-                                          LOAD_INT(t->memory_pressure.avg10), LOAD_FRAC(t->memory_pressure.avg10),
-                                          LOAD_INT(t->mem_pressure_limit), LOAD_FRAC(t->mem_pressure_limit),
-                                          format_timespan(ts, sizeof ts,
-                                                          m->default_mem_pressure_duration_usec,
-                                                          USEC_PER_SEC));
-
-                                r = oomd_kill_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates, t->path, m->dry_run, &selected);
-                                if (r == -ENOMEM)
-                                        return log_oom();
-                                if (r < 0)
-                                        log_notice_errno(r, "Failed to kill any cgroup(s) under %s based on pressure: %m", t->path);
-                                else {
-                                        /* Don't act on all the high pressure cgroups at once; return as soon as we kill one */
-                                        m->mem_pressure_post_action_delay_start = usec_now;
-                                        if (selected)
-                                                log_notice("Killed %s due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%"
-                                                           " for > %s with reclaim activity",
-                                                           selected, t->path,
-                                                           LOAD_INT(t->memory_pressure.avg10), LOAD_FRAC(t->memory_pressure.avg10),
-                                                           LOAD_INT(t->mem_pressure_limit), LOAD_FRAC(t->mem_pressure_limit),
-                                                           format_timespan(ts, sizeof ts,
-                                                                           m->default_mem_pressure_duration_usec,
-                                                                           USEC_PER_SEC));
-                                        return 0;
-                                }
+                OomdCGroupContext *t;
+                SET_FOREACH(t, targets) {
+                        _cleanup_free_ char *selected = NULL;
+                        char ts[FORMAT_TIMESPAN_MAX];
+
+                        /* Check if there was reclaim activity in the given interval. The concern is the following case:
+                         * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending
+                         * cgroup. Even after this, well-behaved processes will fault in recently resident pages and
+                         * this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need
+                         * to kill something (it won't help anyways). */
+                        if ((now(CLOCK_MONOTONIC) - t->last_had_mem_reclaim) > RECLAIM_DURATION_USEC)
+                                continue;
+
+                        log_debug("Memory pressure for %s is %lu.%02lu%% > %lu.%02lu%% for > %s with reclaim activity",
+                                  t->path,
+                                  LOAD_INT(t->memory_pressure.avg10), LOAD_FRAC(t->memory_pressure.avg10),
+                                  LOAD_INT(t->mem_pressure_limit), LOAD_FRAC(t->mem_pressure_limit),
+                                  format_timespan(ts, sizeof ts,
+                                                  m->default_mem_pressure_duration_usec,
+                                                  USEC_PER_SEC));
+
+                        r = oomd_kill_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates, t->path, m->dry_run, &selected);
+                        if (r == -ENOMEM)
+                                return log_oom();
+                        if (r < 0)
+                                log_notice_errno(r, "Failed to kill any cgroup(s) under %s based on pressure: %m", t->path);
+                        else {
+                                /* Don't act on all the high pressure cgroups at once; return as soon as we kill one */
+                                m->mem_pressure_post_action_delay_start = usec_now;
+                                if (selected)
+                                        log_notice("Killed %s due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%"
+                                                   " for > %s with reclaim activity",
+                                                   selected, t->path,
+                                                   LOAD_INT(t->memory_pressure.avg10), LOAD_FRAC(t->memory_pressure.avg10),
+                                                   LOAD_INT(t->mem_pressure_limit), LOAD_FRAC(t->mem_pressure_limit),
+                                                   format_timespan(ts, sizeof ts,
+                                                                   m->default_mem_pressure_duration_usec,
+                                                                   USEC_PER_SEC));
+                                return 0;
                         }
                 }
         }
index 3df7ce94d7eba0b28ffc514ae6382f0f13518864..d5407003d30552ba018c40f9636038e4ed499f38 100644 (file)
@@ -46,7 +46,6 @@ struct Manager {
 
         OomdSystemContext system_context;
 
-        usec_t last_reclaim_at;
         usec_t mem_pressure_post_action_delay_start;
 
         sd_event_source *swap_context_event_source;
index 5133b1c060c0985812c50e7ed676de18de468b8a..fcab31830cd8f192d3b63cb405505781a785b66f 100644 (file)
@@ -104,36 +104,6 @@ int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret) {
         return 0;
 }
 
-bool oomd_memory_reclaim(Hashmap *h) {
-        uint64_t pgscan = 0, pgscan_of = 0, last_pgscan = 0, last_pgscan_of = 0;
-        OomdCGroupContext *ctx;
-
-        assert(h);
-
-        /* If sum of all the current pgscan values are greater than the sum of all the last_pgscan values,
-         * there was reclaim activity. Used along with pressure checks to decide whether to take action. */
-
-        HASHMAP_FOREACH(ctx, h) {
-                uint64_t sum;
-
-                sum = pgscan + ctx->pgscan;
-                if (sum < pgscan || sum < ctx->pgscan)
-                        pgscan_of++; /* count overflows */
-                pgscan = sum;
-
-                sum = last_pgscan + ctx->last_pgscan;
-                if (sum < last_pgscan || sum < ctx->last_pgscan)
-                        last_pgscan_of++; /* count overflows */
-                last_pgscan = sum;
-        }
-
-        /* overflow counts are the same, return sums comparison */
-        if (last_pgscan_of == pgscan_of)
-                return pgscan > last_pgscan;
-
-        return pgscan_of > last_pgscan_of;
-}
-
 uint64_t oomd_pgscan_rate(const OomdCGroupContext *c) {
         uint64_t last_pgscan;
 
@@ -448,8 +418,12 @@ int oomd_insert_cgroup_context(Hashmap *old_h, Hashmap *new_h, const char *path)
                 curr_ctx->last_pgscan = old_ctx->pgscan;
                 curr_ctx->mem_pressure_limit = old_ctx->mem_pressure_limit;
                 curr_ctx->last_hit_mem_pressure_limit = old_ctx->last_hit_mem_pressure_limit;
+                curr_ctx->last_had_mem_reclaim = old_ctx->last_had_mem_reclaim;
         }
 
+        if (oomd_pgscan_rate(curr_ctx) > 0)
+                curr_ctx->last_had_mem_reclaim = now(CLOCK_MONOTONIC);
+
         r = hashmap_put(new_h, curr_ctx->path, curr_ctx);
         if (r < 0)
                 return r;
@@ -474,6 +448,10 @@ void oomd_update_cgroup_contexts_between_hashmaps(Hashmap *old_h, Hashmap *curr_
                 ctx->last_pgscan = old_ctx->pgscan;
                 ctx->mem_pressure_limit = old_ctx->mem_pressure_limit;
                 ctx->last_hit_mem_pressure_limit = old_ctx->last_hit_mem_pressure_limit;
+                ctx->last_had_mem_reclaim = old_ctx->last_had_mem_reclaim;
+
+                if (oomd_pgscan_rate(ctx) > 0)
+                        ctx->last_had_mem_reclaim = now(CLOCK_MONOTONIC);
         }
 }
 
index af08504c5bb9a9e0036ffa2dfcf75cb275821866..b3b637ba8b86eea947aea471ad149f4c4349bde1 100644 (file)
@@ -32,10 +32,11 @@ struct OomdCGroupContext {
 
         ManagedOOMPreference preference;
 
-        /* These are only used by oomd_pressure_above for acting on high memory pressure. */
+        /* These are only used for acting on high memory pressure. */
         loadavg_t mem_pressure_limit;
         usec_t mem_pressure_duration_usec;
         usec_t last_hit_mem_pressure_limit;
+        usec_t last_had_mem_reclaim;
 };
 
 struct OomdSystemContext {
@@ -57,10 +58,6 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(OomdCGroupContext*, oomd_cgroup_context_free);
  * Returns -ENOMEM for allocation errors. */
 int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret);
 
-/* Sum up current OomdCGroupContexts' pgscan values and last interval's pgscan values in `h`. Returns true if the
- * current sum is higher than the last interval's sum (there was some reclaim activity). */
-bool oomd_memory_reclaim(Hashmap *h);
-
 /* Returns true if the amount of swap free is below the permyriad of swap specified by `threshold_permyriad`. */
 bool oomd_swap_free_below(const OomdSystemContext *ctx, int threshold_permyriad);
 
index bd1c574ca7fa02cabb24d155edcf9f541d190625..92248aa063e5b6e6e3572375975ef0f8681afc86 100644 (file)
@@ -153,9 +153,10 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) {
         assert_se(oomd_insert_cgroup_context(NULL, h1, cgroup) == -EEXIST);
 
          /* make sure certain values from h1 get updated in h2 */
-        c1->pgscan = 5555;
+        c1->pgscan = UINT64_MAX;
         c1->mem_pressure_limit = 6789;
         c1->last_hit_mem_pressure_limit = 42;
+        c1->last_had_mem_reclaim = 888;
         assert_se(h2 = hashmap_new(&oomd_cgroup_ctx_hash_ops));
         assert_se(oomd_insert_cgroup_context(h1, h2, cgroup) == 0);
         c1 = hashmap_get(h1, cgroup);
@@ -163,9 +164,10 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) {
         assert_se(c1);
         assert_se(c2);
         assert_se(c1 != c2);
-        assert_se(c2->last_pgscan == 5555);
+        assert_se(c2->last_pgscan == UINT64_MAX);
         assert_se(c2->mem_pressure_limit == 6789);
         assert_se(c2->last_hit_mem_pressure_limit == 42);
+        assert_se(c2->last_had_mem_reclaim == 888); /* assumes the live pgscan is less than UINT64_MAX */
 
         /* Assert that avoid/omit are not set if the cgroup is not owned by root */
         if (test_xattrs) {
@@ -182,20 +184,22 @@ static void test_oomd_update_cgroup_contexts_between_hashmaps(void) {
         char **paths = STRV_MAKE("/0.slice",
                                  "/1.slice");
 
-        OomdCGroupContext ctx_old[3] = {
+        OomdCGroupContext ctx_old[2] = {
                 { .path = paths[0],
                   .mem_pressure_limit = 5,
                   .last_hit_mem_pressure_limit = 777,
+                  .last_had_mem_reclaim = 888,
                   .pgscan = 57 },
                 { .path = paths[1],
                   .mem_pressure_limit = 6,
                   .last_hit_mem_pressure_limit = 888,
+                  .last_had_mem_reclaim = 888,
                   .pgscan = 42 },
         };
 
-        OomdCGroupContext ctx_new[3] = {
+        OomdCGroupContext ctx_new[2] = {
                 { .path = paths[0],
-                  .pgscan = 100 },
+                  .pgscan = 57 },
                 { .path = paths[1],
                   .pgscan = 101 },
         };
@@ -215,12 +219,14 @@ static void test_oomd_update_cgroup_contexts_between_hashmaps(void) {
         assert_se(c_old->pgscan == c_new->last_pgscan);
         assert_se(c_old->mem_pressure_limit == c_new->mem_pressure_limit);
         assert_se(c_old->last_hit_mem_pressure_limit == c_new->last_hit_mem_pressure_limit);
+        assert_se(c_old->last_had_mem_reclaim == c_new->last_had_mem_reclaim);
 
         assert_se(c_old = hashmap_get(h_old, "/1.slice"));
         assert_se(c_new = hashmap_get(h_new, "/1.slice"));
         assert_se(c_old->pgscan == c_new->last_pgscan);
         assert_se(c_old->mem_pressure_limit == c_new->mem_pressure_limit);
         assert_se(c_old->last_hit_mem_pressure_limit == c_new->last_hit_mem_pressure_limit);
+        assert_se(c_new->last_had_mem_reclaim > c_old->last_had_mem_reclaim);
 }
 
 static void test_oomd_system_context_acquire(void) {
@@ -304,47 +310,6 @@ static void test_oomd_pressure_above(void) {
         assert_se(c->last_hit_mem_pressure_limit == 0);
 }
 
-static void test_oomd_memory_reclaim(void) {
-        _cleanup_hashmap_free_ Hashmap *h1 = NULL;
-        char **paths = STRV_MAKE("/0.slice",
-                                 "/1.slice",
-                                 "/2.slice",
-                                 "/3.slice",
-                                 "/4.slice");
-
-        OomdCGroupContext ctx[5] = {
-                { .path = paths[0],
-                  .last_pgscan = 100,
-                  .pgscan = 100 },
-                { .path = paths[1],
-                  .last_pgscan = 100,
-                  .pgscan = 100 },
-                { .path = paths[2],
-                  .last_pgscan = 77,
-                  .pgscan = 33 },
-                { .path = paths[3],
-                  .last_pgscan = UINT64_MAX,
-                  .pgscan = 100 },
-                { .path = paths[4],
-                  .last_pgscan = 100,
-                  .pgscan = UINT64_MAX },
-        };
-
-        assert_se(h1 = hashmap_new(&string_hash_ops));
-        assert_se(hashmap_put(h1, paths[0], &ctx[0]) >= 0);
-        assert_se(hashmap_put(h1, paths[1], &ctx[1]) >= 0);
-        assert_se(oomd_memory_reclaim(h1) == false);
-
-        assert_se(hashmap_put(h1, paths[2], &ctx[2]) >= 0);
-        assert_se(oomd_memory_reclaim(h1) == false);
-
-        assert_se(hashmap_put(h1, paths[4], &ctx[4]) >= 0);
-        assert_se(oomd_memory_reclaim(h1) == true);
-
-        assert_se(hashmap_put(h1, paths[3], &ctx[3]) >= 0);
-        assert_se(oomd_memory_reclaim(h1) == false);
-}
-
 static void test_oomd_swap_free_below(void) {
         OomdSystemContext ctx = (OomdSystemContext) {
                 .swap_total = 20971512 * 1024U,
@@ -461,7 +426,6 @@ int main(void) {
         test_oomd_update_cgroup_contexts_between_hashmaps();
         test_oomd_system_context_acquire();
         test_oomd_pressure_above();
-        test_oomd_memory_reclaim();
         test_oomd_swap_free_below();
         test_oomd_sort_cgroups();