oomd: clean up error handling

author Anita Zhang <the.anitazha@gmail.com>

Wed, 17 Mar 2021 00:57:50 +0000 (17:57 -0700)

committer Anita Zhang <the.anitazha@gmail.com>

Wed, 17 Mar 2021 20:07:11 +0000 (13:07 -0700)
author Anita Zhang <the.anitazha@gmail.com>
Wed, 17 Mar 2021 00:57:50 +0000 (17:57 -0700)
committer Anita Zhang <the.anitazha@gmail.com>
Wed, 17 Mar 2021 20:07:11 +0000 (13:07 -0700)
diff --git a/src/oom/oomd-manager.c b/src/oom/oomd-manager.c

index 72781f3e88a82031547a6ec2b0864f154b2c3ce9..345f8a77cfad62bec36637486575c21d58d342d5 100644 (file)
--- a/src/oom/oomd-manager.c
+++ b/src/oom/oomd-manager.c
@@ -117,6 +117,8 @@ static int process_managed_oom_reply(
                          r = ret;
                          goto finish;
                  }
+                if (ret < 0 && ret != -EEXIST)
+                        log_debug_errno(ret, "Failed to insert reply, ignoring: %m");
  
                  /* Always update the limit in case it was changed. For non-memory pressure detection the value is
                   * ignored so always updating it here is not a problem. */
@@ -156,7 +158,11 @@ static int recursively_get_cgroup_context(Hashmap *new_h, const char *path) {
                  return r;
          else if (r == 0) { /* No subgroups? We're a leaf node */
                  r = oomd_insert_cgroup_context(NULL, new_h, path);
-                return (r == -ENOMEM) ? r : 0;
+                if (r == -ENOMEM)
+                        return r;
+                if (r < 0)
+                        log_debug_errno(r, "Failed to insert context for %s, ignoring: %m", path);
+                return 0;
          }
  
          do {
@@ -171,8 +177,12 @@ static int recursively_get_cgroup_context(Hashmap *new_h, const char *path) {
  
                  r = cg_get_attribute_as_bool("memory", cg_path, "memory.oom.group", &oom_group);
                  /* The cgroup might be gone. Skip it as a candidate since we can't get information on it. */
-                if (r < 0)
-                        return (r == -ENOMEM) ? r : 0;
+                if (r == -ENOMEM)
+                        return r;
+                if (r < 0) {
+                        log_debug_errno(r, "Failed to read memory.oom.group from %s, ignoring: %m", cg_path);
+                        return 0;
+                }
  
                  if (oom_group)
                          r = oomd_insert_cgroup_context(NULL, new_h, cg_path);
@@ -180,6 +190,8 @@ static int recursively_get_cgroup_context(Hashmap *new_h, const char *path) {
                          r = recursively_get_cgroup_context(new_h, cg_path);
                  if (r == -ENOMEM)
                          return r;
+                if (r < 0)
+                        log_debug_errno(r, "Failed to insert or recursively get from %s, ignoring: %m", cg_path);
          } while ((r = cg_read_subgroup(d, &subpath)) > 0);
  
          return 0;
@@ -201,6 +213,8 @@ static int update_monitored_cgroup_contexts(Hashmap **monitored_cgroups) {
                  r = oomd_insert_cgroup_context(*monitored_cgroups, new_base, ctx->path);
                  if (r == -ENOMEM)
                          return r;
+                if (r < 0 && !IN_SET(r, -EEXIST, -ENOENT))
+                        log_debug_errno(r, "Failed to insert context for %s, ignoring: %m", ctx->path);
          }
  
          hashmap_free(*monitored_cgroups);
@@ -225,6 +239,8 @@ static int get_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups,
                  r = recursively_get_cgroup_context(candidates, ctx->path);
                  if (r == -ENOMEM)
                          return r;
+                if (r < 0)
+                        log_debug_errno(r, "Failed to recursively get contexts for %s, ignoring: %m", ctx->path);
          }
  
          *ret_candidates = TAKE_PTR(candidates);
@@ -295,38 +311,44 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
          /* Reset timer */
          r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now);
          if (r < 0)
-                return log_error_errno(r, "Failed to reset event timer");
+                return log_error_errno(r, "Failed to reset event timer: %m");
  
          r = sd_event_source_set_time_relative(s, INTERVAL_USEC);
          if (r < 0)
-                return log_error_errno(r, "Failed to set relative time for timer");
+                return log_error_errno(r, "Failed to set relative time for timer: %m");
  
          /* Reconnect if our connection dropped */
          if (!m->varlink) {
                  r = acquire_managed_oom_connect(m);
                  if (r < 0)
-                        return log_error_errno(r, "Failed to acquire varlink connection");
+                        return log_error_errno(r, "Failed to acquire varlink connection: %m");
          }
  
          /* Update the cgroups used for detection/action */
          r = update_monitored_cgroup_contexts(&m->monitored_swap_cgroup_contexts);
          if (r == -ENOMEM)
-                return log_error_errno(r, "Failed to update monitored swap cgroup contexts");
+                return log_oom();
+        if (r < 0)
+                log_debug_errno(r, "Failed to update monitored swap cgroup contexts, ignoring: %m");
  
          r = update_monitored_cgroup_contexts(&m->monitored_mem_pressure_cgroup_contexts);
          if (r == -ENOMEM)
-                return log_error_errno(r, "Failed to update monitored memory pressure cgroup contexts");
+                return log_oom();
+        if (r < 0)
+                log_debug_errno(r, "Failed to update monitored memory pressure cgroup contexts, ignoring: %m");
  
          r = update_monitored_cgroup_contexts_candidates(
                          m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates);
          if (r == -ENOMEM)
-                return log_error_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts");
+                return log_oom();
+        if (r < 0)
+                log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
  
          r = oomd_system_context_acquire("/proc/swaps", &m->system_context);
          /* If there aren't units depending on swap actions, the only error we exit on is ENOMEM.
           * Allow ENOENT in the event that swap is disabled on the system. */
          if (r == -ENOMEM || (r < 0 && r != -ENOENT && !hashmap_isempty(m->monitored_swap_cgroup_contexts)))
-                return log_error_errno(r, "Failed to acquire system context");
+                return log_error_errno(r, "Failed to acquire system context: %m");
          else if (r == -ENOENT)
                  zero(m->system_context);
  
@@ -343,7 +365,9 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
  
          r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets);
          if (r == -ENOMEM)
-                return log_error_errno(r, "Failed to check if memory pressure exceeded limits");
+                return log_oom();
+        if (r < 0)
+                log_debug_errno(r, "Failed to check if memory pressure exceeded limits, ignoring: %m");
          else if (r == 1) {
                  /* Check if there was reclaim activity in the given interval. The concern is the following case:
                   * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending
@@ -359,9 +383,9 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
  
                                  r = oomd_kill_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates, t->path, m->dry_run);
                                  if (r == -ENOMEM)
-                                        return log_error_errno(r, "Failed to kill cgroup processes by pgscan rate: %m");
+                                        return log_oom();
                                  if (r < 0)
-                                        log_info("Failed to kill any cgroup(s) under %s based on pressure", t->path);
+                                        log_notice_errno(r, "Failed to kill any cgroup(s) under %s based on pressure: %m", t->path);
                                  else {
                                          /* Don't act on all the high pressure cgroups at once; return as soon as we kill one */
                                          m->post_action_delay_start = usec_now;
@@ -379,13 +403,15 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
  
                  r = get_monitored_cgroup_contexts_candidates(m->monitored_swap_cgroup_contexts, &candidates);
                  if (r == -ENOMEM)
-                        return log_error_errno(r, "Failed to get monitored swap cgroup candidates");
+                        return log_oom();
+                if (r < 0)
+                        log_debug_errno(r, "Failed to get monitored swap cgroup candidates, ignoring: %m");
  
                  r = oomd_kill_by_swap_usage(candidates, m->dry_run);
                  if (r == -ENOMEM)
-                        return log_error_errno(r, "Failed to kill cgroup processes by swap usage");
+                        return log_oom();
                  if (r < 0)
-                        log_info("Failed to kill any cgroup(s) based on swap");
+                        log_notice_errno(r, "Failed to kill any cgroup(s) based on swap: %m");
                  else {
                          m->post_action_delay_start = usec_now;
                          return 0;
author	Anita Zhang <the.anitazha@gmail.com>
	Wed, 17 Mar 2021 00:57:50 +0000 (17:57 -0700)
committer	Anita Zhang <the.anitazha@gmail.com>
	Wed, 17 Mar 2021 20:07:11 +0000 (13:07 -0700)