Merge pull request #12753 from jrouleau/fix/hibernate-resume-timeout

[thirdparty/systemd.git] / src / core / cgroup.c
diff --git a/src/core/cgroup.c b/src/core/cgroup.c

index ceb7ee21892d2debdeb170fcb71c0464a4deb9f9..0c885d57440f414ede597d78bf846fa7c8384b2f 100644 (file)
--- a/src/core/cgroup.c
+++ b/src/core/cgroup.c
@@ -202,6 +202,7 @@ void cgroup_context_done(CGroupContext *c) {
  }
  
  void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
+        _cleanup_free_ char *disable_controllers_str = NULL;
          CGroupIODeviceLimit *il;
          CGroupIODeviceWeight *iw;
          CGroupIODeviceLatency *l;
@@ -217,6 +218,8 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
  
          prefix = strempty(prefix);
  
+        (void) cg_mask_to_string(c->disable_controllers, &disable_controllers_str);
+
          fprintf(f,
                  "%sCPUAccounting=%s\n"
                  "%sIOAccounting=%s\n"
@@ -234,6 +237,7 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
                  "%sStartupIOWeight=%" PRIu64 "\n"
                  "%sBlockIOWeight=%" PRIu64 "\n"
                  "%sStartupBlockIOWeight=%" PRIu64 "\n"
+                "%sDefaultMemoryMin=%" PRIu64 "\n"
                  "%sDefaultMemoryLow=%" PRIu64 "\n"
                  "%sMemoryMin=%" PRIu64 "\n"
                  "%sMemoryLow=%" PRIu64 "\n"
@@ -243,6 +247,7 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
                  "%sMemoryLimit=%" PRIu64 "\n"
                  "%sTasksMax=%" PRIu64 "\n"
                  "%sDevicePolicy=%s\n"
+                "%sDisableControllers=%s\n"
                  "%sDelegate=%s\n",
                  prefix, yes_no(c->cpu_accounting),
                  prefix, yes_no(c->io_accounting),
@@ -260,6 +265,7 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
                  prefix, c->startup_io_weight,
                  prefix, c->blockio_weight,
                  prefix, c->startup_blockio_weight,
+                prefix, c->default_memory_min,
                  prefix, c->default_memory_low,
                  prefix, c->memory_min,
                  prefix, c->memory_low,
@@ -269,6 +275,7 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
                  prefix, c->memory_limit,
                  prefix, c->tasks_max,
                  prefix, cgroup_device_policy_to_string(c->device_policy),
+                prefix, strnull(disable_controllers_str),
                  prefix, yes_no(c->delegate));
  
          if (c->delegate) {
@@ -384,31 +391,34 @@ int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode)
          return 0;
  }
  
-uint64_t unit_get_ancestor_memory_low(Unit *u) {
-        CGroupContext *c;
-
-        /* 1. Is MemoryLow set in this unit? If so, use that.
-         * 2. Is DefaultMemoryLow set in any ancestor? If so, use that.
-         * 3. Otherwise, return CGROUP_LIMIT_MIN. */
-
-        assert(u);
-
-        c = unit_get_cgroup_context(u);
-
-        if (c->memory_low_set)
-                return c->memory_low;
-
-        while (UNIT_ISSET(u->slice)) {
-                u = UNIT_DEREF(u->slice);
-                c = unit_get_cgroup_context(u);
-
-                if (c->default_memory_low_set)
-                        return c->default_memory_low;
-        }
-
-        /* We've reached the root, but nobody had DefaultMemoryLow set, so set it to the kernel default. */
-        return CGROUP_LIMIT_MIN;
-}
+#define UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(entry)                       \
+        uint64_t unit_get_ancestor_##entry(Unit *u) {                   \
+                CGroupContext *c;                                       \
+                                                                        \
+                /* 1. Is entry set in this unit? If so, use that.       \
+                 * 2. Is the default for this entry set in any          \
+                 *    ancestor? If so, use that.                        \
+                 * 3. Otherwise, return CGROUP_LIMIT_MIN. */            \
+                                                                        \
+                assert(u);                                              \
+                                                                        \
+                c = unit_get_cgroup_context(u);                         \
+                if (c && c->entry##_set)                                \
+                        return c->entry;                                \
+                                                                        \
+                while ((u = UNIT_DEREF(u->slice))) {                    \
+                        c = unit_get_cgroup_context(u);                 \
+                        if (c && c->default_##entry##_set)              \
+                                return c->default_##entry;              \
+                }                                                       \
+                                                                        \
+                /* We've reached the root, but nobody had default for   \
+                 * this entry set, so set it to the kernel default. */  \
+                return CGROUP_LIMIT_MIN;                                \
+}
+
+UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_low);
+UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_min);
  
  static void cgroup_xattr_apply(Unit *u) {
          char ids[SD_ID128_STRING_MAX];
@@ -1134,7 +1144,7 @@ static void cgroup_context_apply(
                          }
                  }
  
-                /* The bandwith limits are something that make sense to be applied to the host's root but not container
+                /* The bandwidth limits are something that make sense to be applied to the host's root but not container
                   * roots, as there we want the container manager to handle it */
                  if (is_host_root || !is_local_root) {
                          if (has_io) {
@@ -1300,7 +1310,7 @@ static void cgroup_context_apply(
                           * it also counts. But if the user never set a limit through us (i.e. we are the default of
                           * "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on
                           * the first time we set a limit. Note that this boolean is flushed out on manager reload,
-                         * which is desirable so that there's an offical way to release control of the sysctl from
+                         * which is desirable so that there's an official way to release control of the sysctl from
                           * systemd: set the limit to unbounded and reload. */
  
                          if (c->tasks_max != CGROUP_LIMIT_MAX) {
@@ -1368,6 +1378,8 @@ static CGroupMask unit_get_cgroup_mask(Unit *u) {
  
          c = unit_get_cgroup_context(u);
  
+        assert(c);
+
          /* Figure out which controllers we need, based on the cgroup context object */
  
          if (c->cpu_accounting)
@@ -1530,6 +1542,10 @@ CGroupMask unit_get_target_mask(Unit *u) {
           * hierarchy that shall be enabled for it. */
  
          mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
+
+        if (mask & CGROUP_MASK_BPF_FIREWALL & ~u->manager->cgroup_supported)
+                emit_bpf_firewall_warning(u);
+
          mask &= u->manager->cgroup_supported;
          mask &= ~unit_get_ancestor_disable_mask(u);
  
@@ -2597,7 +2613,7 @@ void unit_add_to_cgroup_empty_queue(Unit *u) {
                  log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
  }
  
-static int unit_check_oom(Unit *u) {
+int unit_check_oom(Unit *u) {
          _cleanup_free_ char *oom_kill = NULL;
          bool increased;
          uint64_t c;
@@ -3220,21 +3236,140 @@ int unit_get_ip_accounting(
          return r;
  }
  
+static int unit_get_io_accounting_raw(Unit *u, uint64_t ret[static _CGROUP_IO_ACCOUNTING_METRIC_MAX]) {
+        static const char *const field_names[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
+                [CGROUP_IO_READ_BYTES]       = "rbytes=",
+                [CGROUP_IO_WRITE_BYTES]      = "wbytes=",
+                [CGROUP_IO_READ_OPERATIONS]  = "rios=",
+                [CGROUP_IO_WRITE_OPERATIONS] = "wios=",
+        };
+        uint64_t acc[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {};
+        _cleanup_free_ char *path = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        int r;
+
+        assert(u);
+
+        if (!u->cgroup_path)
+                return -ENODATA;
+
+        if (unit_has_host_root_cgroup(u))
+                return -ENODATA; /* TODO: return useful data for the top-level cgroup */
+
+        r = cg_all_unified();
+        if (r < 0)
+                return r;
+        if (r == 0) /* TODO: support cgroupv1 */
+                return -ENODATA;
+
+        if (!FLAGS_SET(u->cgroup_realized_mask, CGROUP_MASK_IO))
+                return -ENODATA;
+
+        r = cg_get_path("io", u->cgroup_path, "io.stat", &path);
+        if (r < 0)
+                return r;
+
+        f = fopen(path, "re");
+        if (!f)
+                return -errno;
+
+        for (;;) {
+                _cleanup_free_ char *line = NULL;
+                const char *p;
+
+                r = read_line(f, LONG_LINE_MAX, &line);
+                if (r < 0)
+                        return r;
+                if (r == 0)
+                        break;
+
+                p = line;
+                p += strcspn(p, WHITESPACE); /* Skip over device major/minor */
+                p += strspn(p, WHITESPACE);  /* Skip over following whitespace */
+
+                for (;;) {
+                        _cleanup_free_ char *word = NULL;
+
+                        r = extract_first_word(&p, &word, NULL, EXTRACT_RETAIN_ESCAPE);
+                        if (r < 0)
+                                return r;
+                        if (r == 0)
+                                break;
+
+                        for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) {
+                                const char *x;
+
+                                x = startswith(word, field_names[i]);
+                                if (x) {
+                                        uint64_t w;
+
+                                        r = safe_atou64(x, &w);
+                                        if (r < 0)
+                                                return r;
+
+                                        /* Sum up the stats of all devices */
+                                        acc[i] += w;
+                                        break;
+                                }
+                        }
+                }
+        }
+
+        memcpy(ret, acc, sizeof(acc));
+        return 0;
+}
+
+int unit_get_io_accounting(
+                Unit *u,
+                CGroupIOAccountingMetric metric,
+                bool allow_cache,
+                uint64_t *ret) {
+
+        uint64_t raw[_CGROUP_IO_ACCOUNTING_METRIC_MAX];
+        int r;
+
+        /* Retrieve an IO account parameter. This will subtract the counter when the unit was started. */
+
+        if (!UNIT_CGROUP_BOOL(u, io_accounting))
+                return -ENODATA;
+
+        if (allow_cache && u->io_accounting_last[metric] != UINT64_MAX)
+                goto done;
+
+        r = unit_get_io_accounting_raw(u, raw);
+        if (r == -ENODATA && u->io_accounting_last[metric] != UINT64_MAX)
+                goto done;
+        if (r < 0)
+                return r;
+
+        for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) {
+                /* Saturated subtraction */
+                if (raw[i] > u->io_accounting_base[i])
+                        u->io_accounting_last[i] = raw[i] - u->io_accounting_base[i];
+                else
+                        u->io_accounting_last[i] = 0;
+        }
+
+done:
+        if (ret)
+                *ret = u->io_accounting_last[metric];
+
+        return 0;
+}
+
  int unit_reset_cpu_accounting(Unit *u) {
-        nsec_t ns;
          int r;
  
          assert(u);
  
          u->cpu_usage_last = NSEC_INFINITY;
  
-        r = unit_get_cpu_usage_raw(u, &ns);
+        r = unit_get_cpu_usage_raw(u, &u->cpu_usage_base);
          if (r < 0) {
                  u->cpu_usage_base = 0;
                  return r;
          }
  
-        u->cpu_usage_base = ns;
          return 0;
  }
  
@@ -3254,6 +3389,35 @@ int unit_reset_ip_accounting(Unit *u) {
          return r < 0 ? r : q;
  }
  
+int unit_reset_io_accounting(Unit *u) {
+        int r;
+
+        assert(u);
+
+        for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++)
+                u->io_accounting_last[i] = UINT64_MAX;
+
+        r = unit_get_io_accounting_raw(u, u->io_accounting_base);
+        if (r < 0) {
+                zero(u->io_accounting_base);
+                return r;
+        }
+
+        return 0;
+}
+
+int unit_reset_accounting(Unit *u) {
+        int r, q, v;
+
+        assert(u);
+
+        r = unit_reset_cpu_accounting(u);
+        q = unit_reset_io_accounting(u);
+        v = unit_reset_ip_accounting(u);
+
+        return r < 0 ? r : q < 0 ? q : v;
+}
+
  void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
          assert(u);