core: check for redundant operation before doing allocation

[thirdparty/systemd.git] / src / core / cgroup.c
diff --git a/src/core/cgroup.c b/src/core/cgroup.c

index 18d470b6d675eead53b8a0281ec597d282cf8280..01d72a390817389ad6feb7cb22eca65b4c9d22c7 100644 (file)
--- a/src/core/cgroup.c
+++ b/src/core/cgroup.c
@@ -5,15 +5,16 @@
  
  #include "alloc-util.h"
  #include "blockdev-util.h"
+#include "bpf-devices.h"
  #include "bpf-firewall.h"
  #include "btrfs-util.h"
-#include "bpf-devices.h"
  #include "bus-error.h"
  #include "cgroup-util.h"
  #include "cgroup.h"
  #include "fd-util.h"
  #include "fileio.h"
  #include "fs-util.h"
+#include "nulstr-util.h"
  #include "parse-util.h"
  #include "path-util.h"
  #include "process-util.h"
@@ -25,7 +26,7 @@
  #include "string-util.h"
  #include "virt.h"
  
-#define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
+#define CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  
  /* Returns the log level to use when cgroup attribute writes fail. When an attribute is missing or we have access
   * problems we downgrade to LOG_DEBUG. This is supposed to be nice to container managers and kernels which want to mask
@@ -98,6 +99,7 @@ void cgroup_context_init(CGroupContext *c) {
                  .cpu_weight = CGROUP_WEIGHT_INVALID,
                  .startup_cpu_weight = CGROUP_WEIGHT_INVALID,
                  .cpu_quota_per_sec_usec = USEC_INFINITY,
+                .cpu_quota_period_usec = USEC_INFINITY,
  
                  .cpu_shares = CGROUP_CPU_SHARES_INVALID,
                  .startup_cpu_shares = CGROUP_CPU_SHARES_INVALID,
@@ -206,6 +208,7 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
          CGroupDeviceAllow *a;
          IPAddressAccessItem *iaai;
          char u[FORMAT_TIMESPAN_MAX];
+        char v[FORMAT_TIMESPAN_MAX];
  
          assert(c);
          assert(f);
@@ -224,6 +227,7 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
                  "%sCPUShares=%" PRIu64 "\n"
                  "%sStartupCPUShares=%" PRIu64 "\n"
                  "%sCPUQuotaPerSecSec=%s\n"
+                "%sCPUQuotaPeriodSec=%s\n"
                  "%sIOWeight=%" PRIu64 "\n"
                  "%sStartupIOWeight=%" PRIu64 "\n"
                  "%sBlockIOWeight=%" PRIu64 "\n"
@@ -248,6 +252,7 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
                  prefix, c->cpu_shares,
                  prefix, c->startup_cpu_shares,
                  prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
+                prefix, format_timespan(v, sizeof(v), c->cpu_quota_period_usec, 1),
                  prefix, c->io_weight,
                  prefix, c->startup_io_weight,
                  prefix, c->blockio_weight,
@@ -656,6 +661,40 @@ static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state)
                  return CGROUP_CPU_SHARES_DEFAULT;
  }
  
+usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period) {
+        /* kernel uses a minimum resolution of 1ms, so both period and (quota * period)
+         * need to be higher than that boundary. quota is specified in USecPerSec.
+         * Additionally, period must be at most max_period. */
+        assert(quota > 0);
+
+        return MIN(MAX3(period, resolution, resolution * USEC_PER_SEC / quota), max_period);
+}
+
+static usec_t cgroup_cpu_adjust_period_and_log(Unit *u, usec_t period, usec_t quota) {
+        usec_t new_period;
+
+        if (quota == USEC_INFINITY)
+                /* Always use default period for infinity quota. */
+                return CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC;
+
+        if (period == USEC_INFINITY)
+                /* Default period was requested. */
+                period = CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC;
+
+        /* Clamp to interval [1ms, 1s] */
+        new_period = cgroup_cpu_adjust_period(period, quota, USEC_PER_MSEC, USEC_PER_SEC);
+
+        if (new_period != period) {
+                char v[FORMAT_TIMESPAN_MAX];
+                log_unit_full(u, u->warned_clamping_cpu_quota_period ? LOG_DEBUG : LOG_WARNING, 0,
+                              "Clamping CPU interval for cpu.max: period is now %s",
+                              format_timespan(v, sizeof(v), new_period, 1));
+                u->warned_clamping_cpu_quota_period = true;
+        }
+
+        return new_period;
+}
+
  static void cgroup_apply_unified_cpu_weight(Unit *u, uint64_t weight) {
          char buf[DECIMAL_STR_MAX(uint64_t) + 2];
  
@@ -663,14 +702,15 @@ static void cgroup_apply_unified_cpu_weight(Unit *u, uint64_t weight) {
          (void) set_attribute_and_warn(u, "cpu", "cpu.weight", buf);
  }
  
-static void cgroup_apply_unified_cpu_quota(Unit *u, usec_t quota) {
+static void cgroup_apply_unified_cpu_quota(Unit *u, usec_t quota, usec_t period) {
          char buf[(DECIMAL_STR_MAX(usec_t) + 1) * 2 + 1];
  
+        period = cgroup_cpu_adjust_period_and_log(u, period, quota);
          if (quota != USEC_INFINITY)
                  xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
-                         quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC);
+                         MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC), period);
          else
-                xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
+                xsprintf(buf, "max " USEC_FMT "\n", period);
          (void) set_attribute_and_warn(u, "cpu", "cpu.max", buf);
  }
  
@@ -681,14 +721,16 @@ static void cgroup_apply_legacy_cpu_shares(Unit *u, uint64_t shares) {
          (void) set_attribute_and_warn(u, "cpu", "cpu.shares", buf);
  }
  
-static void cgroup_apply_legacy_cpu_quota(Unit *u, usec_t quota) {
+static void cgroup_apply_legacy_cpu_quota(Unit *u, usec_t quota, usec_t period) {
          char buf[DECIMAL_STR_MAX(usec_t) + 2];
  
-        xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
+        period = cgroup_cpu_adjust_period_and_log(u, period, quota);
+
+        xsprintf(buf, USEC_FMT "\n", period);
          (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_period_us", buf);
  
          if (quota != USEC_INFINITY) {
-                xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
+                xsprintf(buf, USEC_FMT "\n", MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC));
                  (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", buf);
          } else
                  (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", "-1\n");
@@ -911,7 +953,7 @@ static void cgroup_context_apply(
                                  weight = CGROUP_WEIGHT_DEFAULT;
  
                          cgroup_apply_unified_cpu_weight(u, weight);
-                        cgroup_apply_unified_cpu_quota(u, c->cpu_quota_per_sec_usec);
+                        cgroup_apply_unified_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec);
  
                  } else {
                          uint64_t shares;
@@ -930,7 +972,7 @@ static void cgroup_context_apply(
                                  shares = CGROUP_CPU_SHARES_DEFAULT;
  
                          cgroup_apply_legacy_cpu_shares(u, shares);
-                        cgroup_apply_legacy_cpu_quota(u, c->cpu_quota_per_sec_usec);
+                        cgroup_apply_legacy_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec);
                  }
          }
  
@@ -1526,15 +1568,14 @@ int unit_set_cgroup_path(Unit *u, const char *path) {
  
          assert(u);
  
+        if (streq_ptr(u->cgroup_path, path))
+                return 0;
+
          if (path) {
                  p = strdup(path);
                  if (!p)
                          return -ENOMEM;
-        } else
-                p = NULL;
-
-        if (streq_ptr(u->cgroup_path, p))
-                return 0;
+        }
  
          if (p) {
                  r = hashmap_put(u->manager->cgroup_unit, p, u);
@@ -1543,7 +1584,6 @@ int unit_set_cgroup_path(Unit *u, const char *path) {
          }
  
          unit_release_cgroup(u);
-
          u->cgroup_path = TAKE_PTR(p);
  
          return 1;
@@ -2227,7 +2267,7 @@ void unit_prune_cgroup(Unit *u) {
  
  int unit_search_main_pid(Unit *u, pid_t *ret) {
          _cleanup_fclose_ FILE *f = NULL;
-        pid_t pid = 0, npid, mypid;
+        pid_t pid = 0, npid;
          int r;
  
          assert(u);
@@ -2240,15 +2280,12 @@ int unit_search_main_pid(Unit *u, pid_t *ret) {
          if (r < 0)
                  return r;
  
-        mypid = getpid_cached();
          while (cg_read_pid(f, &npid) > 0)  {
-                pid_t ppid;
  
                  if (npid == pid)
                          continue;
  
-                /* Ignore processes that aren't our kids */
-                if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid)
+                if (pid_is_my_child(npid) == 0)
                          continue;
  
                  if (pid != 0)
@@ -2280,7 +2317,7 @@ static int unit_watch_pids_in_path(Unit *u, const char *path) {
                  pid_t pid;
  
                  while ((r = cg_read_pid(f, &pid)) > 0) {
-                        r = unit_watch_pid(u, pid);
+                        r = unit_watch_pid(u, pid, false);
                          if (r < 0 && ret >= 0)
                                  ret = r;
                  }
@@ -2567,6 +2604,9 @@ int manager_setup_cgroup(Manager *m) {
          if (r < 0)
                  return log_error_errno(r, "Failed to create cgroup empty event source: %m");
  
+        /* Schedule cgroup empty checks early, but after having processed service notification messages or
+         * SIGCHLD signals, so that a cgroup running empty is always just the last safety net of
+         * notification, and we collected the metadata the notification and SIGCHLD stuff offers first. */
          r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
          if (r < 0)
                  return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
@@ -2593,9 +2633,10 @@ int manager_setup_cgroup(Manager *m) {
                  if (r < 0)
                          return log_error_errno(r, "Failed to watch control group inotify object: %m");
  
-                /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
-                 * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
-                r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4);
+                /* Process cgroup empty notifications early. Note that when this event is dispatched it'll
+                 * just add the unit to a cgroup empty queue, hence let's run earlier than that. Also see
+                 * handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
+                r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-9);
                  if (r < 0)
                          return log_error_errno(r, "Failed to set priority of inotify event source: %m");