X-Git-Url: http://git.ipfire.org/?a=blobdiff_plain;f=src%2Fcore%2Fcgroup.c;h=8884981b29d362172e42fc522278da18fcebf1d5;hb=6264b85e92aeddb74b8d8808a08c9eae8390a6a5;hp=ed2f331b33eb1563df24ce94086ef80b55fde60c;hpb=9ef36967c8cf985313f26d16722ea059fa668ccb;p=thirdparty%2Fsystemd.git diff --git a/src/core/cgroup.c b/src/core/cgroup.c index ed2f331b33e..8884981b29d 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -3,17 +3,20 @@ #include #include +#include "sd-messages.h" + #include "alloc-util.h" #include "blockdev-util.h" +#include "bpf-devices.h" #include "bpf-firewall.h" #include "btrfs-util.h" -#include "bpf-devices.h" #include "bus-error.h" #include "cgroup-util.h" #include "cgroup.h" #include "fd-util.h" #include "fileio.h" #include "fs-util.h" +#include "nulstr-util.h" #include "parse-util.h" #include "path-util.h" #include "process-util.h" @@ -25,7 +28,7 @@ #include "string-util.h" #include "virt.h" -#define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC) +#define CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC) /* Returns the log level to use when cgroup attribute writes fail. When an attribute is missing or we have access * problems we downgrade to LOG_DEBUG. This is supposed to be nice to container managers and kernels which want to mask @@ -98,6 +101,7 @@ void cgroup_context_init(CGroupContext *c) { .cpu_weight = CGROUP_WEIGHT_INVALID, .startup_cpu_weight = CGROUP_WEIGHT_INVALID, .cpu_quota_per_sec_usec = USEC_INFINITY, + .cpu_quota_period_usec = USEC_INFINITY, .cpu_shares = CGROUP_CPU_SHARES_INVALID, .startup_cpu_shares = CGROUP_CPU_SHARES_INVALID, @@ -206,6 +210,7 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) { CGroupDeviceAllow *a; IPAddressAccessItem *iaai; char u[FORMAT_TIMESPAN_MAX]; + char v[FORMAT_TIMESPAN_MAX]; assert(c); assert(f); @@ -224,10 +229,12 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) { "%sCPUShares=%" PRIu64 "\n" "%sStartupCPUShares=%" PRIu64 "\n" "%sCPUQuotaPerSecSec=%s\n" + "%sCPUQuotaPeriodSec=%s\n" "%sIOWeight=%" PRIu64 "\n" "%sStartupIOWeight=%" PRIu64 "\n" "%sBlockIOWeight=%" PRIu64 "\n" "%sStartupBlockIOWeight=%" PRIu64 "\n" + "%sDefaultMemoryLow=%" PRIu64 "\n" "%sMemoryMin=%" PRIu64 "\n" "%sMemoryLow=%" PRIu64 "\n" "%sMemoryHigh=%" PRIu64 "\n" @@ -248,10 +255,12 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) { prefix, c->cpu_shares, prefix, c->startup_cpu_shares, prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1), + prefix, format_timespan(v, sizeof(v), c->cpu_quota_period_usec, 1), prefix, c->io_weight, prefix, c->startup_io_weight, prefix, c->blockio_weight, prefix, c->startup_blockio_weight, + prefix, c->default_memory_low, prefix, c->memory_min, prefix, c->memory_low, prefix, c->memory_high, @@ -375,6 +384,37 @@ int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode) return 0; } +#define UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(entry) \ + uint64_t unit_get_ancestor_##entry(Unit *u) { \ + CGroupContext *c; \ + \ + /* 1. Is entry set in this unit? If so, use that. \ + * 2. Is the default for this entry set in any \ + * ancestor? If so, use that. \ + * 3. Otherwise, return CGROUP_LIMIT_MIN. */ \ + \ + assert(u); \ + \ + c = unit_get_cgroup_context(u); \ + \ + if (c->entry##_set) \ + return c->entry; \ + \ + while (UNIT_ISSET(u->slice)) { \ + u = UNIT_DEREF(u->slice); \ + c = unit_get_cgroup_context(u); \ + \ + if (c->default_##entry##_set) \ + return c->default_##entry; \ + } \ + \ + /* We've reached the root, but nobody had default for \ + * this entry set, so set it to the kernel default. */ \ + return CGROUP_LIMIT_MIN; \ +} + +UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_low); + static void cgroup_xattr_apply(Unit *u) { char ids[SD_ID128_STRING_MAX]; int r; @@ -656,6 +696,40 @@ static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) return CGROUP_CPU_SHARES_DEFAULT; } +usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period) { + /* kernel uses a minimum resolution of 1ms, so both period and (quota * period) + * need to be higher than that boundary. quota is specified in USecPerSec. + * Additionally, period must be at most max_period. */ + assert(quota > 0); + + return MIN(MAX3(period, resolution, resolution * USEC_PER_SEC / quota), max_period); +} + +static usec_t cgroup_cpu_adjust_period_and_log(Unit *u, usec_t period, usec_t quota) { + usec_t new_period; + + if (quota == USEC_INFINITY) + /* Always use default period for infinity quota. */ + return CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC; + + if (period == USEC_INFINITY) + /* Default period was requested. */ + period = CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC; + + /* Clamp to interval [1ms, 1s] */ + new_period = cgroup_cpu_adjust_period(period, quota, USEC_PER_MSEC, USEC_PER_SEC); + + if (new_period != period) { + char v[FORMAT_TIMESPAN_MAX]; + log_unit_full(u, u->warned_clamping_cpu_quota_period ? LOG_DEBUG : LOG_WARNING, 0, + "Clamping CPU interval for cpu.max: period is now %s", + format_timespan(v, sizeof(v), new_period, 1)); + u->warned_clamping_cpu_quota_period = true; + } + + return new_period; +} + static void cgroup_apply_unified_cpu_weight(Unit *u, uint64_t weight) { char buf[DECIMAL_STR_MAX(uint64_t) + 2]; @@ -663,14 +737,15 @@ static void cgroup_apply_unified_cpu_weight(Unit *u, uint64_t weight) { (void) set_attribute_and_warn(u, "cpu", "cpu.weight", buf); } -static void cgroup_apply_unified_cpu_quota(Unit *u, usec_t quota) { +static void cgroup_apply_unified_cpu_quota(Unit *u, usec_t quota, usec_t period) { char buf[(DECIMAL_STR_MAX(usec_t) + 1) * 2 + 1]; + period = cgroup_cpu_adjust_period_and_log(u, period, quota); if (quota != USEC_INFINITY) xsprintf(buf, USEC_FMT " " USEC_FMT "\n", - quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC); + MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC), period); else - xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC); + xsprintf(buf, "max " USEC_FMT "\n", period); (void) set_attribute_and_warn(u, "cpu", "cpu.max", buf); } @@ -681,14 +756,16 @@ static void cgroup_apply_legacy_cpu_shares(Unit *u, uint64_t shares) { (void) set_attribute_and_warn(u, "cpu", "cpu.shares", buf); } -static void cgroup_apply_legacy_cpu_quota(Unit *u, usec_t quota) { +static void cgroup_apply_legacy_cpu_quota(Unit *u, usec_t quota, usec_t period) { char buf[DECIMAL_STR_MAX(usec_t) + 2]; - xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC); + period = cgroup_cpu_adjust_period_and_log(u, period, quota); + + xsprintf(buf, USEC_FMT "\n", period); (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_period_us", buf); if (quota != USEC_INFINITY) { - xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC); + xsprintf(buf, USEC_FMT "\n", MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC)); (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", buf); } else (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", "-1\n"); @@ -833,8 +910,17 @@ static void cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint6 (void) set_attribute_and_warn(u, "blkio", "blkio.throttle.write_bps_device", buf); } -static bool cgroup_context_has_unified_memory_config(CGroupContext *c) { - return c->memory_min > 0 || c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX || c->memory_swap_max != CGROUP_LIMIT_MAX; +static bool unit_has_unified_memory_config(Unit *u) { + CGroupContext *c; + + assert(u); + + c = unit_get_cgroup_context(u); + assert(c); + + return c->memory_min > 0 || unit_get_ancestor_memory_low(u) > 0 || + c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX || + c->memory_swap_max != CGROUP_LIMIT_MAX; } static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) { @@ -911,7 +997,7 @@ static void cgroup_context_apply( weight = CGROUP_WEIGHT_DEFAULT; cgroup_apply_unified_cpu_weight(u, weight); - cgroup_apply_unified_cpu_quota(u, c->cpu_quota_per_sec_usec); + cgroup_apply_unified_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec); } else { uint64_t shares; @@ -930,7 +1016,7 @@ static void cgroup_context_apply( shares = CGROUP_CPU_SHARES_DEFAULT; cgroup_apply_legacy_cpu_shares(u, shares); - cgroup_apply_legacy_cpu_quota(u, c->cpu_quota_per_sec_usec); + cgroup_apply_legacy_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec); } } @@ -1083,7 +1169,7 @@ static void cgroup_context_apply( if (cg_all_unified() > 0) { uint64_t max, swap_max = CGROUP_LIMIT_MAX; - if (cgroup_context_has_unified_memory_config(c)) { + if (unit_has_unified_memory_config(u)) { max = c->memory_max; swap_max = c->memory_swap_max; } else { @@ -1094,16 +1180,18 @@ static void cgroup_context_apply( } cgroup_apply_unified_memory_limit(u, "memory.min", c->memory_min); - cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low); + cgroup_apply_unified_memory_limit(u, "memory.low", unit_get_ancestor_memory_low(u)); cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high); cgroup_apply_unified_memory_limit(u, "memory.max", max); cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max); + (void) set_attribute_and_warn(u, "memory", "memory.oom.group", one_zero(c->memory_oom_group)); + } else { char buf[DECIMAL_STR_MAX(uint64_t) + 1]; uint64_t val; - if (cgroup_context_has_unified_memory_config(c)) { + if (unit_has_unified_memory_config(u)) { val = c->memory_max; log_cgroup_compat(u, "Applying MemoryMax=%" PRIi64 " as MemoryLimit=", val); } else @@ -1277,8 +1365,13 @@ static bool unit_get_needs_bpf_firewall(Unit *u) { return false; } -static CGroupMask cgroup_context_get_mask(CGroupContext *c) { +static CGroupMask unit_get_cgroup_mask(Unit *u) { CGroupMask mask = 0; + CGroupContext *c; + + assert(u); + + c = unit_get_cgroup_context(u); /* Figure out which controllers we need, based on the cgroup context object */ @@ -1295,7 +1388,7 @@ static CGroupMask cgroup_context_get_mask(CGroupContext *c) { if (c->memory_accounting || c->memory_limit != CGROUP_LIMIT_MAX || - cgroup_context_has_unified_memory_config(c)) + unit_has_unified_memory_config(u)) mask |= CGROUP_MASK_MEMORY; if (c->device_allow || @@ -1334,7 +1427,7 @@ CGroupMask unit_get_own_mask(Unit *u) { if (!c) return 0; - return (cgroup_context_get_mask(c) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u)) & ~unit_get_ancestor_disable_mask(u); + return (unit_get_cgroup_mask(u) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u)) & ~unit_get_ancestor_disable_mask(u); } CGroupMask unit_get_delegate_mask(Unit *u) { @@ -1526,15 +1619,14 @@ int unit_set_cgroup_path(Unit *u, const char *path) { assert(u); + if (streq_ptr(u->cgroup_path, path)) + return 0; + if (path) { p = strdup(path); if (!p) return -ENOMEM; - } else - p = NULL; - - if (streq_ptr(u->cgroup_path, p)) - return 0; + } if (p) { r = hashmap_put(u->manager->cgroup_unit, p, u); @@ -1543,7 +1635,6 @@ int unit_set_cgroup_path(Unit *u, const char *path) { } unit_release_cgroup(u); - u->cgroup_path = TAKE_PTR(p); return 1; @@ -1555,10 +1646,13 @@ int unit_watch_cgroup(Unit *u) { assert(u); + /* Watches the "cgroups.events" attribute of this unit's cgroup for "empty" events, but only if + * cgroupv2 is available. */ + if (!u->cgroup_path) return 0; - if (u->cgroup_inotify_wd >= 0) + if (u->cgroup_control_inotify_wd >= 0) return 0; /* Only applies to the unified hierarchy */ @@ -1568,11 +1662,11 @@ int unit_watch_cgroup(Unit *u) { if (r == 0) return 0; - /* Don't watch the root slice, it's pointless. */ + /* No point in watch the top-level slice, it's never going to run empty. */ if (unit_has_name(u, SPECIAL_ROOT_SLICE)) return 0; - r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops); + r = hashmap_ensure_allocated(&u->manager->cgroup_control_inotify_wd_unit, &trivial_hash_ops); if (r < 0) return log_oom(); @@ -1580,20 +1674,82 @@ int unit_watch_cgroup(Unit *u) { if (r < 0) return log_oom(); - u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY); - if (u->cgroup_inotify_wd < 0) { + u->cgroup_control_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY); + if (u->cgroup_control_inotify_wd < 0) { + + if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this + * is not an error */ + return 0; + + return log_unit_error_errno(u, errno, "Failed to add control inotify watch descriptor for control group %s: %m", u->cgroup_path); + } + + r = hashmap_put(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd), u); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to add control inotify watch descriptor to hash map: %m"); + + return 0; +} + +int unit_watch_cgroup_memory(Unit *u) { + _cleanup_free_ char *events = NULL; + CGroupContext *c; + int r; + + assert(u); + + /* Watches the "memory.events" attribute of this unit's cgroup for "oom_kill" events, but only if + * cgroupv2 is available. */ + + if (!u->cgroup_path) + return 0; + + c = unit_get_cgroup_context(u); + if (!c) + return 0; + + /* The "memory.events" attribute is only available if the memory controller is on. Let's hence tie + * this to memory accounting, in a way watching for OOM kills is a form of memory accounting after + * all. */ + if (!c->memory_accounting) + return 0; + + /* Don't watch inner nodes, as the kernel doesn't report oom_kill events recursively currently, and + * we also don't want to generate a log message for each parent cgroup of a process. */ + if (u->type == UNIT_SLICE) + return 0; + + if (u->cgroup_memory_inotify_wd >= 0) + return 0; + + /* Only applies to the unified hierarchy */ + r = cg_all_unified(); + if (r < 0) + return log_error_errno(r, "Failed to determine whether the memory controller is unified: %m"); + if (r == 0) + return 0; + + r = hashmap_ensure_allocated(&u->manager->cgroup_memory_inotify_wd_unit, &trivial_hash_ops); + if (r < 0) + return log_oom(); + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "memory.events", &events); + if (r < 0) + return log_oom(); - if (errno == ENOENT) /* If the directory is already - * gone we don't need to track - * it, so this is not an error */ + u->cgroup_memory_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY); + if (u->cgroup_memory_inotify_wd < 0) { + + if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this + * is not an error */ return 0; - return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path); + return log_unit_error_errno(u, errno, "Failed to add memory inotify watch descriptor for control group %s: %m", u->cgroup_path); } - r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u); + r = hashmap_put(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd), u); if (r < 0) - return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m"); + return log_unit_error_errno(u, r, "Failed to add memory inotify watch descriptor to hash map: %m"); return 0; } @@ -1650,6 +1806,7 @@ static int unit_create_cgroup( /* Start watching it */ (void) unit_watch_cgroup(u); + (void) unit_watch_cgroup_memory(u); /* Preserve enabled controllers in delegated units, adjust others. */ if (created || !u->cgroup_realized || !unit_cgroup_delegate(u)) { @@ -2183,12 +2340,20 @@ void unit_release_cgroup(Unit *u) { u->cgroup_path = mfree(u->cgroup_path); } - if (u->cgroup_inotify_wd >= 0) { - if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0) - log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring: %m", u->cgroup_inotify_wd, u->id); + if (u->cgroup_control_inotify_wd >= 0) { + if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_control_inotify_wd) < 0) + log_unit_debug_errno(u, errno, "Failed to remove cgroup control inotify watch %i for %s, ignoring: %m", u->cgroup_control_inotify_wd, u->id); + + (void) hashmap_remove(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd)); + u->cgroup_control_inotify_wd = -1; + } + + if (u->cgroup_memory_inotify_wd >= 0) { + if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_memory_inotify_wd) < 0) + log_unit_debug_errno(u, errno, "Failed to remove cgroup memory inotify watch %i for %s, ignoring: %m", u->cgroup_memory_inotify_wd, u->id); - (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd)); - u->cgroup_inotify_wd = -1; + (void) hashmap_remove(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd)); + u->cgroup_memory_inotify_wd = -1; } } @@ -2227,7 +2392,7 @@ void unit_prune_cgroup(Unit *u) { int unit_search_main_pid(Unit *u, pid_t *ret) { _cleanup_fclose_ FILE *f = NULL; - pid_t pid = 0, npid, mypid; + pid_t pid = 0, npid; int r; assert(u); @@ -2240,15 +2405,12 @@ int unit_search_main_pid(Unit *u, pid_t *ret) { if (r < 0) return r; - mypid = getpid_cached(); while (cg_read_pid(f, &npid) > 0) { - pid_t ppid; if (npid == pid) continue; - /* Ignore processes that aren't our kids */ - if (get_process_ppid(npid, &ppid) >= 0 && ppid != mypid) + if (pid_is_my_child(npid) == 0) continue; if (pid != 0) @@ -2280,7 +2442,7 @@ static int unit_watch_pids_in_path(Unit *u, const char *path) { pid_t pid; while ((r = cg_read_pid(f, &pid)) > 0) { - r = unit_watch_pid(u, pid); + r = unit_watch_pid(u, pid, false); if (r < 0 && ret >= 0) ret = r; } @@ -2440,6 +2602,106 @@ void unit_add_to_cgroup_empty_queue(Unit *u) { log_debug_errno(r, "Failed to enable cgroup empty event source: %m"); } +static int unit_check_oom(Unit *u) { + _cleanup_free_ char *oom_kill = NULL; + bool increased; + uint64_t c; + int r; + + if (!u->cgroup_path) + return 0; + + r = cg_get_keyed_attribute("memory", u->cgroup_path, "memory.events", STRV_MAKE("oom_kill"), &oom_kill); + if (r < 0) + return log_unit_debug_errno(u, r, "Failed to read oom_kill field of memory.events cgroup attribute: %m"); + + r = safe_atou64(oom_kill, &c); + if (r < 0) + return log_unit_debug_errno(u, r, "Failed to parse oom_kill field: %m"); + + increased = c > u->oom_kill_last; + u->oom_kill_last = c; + + if (!increased) + return 0; + + log_struct(LOG_NOTICE, + "MESSAGE_ID=" SD_MESSAGE_UNIT_OUT_OF_MEMORY_STR, + LOG_UNIT_ID(u), + LOG_UNIT_INVOCATION_ID(u), + LOG_UNIT_MESSAGE(u, "A process of this unit has been killed by the OOM killer.")); + + if (UNIT_VTABLE(u)->notify_cgroup_oom) + UNIT_VTABLE(u)->notify_cgroup_oom(u); + + return 1; +} + +static int on_cgroup_oom_event(sd_event_source *s, void *userdata) { + Manager *m = userdata; + Unit *u; + int r; + + assert(s); + assert(m); + + u = m->cgroup_oom_queue; + if (!u) + return 0; + + assert(u->in_cgroup_oom_queue); + u->in_cgroup_oom_queue = false; + LIST_REMOVE(cgroup_oom_queue, m->cgroup_oom_queue, u); + + if (m->cgroup_oom_queue) { + /* More stuff queued, let's make sure we remain enabled */ + r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT); + if (r < 0) + log_debug_errno(r, "Failed to reenable cgroup oom event source, ignoring: %m"); + } + + (void) unit_check_oom(u); + return 0; +} + +static void unit_add_to_cgroup_oom_queue(Unit *u) { + int r; + + assert(u); + + if (u->in_cgroup_oom_queue) + return; + if (!u->cgroup_path) + return; + + LIST_PREPEND(cgroup_oom_queue, u->manager->cgroup_oom_queue, u); + u->in_cgroup_oom_queue = true; + + /* Trigger the defer event */ + if (!u->manager->cgroup_oom_event_source) { + _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; + + r = sd_event_add_defer(u->manager->event, &s, on_cgroup_oom_event, u->manager); + if (r < 0) { + log_error_errno(r, "Failed to create cgroup oom event source: %m"); + return; + } + + r = sd_event_source_set_priority(s, SD_EVENT_PRIORITY_NORMAL-8); + if (r < 0) { + log_error_errno(r, "Failed to set priority of cgroup oom event source: %m"); + return; + } + + (void) sd_event_source_set_description(s, "cgroup-oom"); + u->manager->cgroup_oom_event_source = TAKE_PTR(s); + } + + r = sd_event_source_set_enabled(u->manager->cgroup_oom_event_source, SD_EVENT_ONESHOT); + if (r < 0) + log_error_errno(r, "Failed to enable cgroup oom event source: %m"); +} + static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) { Manager *m = userdata; @@ -2471,15 +2733,16 @@ static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, /* The watch was just removed */ continue; - u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd)); - if (!u) /* Not that inotify might deliver - * events for a watch even after it - * was removed, because it was queued - * before the removal. Let's ignore - * this here safely. */ - continue; + /* Note that inotify might deliver events for a watch even after it was removed, + * because it was queued before the removal. Let's ignore this here safely. */ + + u = hashmap_get(m->cgroup_control_inotify_wd_unit, INT_TO_PTR(e->wd)); + if (u) + unit_add_to_cgroup_empty_queue(u); - unit_add_to_cgroup_empty_queue(u); + u = hashmap_get(m->cgroup_memory_inotify_wd_unit, INT_TO_PTR(e->wd)); + if (u) + unit_add_to_cgroup_oom_queue(u); } } } @@ -2567,6 +2830,9 @@ int manager_setup_cgroup(Manager *m) { if (r < 0) return log_error_errno(r, "Failed to create cgroup empty event source: %m"); + /* Schedule cgroup empty checks early, but after having processed service notification messages or + * SIGCHLD signals, so that a cgroup running empty is always just the last safety net of + * notification, and we collected the metadata the notification and SIGCHLD stuff offers first. */ r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5); if (r < 0) return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m"); @@ -2593,9 +2859,10 @@ int manager_setup_cgroup(Manager *m) { if (r < 0) return log_error_errno(r, "Failed to watch control group inotify object: %m"); - /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also - * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */ - r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4); + /* Process cgroup empty notifications early. Note that when this event is dispatched it'll + * just add the unit to a cgroup empty queue, hence let's run earlier than that. Also see + * handling of cgroup agent notifications, for the classic cgroup hierarchy support. */ + r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-9); if (r < 0) return log_error_errno(r, "Failed to set priority of inotify event source: %m"); @@ -2665,7 +2932,8 @@ void manager_shutdown_cgroup(Manager *m, bool delete) { m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source); - m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit); + m->cgroup_control_inotify_wd_unit = hashmap_free(m->cgroup_control_inotify_wd_unit); + m->cgroup_memory_inotify_wd_unit = hashmap_free(m->cgroup_memory_inotify_wd_unit); m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source); m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd); @@ -2780,7 +3048,7 @@ int unit_get_memory_current(Unit *u, uint64_t *ret) { /* The root cgroup doesn't expose this information, let's get it from /proc instead */ if (unit_has_host_root_cgroup(u)) - return procfs_memory_get_current(ret); + return procfs_memory_get_used(ret); if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0) return -ENODATA;