1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
6 #include "errno-util.h"
9 #include "format-util.h"
10 #include "oomd-util.h"
11 #include "parse-util.h"
12 #include "path-util.h"
13 #include "procfs-util.h"
14 #include "signal-util.h"
15 #include "sort-util.h"
16 #include "stat-util.h"
17 #include "stdio-util.h"
18 #include "user-util.h"
20 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(
21 oomd_cgroup_ctx_hash_ops
,
26 oomd_cgroup_context_free
);
28 static int log_kill(pid_t pid
, int sig
, void *userdata
) {
29 log_debug("oomd attempting to kill " PID_FMT
" with %s", pid
, signal_to_string(sig
));
33 static int increment_oomd_xattr(const char *path
, const char *xattr
, uint64_t num_procs_killed
) {
34 _cleanup_free_
char *value
= NULL
;
35 char buf
[DECIMAL_STR_MAX(uint64_t) + 1];
36 uint64_t curr_count
= 0;
42 r
= cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER
, path
, xattr
, &value
);
43 if (r
< 0 && !ERRNO_IS_XATTR_ABSENT(r
))
46 if (!isempty(value
)) {
47 r
= safe_atou64(value
, &curr_count
);
52 if (curr_count
> UINT64_MAX
- num_procs_killed
)
55 xsprintf(buf
, "%"PRIu64
, curr_count
+ num_procs_killed
);
56 r
= cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER
, path
, xattr
, buf
, strlen(buf
), 0);
63 OomdCGroupContext
*oomd_cgroup_context_free(OomdCGroupContext
*ctx
) {
71 int oomd_pressure_above(Hashmap
*h
, usec_t duration
, Set
**ret
) {
72 _cleanup_set_free_ Set
*targets
= NULL
;
73 OomdCGroupContext
*ctx
;
80 targets
= set_new(NULL
);
84 HASHMAP_FOREACH_KEY(ctx
, key
, h
) {
85 if (ctx
->memory_pressure
.avg10
> ctx
->mem_pressure_limit
) {
88 if (ctx
->mem_pressure_limit_hit_start
== 0)
89 ctx
->mem_pressure_limit_hit_start
= now(CLOCK_MONOTONIC
);
91 diff
= now(CLOCK_MONOTONIC
) - ctx
->mem_pressure_limit_hit_start
;
92 if (diff
>= duration
) {
93 r
= set_put(targets
, ctx
);
98 ctx
->mem_pressure_limit_hit_start
= 0;
101 if (!set_isempty(targets
)) {
102 *ret
= TAKE_PTR(targets
);
110 uint64_t oomd_pgscan_rate(const OomdCGroupContext
*c
) {
111 uint64_t last_pgscan
;
115 /* If last_pgscan > pgscan, assume the cgroup was recreated and reset last_pgscan to zero.
116 * pgscan is monotonic and in practice should not decrease (except in the recreation case). */
117 last_pgscan
= c
->last_pgscan
;
118 if (c
->last_pgscan
> c
->pgscan
) {
119 log_debug("Last pgscan %"PRIu64
" greater than current pgscan %"PRIu64
" for %s. Using last pgscan of zero.",
120 c
->last_pgscan
, c
->pgscan
, c
->path
);
124 return c
->pgscan
- last_pgscan
;
127 bool oomd_mem_available_below(const OomdSystemContext
*ctx
, int threshold_permyriad
) {
128 uint64_t mem_threshold
;
131 assert(threshold_permyriad
<= 10000);
133 mem_threshold
= ctx
->mem_total
* threshold_permyriad
/ (uint64_t) 10000;
134 return LESS_BY(ctx
->mem_total
, ctx
->mem_used
) < mem_threshold
;
137 bool oomd_swap_free_below(const OomdSystemContext
*ctx
, int threshold_permyriad
) {
138 uint64_t swap_threshold
;
141 assert(threshold_permyriad
<= 10000);
143 swap_threshold
= ctx
->swap_total
* threshold_permyriad
/ (uint64_t) 10000;
144 return (ctx
->swap_total
- ctx
->swap_used
) < swap_threshold
;
147 int oomd_fetch_cgroup_oom_preference(OomdCGroupContext
*ctx
, const char *prefix
) {
153 prefix
= empty_to_root(prefix
);
155 if (!path_startswith(ctx
->path
, prefix
))
156 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL
),
157 "%s is not a descendant of %s", ctx
->path
, prefix
);
159 r
= cg_get_owner(SYSTEMD_CGROUP_CONTROLLER
, ctx
->path
, &uid
);
161 return log_debug_errno(r
, "Failed to get owner/group from %s: %m", ctx
->path
);
166 r
= cg_get_owner(SYSTEMD_CGROUP_CONTROLLER
, prefix
, &prefix_uid
);
168 return log_debug_errno(r
, "Failed to get owner/group from %s: %m", prefix
);
170 if (uid
!= prefix_uid
) {
171 ctx
->preference
= MANAGED_OOM_PREFERENCE_NONE
;
176 /* Ignore most errors when reading the xattr since it is usually unset and cgroup xattrs are only used
177 * as an optional feature of systemd-oomd (and the system might not even support them). */
178 r
= cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER
, ctx
->path
, "user.oomd_avoid");
180 return log_oom_debug();
181 if (r
< 0 && !ERRNO_IS_XATTR_ABSENT(r
))
182 log_debug_errno(r
, "Failed to get xattr user.oomd_avoid, ignoring: %m");
183 ctx
->preference
= r
> 0 ? MANAGED_OOM_PREFERENCE_AVOID
: ctx
->preference
;
185 r
= cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER
, ctx
->path
, "user.oomd_omit");
187 return log_oom_debug();
188 if (r
< 0 && !ERRNO_IS_XATTR_ABSENT(r
))
189 log_debug_errno(r
, "Failed to get xattr user.oomd_omit, ignoring: %m");
190 ctx
->preference
= r
> 0 ? MANAGED_OOM_PREFERENCE_OMIT
: ctx
->preference
;
195 int oomd_sort_cgroup_contexts(Hashmap
*h
, oomd_compare_t compare_func
, const char *prefix
, OomdCGroupContext
***ret
) {
196 _cleanup_free_ OomdCGroupContext
**sorted
= NULL
;
197 OomdCGroupContext
*item
;
202 assert(compare_func
);
205 sorted
= new0(OomdCGroupContext
*, hashmap_size(h
));
209 HASHMAP_FOREACH(item
, h
) {
210 /* Skip over cgroups that are not valid candidates or are explicitly marked for omission */
211 if (item
->path
&& prefix
&& !path_startswith(item
->path
, prefix
))
214 r
= oomd_fetch_cgroup_oom_preference(item
, prefix
);
218 if (item
->preference
== MANAGED_OOM_PREFERENCE_OMIT
)
224 typesafe_qsort(sorted
, k
, compare_func
);
226 *ret
= TAKE_PTR(sorted
);
228 assert(k
<= INT_MAX
);
232 int oomd_cgroup_kill(const char *path
, bool recurse
, bool dry_run
) {
233 _cleanup_set_free_ Set
*pids_killed
= NULL
;
239 _cleanup_free_
char *cg_path
= NULL
;
241 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, path
, NULL
, &cg_path
);
245 log_info("oomd dry-run: Would have tried to kill %s with recurse=%s", cg_path
, true_false(recurse
));
249 pids_killed
= set_new(NULL
);
253 r
= increment_oomd_xattr(path
, "user.oomd_ooms", 1);
255 log_debug_errno(r
, "Failed to set user.oomd_ooms before kill: %m");
258 r
= cg_kill_recursive(SYSTEMD_CGROUP_CONTROLLER
, path
, SIGKILL
, CGROUP_IGNORE_SELF
, pids_killed
, log_kill
, NULL
);
260 r
= cg_kill(SYSTEMD_CGROUP_CONTROLLER
, path
, SIGKILL
, CGROUP_IGNORE_SELF
, pids_killed
, log_kill
, NULL
);
262 /* The cgroup could have been cleaned up after we have sent SIGKILL to all of the processes, but before
263 * we could do one last iteration of cgroup.procs to check. Or the service unit could have exited and
264 * was removed between picking candidates and coming into this function. In either case, let's log
265 * about it let the caller decide what to do once they know how many PIDs were killed. */
266 if (IN_SET(r
, -ENOENT
, -ENODEV
))
267 log_debug_errno(r
, "Error when sending SIGKILL to processes in cgroup path %s, ignoring: %m", path
);
271 if (set_isempty(pids_killed
))
272 log_debug("Nothing killed when attempting to kill %s", path
);
274 r
= increment_oomd_xattr(path
, "user.oomd_kill", set_size(pids_killed
));
276 log_debug_errno(r
, "Failed to set user.oomd_kill on kill: %m");
278 return set_size(pids_killed
) != 0;
281 typedef void (*dump_candidate_func
)(const OomdCGroupContext
*ctx
, FILE *f
, const char *prefix
);
283 static int dump_kill_candidates(OomdCGroupContext
**sorted
, int n
, int dump_until
, dump_candidate_func dump_func
) {
284 /* Try dumping top offendors, ignoring any errors that might happen. */
285 _cleanup_free_
char *dump
= NULL
;
286 _cleanup_fclose_
FILE *f
= NULL
;
290 f
= open_memstream_unlocked(&dump
, &size
);
294 fprintf(f
, "Considered %d cgroups for killing, top candidates were:\n", n
);
295 for (int i
= 0; i
< dump_until
; i
++)
296 dump_func(sorted
[i
], f
, "\t");
298 r
= fflush_and_check(f
);
307 return log_dump(LOG_INFO
, dump
);
310 int oomd_kill_by_pgscan_rate(Hashmap
*h
, const char *prefix
, bool dry_run
, char **ret_selected
) {
311 _cleanup_free_ OomdCGroupContext
**sorted
= NULL
;
316 assert(ret_selected
);
318 n
= oomd_sort_cgroup_contexts(h
, compare_pgscan_rate_and_memory_usage
, prefix
, &sorted
);
322 dump_until
= MIN(n
, DUMP_ON_KILL_COUNT
);
323 for (int i
= 0; i
< n
; i
++) {
324 /* Skip cgroups with no reclaim and memory usage; it won't alleviate pressure.
325 * Continue since there might be "avoid" cgroups at the end. */
326 if (sorted
[i
]->pgscan
== 0 && sorted
[i
]->current_memory_usage
== 0)
329 r
= oomd_cgroup_kill(sorted
[i
]->path
, /* recurse= */ true, /* dry_run= */ dry_run
);
331 return r
; /* Treat oom as a hard error */
335 continue; /* Try to find something else to kill */
338 dump_until
= MAX(dump_until
, i
+ 1);
339 char *selected
= strdup(sorted
[i
]->path
);
342 *ret_selected
= selected
;
347 dump_kill_candidates(sorted
, n
, dump_until
, oomd_dump_memory_pressure_cgroup_context
);
352 int oomd_kill_by_swap_usage(Hashmap
*h
, uint64_t threshold_usage
, bool dry_run
, char **ret_selected
) {
353 _cleanup_free_ OomdCGroupContext
**sorted
= NULL
;
358 assert(ret_selected
);
360 n
= oomd_sort_cgroup_contexts(h
, compare_swap_usage
, NULL
, &sorted
);
364 dump_until
= MIN(n
, DUMP_ON_KILL_COUNT
);
365 /* Try to kill cgroups with non-zero swap usage until we either succeed in killing or we get to a cgroup with
366 * no swap usage. Threshold killing only cgroups with more than threshold swap usage. */
367 for (int i
= 0; i
< n
; i
++) {
368 /* Skip over cgroups with not enough swap usage. Don't break since there might be "avoid"
369 * cgroups at the end. */
370 if (sorted
[i
]->swap_usage
<= threshold_usage
)
373 r
= oomd_cgroup_kill(sorted
[i
]->path
, /* recurse= */ true, /* dry_run= */ dry_run
);
375 return r
; /* Treat oom as a hard error */
379 continue; /* Try to find something else to kill */
382 dump_until
= MAX(dump_until
, i
+ 1);
383 char *selected
= strdup(sorted
[i
]->path
);
386 *ret_selected
= selected
;
391 dump_kill_candidates(sorted
, n
, dump_until
, oomd_dump_swap_cgroup_context
);
396 int oomd_cgroup_context_acquire(const char *path
, OomdCGroupContext
**ret
) {
397 _cleanup_(oomd_cgroup_context_freep
) OomdCGroupContext
*ctx
= NULL
;
398 _cleanup_free_
char *p
= NULL
, *val
= NULL
;
405 ctx
= new0(OomdCGroupContext
, 1);
409 is_root
= empty_or_root(path
);
410 ctx
->preference
= MANAGED_OOM_PREFERENCE_NONE
;
412 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, path
, "memory.pressure", &p
);
414 return log_debug_errno(r
, "Error getting cgroup memory pressure path from %s: %m", path
);
416 r
= read_resource_pressure(p
, PRESSURE_TYPE_FULL
, &ctx
->memory_pressure
);
418 return log_debug_errno(r
, "Error parsing memory pressure from %s: %m", p
);
421 r
= procfs_memory_get_used(&ctx
->current_memory_usage
);
423 return log_debug_errno(r
, "Error getting memory used from procfs: %m");
425 r
= cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER
, path
, "memory.current", &ctx
->current_memory_usage
);
427 return log_debug_errno(r
, "Error getting memory.current from %s: %m", path
);
429 r
= cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER
, path
, "memory.min", &ctx
->memory_min
);
431 return log_debug_errno(r
, "Error getting memory.min from %s: %m", path
);
433 r
= cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER
, path
, "memory.low", &ctx
->memory_low
);
435 return log_debug_errno(r
, "Error getting memory.low from %s: %m", path
);
437 r
= cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER
, path
, "memory.swap.current", &ctx
->swap_usage
);
439 /* The kernel can be compiled without support for memory.swap.* files,
440 * or it can be disabled with boot param 'swapaccount=0' */
441 log_once(LOG_WARNING
, "No kernel support for memory.swap.current from %s (try boot param swapaccount=1), ignoring.", path
);
443 return log_debug_errno(r
, "Error getting memory.swap.current from %s: %m", path
);
445 r
= cg_get_keyed_attribute(SYSTEMD_CGROUP_CONTROLLER
, path
, "memory.stat", STRV_MAKE("pgscan"), &val
);
447 return log_debug_errno(r
, "Error getting pgscan from memory.stat under %s: %m", path
);
449 r
= safe_atou64(val
, &ctx
->pgscan
);
451 return log_debug_errno(r
, "Error converting pgscan value to uint64_t: %m");
454 ctx
->path
= strdup(empty_to_root(path
));
458 *ret
= TAKE_PTR(ctx
);
462 int oomd_system_context_acquire(const char *proc_meminfo_path
, OomdSystemContext
*ret
) {
463 _cleanup_fclose_
FILE *f
= NULL
;
464 unsigned field_filled
= 0;
465 OomdSystemContext ctx
= {};
466 uint64_t mem_available
, swap_free
;
471 MEM_AVAILABLE
= 1U << 1,
472 SWAP_TOTAL
= 1U << 2,
474 ALL
= MEM_TOTAL
|MEM_AVAILABLE
|SWAP_TOTAL
|SWAP_FREE
,
477 assert(proc_meminfo_path
);
480 f
= fopen(proc_meminfo_path
, "re");
485 _cleanup_free_
char *line
= NULL
;
488 r
= read_line(f
, LONG_LINE_MAX
, &line
);
494 if ((word
= startswith(line
, "MemTotal:"))) {
495 field_filled
|= MEM_TOTAL
;
496 r
= convert_meminfo_value_to_uint64_bytes(word
, &ctx
.mem_total
);
497 } else if ((word
= startswith(line
, "MemAvailable:"))) {
498 field_filled
|= MEM_AVAILABLE
;
499 r
= convert_meminfo_value_to_uint64_bytes(word
, &mem_available
);
500 } else if ((word
= startswith(line
, "SwapTotal:"))) {
501 field_filled
|= SWAP_TOTAL
;
502 r
= convert_meminfo_value_to_uint64_bytes(word
, &ctx
.swap_total
);
503 } else if ((word
= startswith(line
, "SwapFree:"))) {
504 field_filled
|= SWAP_FREE
;
505 r
= convert_meminfo_value_to_uint64_bytes(word
, &swap_free
);
510 return log_debug_errno(r
, "Error converting '%s' from %s to uint64_t: %m", line
, proc_meminfo_path
);
512 if (field_filled
== ALL
)
516 if (field_filled
!= ALL
)
517 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL
), "%s is missing expected fields", proc_meminfo_path
);
519 if (mem_available
> ctx
.mem_total
)
520 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL
),
521 "MemAvailable (%" PRIu64
") cannot be greater than MemTotal (%" PRIu64
") %m",
525 if (swap_free
> ctx
.swap_total
)
526 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL
),
527 "SwapFree (%" PRIu64
") cannot be greater than SwapTotal (%" PRIu64
") %m",
531 ctx
.mem_used
= ctx
.mem_total
- mem_available
;
532 ctx
.swap_used
= ctx
.swap_total
- swap_free
;
538 int oomd_insert_cgroup_context(Hashmap
*old_h
, Hashmap
*new_h
, const char *path
) {
539 _cleanup_(oomd_cgroup_context_freep
) OomdCGroupContext
*curr_ctx
= NULL
;
540 OomdCGroupContext
*old_ctx
;
546 path
= empty_to_root(path
);
548 r
= oomd_cgroup_context_acquire(path
, &curr_ctx
);
550 return log_debug_errno(r
, "Failed to get OomdCGroupContext for %s: %m", path
);
552 assert_se(streq(path
, curr_ctx
->path
));
554 old_ctx
= hashmap_get(old_h
, path
);
556 curr_ctx
->last_pgscan
= old_ctx
->pgscan
;
557 curr_ctx
->mem_pressure_limit
= old_ctx
->mem_pressure_limit
;
558 curr_ctx
->mem_pressure_limit_hit_start
= old_ctx
->mem_pressure_limit_hit_start
;
559 curr_ctx
->last_had_mem_reclaim
= old_ctx
->last_had_mem_reclaim
;
562 if (oomd_pgscan_rate(curr_ctx
) > 0)
563 curr_ctx
->last_had_mem_reclaim
= now(CLOCK_MONOTONIC
);
565 r
= hashmap_put(new_h
, curr_ctx
->path
, curr_ctx
);
573 void oomd_update_cgroup_contexts_between_hashmaps(Hashmap
*old_h
, Hashmap
*curr_h
) {
574 OomdCGroupContext
*ctx
;
579 HASHMAP_FOREACH(ctx
, curr_h
) {
580 OomdCGroupContext
*old_ctx
;
582 old_ctx
= hashmap_get(old_h
, ctx
->path
);
586 ctx
->last_pgscan
= old_ctx
->pgscan
;
587 ctx
->mem_pressure_limit
= old_ctx
->mem_pressure_limit
;
588 ctx
->mem_pressure_limit_hit_start
= old_ctx
->mem_pressure_limit_hit_start
;
589 ctx
->last_had_mem_reclaim
= old_ctx
->last_had_mem_reclaim
;
591 if (oomd_pgscan_rate(ctx
) > 0)
592 ctx
->last_had_mem_reclaim
= now(CLOCK_MONOTONIC
);
596 void oomd_dump_swap_cgroup_context(const OomdCGroupContext
*ctx
, FILE *f
, const char *prefix
) {
600 if (!empty_or_root(ctx
->path
))
603 "%s\tSwap Usage: %s\n",
604 strempty(prefix
), ctx
->path
,
605 strempty(prefix
), FORMAT_BYTES(ctx
->swap_usage
));
609 "%s\tSwap Usage: (see System Context)\n",
610 strempty(prefix
), ctx
->path
,
614 void oomd_dump_memory_pressure_cgroup_context(const OomdCGroupContext
*ctx
, FILE *f
, const char *prefix
) {
620 "%s\tMemory Pressure Limit: %lu.%02lu%%\n"
621 "%s\tPressure: Avg10: %lu.%02lu Avg60: %lu.%02lu Avg300: %lu.%02lu Total: %s\n"
622 "%s\tCurrent Memory Usage: %s\n",
623 strempty(prefix
), ctx
->path
,
624 strempty(prefix
), LOADAVG_INT_SIDE(ctx
->mem_pressure_limit
), LOADAVG_DECIMAL_SIDE(ctx
->mem_pressure_limit
),
626 LOADAVG_INT_SIDE(ctx
->memory_pressure
.avg10
), LOADAVG_DECIMAL_SIDE(ctx
->memory_pressure
.avg10
),
627 LOADAVG_INT_SIDE(ctx
->memory_pressure
.avg60
), LOADAVG_DECIMAL_SIDE(ctx
->memory_pressure
.avg60
),
628 LOADAVG_INT_SIDE(ctx
->memory_pressure
.avg300
), LOADAVG_DECIMAL_SIDE(ctx
->memory_pressure
.avg300
),
629 FORMAT_TIMESPAN(ctx
->memory_pressure
.total
, USEC_PER_SEC
),
630 strempty(prefix
), FORMAT_BYTES(ctx
->current_memory_usage
));
632 if (!empty_or_root(ctx
->path
))
634 "%s\tMemory Min: %s\n"
635 "%s\tMemory Low: %s\n"
636 "%s\tPgscan: %" PRIu64
"\n"
637 "%s\tLast Pgscan: %" PRIu64
"\n",
638 strempty(prefix
), FORMAT_BYTES_CGROUP_PROTECTION(ctx
->memory_min
),
639 strempty(prefix
), FORMAT_BYTES_CGROUP_PROTECTION(ctx
->memory_low
),
640 strempty(prefix
), ctx
->pgscan
,
641 strempty(prefix
), ctx
->last_pgscan
);
644 void oomd_dump_system_context(const OomdSystemContext
*ctx
, FILE *f
, const char *prefix
) {
649 "%sMemory: Used: %s Total: %s\n"
650 "%sSwap: Used: %s Total: %s\n",
652 FORMAT_BYTES(ctx
->mem_used
),
653 FORMAT_BYTES(ctx
->mem_total
),
655 FORMAT_BYTES(ctx
->swap_used
),
656 FORMAT_BYTES(ctx
->swap_total
));