1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
7 #include "format-util.h"
9 #include "parse-util.h"
10 #include "path-util.h"
11 #include "procfs-util.h"
12 #include "signal-util.h"
13 #include "sort-util.h"
14 #include "stat-util.h"
15 #include "stdio-util.h"
17 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(
18 oomd_cgroup_ctx_hash_ops
,
23 oomd_cgroup_context_free
);
25 static int log_kill(pid_t pid
, int sig
, void *userdata
) {
26 log_debug("oomd attempting to kill " PID_FMT
" with %s", pid
, signal_to_string(sig
));
30 static int increment_oomd_xattr(const char *path
, const char *xattr
, uint64_t num_procs_killed
) {
31 _cleanup_free_
char *value
= NULL
;
32 char buf
[DECIMAL_STR_MAX(uint64_t) + 1];
33 uint64_t curr_count
= 0;
39 r
= cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER
, path
, xattr
, &value
);
40 if (r
< 0 && r
!= -ENODATA
)
43 if (!isempty(value
)) {
44 r
= safe_atou64(value
, &curr_count
);
49 if (curr_count
> UINT64_MAX
- num_procs_killed
)
52 xsprintf(buf
, "%"PRIu64
, curr_count
+ num_procs_killed
);
53 r
= cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER
, path
, xattr
, buf
, strlen(buf
), 0);
60 OomdCGroupContext
*oomd_cgroup_context_free(OomdCGroupContext
*ctx
) {
68 int oomd_pressure_above(Hashmap
*h
, usec_t duration
, Set
**ret
) {
69 _cleanup_set_free_ Set
*targets
= NULL
;
70 OomdCGroupContext
*ctx
;
77 targets
= set_new(NULL
);
81 HASHMAP_FOREACH_KEY(ctx
, key
, h
) {
82 if (ctx
->memory_pressure
.avg10
> ctx
->mem_pressure_limit
) {
85 if (ctx
->last_hit_mem_pressure_limit
== 0)
86 ctx
->last_hit_mem_pressure_limit
= now(CLOCK_MONOTONIC
);
88 diff
= now(CLOCK_MONOTONIC
) - ctx
->last_hit_mem_pressure_limit
;
89 if (diff
>= duration
) {
90 r
= set_put(targets
, ctx
);
95 ctx
->last_hit_mem_pressure_limit
= 0;
98 if (!set_isempty(targets
)) {
99 *ret
= TAKE_PTR(targets
);
107 bool oomd_memory_reclaim(Hashmap
*h
) {
108 uint64_t pgscan
= 0, pgscan_of
= 0, last_pgscan
= 0, last_pgscan_of
= 0;
109 OomdCGroupContext
*ctx
;
113 /* If sum of all the current pgscan values are greater than the sum of all the last_pgscan values,
114 * there was reclaim activity. Used along with pressure checks to decide whether to take action. */
116 HASHMAP_FOREACH(ctx
, h
) {
119 sum
= pgscan
+ ctx
->pgscan
;
120 if (sum
< pgscan
|| sum
< ctx
->pgscan
)
121 pgscan_of
++; /* count overflows */
124 sum
= last_pgscan
+ ctx
->last_pgscan
;
125 if (sum
< last_pgscan
|| sum
< ctx
->last_pgscan
)
126 last_pgscan_of
++; /* count overflows */
130 /* overflow counts are the same, return sums comparison */
131 if (last_pgscan_of
== pgscan_of
)
132 return pgscan
> last_pgscan
;
134 return pgscan_of
> last_pgscan_of
;
137 bool oomd_swap_free_below(const OomdSystemContext
*ctx
, int threshold_permyriad
) {
138 uint64_t swap_threshold
;
141 assert(threshold_permyriad
<= 10000);
143 swap_threshold
= ctx
->swap_total
* threshold_permyriad
/ (uint64_t) 10000;
144 return (ctx
->swap_total
- ctx
->swap_used
) < swap_threshold
;
147 int oomd_sort_cgroup_contexts(Hashmap
*h
, oomd_compare_t compare_func
, const char *prefix
, OomdCGroupContext
***ret
) {
148 _cleanup_free_ OomdCGroupContext
**sorted
= NULL
;
149 OomdCGroupContext
*item
;
153 assert(compare_func
);
156 sorted
= new0(OomdCGroupContext
*, hashmap_size(h
));
160 HASHMAP_FOREACH(item
, h
) {
161 /* Skip over cgroups that are not valid candidates or are explicitly marked for omission */
162 if ((item
->path
&& prefix
&& !path_startswith(item
->path
, prefix
)) || item
->preference
== MANAGED_OOM_PREFERENCE_OMIT
)
168 typesafe_qsort(sorted
, k
, compare_func
);
170 *ret
= TAKE_PTR(sorted
);
172 assert(k
<= INT_MAX
);
176 int oomd_cgroup_kill(const char *path
, bool recurse
, bool dry_run
) {
177 _cleanup_set_free_ Set
*pids_killed
= NULL
;
183 _cleanup_free_
char *cg_path
= NULL
;
185 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, path
, NULL
, &cg_path
);
189 log_debug("oomd dry-run: Would have tried to kill %s with recurse=%s", cg_path
, true_false(recurse
));
193 pids_killed
= set_new(NULL
);
198 r
= cg_kill_recursive(SYSTEMD_CGROUP_CONTROLLER
, path
, SIGKILL
, CGROUP_IGNORE_SELF
, pids_killed
, log_kill
, NULL
);
200 r
= cg_kill(SYSTEMD_CGROUP_CONTROLLER
, path
, SIGKILL
, CGROUP_IGNORE_SELF
, pids_killed
, log_kill
, NULL
);
204 r
= increment_oomd_xattr(path
, "user.oomd_kill", set_size(pids_killed
));
206 log_debug_errno(r
, "Failed to set user.oomd_kill on kill: %m");
208 return set_size(pids_killed
) != 0;
211 int oomd_kill_by_pgscan_rate(Hashmap
*h
, const char *prefix
, bool dry_run
, char **ret_selected
) {
212 _cleanup_free_ OomdCGroupContext
**sorted
= NULL
;
216 assert(ret_selected
);
218 n
= oomd_sort_cgroup_contexts(h
, compare_pgscan_rate_and_memory_usage
, prefix
, &sorted
);
222 for (int i
= 0; i
< n
; i
++) {
223 /* Skip cgroups with no reclaim and memory usage; it won't alleviate pressure.
224 * Continue since there might be "avoid" cgroups at the end. */
225 if (sorted
[i
]->pgscan
== 0 && sorted
[i
]->current_memory_usage
== 0)
228 r
= oomd_cgroup_kill(sorted
[i
]->path
, true, dry_run
);
230 continue; /* We didn't find anything to kill */
232 return r
; /* Treat oom as a hard error */
236 continue; /* Try to find something else to kill */
239 char *selected
= strdup(sorted
[i
]->path
);
242 *ret_selected
= selected
;
249 int oomd_kill_by_swap_usage(Hashmap
*h
, bool dry_run
, char **ret_selected
) {
250 _cleanup_free_ OomdCGroupContext
**sorted
= NULL
;
254 assert(ret_selected
);
256 n
= oomd_sort_cgroup_contexts(h
, compare_swap_usage
, NULL
, &sorted
);
260 /* Try to kill cgroups with non-zero swap usage until we either succeed in
261 * killing or we get to a cgroup with no swap usage. */
262 for (int i
= 0; i
< n
; i
++) {
263 /* Skip over cgroups with no resource usage.
264 * Continue break since there might be "avoid" cgroups at the end. */
265 if (sorted
[i
]->swap_usage
== 0)
268 r
= oomd_cgroup_kill(sorted
[i
]->path
, true, dry_run
);
270 continue; /* We didn't find anything to kill */
272 return r
; /* Treat oom as a hard error */
276 continue; /* Try to find something else to kill */
279 char *selected
= strdup(sorted
[i
]->path
);
282 *ret_selected
= selected
;
289 int oomd_cgroup_context_acquire(const char *path
, OomdCGroupContext
**ret
) {
290 _cleanup_(oomd_cgroup_context_freep
) OomdCGroupContext
*ctx
= NULL
;
291 _cleanup_free_
char *p
= NULL
, *val
= NULL
;
299 ctx
= new0(OomdCGroupContext
, 1);
303 is_root
= empty_or_root(path
);
304 ctx
->preference
= MANAGED_OOM_PREFERENCE_NONE
;
306 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, path
, "memory.pressure", &p
);
308 return log_debug_errno(r
, "Error getting cgroup memory pressure path from %s: %m", path
);
310 r
= read_resource_pressure(p
, PRESSURE_TYPE_FULL
, &ctx
->memory_pressure
);
312 return log_debug_errno(r
, "Error parsing memory pressure from %s: %m", p
);
314 r
= cg_get_owner(SYSTEMD_CGROUP_CONTROLLER
, path
, &uid
);
316 log_debug_errno(r
, "Failed to get owner/group from %s: %m", path
);
318 /* Ignore most errors when reading the xattr since it is usually unset and cgroup xattrs are only used
319 * as an optional feature of systemd-oomd (and the system might not even support them). */
320 r
= cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER
, path
, "user.oomd_avoid");
323 ctx
->preference
= r
== 1 ? MANAGED_OOM_PREFERENCE_AVOID
: ctx
->preference
;
325 r
= cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER
, path
, "user.oomd_omit");
328 ctx
->preference
= r
== 1 ? MANAGED_OOM_PREFERENCE_OMIT
: ctx
->preference
;
332 r
= procfs_memory_get_used(&ctx
->current_memory_usage
);
334 return log_debug_errno(r
, "Error getting memory used from procfs: %m");
336 r
= cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER
, path
, "memory.current", &ctx
->current_memory_usage
);
338 return log_debug_errno(r
, "Error getting memory.current from %s: %m", path
);
340 r
= cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER
, path
, "memory.min", &ctx
->memory_min
);
342 return log_debug_errno(r
, "Error getting memory.min from %s: %m", path
);
344 r
= cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER
, path
, "memory.low", &ctx
->memory_low
);
346 return log_debug_errno(r
, "Error getting memory.low from %s: %m", path
);
348 r
= cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER
, path
, "memory.swap.current", &ctx
->swap_usage
);
350 return log_debug_errno(r
, "Error getting memory.swap.current from %s: %m", path
);
352 r
= cg_get_keyed_attribute(SYSTEMD_CGROUP_CONTROLLER
, path
, "memory.stat", STRV_MAKE("pgscan"), &val
);
354 return log_debug_errno(r
, "Error getting pgscan from memory.stat under %s: %m", path
);
356 r
= safe_atou64(val
, &ctx
->pgscan
);
358 return log_debug_errno(r
, "Error converting pgscan value to uint64_t: %m");
361 ctx
->path
= strdup(empty_to_root(path
));
365 *ret
= TAKE_PTR(ctx
);
369 int oomd_system_context_acquire(const char *proc_swaps_path
, OomdSystemContext
*ret
) {
370 _cleanup_fclose_
FILE *f
= NULL
;
371 OomdSystemContext ctx
= {};
374 assert(proc_swaps_path
);
377 f
= fopen(proc_swaps_path
, "re");
381 (void) fscanf(f
, "%*s %*s %*s %*s %*s\n");
384 uint64_t total
, used
;
387 "%*s " /* device/file */
388 "%*s " /* type of swap */
389 "%" PRIu64
" " /* swap size */
390 "%" PRIu64
" " /* used */
391 "%*s\n", /* priority */
394 if (r
== EOF
&& feof(f
))
399 return log_debug_errno(errno
, "Error reading from %s: %m", proc_swaps_path
);
401 return log_debug_errno(SYNTHETIC_ERRNO(EIO
),
402 "Failed to parse values from %s: %m", proc_swaps_path
);
405 ctx
.swap_total
+= total
* 1024U;
406 ctx
.swap_used
+= used
* 1024U;
413 int oomd_insert_cgroup_context(Hashmap
*old_h
, Hashmap
*new_h
, const char *path
) {
414 _cleanup_(oomd_cgroup_context_freep
) OomdCGroupContext
*curr_ctx
= NULL
;
415 OomdCGroupContext
*old_ctx
;
421 path
= empty_to_root(path
);
423 r
= oomd_cgroup_context_acquire(path
, &curr_ctx
);
425 return log_debug_errno(r
, "Failed to get OomdCGroupContext for %s: %m", path
);
427 assert_se(streq(path
, curr_ctx
->path
));
429 old_ctx
= hashmap_get(old_h
, path
);
431 curr_ctx
->last_pgscan
= old_ctx
->pgscan
;
432 curr_ctx
->mem_pressure_limit
= old_ctx
->mem_pressure_limit
;
433 curr_ctx
->last_hit_mem_pressure_limit
= old_ctx
->last_hit_mem_pressure_limit
;
436 r
= hashmap_put(new_h
, curr_ctx
->path
, curr_ctx
);
444 void oomd_update_cgroup_contexts_between_hashmaps(Hashmap
*old_h
, Hashmap
*curr_h
) {
445 OomdCGroupContext
*ctx
;
450 HASHMAP_FOREACH(ctx
, curr_h
) {
451 OomdCGroupContext
*old_ctx
;
453 old_ctx
= hashmap_get(old_h
, ctx
->path
);
457 ctx
->last_pgscan
= old_ctx
->pgscan
;
458 ctx
->mem_pressure_limit
= old_ctx
->mem_pressure_limit
;
459 ctx
->last_hit_mem_pressure_limit
= old_ctx
->last_hit_mem_pressure_limit
;
463 void oomd_dump_swap_cgroup_context(const OomdCGroupContext
*ctx
, FILE *f
, const char *prefix
) {
464 char swap
[FORMAT_BYTES_MAX
];
469 if (!empty_or_root(ctx
->path
))
472 "%s\tSwap Usage: %s\n",
473 strempty(prefix
), ctx
->path
,
474 strempty(prefix
), format_bytes(swap
, sizeof(swap
), ctx
->swap_usage
));
478 "%s\tSwap Usage: (see System Context)\n",
479 strempty(prefix
), ctx
->path
,
483 void oomd_dump_memory_pressure_cgroup_context(const OomdCGroupContext
*ctx
, FILE *f
, const char *prefix
) {
484 char tbuf
[FORMAT_TIMESPAN_MAX
], mem_use
[FORMAT_BYTES_MAX
];
485 char mem_min
[FORMAT_BYTES_MAX
], mem_low
[FORMAT_BYTES_MAX
];
492 "%s\tMemory Pressure Limit: %lu.%02lu%%\n"
493 "%s\tPressure: Avg10: %lu.%02lu Avg60: %lu.%02lu Avg300: %lu.%02lu Total: %s\n"
494 "%s\tCurrent Memory Usage: %s\n",
495 strempty(prefix
), ctx
->path
,
496 strempty(prefix
), LOAD_INT(ctx
->mem_pressure_limit
), LOAD_FRAC(ctx
->mem_pressure_limit
),
498 LOAD_INT(ctx
->memory_pressure
.avg10
), LOAD_FRAC(ctx
->memory_pressure
.avg10
),
499 LOAD_INT(ctx
->memory_pressure
.avg60
), LOAD_FRAC(ctx
->memory_pressure
.avg60
),
500 LOAD_INT(ctx
->memory_pressure
.avg300
), LOAD_FRAC(ctx
->memory_pressure
.avg300
),
501 format_timespan(tbuf
, sizeof(tbuf
), ctx
->memory_pressure
.total
, USEC_PER_SEC
),
502 strempty(prefix
), format_bytes(mem_use
, sizeof(mem_use
), ctx
->current_memory_usage
));
504 if (!empty_or_root(ctx
->path
))
506 "%s\tMemory Min: %s\n"
507 "%s\tMemory Low: %s\n"
508 "%s\tPgscan: %" PRIu64
"\n"
509 "%s\tLast Pgscan: %" PRIu64
"\n",
510 strempty(prefix
), format_bytes_cgroup_protection(mem_min
, sizeof(mem_min
), ctx
->memory_min
),
511 strempty(prefix
), format_bytes_cgroup_protection(mem_low
, sizeof(mem_low
), ctx
->memory_low
),
512 strempty(prefix
), ctx
->pgscan
,
513 strempty(prefix
), ctx
->last_pgscan
);
516 void oomd_dump_system_context(const OomdSystemContext
*ctx
, FILE *f
, const char *prefix
) {
517 char used
[FORMAT_BYTES_MAX
], total
[FORMAT_BYTES_MAX
];
523 "%sSwap: Used: %s Total: %s\n",
525 format_bytes(used
, sizeof(used
), ctx
->swap_used
),
526 format_bytes(total
, sizeof(total
), ctx
->swap_total
));