]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/oom/oomd-util.c
docs: add coding style example
[thirdparty/systemd.git] / src / oom / oomd-util.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
61ff7397
AZ
2
3#include <sys/xattr.h>
4#include <unistd.h>
5
6#include "fd-util.h"
7#include "format-util.h"
8#include "oomd-util.h"
9#include "parse-util.h"
10#include "path-util.h"
11#include "procfs-util.h"
12#include "signal-util.h"
13#include "sort-util.h"
14#include "stat-util.h"
15#include "stdio-util.h"
16
17DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(
18 oomd_cgroup_ctx_hash_ops,
19 char,
20 string_hash_func,
21 string_compare_func,
22 OomdCGroupContext,
23 oomd_cgroup_context_free);
24
25static int log_kill(pid_t pid, int sig, void *userdata) {
26 log_debug("oomd attempting to kill " PID_FMT " with %s", pid, signal_to_string(sig));
27 return 0;
28}
29
30static int increment_oomd_xattr(const char *path, const char *xattr, uint64_t num_procs_killed) {
31 _cleanup_free_ char *value = NULL;
32 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
33 uint64_t curr_count = 0;
34 int r;
35
36 assert(path);
37 assert(xattr);
38
39 r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, path, xattr, &value);
40 if (r < 0 && r != -ENODATA)
41 return r;
42
43 if (!isempty(value)) {
44 r = safe_atou64(value, &curr_count);
45 if (r < 0)
46 return r;
47 }
48
49 if (curr_count > UINT64_MAX - num_procs_killed)
50 return -EOVERFLOW;
51
52 xsprintf(buf, "%"PRIu64, curr_count + num_procs_killed);
53 r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, path, xattr, buf, strlen(buf), 0);
54 if (r < 0)
55 return r;
56
57 return 0;
58}
59
60OomdCGroupContext *oomd_cgroup_context_free(OomdCGroupContext *ctx) {
61 if (!ctx)
62 return NULL;
63
64 free(ctx->path);
65 return mfree(ctx);
66}
67
68int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret) {
69 _cleanup_set_free_ Set *targets = NULL;
70 OomdCGroupContext *ctx;
71 char *key;
72 int r;
73
74 assert(h);
75 assert(ret);
76
77 targets = set_new(NULL);
78 if (!targets)
79 return -ENOMEM;
80
81 HASHMAP_FOREACH_KEY(ctx, key, h) {
82 if (ctx->memory_pressure.avg10 > ctx->mem_pressure_limit) {
83 usec_t diff;
84
69c8f025
AZ
85 if (ctx->mem_pressure_limit_hit_start == 0)
86 ctx->mem_pressure_limit_hit_start = now(CLOCK_MONOTONIC);
61ff7397 87
69c8f025 88 diff = now(CLOCK_MONOTONIC) - ctx->mem_pressure_limit_hit_start;
61ff7397
AZ
89 if (diff >= duration) {
90 r = set_put(targets, ctx);
91 if (r < 0)
92 return -ENOMEM;
93 }
94 } else
69c8f025 95 ctx->mem_pressure_limit_hit_start = 0;
61ff7397
AZ
96 }
97
98 if (!set_isempty(targets)) {
99 *ret = TAKE_PTR(targets);
100 return 1;
101 }
102
103 *ret = NULL;
104 return 0;
105}
106
37d8020c
AZ
107uint64_t oomd_pgscan_rate(const OomdCGroupContext *c) {
108 uint64_t last_pgscan;
109
110 assert(c);
111
112 /* If last_pgscan > pgscan, assume the cgroup was recreated and reset last_pgscan to zero.
113 * pgscan is monotonic and in practice should not decrease (except in the recreation case). */
114 last_pgscan = c->last_pgscan;
115 if (c->last_pgscan > c->pgscan) {
116 log_debug("Last pgscan %"PRIu64" greater than current pgscan %"PRIu64" for %s. Using last pgscan of zero.",
117 c->last_pgscan, c->pgscan, c->path);
118 last_pgscan = 0;
119 }
120
121 return c->pgscan - last_pgscan;
122}
123
d06e7fb5 124bool oomd_swap_free_below(const OomdSystemContext *ctx, int threshold_permyriad) {
61ff7397
AZ
125 uint64_t swap_threshold;
126
127 assert(ctx);
d06e7fb5 128 assert(threshold_permyriad <= 10000);
61ff7397 129
d06e7fb5 130 swap_threshold = ctx->swap_total * threshold_permyriad / (uint64_t) 10000;
61ff7397
AZ
131 return (ctx->swap_total - ctx->swap_used) < swap_threshold;
132}
133
134int oomd_sort_cgroup_contexts(Hashmap *h, oomd_compare_t compare_func, const char *prefix, OomdCGroupContext ***ret) {
135 _cleanup_free_ OomdCGroupContext **sorted = NULL;
136 OomdCGroupContext *item;
137 size_t k = 0;
138
139 assert(h);
140 assert(compare_func);
141 assert(ret);
142
143 sorted = new0(OomdCGroupContext*, hashmap_size(h));
144 if (!sorted)
145 return -ENOMEM;
146
147 HASHMAP_FOREACH(item, h) {
59331b8e
AZ
148 /* Skip over cgroups that are not valid candidates or are explicitly marked for omission */
149 if ((item->path && prefix && !path_startswith(item->path, prefix)) || item->preference == MANAGED_OOM_PREFERENCE_OMIT)
61ff7397
AZ
150 continue;
151
152 sorted[k++] = item;
153 }
154
155 typesafe_qsort(sorted, k, compare_func);
156
157 *ret = TAKE_PTR(sorted);
158
159 assert(k <= INT_MAX);
160 return (int) k;
161}
162
163int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run) {
164 _cleanup_set_free_ Set *pids_killed = NULL;
165 int r;
166
167 assert(path);
168
169 if (dry_run) {
170 _cleanup_free_ char *cg_path = NULL;
171
172 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &cg_path);
173 if (r < 0)
174 return r;
175
176 log_debug("oomd dry-run: Would have tried to kill %s with recurse=%s", cg_path, true_false(recurse));
177 return 0;
178 }
179
180 pids_killed = set_new(NULL);
181 if (!pids_killed)
182 return -ENOMEM;
183
184 if (recurse)
185 r = cg_kill_recursive(SYSTEMD_CGROUP_CONTROLLER, path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL);
186 else
187 r = cg_kill(SYSTEMD_CGROUP_CONTROLLER, path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL);
188 if (r < 0)
189 return r;
190
e3038333 191 r = increment_oomd_xattr(path, "user.oomd_kill", set_size(pids_killed));
61ff7397 192 if (r < 0)
e3038333 193 log_debug_errno(r, "Failed to set user.oomd_kill on kill: %m");
61ff7397
AZ
194
195 return set_size(pids_killed) != 0;
196}
197
37a7e159 198int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char **ret_selected) {
61ff7397 199 _cleanup_free_ OomdCGroupContext **sorted = NULL;
f94a80ab 200 int n, r, ret = 0;
61ff7397
AZ
201
202 assert(h);
37a7e159 203 assert(ret_selected);
61ff7397 204
f94a80ab
ZJS
205 n = oomd_sort_cgroup_contexts(h, compare_pgscan_rate_and_memory_usage, prefix, &sorted);
206 if (n < 0)
207 return n;
61ff7397 208
f94a80ab 209 for (int i = 0; i < n; i++) {
37a7e159
AZ
210 /* Skip cgroups with no reclaim and memory usage; it won't alleviate pressure.
211 * Continue since there might be "avoid" cgroups at the end. */
74f834e9 212 if (sorted[i]->pgscan == 0 && sorted[i]->current_memory_usage == 0)
59331b8e 213 continue;
61ff7397
AZ
214
215 r = oomd_cgroup_kill(sorted[i]->path, true, dry_run);
37a7e159
AZ
216 if (r == 0)
217 continue; /* We didn't find anything to kill */
218 if (r == -ENOMEM)
219 return r; /* Treat oom as a hard error */
220 if (r < 0) {
221 if (ret == 0)
222 ret = r;
223 continue; /* Try to find something else to kill */
224 }
225
226 char *selected = strdup(sorted[i]->path);
227 if (!selected)
228 return -ENOMEM;
229 *ret_selected = selected;
230 return 1;
61ff7397
AZ
231 }
232
37a7e159 233 return ret;
61ff7397
AZ
234}
235
685b0985 236int oomd_kill_by_swap_usage(Hashmap *h, uint64_t threshold_usage, bool dry_run, char **ret_selected) {
61ff7397 237 _cleanup_free_ OomdCGroupContext **sorted = NULL;
f94a80ab 238 int n, r, ret = 0;
61ff7397
AZ
239
240 assert(h);
37a7e159 241 assert(ret_selected);
61ff7397 242
f94a80ab
ZJS
243 n = oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted);
244 if (n < 0)
245 return n;
61ff7397 246
685b0985
AZ
247 /* Try to kill cgroups with non-zero swap usage until we either succeed in killing or we get to a cgroup with
248 * no swap usage. Threshold killing only cgroups with more than threshold swap usage. */
f94a80ab 249 for (int i = 0; i < n; i++) {
685b0985
AZ
250 /* Skip over cgroups with not enough swap usage. Don't break since there might be "avoid"
251 * cgroups at the end. */
252 if (sorted[i]->swap_usage <= threshold_usage)
59331b8e 253 continue;
61ff7397
AZ
254
255 r = oomd_cgroup_kill(sorted[i]->path, true, dry_run);
37a7e159
AZ
256 if (r == 0)
257 continue; /* We didn't find anything to kill */
258 if (r == -ENOMEM)
259 return r; /* Treat oom as a hard error */
260 if (r < 0) {
261 if (ret == 0)
262 ret = r;
263 continue; /* Try to find something else to kill */
264 }
265
266 char *selected = strdup(sorted[i]->path);
267 if (!selected)
268 return -ENOMEM;
269 *ret_selected = selected;
270 return 1;
61ff7397
AZ
271 }
272
37a7e159 273 return ret;
61ff7397
AZ
274}
275
276int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret) {
277 _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL;
278 _cleanup_free_ char *p = NULL, *val = NULL;
279 bool is_root;
59331b8e 280 uid_t uid;
61ff7397
AZ
281 int r;
282
283 assert(path);
284 assert(ret);
285
286 ctx = new0(OomdCGroupContext, 1);
287 if (!ctx)
288 return -ENOMEM;
289
290 is_root = empty_or_root(path);
59331b8e 291 ctx->preference = MANAGED_OOM_PREFERENCE_NONE;
61ff7397
AZ
292
293 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, "memory.pressure", &p);
294 if (r < 0)
295 return log_debug_errno(r, "Error getting cgroup memory pressure path from %s: %m", path);
296
297 r = read_resource_pressure(p, PRESSURE_TYPE_FULL, &ctx->memory_pressure);
298 if (r < 0)
299 return log_debug_errno(r, "Error parsing memory pressure from %s: %m", p);
300
59331b8e
AZ
301 r = cg_get_owner(SYSTEMD_CGROUP_CONTROLLER, path, &uid);
302 if (r < 0)
303 log_debug_errno(r, "Failed to get owner/group from %s: %m", path);
304 else if (uid == 0) {
305 /* Ignore most errors when reading the xattr since it is usually unset and cgroup xattrs are only used
306 * as an optional feature of systemd-oomd (and the system might not even support them). */
307 r = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER, path, "user.oomd_avoid");
308 if (r == -ENOMEM)
309 return r;
310 ctx->preference = r == 1 ? MANAGED_OOM_PREFERENCE_AVOID : ctx->preference;
311
312 r = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER, path, "user.oomd_omit");
313 if (r == -ENOMEM)
314 return r;
315 ctx->preference = r == 1 ? MANAGED_OOM_PREFERENCE_OMIT : ctx->preference;
316 }
317
61ff7397
AZ
318 if (is_root) {
319 r = procfs_memory_get_used(&ctx->current_memory_usage);
320 if (r < 0)
321 return log_debug_errno(r, "Error getting memory used from procfs: %m");
322 } else {
323 r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.current", &ctx->current_memory_usage);
324 if (r < 0)
325 return log_debug_errno(r, "Error getting memory.current from %s: %m", path);
326
327 r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.min", &ctx->memory_min);
328 if (r < 0)
329 return log_debug_errno(r, "Error getting memory.min from %s: %m", path);
330
331 r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.low", &ctx->memory_low);
332 if (r < 0)
333 return log_debug_errno(r, "Error getting memory.low from %s: %m", path);
334
335 r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.swap.current", &ctx->swap_usage);
13540027
DS
336 if (r == -ENODATA)
337 /* The kernel can be compiled without support for memory.swap.* files,
338 * or it can be disabled with boot param 'swapaccount=0' */
339 log_once(LOG_WARNING, "No kernel support for memory.swap.current from %s (try boot param swapaccount=1), ignoring.", path);
340 else if (r < 0)
61ff7397
AZ
341 return log_debug_errno(r, "Error getting memory.swap.current from %s: %m", path);
342
343 r = cg_get_keyed_attribute(SYSTEMD_CGROUP_CONTROLLER, path, "memory.stat", STRV_MAKE("pgscan"), &val);
344 if (r < 0)
345 return log_debug_errno(r, "Error getting pgscan from memory.stat under %s: %m", path);
346
347 r = safe_atou64(val, &ctx->pgscan);
348 if (r < 0)
349 return log_debug_errno(r, "Error converting pgscan value to uint64_t: %m");
350 }
351
352 ctx->path = strdup(empty_to_root(path));
353 if (!ctx->path)
354 return -ENOMEM;
355
356 *ret = TAKE_PTR(ctx);
357 return 0;
358}
359
360int oomd_system_context_acquire(const char *proc_swaps_path, OomdSystemContext *ret) {
361 _cleanup_fclose_ FILE *f = NULL;
362 OomdSystemContext ctx = {};
363 int r;
364
365 assert(proc_swaps_path);
366 assert(ret);
367
368 f = fopen(proc_swaps_path, "re");
369 if (!f)
370 return -errno;
371
372 (void) fscanf(f, "%*s %*s %*s %*s %*s\n");
373
374 for (;;) {
375 uint64_t total, used;
376
377 r = fscanf(f,
378 "%*s " /* device/file */
379 "%*s " /* type of swap */
380 "%" PRIu64 " " /* swap size */
381 "%" PRIu64 " " /* used */
382 "%*s\n", /* priority */
383 &total, &used);
384
385 if (r == EOF && feof(f))
386 break;
387
388 if (r != 2) {
389 if (ferror(f))
390 return log_debug_errno(errno, "Error reading from %s: %m", proc_swaps_path);
391
392 return log_debug_errno(SYNTHETIC_ERRNO(EIO),
393 "Failed to parse values from %s: %m", proc_swaps_path);
394 }
395
396 ctx.swap_total += total * 1024U;
397 ctx.swap_used += used * 1024U;
398 }
399
400 *ret = ctx;
401 return 0;
402}
403
404int oomd_insert_cgroup_context(Hashmap *old_h, Hashmap *new_h, const char *path) {
405 _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *curr_ctx = NULL;
45da27fa 406 OomdCGroupContext *old_ctx;
61ff7397
AZ
407 int r;
408
409 assert(new_h);
410 assert(path);
411
50c0578b
AZ
412 path = empty_to_root(path);
413
61ff7397
AZ
414 r = oomd_cgroup_context_acquire(path, &curr_ctx);
415 if (r < 0)
416 return log_debug_errno(r, "Failed to get OomdCGroupContext for %s: %m", path);
417
50c0578b
AZ
418 assert_se(streq(path, curr_ctx->path));
419
61ff7397
AZ
420 old_ctx = hashmap_get(old_h, path);
421 if (old_ctx) {
422 curr_ctx->last_pgscan = old_ctx->pgscan;
423 curr_ctx->mem_pressure_limit = old_ctx->mem_pressure_limit;
69c8f025 424 curr_ctx->mem_pressure_limit_hit_start = old_ctx->mem_pressure_limit_hit_start;
df637ede 425 curr_ctx->last_had_mem_reclaim = old_ctx->last_had_mem_reclaim;
61ff7397
AZ
426 }
427
df637ede
AZ
428 if (oomd_pgscan_rate(curr_ctx) > 0)
429 curr_ctx->last_had_mem_reclaim = now(CLOCK_MONOTONIC);
430
45da27fa 431 r = hashmap_put(new_h, curr_ctx->path, curr_ctx);
61ff7397
AZ
432 if (r < 0)
433 return r;
434
45da27fa 435 TAKE_PTR(curr_ctx);
61ff7397
AZ
436 return 0;
437}
5c616ecf 438
b037a6da
AZ
439void oomd_update_cgroup_contexts_between_hashmaps(Hashmap *old_h, Hashmap *curr_h) {
440 OomdCGroupContext *ctx;
441
442 assert(old_h);
443 assert(curr_h);
444
445 HASHMAP_FOREACH(ctx, curr_h) {
446 OomdCGroupContext *old_ctx;
447
448 old_ctx = hashmap_get(old_h, ctx->path);
449 if (!old_ctx)
450 continue;
451
452 ctx->last_pgscan = old_ctx->pgscan;
453 ctx->mem_pressure_limit = old_ctx->mem_pressure_limit;
69c8f025 454 ctx->mem_pressure_limit_hit_start = old_ctx->mem_pressure_limit_hit_start;
df637ede
AZ
455 ctx->last_had_mem_reclaim = old_ctx->last_had_mem_reclaim;
456
457 if (oomd_pgscan_rate(ctx) > 0)
458 ctx->last_had_mem_reclaim = now(CLOCK_MONOTONIC);
b037a6da
AZ
459 }
460}
461
5c616ecf
AZ
462void oomd_dump_swap_cgroup_context(const OomdCGroupContext *ctx, FILE *f, const char *prefix) {
463 char swap[FORMAT_BYTES_MAX];
464
465 assert(ctx);
466 assert(f);
467
468 if (!empty_or_root(ctx->path))
469 fprintf(f,
470 "%sPath: %s\n"
471 "%s\tSwap Usage: %s\n",
472 strempty(prefix), ctx->path,
473 strempty(prefix), format_bytes(swap, sizeof(swap), ctx->swap_usage));
474 else
475 fprintf(f,
476 "%sPath: %s\n"
477 "%s\tSwap Usage: (see System Context)\n",
478 strempty(prefix), ctx->path,
479 strempty(prefix));
480}
481
482void oomd_dump_memory_pressure_cgroup_context(const OomdCGroupContext *ctx, FILE *f, const char *prefix) {
483 char tbuf[FORMAT_TIMESPAN_MAX], mem_use[FORMAT_BYTES_MAX];
484 char mem_min[FORMAT_BYTES_MAX], mem_low[FORMAT_BYTES_MAX];
485
486 assert(ctx);
487 assert(f);
488
489 fprintf(f,
490 "%sPath: %s\n"
0a9f9344 491 "%s\tMemory Pressure Limit: %lu.%02lu%%\n"
5c616ecf
AZ
492 "%s\tPressure: Avg10: %lu.%02lu Avg60: %lu.%02lu Avg300: %lu.%02lu Total: %s\n"
493 "%s\tCurrent Memory Usage: %s\n",
494 strempty(prefix), ctx->path,
0a9f9344 495 strempty(prefix), LOAD_INT(ctx->mem_pressure_limit), LOAD_FRAC(ctx->mem_pressure_limit),
5c616ecf
AZ
496 strempty(prefix),
497 LOAD_INT(ctx->memory_pressure.avg10), LOAD_FRAC(ctx->memory_pressure.avg10),
498 LOAD_INT(ctx->memory_pressure.avg60), LOAD_FRAC(ctx->memory_pressure.avg60),
499 LOAD_INT(ctx->memory_pressure.avg300), LOAD_FRAC(ctx->memory_pressure.avg300),
500 format_timespan(tbuf, sizeof(tbuf), ctx->memory_pressure.total, USEC_PER_SEC),
501 strempty(prefix), format_bytes(mem_use, sizeof(mem_use), ctx->current_memory_usage));
502
503 if (!empty_or_root(ctx->path))
504 fprintf(f,
505 "%s\tMemory Min: %s\n"
506 "%s\tMemory Low: %s\n"
bb081240
AZ
507 "%s\tPgscan: %" PRIu64 "\n"
508 "%s\tLast Pgscan: %" PRIu64 "\n",
5c616ecf
AZ
509 strempty(prefix), format_bytes_cgroup_protection(mem_min, sizeof(mem_min), ctx->memory_min),
510 strempty(prefix), format_bytes_cgroup_protection(mem_low, sizeof(mem_low), ctx->memory_low),
bb081240
AZ
511 strempty(prefix), ctx->pgscan,
512 strempty(prefix), ctx->last_pgscan);
5c616ecf
AZ
513}
514
515void oomd_dump_system_context(const OomdSystemContext *ctx, FILE *f, const char *prefix) {
516 char used[FORMAT_BYTES_MAX], total[FORMAT_BYTES_MAX];
517
518 assert(ctx);
519 assert(f);
520
521 fprintf(f,
522 "%sSwap: Used: %s Total: %s\n",
523 strempty(prefix),
524 format_bytes(used, sizeof(used), ctx->swap_used),
525 format_bytes(total, sizeof(total), ctx->swap_total));
526}