]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/oom/oomd-util.c
resolved: reply using unicast mDNS when appropriate
[thirdparty/systemd.git] / src / oom / oomd-util.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <sys/xattr.h>
4 #include <unistd.h>
5
6 #include "fd-util.h"
7 #include "format-util.h"
8 #include "oomd-util.h"
9 #include "parse-util.h"
10 #include "path-util.h"
11 #include "procfs-util.h"
12 #include "signal-util.h"
13 #include "sort-util.h"
14 #include "stat-util.h"
15 #include "stdio-util.h"
16
17 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(
18 oomd_cgroup_ctx_hash_ops,
19 char,
20 string_hash_func,
21 string_compare_func,
22 OomdCGroupContext,
23 oomd_cgroup_context_free);
24
25 static int log_kill(pid_t pid, int sig, void *userdata) {
26 log_debug("oomd attempting to kill " PID_FMT " with %s", pid, signal_to_string(sig));
27 return 0;
28 }
29
30 static int increment_oomd_xattr(const char *path, const char *xattr, uint64_t num_procs_killed) {
31 _cleanup_free_ char *value = NULL;
32 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
33 uint64_t curr_count = 0;
34 int r;
35
36 assert(path);
37 assert(xattr);
38
39 r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, path, xattr, &value);
40 if (r < 0 && r != -ENODATA)
41 return r;
42
43 if (!isempty(value)) {
44 r = safe_atou64(value, &curr_count);
45 if (r < 0)
46 return r;
47 }
48
49 if (curr_count > UINT64_MAX - num_procs_killed)
50 return -EOVERFLOW;
51
52 xsprintf(buf, "%"PRIu64, curr_count + num_procs_killed);
53 r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, path, xattr, buf, strlen(buf), 0);
54 if (r < 0)
55 return r;
56
57 return 0;
58 }
59
60 OomdCGroupContext *oomd_cgroup_context_free(OomdCGroupContext *ctx) {
61 if (!ctx)
62 return NULL;
63
64 free(ctx->path);
65 return mfree(ctx);
66 }
67
68 int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret) {
69 _cleanup_set_free_ Set *targets = NULL;
70 OomdCGroupContext *ctx;
71 char *key;
72 int r;
73
74 assert(h);
75 assert(ret);
76
77 targets = set_new(NULL);
78 if (!targets)
79 return -ENOMEM;
80
81 HASHMAP_FOREACH_KEY(ctx, key, h) {
82 if (ctx->memory_pressure.avg10 > ctx->mem_pressure_limit) {
83 usec_t diff;
84
85 if (ctx->last_hit_mem_pressure_limit == 0)
86 ctx->last_hit_mem_pressure_limit = now(CLOCK_MONOTONIC);
87
88 diff = now(CLOCK_MONOTONIC) - ctx->last_hit_mem_pressure_limit;
89 if (diff >= duration) {
90 r = set_put(targets, ctx);
91 if (r < 0)
92 return -ENOMEM;
93 }
94 } else
95 ctx->last_hit_mem_pressure_limit = 0;
96 }
97
98 if (!set_isempty(targets)) {
99 *ret = TAKE_PTR(targets);
100 return 1;
101 }
102
103 *ret = NULL;
104 return 0;
105 }
106
107 bool oomd_memory_reclaim(Hashmap *h) {
108 uint64_t pgscan = 0, pgscan_of = 0, last_pgscan = 0, last_pgscan_of = 0;
109 OomdCGroupContext *ctx;
110
111 assert(h);
112
113 /* If sum of all the current pgscan values are greater than the sum of all the last_pgscan values,
114 * there was reclaim activity. Used along with pressure checks to decide whether to take action. */
115
116 HASHMAP_FOREACH(ctx, h) {
117 uint64_t sum;
118
119 sum = pgscan + ctx->pgscan;
120 if (sum < pgscan || sum < ctx->pgscan)
121 pgscan_of++; /* count overflows */
122 pgscan = sum;
123
124 sum = last_pgscan + ctx->last_pgscan;
125 if (sum < last_pgscan || sum < ctx->last_pgscan)
126 last_pgscan_of++; /* count overflows */
127 last_pgscan = sum;
128 }
129
130 /* overflow counts are the same, return sums comparison */
131 if (last_pgscan_of == pgscan_of)
132 return pgscan > last_pgscan;
133
134 return pgscan_of > last_pgscan_of;
135 }
136
137 bool oomd_swap_free_below(const OomdSystemContext *ctx, int threshold_permyriad) {
138 uint64_t swap_threshold;
139
140 assert(ctx);
141 assert(threshold_permyriad <= 10000);
142
143 swap_threshold = ctx->swap_total * threshold_permyriad / (uint64_t) 10000;
144 return (ctx->swap_total - ctx->swap_used) < swap_threshold;
145 }
146
147 int oomd_sort_cgroup_contexts(Hashmap *h, oomd_compare_t compare_func, const char *prefix, OomdCGroupContext ***ret) {
148 _cleanup_free_ OomdCGroupContext **sorted = NULL;
149 OomdCGroupContext *item;
150 size_t k = 0;
151
152 assert(h);
153 assert(compare_func);
154 assert(ret);
155
156 sorted = new0(OomdCGroupContext*, hashmap_size(h));
157 if (!sorted)
158 return -ENOMEM;
159
160 HASHMAP_FOREACH(item, h) {
161 /* Skip over cgroups that are not valid candidates or are explicitly marked for omission */
162 if ((item->path && prefix && !path_startswith(item->path, prefix)) || item->preference == MANAGED_OOM_PREFERENCE_OMIT)
163 continue;
164
165 sorted[k++] = item;
166 }
167
168 typesafe_qsort(sorted, k, compare_func);
169
170 *ret = TAKE_PTR(sorted);
171
172 assert(k <= INT_MAX);
173 return (int) k;
174 }
175
176 int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run) {
177 _cleanup_set_free_ Set *pids_killed = NULL;
178 int r;
179
180 assert(path);
181
182 if (dry_run) {
183 _cleanup_free_ char *cg_path = NULL;
184
185 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &cg_path);
186 if (r < 0)
187 return r;
188
189 log_debug("oomd dry-run: Would have tried to kill %s with recurse=%s", cg_path, true_false(recurse));
190 return 0;
191 }
192
193 pids_killed = set_new(NULL);
194 if (!pids_killed)
195 return -ENOMEM;
196
197 if (recurse)
198 r = cg_kill_recursive(SYSTEMD_CGROUP_CONTROLLER, path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL);
199 else
200 r = cg_kill(SYSTEMD_CGROUP_CONTROLLER, path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL);
201 if (r < 0)
202 return r;
203
204 r = increment_oomd_xattr(path, "user.oomd_kill", set_size(pids_killed));
205 if (r < 0)
206 log_debug_errno(r, "Failed to set user.oomd_kill on kill: %m");
207
208 return set_size(pids_killed) != 0;
209 }
210
211 int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char **ret_selected) {
212 _cleanup_free_ OomdCGroupContext **sorted = NULL;
213 int n, r, ret = 0;
214
215 assert(h);
216 assert(ret_selected);
217
218 n = oomd_sort_cgroup_contexts(h, compare_pgscan_rate_and_memory_usage, prefix, &sorted);
219 if (n < 0)
220 return n;
221
222 for (int i = 0; i < n; i++) {
223 /* Skip cgroups with no reclaim and memory usage; it won't alleviate pressure.
224 * Continue since there might be "avoid" cgroups at the end. */
225 if (sorted[i]->pgscan == 0 && sorted[i]->current_memory_usage == 0)
226 continue;
227
228 r = oomd_cgroup_kill(sorted[i]->path, true, dry_run);
229 if (r == 0)
230 continue; /* We didn't find anything to kill */
231 if (r == -ENOMEM)
232 return r; /* Treat oom as a hard error */
233 if (r < 0) {
234 if (ret == 0)
235 ret = r;
236 continue; /* Try to find something else to kill */
237 }
238
239 char *selected = strdup(sorted[i]->path);
240 if (!selected)
241 return -ENOMEM;
242 *ret_selected = selected;
243 return 1;
244 }
245
246 return ret;
247 }
248
249 int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run, char **ret_selected) {
250 _cleanup_free_ OomdCGroupContext **sorted = NULL;
251 int n, r, ret = 0;
252
253 assert(h);
254 assert(ret_selected);
255
256 n = oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted);
257 if (n < 0)
258 return n;
259
260 /* Try to kill cgroups with non-zero swap usage until we either succeed in
261 * killing or we get to a cgroup with no swap usage. */
262 for (int i = 0; i < n; i++) {
263 /* Skip over cgroups with no resource usage.
264 * Continue break since there might be "avoid" cgroups at the end. */
265 if (sorted[i]->swap_usage == 0)
266 continue;
267
268 r = oomd_cgroup_kill(sorted[i]->path, true, dry_run);
269 if (r == 0)
270 continue; /* We didn't find anything to kill */
271 if (r == -ENOMEM)
272 return r; /* Treat oom as a hard error */
273 if (r < 0) {
274 if (ret == 0)
275 ret = r;
276 continue; /* Try to find something else to kill */
277 }
278
279 char *selected = strdup(sorted[i]->path);
280 if (!selected)
281 return -ENOMEM;
282 *ret_selected = selected;
283 return 1;
284 }
285
286 return ret;
287 }
288
289 int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret) {
290 _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL;
291 _cleanup_free_ char *p = NULL, *val = NULL;
292 bool is_root;
293 uid_t uid;
294 int r;
295
296 assert(path);
297 assert(ret);
298
299 ctx = new0(OomdCGroupContext, 1);
300 if (!ctx)
301 return -ENOMEM;
302
303 is_root = empty_or_root(path);
304 ctx->preference = MANAGED_OOM_PREFERENCE_NONE;
305
306 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, "memory.pressure", &p);
307 if (r < 0)
308 return log_debug_errno(r, "Error getting cgroup memory pressure path from %s: %m", path);
309
310 r = read_resource_pressure(p, PRESSURE_TYPE_FULL, &ctx->memory_pressure);
311 if (r < 0)
312 return log_debug_errno(r, "Error parsing memory pressure from %s: %m", p);
313
314 r = cg_get_owner(SYSTEMD_CGROUP_CONTROLLER, path, &uid);
315 if (r < 0)
316 log_debug_errno(r, "Failed to get owner/group from %s: %m", path);
317 else if (uid == 0) {
318 /* Ignore most errors when reading the xattr since it is usually unset and cgroup xattrs are only used
319 * as an optional feature of systemd-oomd (and the system might not even support them). */
320 r = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER, path, "user.oomd_avoid");
321 if (r == -ENOMEM)
322 return r;
323 ctx->preference = r == 1 ? MANAGED_OOM_PREFERENCE_AVOID : ctx->preference;
324
325 r = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER, path, "user.oomd_omit");
326 if (r == -ENOMEM)
327 return r;
328 ctx->preference = r == 1 ? MANAGED_OOM_PREFERENCE_OMIT : ctx->preference;
329 }
330
331 if (is_root) {
332 r = procfs_memory_get_used(&ctx->current_memory_usage);
333 if (r < 0)
334 return log_debug_errno(r, "Error getting memory used from procfs: %m");
335 } else {
336 r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.current", &ctx->current_memory_usage);
337 if (r < 0)
338 return log_debug_errno(r, "Error getting memory.current from %s: %m", path);
339
340 r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.min", &ctx->memory_min);
341 if (r < 0)
342 return log_debug_errno(r, "Error getting memory.min from %s: %m", path);
343
344 r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.low", &ctx->memory_low);
345 if (r < 0)
346 return log_debug_errno(r, "Error getting memory.low from %s: %m", path);
347
348 r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.swap.current", &ctx->swap_usage);
349 if (r < 0)
350 return log_debug_errno(r, "Error getting memory.swap.current from %s: %m", path);
351
352 r = cg_get_keyed_attribute(SYSTEMD_CGROUP_CONTROLLER, path, "memory.stat", STRV_MAKE("pgscan"), &val);
353 if (r < 0)
354 return log_debug_errno(r, "Error getting pgscan from memory.stat under %s: %m", path);
355
356 r = safe_atou64(val, &ctx->pgscan);
357 if (r < 0)
358 return log_debug_errno(r, "Error converting pgscan value to uint64_t: %m");
359 }
360
361 ctx->path = strdup(empty_to_root(path));
362 if (!ctx->path)
363 return -ENOMEM;
364
365 *ret = TAKE_PTR(ctx);
366 return 0;
367 }
368
369 int oomd_system_context_acquire(const char *proc_swaps_path, OomdSystemContext *ret) {
370 _cleanup_fclose_ FILE *f = NULL;
371 OomdSystemContext ctx = {};
372 int r;
373
374 assert(proc_swaps_path);
375 assert(ret);
376
377 f = fopen(proc_swaps_path, "re");
378 if (!f)
379 return -errno;
380
381 (void) fscanf(f, "%*s %*s %*s %*s %*s\n");
382
383 for (;;) {
384 uint64_t total, used;
385
386 r = fscanf(f,
387 "%*s " /* device/file */
388 "%*s " /* type of swap */
389 "%" PRIu64 " " /* swap size */
390 "%" PRIu64 " " /* used */
391 "%*s\n", /* priority */
392 &total, &used);
393
394 if (r == EOF && feof(f))
395 break;
396
397 if (r != 2) {
398 if (ferror(f))
399 return log_debug_errno(errno, "Error reading from %s: %m", proc_swaps_path);
400
401 return log_debug_errno(SYNTHETIC_ERRNO(EIO),
402 "Failed to parse values from %s: %m", proc_swaps_path);
403 }
404
405 ctx.swap_total += total * 1024U;
406 ctx.swap_used += used * 1024U;
407 }
408
409 *ret = ctx;
410 return 0;
411 }
412
413 int oomd_insert_cgroup_context(Hashmap *old_h, Hashmap *new_h, const char *path) {
414 _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *curr_ctx = NULL;
415 OomdCGroupContext *old_ctx;
416 int r;
417
418 assert(new_h);
419 assert(path);
420
421 path = empty_to_root(path);
422
423 r = oomd_cgroup_context_acquire(path, &curr_ctx);
424 if (r < 0)
425 return log_debug_errno(r, "Failed to get OomdCGroupContext for %s: %m", path);
426
427 assert_se(streq(path, curr_ctx->path));
428
429 old_ctx = hashmap_get(old_h, path);
430 if (old_ctx) {
431 curr_ctx->last_pgscan = old_ctx->pgscan;
432 curr_ctx->mem_pressure_limit = old_ctx->mem_pressure_limit;
433 curr_ctx->last_hit_mem_pressure_limit = old_ctx->last_hit_mem_pressure_limit;
434 }
435
436 r = hashmap_put(new_h, curr_ctx->path, curr_ctx);
437 if (r < 0)
438 return r;
439
440 TAKE_PTR(curr_ctx);
441 return 0;
442 }
443
444 void oomd_update_cgroup_contexts_between_hashmaps(Hashmap *old_h, Hashmap *curr_h) {
445 OomdCGroupContext *ctx;
446
447 assert(old_h);
448 assert(curr_h);
449
450 HASHMAP_FOREACH(ctx, curr_h) {
451 OomdCGroupContext *old_ctx;
452
453 old_ctx = hashmap_get(old_h, ctx->path);
454 if (!old_ctx)
455 continue;
456
457 ctx->last_pgscan = old_ctx->pgscan;
458 ctx->mem_pressure_limit = old_ctx->mem_pressure_limit;
459 ctx->last_hit_mem_pressure_limit = old_ctx->last_hit_mem_pressure_limit;
460 }
461 }
462
463 void oomd_dump_swap_cgroup_context(const OomdCGroupContext *ctx, FILE *f, const char *prefix) {
464 char swap[FORMAT_BYTES_MAX];
465
466 assert(ctx);
467 assert(f);
468
469 if (!empty_or_root(ctx->path))
470 fprintf(f,
471 "%sPath: %s\n"
472 "%s\tSwap Usage: %s\n",
473 strempty(prefix), ctx->path,
474 strempty(prefix), format_bytes(swap, sizeof(swap), ctx->swap_usage));
475 else
476 fprintf(f,
477 "%sPath: %s\n"
478 "%s\tSwap Usage: (see System Context)\n",
479 strempty(prefix), ctx->path,
480 strempty(prefix));
481 }
482
483 void oomd_dump_memory_pressure_cgroup_context(const OomdCGroupContext *ctx, FILE *f, const char *prefix) {
484 char tbuf[FORMAT_TIMESPAN_MAX], mem_use[FORMAT_BYTES_MAX];
485 char mem_min[FORMAT_BYTES_MAX], mem_low[FORMAT_BYTES_MAX];
486
487 assert(ctx);
488 assert(f);
489
490 fprintf(f,
491 "%sPath: %s\n"
492 "%s\tMemory Pressure Limit: %lu.%02lu%%\n"
493 "%s\tPressure: Avg10: %lu.%02lu Avg60: %lu.%02lu Avg300: %lu.%02lu Total: %s\n"
494 "%s\tCurrent Memory Usage: %s\n",
495 strempty(prefix), ctx->path,
496 strempty(prefix), LOAD_INT(ctx->mem_pressure_limit), LOAD_FRAC(ctx->mem_pressure_limit),
497 strempty(prefix),
498 LOAD_INT(ctx->memory_pressure.avg10), LOAD_FRAC(ctx->memory_pressure.avg10),
499 LOAD_INT(ctx->memory_pressure.avg60), LOAD_FRAC(ctx->memory_pressure.avg60),
500 LOAD_INT(ctx->memory_pressure.avg300), LOAD_FRAC(ctx->memory_pressure.avg300),
501 format_timespan(tbuf, sizeof(tbuf), ctx->memory_pressure.total, USEC_PER_SEC),
502 strempty(prefix), format_bytes(mem_use, sizeof(mem_use), ctx->current_memory_usage));
503
504 if (!empty_or_root(ctx->path))
505 fprintf(f,
506 "%s\tMemory Min: %s\n"
507 "%s\tMemory Low: %s\n"
508 "%s\tPgscan: %" PRIu64 "\n"
509 "%s\tLast Pgscan: %" PRIu64 "\n",
510 strempty(prefix), format_bytes_cgroup_protection(mem_min, sizeof(mem_min), ctx->memory_min),
511 strempty(prefix), format_bytes_cgroup_protection(mem_low, sizeof(mem_low), ctx->memory_low),
512 strempty(prefix), ctx->pgscan,
513 strempty(prefix), ctx->last_pgscan);
514 }
515
516 void oomd_dump_system_context(const OomdSystemContext *ctx, FILE *f, const char *prefix) {
517 char used[FORMAT_BYTES_MAX], total[FORMAT_BYTES_MAX];
518
519 assert(ctx);
520 assert(f);
521
522 fprintf(f,
523 "%sSwap: Used: %s Total: %s\n",
524 strempty(prefix),
525 format_bytes(used, sizeof(used), ctx->swap_used),
526 format_bytes(total, sizeof(total), ctx->swap_total));
527 }