]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/oom/oomd-manager.c
Merge pull request #18481 from keszybz/rpm-restart-post-trans
[thirdparty/systemd.git] / src / oom / oomd-manager.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
9de5e321 2
5c616ecf
AZ
3#include "bus-log-control-api.h"
4#include "bus-util.h"
5#include "bus-polkit.h"
9de5e321
AZ
6#include "cgroup-util.h"
7#include "fd-util.h"
8#include "fileio.h"
408a3bbd 9#include "memory-util.h"
5c616ecf 10#include "oomd-manager-bus.h"
9de5e321
AZ
11#include "oomd-manager.h"
12#include "path-util.h"
13
14typedef struct ManagedOOMReply {
15 ManagedOOMMode mode;
16 char *path;
17 char *property;
18 unsigned limit;
19} ManagedOOMReply;
20
21static void managed_oom_reply_destroy(ManagedOOMReply *reply) {
22 assert(reply);
23 free(reply->path);
24 free(reply->property);
25}
26
27static int managed_oom_mode(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) {
28 ManagedOOMMode *mode = userdata, m;
29 const char *s;
30
31 assert(mode);
32 assert_se(s = json_variant_string(v));
33
34 m = managed_oom_mode_from_string(s);
35 if (m < 0)
7211c853 36 return json_log(v, flags, m, "%s is not a valid ManagedOOMMode", s);
9de5e321
AZ
37
38 *mode = m;
39 return 0;
40}
41
42static int process_managed_oom_reply(
43 Varlink *link,
44 JsonVariant *parameters,
45 const char *error_id,
46 VarlinkReplyFlags flags,
47 void *userdata) {
48 JsonVariant *c, *cgroups;
49 Manager *m = userdata;
50 int r = 0;
51
52 assert(m);
53
54 static const JsonDispatch dispatch_table[] = {
55 { "mode", JSON_VARIANT_STRING, managed_oom_mode, offsetof(ManagedOOMReply, mode), JSON_MANDATORY },
56 { "path", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMReply, path), JSON_MANDATORY },
57 { "property", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMReply, property), JSON_MANDATORY },
58 { "limit", JSON_VARIANT_UNSIGNED, json_dispatch_unsigned, offsetof(ManagedOOMReply, limit), 0 },
59 {},
60 };
61
62 if (error_id) {
63 r = -EIO;
64 log_debug("Error getting ManagedOOM cgroups: %s", error_id);
65 goto finish;
66 }
67
68 cgroups = json_variant_by_key(parameters, "cgroups");
69 if (!cgroups) {
70 r = -EINVAL;
71 goto finish;
72 }
73
74 /* Skip malformed elements and keep processing in case the others are good */
75 JSON_VARIANT_ARRAY_FOREACH(c, cgroups) {
76 _cleanup_(managed_oom_reply_destroy) ManagedOOMReply reply = {};
77 OomdCGroupContext *ctx;
78 Hashmap *monitor_hm;
79 loadavg_t limit;
80 int ret;
81
82 if (!json_variant_is_object(c))
83 continue;
84
85 ret = json_dispatch(c, dispatch_table, NULL, 0, &reply);
86 if (ret == -ENOMEM) {
87 r = ret;
88 goto finish;
89 } else if (ret < 0)
90 continue;
91
92 monitor_hm = streq(reply.property, "ManagedOOMSwap") ?
93 m->monitored_swap_cgroup_contexts : m->monitored_mem_pressure_cgroup_contexts;
94
95 if (reply.mode == MANAGED_OOM_AUTO) {
df7f3eab 96 (void) oomd_cgroup_context_free(hashmap_remove(monitor_hm, empty_to_root(reply.path)));
9de5e321
AZ
97 continue;
98 }
99
100 limit = m->default_mem_pressure_limit;
101
102 if (streq(reply.property, "ManagedOOMMemoryPressure")) {
0a9f9344 103 if (reply.limit > 10000)
9de5e321
AZ
104 continue;
105 else if (reply.limit != 0) {
0a9f9344 106 ret = store_loadavg_fixed_point((unsigned long) reply.limit / 100, (unsigned long) reply.limit % 100, &limit);
9de5e321
AZ
107 if (ret < 0)
108 continue;
109 }
110 }
111
df7f3eab 112 ret = oomd_insert_cgroup_context(NULL, monitor_hm, empty_to_root(reply.path));
9de5e321
AZ
113 if (ret == -ENOMEM) {
114 r = ret;
115 goto finish;
116 }
117
118 /* Always update the limit in case it was changed. For non-memory pressure detection the value is
119 * ignored so always updating it here is not a problem. */
df7f3eab 120 ctx = hashmap_get(monitor_hm, empty_to_root(reply.path));
9de5e321
AZ
121 if (ctx)
122 ctx->mem_pressure_limit = limit;
123 }
124
125finish:
126 if (!FLAGS_SET(flags, VARLINK_REPLY_CONTINUES))
127 m->varlink = varlink_close_unref(link);
128
129 return r;
130}
131
132/* Fill `new_h` with `path`'s descendent OomdCGroupContexts. Only include descendent cgroups that are possible
133 * candidates for action. That is, only leaf cgroups or cgroups with memory.oom.group set to "1".
134 *
135 * This function ignores most errors in order to handle cgroups that may have been cleaned up while populating
136 * the hashmap.
137 *
138 * `new_h` is of the form { key: cgroup paths -> value: OomdCGroupContext } */
139static int recursively_get_cgroup_context(Hashmap *new_h, const char *path) {
140 _cleanup_free_ char *subpath = NULL;
141 _cleanup_closedir_ DIR *d = NULL;
142 int r;
143
144 assert(new_h);
145 assert(path);
146
147 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
148 if (r < 0)
149 return r;
150
151 r = cg_read_subgroup(d, &subpath);
152 if (r < 0)
153 return r;
154 else if (r == 0) { /* No subgroups? We're a leaf node */
155 r = oomd_insert_cgroup_context(NULL, new_h, path);
156 return (r == -ENOMEM) ? r : 0;
157 }
158
159 do {
160 _cleanup_free_ char *cg_path = NULL;
161 bool oom_group;
162
163 cg_path = path_join(empty_to_root(path), subpath);
164 if (!cg_path)
165 return -ENOMEM;
166
167 subpath = mfree(subpath);
168
169 r = cg_get_attribute_as_bool("memory", cg_path, "memory.oom.group", &oom_group);
170 /* The cgroup might be gone. Skip it as a candidate since we can't get information on it. */
171 if (r < 0)
172 return (r == -ENOMEM) ? r : 0;
173
349a2003 174 if (oom_group)
9de5e321 175 r = oomd_insert_cgroup_context(NULL, new_h, cg_path);
349a2003 176 else
9de5e321 177 r = recursively_get_cgroup_context(new_h, cg_path);
349a2003
AZ
178 if (r == -ENOMEM)
179 return r;
9de5e321
AZ
180 } while ((r = cg_read_subgroup(d, &subpath)) > 0);
181
182 return 0;
183}
184
185static int update_monitored_cgroup_contexts(Hashmap **monitored_cgroups) {
186 _cleanup_hashmap_free_ Hashmap *new_base = NULL;
187 OomdCGroupContext *ctx;
188 int r;
189
190 assert(monitored_cgroups);
191
192 new_base = hashmap_new(&oomd_cgroup_ctx_hash_ops);
193 if (!new_base)
194 return -ENOMEM;
195
196 HASHMAP_FOREACH(ctx, *monitored_cgroups) {
197 /* Skip most errors since the cgroup we're trying to update might not exist anymore. */
198 r = oomd_insert_cgroup_context(*monitored_cgroups, new_base, ctx->path);
199 if (r == -ENOMEM)
200 return r;
201 }
202
203 hashmap_free(*monitored_cgroups);
204 *monitored_cgroups = TAKE_PTR(new_base);
205
206 return 0;
207}
208
209static int get_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups, Hashmap **ret_candidates) {
210 _cleanup_hashmap_free_ Hashmap *candidates = NULL;
211 OomdCGroupContext *ctx;
212 int r;
213
214 assert(monitored_cgroups);
215 assert(ret_candidates);
216
217 candidates = hashmap_new(&oomd_cgroup_ctx_hash_ops);
218 if (!candidates)
219 return -ENOMEM;
220
221 HASHMAP_FOREACH(ctx, monitored_cgroups) {
222 r = recursively_get_cgroup_context(candidates, ctx->path);
223 if (r == -ENOMEM)
224 return r;
225 }
226
227 *ret_candidates = TAKE_PTR(candidates);
228
229 return 0;
230}
231
232static int acquire_managed_oom_connect(Manager *m) {
233 _cleanup_(varlink_close_unrefp) Varlink *link = NULL;
234 int r;
235
236 assert(m);
237 assert(m->event);
238
239 r = varlink_connect_address(&link, VARLINK_ADDR_PATH_MANAGED_OOM);
240 if (r < 0)
241 return log_error_errno(r, "Failed to connect to %s: %m", VARLINK_ADDR_PATH_MANAGED_OOM);
242
243 (void) varlink_set_userdata(link, m);
244 (void) varlink_set_description(link, "oomd");
245 (void) varlink_set_relative_timeout(link, USEC_INFINITY);
246
247 r = varlink_attach_event(link, m->event, SD_EVENT_PRIORITY_NORMAL);
248 if (r < 0)
249 return log_error_errno(r, "Failed to attach varlink connection to event loop: %m");
250
251 r = varlink_bind_reply(link, process_managed_oom_reply);
252 if (r < 0)
253 return log_error_errno(r, "Failed to bind reply callback: %m");
254
255 r = varlink_observe(link, "io.systemd.ManagedOOM.SubscribeManagedOOMCGroups", NULL);
256 if (r < 0)
257 return log_error_errno(r, "Failed to observe varlink call: %m");
258
259 m->varlink = TAKE_PTR(link);
260 return 0;
261}
262
263static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
264 _cleanup_set_free_ Set *targets = NULL;
265 Manager *m = userdata;
266 usec_t usec_now;
267 int r;
268
269 assert(s);
270 assert(userdata);
271
272 /* Reset timer */
273 r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now);
274 if (r < 0)
275 return log_error_errno(r, "Failed to reset event timer");
276
277 r = sd_event_source_set_time_relative(s, INTERVAL_USEC);
278 if (r < 0)
279 return log_error_errno(r, "Failed to set relative time for timer");
280
281 /* Reconnect if our connection dropped */
282 if (!m->varlink) {
283 r = acquire_managed_oom_connect(m);
284 if (r < 0)
285 return log_error_errno(r, "Failed to acquire varlink connection");
286 }
287
288 /* Update the cgroups used for detection/action */
289 r = update_monitored_cgroup_contexts(&m->monitored_swap_cgroup_contexts);
290 if (r == -ENOMEM)
291 return log_error_errno(r, "Failed to update monitored swap cgroup contexts");
292
293 r = update_monitored_cgroup_contexts(&m->monitored_mem_pressure_cgroup_contexts);
294 if (r == -ENOMEM)
295 return log_error_errno(r, "Failed to update monitored memory pressure cgroup contexts");
296
297 r = oomd_system_context_acquire("/proc/swaps", &m->system_context);
408a3bbd
AZ
298 /* If there aren't units depending on swap actions, the only error we exit on is ENOMEM.
299 * Allow ENOENT in the event that swap is disabled on the system. */
300 if (r == -ENOMEM || (r < 0 && r != -ENOENT && !hashmap_isempty(m->monitored_swap_cgroup_contexts)))
9de5e321 301 return log_error_errno(r, "Failed to acquire system context");
408a3bbd
AZ
302 else if (r == -ENOENT)
303 zero(m->system_context);
9de5e321 304
924c89e9
AZ
305 if (oomd_memory_reclaim(m->monitored_mem_pressure_cgroup_contexts))
306 m->last_reclaim_at = usec_now;
307
9de5e321
AZ
308 /* If we're still recovering from a kill, don't try to kill again yet */
309 if (m->post_action_delay_start > 0) {
310 if (m->post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now)
311 return 0;
312 else
313 m->post_action_delay_start = 0;
314 }
315
c20aa7b1 316 r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets);
9de5e321
AZ
317 if (r == -ENOMEM)
318 return log_error_errno(r, "Failed to check if memory pressure exceeded limits");
319 else if (r == 1) {
924c89e9 320 /* Check if there was reclaim activity in the given interval. The concern is the following case:
9de5e321
AZ
321 * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending
322 * cgroup. Even after this, well-behaved processes will fault in recently resident pages and
323 * this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need
324 * to kill something (it won't help anyways). */
924c89e9 325 if ((usec_now - m->last_reclaim_at) <= RECLAIM_DURATION_USEC) {
9de5e321
AZ
326 _cleanup_hashmap_free_ Hashmap *candidates = NULL;
327 OomdCGroupContext *t;
328
329 r = get_monitored_cgroup_contexts_candidates(m->monitored_mem_pressure_cgroup_contexts, &candidates);
330 if (r == -ENOMEM)
331 return log_error_errno(r, "Failed to get monitored memory pressure cgroup candidates");
332
333 SET_FOREACH(t, targets) {
334 log_notice("Memory pressure for %s is greater than %lu for more than %"PRIu64" seconds and there was reclaim activity",
c20aa7b1 335 t->path, LOAD_INT(t->mem_pressure_limit), m->default_mem_pressure_duration_usec / USEC_PER_SEC);
9de5e321
AZ
336
337 r = oomd_kill_by_pgscan(candidates, t->path, m->dry_run);
338 if (r == -ENOMEM)
339 return log_error_errno(r, "Failed to kill cgroup processes by pgscan");
340 if (r < 0)
341 log_info("Failed to kill any cgroup(s) under %s based on pressure", t->path);
342 else {
343 /* Don't act on all the high pressure cgroups at once; return as soon as we kill one */
344 m->post_action_delay_start = usec_now;
345 return 0;
346 }
347 }
348 }
349 }
350
351 if (oomd_swap_free_below(&m->system_context, (100 - m->swap_used_limit))) {
352 _cleanup_hashmap_free_ Hashmap *candidates = NULL;
353
354 log_notice("Swap used (%"PRIu64") / total (%"PRIu64") is more than %u%%",
355 m->system_context.swap_used, m->system_context.swap_total, m->swap_used_limit);
356
357 r = get_monitored_cgroup_contexts_candidates(m->monitored_swap_cgroup_contexts, &candidates);
358 if (r == -ENOMEM)
359 return log_error_errno(r, "Failed to get monitored swap cgroup candidates");
360
361 r = oomd_kill_by_swap_usage(candidates, m->dry_run);
362 if (r == -ENOMEM)
363 return log_error_errno(r, "Failed to kill cgroup processes by swap usage");
364 if (r < 0)
365 log_info("Failed to kill any cgroup(s) based on swap");
366 else {
367 m->post_action_delay_start = usec_now;
368 return 0;
369 }
370 }
371
372 return 0;
373}
374
375static int monitor_cgroup_contexts(Manager *m) {
376 _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
377 int r;
378
379 assert(m);
380 assert(m->event);
381
382 r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_cgroup_contexts_handler, m);
383 if (r < 0)
384 return r;
385
386 r = sd_event_source_set_exit_on_failure(s, true);
387 if (r < 0)
388 return r;
389
390 r = sd_event_source_set_enabled(s, SD_EVENT_ON);
391 if (r < 0)
392 return r;
393
394 (void) sd_event_source_set_description(s, "oomd-timer");
395
396 m->cgroup_context_event_source = TAKE_PTR(s);
397 return 0;
398}
399
400void manager_free(Manager *m) {
401 assert(m);
402
403 varlink_close_unref(m->varlink);
404 sd_event_source_unref(m->cgroup_context_event_source);
405 sd_event_unref(m->event);
406
5c616ecf
AZ
407 bus_verify_polkit_async_registry_free(m->polkit_registry);
408 sd_bus_flush_close_unref(m->bus);
409
9de5e321
AZ
410 hashmap_free(m->monitored_swap_cgroup_contexts);
411 hashmap_free(m->monitored_mem_pressure_cgroup_contexts);
412
413 free(m);
414}
415
416int manager_new(Manager **ret) {
417 _cleanup_(manager_freep) Manager *m = NULL;
418 int r;
419
420 assert(ret);
421
422 m = new0(Manager, 1);
423 if (!m)
424 return -ENOMEM;
425
426 r = sd_event_default(&m->event);
427 if (r < 0)
428 return r;
429
430 (void) sd_event_set_watchdog(m->event, true);
431
432 r = sd_event_add_signal(m->event, NULL, SIGINT, NULL, NULL);
433 if (r < 0)
434 return r;
435
436 r = sd_event_add_signal(m->event, NULL, SIGTERM, NULL, NULL);
437 if (r < 0)
438 return r;
439
440 m->monitored_swap_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops);
441 if (!m->monitored_swap_cgroup_contexts)
442 return -ENOMEM;
443
444 m->monitored_mem_pressure_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops);
445 if (!m->monitored_mem_pressure_cgroup_contexts)
446 return -ENOMEM;
447
448 *ret = TAKE_PTR(m);
449 return 0;
450}
451
5c616ecf
AZ
452static int manager_connect_bus(Manager *m) {
453 int r;
454
455 assert(m);
456 assert(!m->bus);
457
458 r = bus_open_system_watch_bind_with_description(&m->bus, "bus-api-oom");
459 if (r < 0)
460 return log_error_errno(r, "Failed to connect to bus: %m");
461
c9a00f5a 462 r = bus_add_implementation(m->bus, &manager_object, m);
5c616ecf 463 if (r < 0)
c9a00f5a 464 return r;
5c616ecf
AZ
465
466 r = bus_log_control_api_register(m->bus);
467 if (r < 0)
468 return r;
469
470 r = sd_bus_request_name_async(m->bus, NULL, "org.freedesktop.oom1", 0, NULL, NULL);
471 if (r < 0)
472 return log_error_errno(r, "Failed to request name: %m");
473
474 r = sd_bus_attach_event(m->bus, m->event, 0);
475 if (r < 0)
476 return log_error_errno(r, "Failed to attach bus to event loop: %m");
477
478 return 0;
479}
480
0a9f9344
AZ
481int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit_permyriad, usec_t mem_pressure_usec) {
482 unsigned long l, f;
9de5e321
AZ
483 int r;
484
485 assert(m);
486
487 m->dry_run = dry_run;
488
489 m->swap_used_limit = swap_used_limit != -1 ? swap_used_limit : DEFAULT_SWAP_USED_LIMIT;
490 assert(m->swap_used_limit <= 100);
491
0a9f9344
AZ
492 if (mem_pressure_limit_permyriad != -1) {
493 assert(mem_pressure_limit_permyriad <= 10000);
494
495 l = mem_pressure_limit_permyriad / 100;
496 f = mem_pressure_limit_permyriad % 100;
497 } else {
498 l = DEFAULT_MEM_PRESSURE_LIMIT_PERCENT;
499 f = 0;
500 }
501 r = store_loadavg_fixed_point(l, f, &m->default_mem_pressure_limit);
9de5e321
AZ
502 if (r < 0)
503 return r;
504
c20aa7b1
AZ
505 m->default_mem_pressure_duration_usec = mem_pressure_usec ?: DEFAULT_MEM_PRESSURE_DURATION_USEC;
506
5c616ecf
AZ
507 r = manager_connect_bus(m);
508 if (r < 0)
509 return r;
510
9de5e321
AZ
511 r = acquire_managed_oom_connect(m);
512 if (r < 0)
513 return r;
514
515 r = monitor_cgroup_contexts(m);
516 if (r < 0)
517 return r;
518
519 return 0;
520}
5c616ecf
AZ
521
522int manager_get_dump_string(Manager *m, char **ret) {
523 _cleanup_free_ char *dump = NULL;
524 _cleanup_fclose_ FILE *f = NULL;
c20aa7b1 525 char buf[FORMAT_TIMESPAN_MAX];
5c616ecf
AZ
526 OomdCGroupContext *c;
527 size_t size;
528 char *key;
529 int r;
530
531 assert(m);
532 assert(ret);
533
534 f = open_memstream_unlocked(&dump, &size);
535 if (!f)
536 return -errno;
537
538 fprintf(f,
539 "Dry Run: %s\n"
540 "Swap Used Limit: %u%%\n"
0a9f9344 541 "Default Memory Pressure Limit: %lu.%02lu%%\n"
c20aa7b1 542 "Default Memory Pressure Duration: %s\n"
5c616ecf
AZ
543 "System Context:\n",
544 yes_no(m->dry_run),
545 m->swap_used_limit,
0a9f9344 546 LOAD_INT(m->default_mem_pressure_limit), LOAD_FRAC(m->default_mem_pressure_limit),
c20aa7b1 547 format_timespan(buf, sizeof(buf), m->default_mem_pressure_duration_usec, USEC_PER_SEC));
5c616ecf
AZ
548 oomd_dump_system_context(&m->system_context, f, "\t");
549
550 fprintf(f, "Swap Monitored CGroups:\n");
551 HASHMAP_FOREACH_KEY(c, key, m->monitored_swap_cgroup_contexts)
552 oomd_dump_swap_cgroup_context(c, f, "\t");
553
554 fprintf(f, "Memory Pressure Monitored CGroups:\n");
555 HASHMAP_FOREACH_KEY(c, key, m->monitored_mem_pressure_cgroup_contexts)
556 oomd_dump_memory_pressure_cgroup_context(c, f, "\t");
557
558 r = fflush_and_check(f);
559 if (r < 0)
560 return r;
561
562 f = safe_fclose(f);
563
564 *ret = TAKE_PTR(dump);
565 return 0;
566}