]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/oom/oomd-manager.c
network: also introduce UseDomains= for [DHCPv6] section
[thirdparty/systemd.git] / src / oom / oomd-manager.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include "bus-log-control-api.h"
4 #include "bus-util.h"
5 #include "bus-polkit.h"
6 #include "cgroup-util.h"
7 #include "fd-util.h"
8 #include "fileio.h"
9 #include "memory-util.h"
10 #include "oomd-manager-bus.h"
11 #include "oomd-manager.h"
12 #include "path-util.h"
13 #include "percent-util.h"
14
15 typedef struct ManagedOOMReply {
16 ManagedOOMMode mode;
17 char *path;
18 char *property;
19 uint32_t limit;
20 } ManagedOOMReply;
21
22 static void managed_oom_reply_destroy(ManagedOOMReply *reply) {
23 assert(reply);
24 free(reply->path);
25 free(reply->property);
26 }
27
28 static int managed_oom_mode(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) {
29 ManagedOOMMode *mode = userdata, m;
30 const char *s;
31
32 assert(mode);
33 assert_se(s = json_variant_string(v));
34
35 m = managed_oom_mode_from_string(s);
36 if (m < 0)
37 return json_log(v, flags, m, "%s is not a valid ManagedOOMMode", s);
38
39 *mode = m;
40 return 0;
41 }
42
43 static int process_managed_oom_reply(
44 Varlink *link,
45 JsonVariant *parameters,
46 const char *error_id,
47 VarlinkReplyFlags flags,
48 void *userdata) {
49 JsonVariant *c, *cgroups;
50 Manager *m = userdata;
51 int r = 0;
52
53 assert(m);
54
55 static const JsonDispatch dispatch_table[] = {
56 { "mode", JSON_VARIANT_STRING, managed_oom_mode, offsetof(ManagedOOMReply, mode), JSON_MANDATORY },
57 { "path", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMReply, path), JSON_MANDATORY },
58 { "property", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMReply, property), JSON_MANDATORY },
59 { "limit", JSON_VARIANT_UNSIGNED, json_dispatch_uint32, offsetof(ManagedOOMReply, limit), 0 },
60 {},
61 };
62
63 if (error_id) {
64 r = -EIO;
65 log_debug("Error getting ManagedOOM cgroups: %s", error_id);
66 goto finish;
67 }
68
69 cgroups = json_variant_by_key(parameters, "cgroups");
70 if (!cgroups) {
71 r = -EINVAL;
72 goto finish;
73 }
74
75 /* Skip malformed elements and keep processing in case the others are good */
76 JSON_VARIANT_ARRAY_FOREACH(c, cgroups) {
77 _cleanup_(managed_oom_reply_destroy) ManagedOOMReply reply = {};
78 OomdCGroupContext *ctx;
79 Hashmap *monitor_hm;
80 loadavg_t limit;
81 int ret;
82
83 if (!json_variant_is_object(c))
84 continue;
85
86 ret = json_dispatch(c, dispatch_table, NULL, 0, &reply);
87 if (ret == -ENOMEM) {
88 r = ret;
89 goto finish;
90 }
91 if (ret < 0)
92 continue;
93
94 monitor_hm = streq(reply.property, "ManagedOOMSwap") ?
95 m->monitored_swap_cgroup_contexts : m->monitored_mem_pressure_cgroup_contexts;
96
97 if (reply.mode == MANAGED_OOM_AUTO) {
98 (void) oomd_cgroup_context_free(hashmap_remove(monitor_hm, empty_to_root(reply.path)));
99 continue;
100 }
101
102 limit = m->default_mem_pressure_limit;
103
104 if (streq(reply.property, "ManagedOOMMemoryPressure") && reply.limit > 0) {
105 int permyriad = UINT32_SCALE_TO_PERMYRIAD(reply.limit);
106
107 ret = store_loadavg_fixed_point(
108 (unsigned long) permyriad / 100,
109 (unsigned long) permyriad % 100,
110 &limit);
111 if (ret < 0)
112 continue;
113 }
114
115 ret = oomd_insert_cgroup_context(NULL, monitor_hm, reply.path);
116 if (ret == -ENOMEM) {
117 r = ret;
118 goto finish;
119 }
120
121 /* Always update the limit in case it was changed. For non-memory pressure detection the value is
122 * ignored so always updating it here is not a problem. */
123 ctx = hashmap_get(monitor_hm, empty_to_root(reply.path));
124 if (ctx)
125 ctx->mem_pressure_limit = limit;
126 }
127
128 finish:
129 if (!FLAGS_SET(flags, VARLINK_REPLY_CONTINUES))
130 m->varlink = varlink_close_unref(link);
131
132 return r;
133 }
134
135 /* Fill `new_h` with `path`'s descendent OomdCGroupContexts. Only include descendent cgroups that are possible
136 * candidates for action. That is, only leaf cgroups or cgroups with memory.oom.group set to "1".
137 *
138 * This function ignores most errors in order to handle cgroups that may have been cleaned up while populating
139 * the hashmap.
140 *
141 * `new_h` is of the form { key: cgroup paths -> value: OomdCGroupContext } */
142 static int recursively_get_cgroup_context(Hashmap *new_h, const char *path) {
143 _cleanup_free_ char *subpath = NULL;
144 _cleanup_closedir_ DIR *d = NULL;
145 int r;
146
147 assert(new_h);
148 assert(path);
149
150 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
151 if (r < 0)
152 return r;
153
154 r = cg_read_subgroup(d, &subpath);
155 if (r < 0)
156 return r;
157 else if (r == 0) { /* No subgroups? We're a leaf node */
158 r = oomd_insert_cgroup_context(NULL, new_h, path);
159 return (r == -ENOMEM) ? r : 0;
160 }
161
162 do {
163 _cleanup_free_ char *cg_path = NULL;
164 bool oom_group;
165
166 cg_path = path_join(empty_to_root(path), subpath);
167 if (!cg_path)
168 return -ENOMEM;
169
170 subpath = mfree(subpath);
171
172 r = cg_get_attribute_as_bool("memory", cg_path, "memory.oom.group", &oom_group);
173 /* The cgroup might be gone. Skip it as a candidate since we can't get information on it. */
174 if (r < 0)
175 return (r == -ENOMEM) ? r : 0;
176
177 if (oom_group)
178 r = oomd_insert_cgroup_context(NULL, new_h, cg_path);
179 else
180 r = recursively_get_cgroup_context(new_h, cg_path);
181 if (r == -ENOMEM)
182 return r;
183 } while ((r = cg_read_subgroup(d, &subpath)) > 0);
184
185 return 0;
186 }
187
188 static int update_monitored_cgroup_contexts(Hashmap **monitored_cgroups) {
189 _cleanup_hashmap_free_ Hashmap *new_base = NULL;
190 OomdCGroupContext *ctx;
191 int r;
192
193 assert(monitored_cgroups);
194
195 new_base = hashmap_new(&oomd_cgroup_ctx_hash_ops);
196 if (!new_base)
197 return -ENOMEM;
198
199 HASHMAP_FOREACH(ctx, *monitored_cgroups) {
200 /* Skip most errors since the cgroup we're trying to update might not exist anymore. */
201 r = oomd_insert_cgroup_context(*monitored_cgroups, new_base, ctx->path);
202 if (r == -ENOMEM)
203 return r;
204 }
205
206 hashmap_free(*monitored_cgroups);
207 *monitored_cgroups = TAKE_PTR(new_base);
208
209 return 0;
210 }
211
212 static int get_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups, Hashmap **ret_candidates) {
213 _cleanup_hashmap_free_ Hashmap *candidates = NULL;
214 OomdCGroupContext *ctx;
215 int r;
216
217 assert(monitored_cgroups);
218 assert(ret_candidates);
219
220 candidates = hashmap_new(&oomd_cgroup_ctx_hash_ops);
221 if (!candidates)
222 return -ENOMEM;
223
224 HASHMAP_FOREACH(ctx, monitored_cgroups) {
225 r = recursively_get_cgroup_context(candidates, ctx->path);
226 if (r == -ENOMEM)
227 return r;
228 }
229
230 *ret_candidates = TAKE_PTR(candidates);
231
232 return 0;
233 }
234
235 static int acquire_managed_oom_connect(Manager *m) {
236 _cleanup_(varlink_close_unrefp) Varlink *link = NULL;
237 int r;
238
239 assert(m);
240 assert(m->event);
241
242 r = varlink_connect_address(&link, VARLINK_ADDR_PATH_MANAGED_OOM);
243 if (r < 0)
244 return log_error_errno(r, "Failed to connect to %s: %m", VARLINK_ADDR_PATH_MANAGED_OOM);
245
246 (void) varlink_set_userdata(link, m);
247 (void) varlink_set_description(link, "oomd");
248 (void) varlink_set_relative_timeout(link, USEC_INFINITY);
249
250 r = varlink_attach_event(link, m->event, SD_EVENT_PRIORITY_NORMAL);
251 if (r < 0)
252 return log_error_errno(r, "Failed to attach varlink connection to event loop: %m");
253
254 r = varlink_bind_reply(link, process_managed_oom_reply);
255 if (r < 0)
256 return log_error_errno(r, "Failed to bind reply callback: %m");
257
258 r = varlink_observe(link, "io.systemd.ManagedOOM.SubscribeManagedOOMCGroups", NULL);
259 if (r < 0)
260 return log_error_errno(r, "Failed to observe varlink call: %m");
261
262 m->varlink = TAKE_PTR(link);
263 return 0;
264 }
265
266 static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
267 _cleanup_set_free_ Set *targets = NULL;
268 Manager *m = userdata;
269 usec_t usec_now;
270 int r;
271
272 assert(s);
273 assert(userdata);
274
275 /* Reset timer */
276 r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now);
277 if (r < 0)
278 return log_error_errno(r, "Failed to reset event timer");
279
280 r = sd_event_source_set_time_relative(s, INTERVAL_USEC);
281 if (r < 0)
282 return log_error_errno(r, "Failed to set relative time for timer");
283
284 /* Reconnect if our connection dropped */
285 if (!m->varlink) {
286 r = acquire_managed_oom_connect(m);
287 if (r < 0)
288 return log_error_errno(r, "Failed to acquire varlink connection");
289 }
290
291 /* Update the cgroups used for detection/action */
292 r = update_monitored_cgroup_contexts(&m->monitored_swap_cgroup_contexts);
293 if (r == -ENOMEM)
294 return log_error_errno(r, "Failed to update monitored swap cgroup contexts");
295
296 r = update_monitored_cgroup_contexts(&m->monitored_mem_pressure_cgroup_contexts);
297 if (r == -ENOMEM)
298 return log_error_errno(r, "Failed to update monitored memory pressure cgroup contexts");
299
300 r = oomd_system_context_acquire("/proc/swaps", &m->system_context);
301 /* If there aren't units depending on swap actions, the only error we exit on is ENOMEM.
302 * Allow ENOENT in the event that swap is disabled on the system. */
303 if (r == -ENOMEM || (r < 0 && r != -ENOENT && !hashmap_isempty(m->monitored_swap_cgroup_contexts)))
304 return log_error_errno(r, "Failed to acquire system context");
305 else if (r == -ENOENT)
306 zero(m->system_context);
307
308 if (oomd_memory_reclaim(m->monitored_mem_pressure_cgroup_contexts))
309 m->last_reclaim_at = usec_now;
310
311 /* If we're still recovering from a kill, don't try to kill again yet */
312 if (m->post_action_delay_start > 0) {
313 if (m->post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now)
314 return 0;
315 else
316 m->post_action_delay_start = 0;
317 }
318
319 r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets);
320 if (r == -ENOMEM)
321 return log_error_errno(r, "Failed to check if memory pressure exceeded limits");
322 else if (r == 1) {
323 /* Check if there was reclaim activity in the given interval. The concern is the following case:
324 * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending
325 * cgroup. Even after this, well-behaved processes will fault in recently resident pages and
326 * this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need
327 * to kill something (it won't help anyways). */
328 if ((usec_now - m->last_reclaim_at) <= RECLAIM_DURATION_USEC) {
329 _cleanup_hashmap_free_ Hashmap *candidates = NULL;
330 OomdCGroupContext *t;
331
332 r = get_monitored_cgroup_contexts_candidates(m->monitored_mem_pressure_cgroup_contexts, &candidates);
333 if (r == -ENOMEM)
334 return log_error_errno(r, "Failed to get monitored memory pressure cgroup candidates");
335
336 SET_FOREACH(t, targets) {
337 log_notice("Memory pressure for %s is greater than %lu for more than %"PRIu64" seconds and there was reclaim activity",
338 t->path, LOAD_INT(t->mem_pressure_limit), m->default_mem_pressure_duration_usec / USEC_PER_SEC);
339
340 r = oomd_kill_by_pgscan(candidates, t->path, m->dry_run);
341 if (r == -ENOMEM)
342 return log_error_errno(r, "Failed to kill cgroup processes by pgscan");
343 if (r < 0)
344 log_info("Failed to kill any cgroup(s) under %s based on pressure", t->path);
345 else {
346 /* Don't act on all the high pressure cgroups at once; return as soon as we kill one */
347 m->post_action_delay_start = usec_now;
348 return 0;
349 }
350 }
351 }
352 }
353
354 if (oomd_swap_free_below(&m->system_context, 10000 - m->swap_used_limit_permyriad)) {
355 _cleanup_hashmap_free_ Hashmap *candidates = NULL;
356
357 log_notice("Swap used (%"PRIu64") / total (%"PRIu64") is more than " PERMYRIAD_AS_PERCENT_FORMAT_STR,
358 m->system_context.swap_used, m->system_context.swap_total, PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
359
360 r = get_monitored_cgroup_contexts_candidates(m->monitored_swap_cgroup_contexts, &candidates);
361 if (r == -ENOMEM)
362 return log_error_errno(r, "Failed to get monitored swap cgroup candidates");
363
364 r = oomd_kill_by_swap_usage(candidates, m->dry_run);
365 if (r == -ENOMEM)
366 return log_error_errno(r, "Failed to kill cgroup processes by swap usage");
367 if (r < 0)
368 log_info("Failed to kill any cgroup(s) based on swap");
369 else {
370 m->post_action_delay_start = usec_now;
371 return 0;
372 }
373 }
374
375 return 0;
376 }
377
378 static int monitor_cgroup_contexts(Manager *m) {
379 _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
380 int r;
381
382 assert(m);
383 assert(m->event);
384
385 r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_cgroup_contexts_handler, m);
386 if (r < 0)
387 return r;
388
389 r = sd_event_source_set_exit_on_failure(s, true);
390 if (r < 0)
391 return r;
392
393 r = sd_event_source_set_enabled(s, SD_EVENT_ON);
394 if (r < 0)
395 return r;
396
397 (void) sd_event_source_set_description(s, "oomd-timer");
398
399 m->cgroup_context_event_source = TAKE_PTR(s);
400 return 0;
401 }
402
403 Manager* manager_free(Manager *m) {
404 assert(m);
405
406 varlink_close_unref(m->varlink);
407 sd_event_source_unref(m->cgroup_context_event_source);
408 sd_event_unref(m->event);
409
410 bus_verify_polkit_async_registry_free(m->polkit_registry);
411 sd_bus_flush_close_unref(m->bus);
412
413 hashmap_free(m->monitored_swap_cgroup_contexts);
414 hashmap_free(m->monitored_mem_pressure_cgroup_contexts);
415
416 return mfree(m);
417 }
418
419 int manager_new(Manager **ret) {
420 _cleanup_(manager_freep) Manager *m = NULL;
421 int r;
422
423 assert(ret);
424
425 m = new0(Manager, 1);
426 if (!m)
427 return -ENOMEM;
428
429 r = sd_event_default(&m->event);
430 if (r < 0)
431 return r;
432
433 (void) sd_event_set_watchdog(m->event, true);
434
435 r = sd_event_add_signal(m->event, NULL, SIGINT, NULL, NULL);
436 if (r < 0)
437 return r;
438
439 r = sd_event_add_signal(m->event, NULL, SIGTERM, NULL, NULL);
440 if (r < 0)
441 return r;
442
443 m->monitored_swap_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops);
444 if (!m->monitored_swap_cgroup_contexts)
445 return -ENOMEM;
446
447 m->monitored_mem_pressure_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops);
448 if (!m->monitored_mem_pressure_cgroup_contexts)
449 return -ENOMEM;
450
451 *ret = TAKE_PTR(m);
452 return 0;
453 }
454
455 static int manager_connect_bus(Manager *m) {
456 int r;
457
458 assert(m);
459 assert(!m->bus);
460
461 r = bus_open_system_watch_bind_with_description(&m->bus, "bus-api-oom");
462 if (r < 0)
463 return log_error_errno(r, "Failed to connect to bus: %m");
464
465 r = bus_add_implementation(m->bus, &manager_object, m);
466 if (r < 0)
467 return r;
468
469 r = bus_log_control_api_register(m->bus);
470 if (r < 0)
471 return r;
472
473 r = sd_bus_request_name_async(m->bus, NULL, "org.freedesktop.oom1", 0, NULL, NULL);
474 if (r < 0)
475 return log_error_errno(r, "Failed to request name: %m");
476
477 r = sd_bus_attach_event(m->bus, m->event, 0);
478 if (r < 0)
479 return log_error_errno(r, "Failed to attach bus to event loop: %m");
480
481 return 0;
482 }
483
484 int manager_start(
485 Manager *m,
486 bool dry_run,
487 int swap_used_limit_permyriad,
488 int mem_pressure_limit_permyriad,
489 usec_t mem_pressure_usec) {
490
491 unsigned long l, f;
492 int r;
493
494 assert(m);
495
496 m->dry_run = dry_run;
497
498 m->swap_used_limit_permyriad = swap_used_limit_permyriad >= 0 ? swap_used_limit_permyriad : DEFAULT_SWAP_USED_LIMIT_PERCENT * 100;
499 assert(m->swap_used_limit_permyriad <= 10000);
500
501 if (mem_pressure_limit_permyriad >= 0) {
502 assert(mem_pressure_limit_permyriad <= 10000);
503
504 l = mem_pressure_limit_permyriad / 100;
505 f = mem_pressure_limit_permyriad % 100;
506 } else {
507 l = DEFAULT_MEM_PRESSURE_LIMIT_PERCENT;
508 f = 0;
509 }
510 r = store_loadavg_fixed_point(l, f, &m->default_mem_pressure_limit);
511 if (r < 0)
512 return r;
513
514 m->default_mem_pressure_duration_usec = mem_pressure_usec ?: DEFAULT_MEM_PRESSURE_DURATION_USEC;
515
516 r = manager_connect_bus(m);
517 if (r < 0)
518 return r;
519
520 r = acquire_managed_oom_connect(m);
521 if (r < 0)
522 return r;
523
524 r = monitor_cgroup_contexts(m);
525 if (r < 0)
526 return r;
527
528 return 0;
529 }
530
531 int manager_get_dump_string(Manager *m, char **ret) {
532 _cleanup_free_ char *dump = NULL;
533 _cleanup_fclose_ FILE *f = NULL;
534 char buf[FORMAT_TIMESPAN_MAX];
535 OomdCGroupContext *c;
536 size_t size;
537 char *key;
538 int r;
539
540 assert(m);
541 assert(ret);
542
543 f = open_memstream_unlocked(&dump, &size);
544 if (!f)
545 return -errno;
546
547 fprintf(f,
548 "Dry Run: %s\n"
549 "Swap Used Limit: " PERMYRIAD_AS_PERCENT_FORMAT_STR "\n"
550 "Default Memory Pressure Limit: %lu.%02lu%%\n"
551 "Default Memory Pressure Duration: %s\n"
552 "System Context:\n",
553 yes_no(m->dry_run),
554 PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad),
555 LOAD_INT(m->default_mem_pressure_limit), LOAD_FRAC(m->default_mem_pressure_limit),
556 format_timespan(buf, sizeof(buf), m->default_mem_pressure_duration_usec, USEC_PER_SEC));
557 oomd_dump_system_context(&m->system_context, f, "\t");
558
559 fprintf(f, "Swap Monitored CGroups:\n");
560 HASHMAP_FOREACH_KEY(c, key, m->monitored_swap_cgroup_contexts)
561 oomd_dump_swap_cgroup_context(c, f, "\t");
562
563 fprintf(f, "Memory Pressure Monitored CGroups:\n");
564 HASHMAP_FOREACH_KEY(c, key, m->monitored_mem_pressure_cgroup_contexts)
565 oomd_dump_memory_pressure_cgroup_context(c, f, "\t");
566
567 r = fflush_and_check(f);
568 if (r < 0)
569 return r;
570
571 f = safe_fclose(f);
572
573 *ret = TAKE_PTR(dump);
574 return 0;
575 }