]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/oom/oomd-manager.c
Merge pull request #17185 from yuwata/ethtool-update
[thirdparty/systemd.git] / src / oom / oomd-manager.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include "bus-log-control-api.h"
4 #include "bus-util.h"
5 #include "bus-polkit.h"
6 #include "cgroup-util.h"
7 #include "fd-util.h"
8 #include "fileio.h"
9 #include "oomd-manager-bus.h"
10 #include "oomd-manager.h"
11 #include "path-util.h"
12
13 typedef struct ManagedOOMReply {
14 ManagedOOMMode mode;
15 char *path;
16 char *property;
17 unsigned limit;
18 } ManagedOOMReply;
19
20 static void managed_oom_reply_destroy(ManagedOOMReply *reply) {
21 assert(reply);
22 free(reply->path);
23 free(reply->property);
24 }
25
26 static int managed_oom_mode(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) {
27 ManagedOOMMode *mode = userdata, m;
28 const char *s;
29
30 assert(mode);
31 assert_se(s = json_variant_string(v));
32
33 m = managed_oom_mode_from_string(s);
34 if (m < 0)
35 return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), "%s is not a valid ManagedOOMMode", s);
36
37 *mode = m;
38 return 0;
39 }
40
41 static int process_managed_oom_reply(
42 Varlink *link,
43 JsonVariant *parameters,
44 const char *error_id,
45 VarlinkReplyFlags flags,
46 void *userdata) {
47 JsonVariant *c, *cgroups;
48 Manager *m = userdata;
49 int r = 0;
50
51 assert(m);
52
53 static const JsonDispatch dispatch_table[] = {
54 { "mode", JSON_VARIANT_STRING, managed_oom_mode, offsetof(ManagedOOMReply, mode), JSON_MANDATORY },
55 { "path", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMReply, path), JSON_MANDATORY },
56 { "property", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMReply, property), JSON_MANDATORY },
57 { "limit", JSON_VARIANT_UNSIGNED, json_dispatch_unsigned, offsetof(ManagedOOMReply, limit), 0 },
58 {},
59 };
60
61 if (error_id) {
62 r = -EIO;
63 log_debug("Error getting ManagedOOM cgroups: %s", error_id);
64 goto finish;
65 }
66
67 cgroups = json_variant_by_key(parameters, "cgroups");
68 if (!cgroups) {
69 r = -EINVAL;
70 goto finish;
71 }
72
73 /* Skip malformed elements and keep processing in case the others are good */
74 JSON_VARIANT_ARRAY_FOREACH(c, cgroups) {
75 _cleanup_(managed_oom_reply_destroy) ManagedOOMReply reply = {};
76 OomdCGroupContext *ctx;
77 Hashmap *monitor_hm;
78 loadavg_t limit;
79 int ret;
80
81 if (!json_variant_is_object(c))
82 continue;
83
84 ret = json_dispatch(c, dispatch_table, NULL, 0, &reply);
85 if (ret == -ENOMEM) {
86 r = ret;
87 goto finish;
88 } else if (ret < 0)
89 continue;
90
91 monitor_hm = streq(reply.property, "ManagedOOMSwap") ?
92 m->monitored_swap_cgroup_contexts : m->monitored_mem_pressure_cgroup_contexts;
93
94 if (reply.mode == MANAGED_OOM_AUTO) {
95 (void) oomd_cgroup_context_free(hashmap_remove(monitor_hm, reply.path));
96 continue;
97 }
98
99 limit = m->default_mem_pressure_limit;
100
101 if (streq(reply.property, "ManagedOOMMemoryPressure")) {
102 if (reply.limit > 100)
103 continue;
104 else if (reply.limit != 0) {
105 ret = store_loadavg_fixed_point((unsigned long) reply.limit, 0, &limit);
106 if (ret < 0)
107 continue;
108 }
109 }
110
111 ret = oomd_insert_cgroup_context(NULL, monitor_hm, reply.path);
112 if (ret == -ENOMEM) {
113 r = ret;
114 goto finish;
115 }
116
117 /* Always update the limit in case it was changed. For non-memory pressure detection the value is
118 * ignored so always updating it here is not a problem. */
119 ctx = hashmap_get(monitor_hm, reply.path);
120 if (ctx)
121 ctx->mem_pressure_limit = limit;
122 }
123
124 finish:
125 if (!FLAGS_SET(flags, VARLINK_REPLY_CONTINUES))
126 m->varlink = varlink_close_unref(link);
127
128 return r;
129 }
130
131 /* Fill `new_h` with `path`'s descendent OomdCGroupContexts. Only include descendent cgroups that are possible
132 * candidates for action. That is, only leaf cgroups or cgroups with memory.oom.group set to "1".
133 *
134 * This function ignores most errors in order to handle cgroups that may have been cleaned up while populating
135 * the hashmap.
136 *
137 * `new_h` is of the form { key: cgroup paths -> value: OomdCGroupContext } */
138 static int recursively_get_cgroup_context(Hashmap *new_h, const char *path) {
139 _cleanup_free_ char *subpath = NULL;
140 _cleanup_closedir_ DIR *d = NULL;
141 int r;
142
143 assert(new_h);
144 assert(path);
145
146 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
147 if (r < 0)
148 return r;
149
150 r = cg_read_subgroup(d, &subpath);
151 if (r < 0)
152 return r;
153 else if (r == 0) { /* No subgroups? We're a leaf node */
154 r = oomd_insert_cgroup_context(NULL, new_h, path);
155 return (r == -ENOMEM) ? r : 0;
156 }
157
158 do {
159 _cleanup_free_ char *cg_path = NULL;
160 bool oom_group;
161
162 cg_path = path_join(empty_to_root(path), subpath);
163 if (!cg_path)
164 return -ENOMEM;
165
166 subpath = mfree(subpath);
167
168 r = cg_get_attribute_as_bool("memory", cg_path, "memory.oom.group", &oom_group);
169 /* The cgroup might be gone. Skip it as a candidate since we can't get information on it. */
170 if (r < 0)
171 return (r == -ENOMEM) ? r : 0;
172
173 if (oom_group)
174 r = oomd_insert_cgroup_context(NULL, new_h, cg_path);
175 else
176 r = recursively_get_cgroup_context(new_h, cg_path);
177 if (r == -ENOMEM)
178 return r;
179 } while ((r = cg_read_subgroup(d, &subpath)) > 0);
180
181 return 0;
182 }
183
184 static int update_monitored_cgroup_contexts(Hashmap **monitored_cgroups) {
185 _cleanup_hashmap_free_ Hashmap *new_base = NULL;
186 OomdCGroupContext *ctx;
187 int r;
188
189 assert(monitored_cgroups);
190
191 new_base = hashmap_new(&oomd_cgroup_ctx_hash_ops);
192 if (!new_base)
193 return -ENOMEM;
194
195 HASHMAP_FOREACH(ctx, *monitored_cgroups) {
196 /* Skip most errors since the cgroup we're trying to update might not exist anymore. */
197 r = oomd_insert_cgroup_context(*monitored_cgroups, new_base, ctx->path);
198 if (r == -ENOMEM)
199 return r;
200 }
201
202 hashmap_free(*monitored_cgroups);
203 *monitored_cgroups = TAKE_PTR(new_base);
204
205 return 0;
206 }
207
208 static int get_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups, Hashmap **ret_candidates) {
209 _cleanup_hashmap_free_ Hashmap *candidates = NULL;
210 OomdCGroupContext *ctx;
211 int r;
212
213 assert(monitored_cgroups);
214 assert(ret_candidates);
215
216 candidates = hashmap_new(&oomd_cgroup_ctx_hash_ops);
217 if (!candidates)
218 return -ENOMEM;
219
220 HASHMAP_FOREACH(ctx, monitored_cgroups) {
221 r = recursively_get_cgroup_context(candidates, ctx->path);
222 if (r == -ENOMEM)
223 return r;
224 }
225
226 *ret_candidates = TAKE_PTR(candidates);
227
228 return 0;
229 }
230
231 static int acquire_managed_oom_connect(Manager *m) {
232 _cleanup_(varlink_close_unrefp) Varlink *link = NULL;
233 int r;
234
235 assert(m);
236 assert(m->event);
237
238 r = varlink_connect_address(&link, VARLINK_ADDR_PATH_MANAGED_OOM);
239 if (r < 0)
240 return log_error_errno(r, "Failed to connect to %s: %m", VARLINK_ADDR_PATH_MANAGED_OOM);
241
242 (void) varlink_set_userdata(link, m);
243 (void) varlink_set_description(link, "oomd");
244 (void) varlink_set_relative_timeout(link, USEC_INFINITY);
245
246 r = varlink_attach_event(link, m->event, SD_EVENT_PRIORITY_NORMAL);
247 if (r < 0)
248 return log_error_errno(r, "Failed to attach varlink connection to event loop: %m");
249
250 r = varlink_bind_reply(link, process_managed_oom_reply);
251 if (r < 0)
252 return log_error_errno(r, "Failed to bind reply callback: %m");
253
254 r = varlink_observe(link, "io.systemd.ManagedOOM.SubscribeManagedOOMCGroups", NULL);
255 if (r < 0)
256 return log_error_errno(r, "Failed to observe varlink call: %m");
257
258 m->varlink = TAKE_PTR(link);
259 return 0;
260 }
261
262 static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
263 _cleanup_set_free_ Set *targets = NULL;
264 Manager *m = userdata;
265 usec_t usec_now;
266 int r;
267
268 assert(s);
269 assert(userdata);
270
271 /* Reset timer */
272 r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now);
273 if (r < 0)
274 return log_error_errno(r, "Failed to reset event timer");
275
276 r = sd_event_source_set_time_relative(s, INTERVAL_USEC);
277 if (r < 0)
278 return log_error_errno(r, "Failed to set relative time for timer");
279
280 /* Reconnect if our connection dropped */
281 if (!m->varlink) {
282 r = acquire_managed_oom_connect(m);
283 if (r < 0)
284 return log_error_errno(r, "Failed to acquire varlink connection");
285 }
286
287 /* Update the cgroups used for detection/action */
288 r = update_monitored_cgroup_contexts(&m->monitored_swap_cgroup_contexts);
289 if (r == -ENOMEM)
290 return log_error_errno(r, "Failed to update monitored swap cgroup contexts");
291
292 r = update_monitored_cgroup_contexts(&m->monitored_mem_pressure_cgroup_contexts);
293 if (r == -ENOMEM)
294 return log_error_errno(r, "Failed to update monitored memory pressure cgroup contexts");
295
296 r = oomd_system_context_acquire("/proc/swaps", &m->system_context);
297 /* If there aren't units depending on swap actions, the only error we exit on is ENOMEM */
298 if (r == -ENOMEM || (r < 0 && !hashmap_isempty(m->monitored_swap_cgroup_contexts)))
299 return log_error_errno(r, "Failed to acquire system context");
300
301 /* If we're still recovering from a kill, don't try to kill again yet */
302 if (m->post_action_delay_start > 0) {
303 if (m->post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now)
304 return 0;
305 else
306 m->post_action_delay_start = 0;
307 }
308
309 r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, PRESSURE_DURATION_USEC, &targets);
310 if (r == -ENOMEM)
311 return log_error_errno(r, "Failed to check if memory pressure exceeded limits");
312 else if (r == 1) {
313 /* Check if there was reclaim activity in the last interval. The concern is the following case:
314 * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending
315 * cgroup. Even after this, well-behaved processes will fault in recently resident pages and
316 * this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need
317 * to kill something (it won't help anyways). */
318 if (oomd_memory_reclaim(m->monitored_mem_pressure_cgroup_contexts)) {
319 _cleanup_hashmap_free_ Hashmap *candidates = NULL;
320 OomdCGroupContext *t;
321
322 r = get_monitored_cgroup_contexts_candidates(m->monitored_mem_pressure_cgroup_contexts, &candidates);
323 if (r == -ENOMEM)
324 return log_error_errno(r, "Failed to get monitored memory pressure cgroup candidates");
325
326 SET_FOREACH(t, targets) {
327 log_notice("Memory pressure for %s is greater than %lu for more than %"PRIu64" seconds and there was reclaim activity",
328 t->path, LOAD_INT(t->mem_pressure_limit), PRESSURE_DURATION_USEC / USEC_PER_SEC);
329
330 r = oomd_kill_by_pgscan(candidates, t->path, m->dry_run);
331 if (r == -ENOMEM)
332 return log_error_errno(r, "Failed to kill cgroup processes by pgscan");
333 if (r < 0)
334 log_info("Failed to kill any cgroup(s) under %s based on pressure", t->path);
335 else {
336 /* Don't act on all the high pressure cgroups at once; return as soon as we kill one */
337 m->post_action_delay_start = usec_now;
338 return 0;
339 }
340 }
341 }
342 }
343
344 if (oomd_swap_free_below(&m->system_context, (100 - m->swap_used_limit))) {
345 _cleanup_hashmap_free_ Hashmap *candidates = NULL;
346
347 log_notice("Swap used (%"PRIu64") / total (%"PRIu64") is more than %u%%",
348 m->system_context.swap_used, m->system_context.swap_total, m->swap_used_limit);
349
350 r = get_monitored_cgroup_contexts_candidates(m->monitored_swap_cgroup_contexts, &candidates);
351 if (r == -ENOMEM)
352 return log_error_errno(r, "Failed to get monitored swap cgroup candidates");
353
354 r = oomd_kill_by_swap_usage(candidates, m->dry_run);
355 if (r == -ENOMEM)
356 return log_error_errno(r, "Failed to kill cgroup processes by swap usage");
357 if (r < 0)
358 log_info("Failed to kill any cgroup(s) based on swap");
359 else {
360 m->post_action_delay_start = usec_now;
361 return 0;
362 }
363 }
364
365 return 0;
366 }
367
368 static int monitor_cgroup_contexts(Manager *m) {
369 _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
370 int r;
371
372 assert(m);
373 assert(m->event);
374
375 r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_cgroup_contexts_handler, m);
376 if (r < 0)
377 return r;
378
379 r = sd_event_source_set_exit_on_failure(s, true);
380 if (r < 0)
381 return r;
382
383 r = sd_event_source_set_enabled(s, SD_EVENT_ON);
384 if (r < 0)
385 return r;
386
387 (void) sd_event_source_set_description(s, "oomd-timer");
388
389 m->cgroup_context_event_source = TAKE_PTR(s);
390 return 0;
391 }
392
393 void manager_free(Manager *m) {
394 assert(m);
395
396 varlink_close_unref(m->varlink);
397 sd_event_source_unref(m->cgroup_context_event_source);
398 sd_event_unref(m->event);
399
400 bus_verify_polkit_async_registry_free(m->polkit_registry);
401 sd_bus_flush_close_unref(m->bus);
402
403 hashmap_free(m->monitored_swap_cgroup_contexts);
404 hashmap_free(m->monitored_mem_pressure_cgroup_contexts);
405
406 free(m);
407 }
408
409 int manager_new(Manager **ret) {
410 _cleanup_(manager_freep) Manager *m = NULL;
411 int r;
412
413 assert(ret);
414
415 m = new0(Manager, 1);
416 if (!m)
417 return -ENOMEM;
418
419 r = sd_event_default(&m->event);
420 if (r < 0)
421 return r;
422
423 (void) sd_event_set_watchdog(m->event, true);
424
425 r = sd_event_add_signal(m->event, NULL, SIGINT, NULL, NULL);
426 if (r < 0)
427 return r;
428
429 r = sd_event_add_signal(m->event, NULL, SIGTERM, NULL, NULL);
430 if (r < 0)
431 return r;
432
433 m->monitored_swap_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops);
434 if (!m->monitored_swap_cgroup_contexts)
435 return -ENOMEM;
436
437 m->monitored_mem_pressure_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops);
438 if (!m->monitored_mem_pressure_cgroup_contexts)
439 return -ENOMEM;
440
441 *ret = TAKE_PTR(m);
442 return 0;
443 }
444
445 static int manager_connect_bus(Manager *m) {
446 int r;
447
448 assert(m);
449 assert(!m->bus);
450
451 r = bus_open_system_watch_bind_with_description(&m->bus, "bus-api-oom");
452 if (r < 0)
453 return log_error_errno(r, "Failed to connect to bus: %m");
454
455 r = bus_add_implementation(m->bus, &manager_object, m);
456 if (r < 0)
457 return r;
458
459 r = bus_log_control_api_register(m->bus);
460 if (r < 0)
461 return r;
462
463 r = sd_bus_request_name_async(m->bus, NULL, "org.freedesktop.oom1", 0, NULL, NULL);
464 if (r < 0)
465 return log_error_errno(r, "Failed to request name: %m");
466
467 r = sd_bus_attach_event(m->bus, m->event, 0);
468 if (r < 0)
469 return log_error_errno(r, "Failed to attach bus to event loop: %m");
470
471 return 0;
472 }
473
474 int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit) {
475 unsigned long l;
476 int r;
477
478 assert(m);
479
480 m->dry_run = dry_run;
481
482 m->swap_used_limit = swap_used_limit != -1 ? swap_used_limit : DEFAULT_SWAP_USED_LIMIT;
483 assert(m->swap_used_limit <= 100);
484
485 l = mem_pressure_limit != -1 ? mem_pressure_limit : DEFAULT_MEM_PRESSURE_LIMIT;
486 r = store_loadavg_fixed_point(l, 0, &m->default_mem_pressure_limit);
487 if (r < 0)
488 return r;
489
490 r = manager_connect_bus(m);
491 if (r < 0)
492 return r;
493
494 r = acquire_managed_oom_connect(m);
495 if (r < 0)
496 return r;
497
498 r = monitor_cgroup_contexts(m);
499 if (r < 0)
500 return r;
501
502 return 0;
503 }
504
505 int manager_get_dump_string(Manager *m, char **ret) {
506 _cleanup_free_ char *dump = NULL;
507 _cleanup_fclose_ FILE *f = NULL;
508 OomdCGroupContext *c;
509 size_t size;
510 char *key;
511 int r;
512
513 assert(m);
514 assert(ret);
515
516 f = open_memstream_unlocked(&dump, &size);
517 if (!f)
518 return -errno;
519
520 fprintf(f,
521 "Dry Run: %s\n"
522 "Swap Used Limit: %u%%\n"
523 "Default Memory Pressure Limit: %lu%%\n"
524 "System Context:\n",
525 yes_no(m->dry_run),
526 m->swap_used_limit,
527 LOAD_INT(m->default_mem_pressure_limit));
528 oomd_dump_system_context(&m->system_context, f, "\t");
529
530 fprintf(f, "Swap Monitored CGroups:\n");
531 HASHMAP_FOREACH_KEY(c, key, m->monitored_swap_cgroup_contexts)
532 oomd_dump_swap_cgroup_context(c, f, "\t");
533
534 fprintf(f, "Memory Pressure Monitored CGroups:\n");
535 HASHMAP_FOREACH_KEY(c, key, m->monitored_mem_pressure_cgroup_contexts)
536 oomd_dump_memory_pressure_cgroup_context(c, f, "\t");
537
538 r = fflush_and_check(f);
539 if (r < 0)
540 return r;
541
542 f = safe_fclose(f);
543
544 *ret = TAKE_PTR(dump);
545 return 0;
546 }