]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/oom/oomd-manager.c
tree-wide: use ASSERT_PTR more
[thirdparty/systemd.git] / src / oom / oomd-manager.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
9de5e321 2
064a5c14
DDM
3#include "sd-daemon.h"
4
5c616ecf
AZ
5#include "bus-log-control-api.h"
6#include "bus-util.h"
7#include "bus-polkit.h"
9de5e321
AZ
8#include "cgroup-util.h"
9#include "fd-util.h"
10#include "fileio.h"
064a5c14 11#include "format-util.h"
408a3bbd 12#include "memory-util.h"
5c616ecf 13#include "oomd-manager-bus.h"
9de5e321
AZ
14#include "oomd-manager.h"
15#include "path-util.h"
d9d3f05d 16#include "percent-util.h"
9de5e321 17
71feeae4 18typedef struct ManagedOOMMessage {
9de5e321
AZ
19 ManagedOOMMode mode;
20 char *path;
21 char *property;
d06e7fb5 22 uint32_t limit;
71feeae4 23} ManagedOOMMessage;
9de5e321 24
71feeae4
DDM
25static void managed_oom_message_destroy(ManagedOOMMessage *message) {
26 assert(message);
27 free(message->path);
28 free(message->property);
9de5e321
AZ
29}
30
31static int managed_oom_mode(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) {
32 ManagedOOMMode *mode = userdata, m;
33 const char *s;
34
35 assert(mode);
36 assert_se(s = json_variant_string(v));
37
38 m = managed_oom_mode_from_string(s);
39 if (m < 0)
7211c853 40 return json_log(v, flags, m, "%s is not a valid ManagedOOMMode", s);
9de5e321
AZ
41
42 *mode = m;
43 return 0;
44}
45
064a5c14 46static int process_managed_oom_message(Manager *m, uid_t uid, JsonVariant *parameters) {
9de5e321 47 JsonVariant *c, *cgroups;
71feeae4 48 int r;
9de5e321
AZ
49
50 static const JsonDispatch dispatch_table[] = {
71feeae4
DDM
51 { "mode", JSON_VARIANT_STRING, managed_oom_mode, offsetof(ManagedOOMMessage, mode), JSON_MANDATORY },
52 { "path", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMMessage, path), JSON_MANDATORY },
53 { "property", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMMessage, property), JSON_MANDATORY },
54 { "limit", JSON_VARIANT_UNSIGNED, json_dispatch_uint32, offsetof(ManagedOOMMessage, limit), 0 },
9de5e321
AZ
55 {},
56 };
57
71feeae4
DDM
58 assert(m);
59 assert(parameters);
9de5e321
AZ
60
61 cgroups = json_variant_by_key(parameters, "cgroups");
71feeae4
DDM
62 if (!cgroups)
63 return -EINVAL;
9de5e321
AZ
64
65 /* Skip malformed elements and keep processing in case the others are good */
66 JSON_VARIANT_ARRAY_FOREACH(c, cgroups) {
71feeae4 67 _cleanup_(managed_oom_message_destroy) ManagedOOMMessage message = {};
9de5e321
AZ
68 OomdCGroupContext *ctx;
69 Hashmap *monitor_hm;
70 loadavg_t limit;
9de5e321
AZ
71
72 if (!json_variant_is_object(c))
73 continue;
74
71feeae4
DDM
75 r = json_dispatch(c, dispatch_table, NULL, 0, &message);
76 if (r == -ENOMEM)
77 return r;
78 if (r < 0)
9de5e321
AZ
79 continue;
80
064a5c14
DDM
81 if (uid != 0) {
82 uid_t cg_uid;
83
84 r = cg_path_get_owner_uid(message.path, &cg_uid);
85 if (r < 0) {
b6f6df4c 86 log_debug_errno(r, "Failed to get cgroup %s owner uid: %m", message.path);
064a5c14
DDM
87 continue;
88 }
89
90 /* Let's not be lenient for permission errors and skip processing if we receive an
91 * update for a cgroup that doesn't belong to the user. */
92 if (uid != cg_uid)
93 return log_error_errno(SYNTHETIC_ERRNO(EPERM),
94 "cgroup path owner UID does not match sender uid "
95 "(" UID_FMT " != " UID_FMT ")", uid, cg_uid);
96 }
97
71feeae4 98 monitor_hm = streq(message.property, "ManagedOOMSwap") ?
9de5e321
AZ
99 m->monitored_swap_cgroup_contexts : m->monitored_mem_pressure_cgroup_contexts;
100
71feeae4
DDM
101 if (message.mode == MANAGED_OOM_AUTO) {
102 (void) oomd_cgroup_context_free(hashmap_remove(monitor_hm, empty_to_root(message.path)));
9de5e321
AZ
103 continue;
104 }
105
106 limit = m->default_mem_pressure_limit;
107
71feeae4
DDM
108 if (streq(message.property, "ManagedOOMMemoryPressure") && message.limit > 0) {
109 int permyriad = UINT32_SCALE_TO_PERMYRIAD(message.limit);
d06e7fb5 110
5f1d6ebd 111 r = store_loadavg_fixed_point(permyriad / 100LU, permyriad % 100LU, &limit);
71feeae4 112 if (r < 0)
9de5e321 113 continue;
9de5e321
AZ
114 }
115
71feeae4
DDM
116 r = oomd_insert_cgroup_context(NULL, monitor_hm, message.path);
117 if (r == -ENOMEM)
118 return r;
119 if (r < 0 && r != -EEXIST)
120 log_debug_errno(r, "Failed to insert message, ignoring: %m");
9de5e321
AZ
121
122 /* Always update the limit in case it was changed. For non-memory pressure detection the value is
123 * ignored so always updating it here is not a problem. */
71feeae4 124 ctx = hashmap_get(monitor_hm, empty_to_root(message.path));
9de5e321
AZ
125 if (ctx)
126 ctx->mem_pressure_limit = limit;
127 }
128
71feeae4
DDM
129 return 0;
130}
131
064a5c14
DDM
132static int process_managed_oom_request(
133 Varlink *link,
134 JsonVariant *parameters,
135 VarlinkMethodFlags flags,
136 void *userdata) {
99534007 137 Manager *m = ASSERT_PTR(userdata);
064a5c14
DDM
138 uid_t uid;
139 int r;
140
064a5c14
DDM
141 r = varlink_get_peer_uid(link, &uid);
142 if (r < 0)
143 return log_error_errno(r, "Failed to get varlink peer uid: %m");
144
145 return process_managed_oom_message(m, uid, parameters);
146}
147
71feeae4
DDM
148static int process_managed_oom_reply(
149 Varlink *link,
150 JsonVariant *parameters,
151 const char *error_id,
152 VarlinkReplyFlags flags,
153 void *userdata) {
99534007 154 Manager *m = ASSERT_PTR(userdata);
064a5c14 155 uid_t uid;
71feeae4
DDM
156 int r;
157
71feeae4
DDM
158 if (error_id) {
159 r = -EIO;
160 log_debug("Error getting ManagedOOM cgroups: %s", error_id);
161 goto finish;
162 }
163
064a5c14
DDM
164 r = varlink_get_peer_uid(link, &uid);
165 if (r < 0) {
166 log_error_errno(r, "Failed to get varlink peer uid: %m");
167 goto finish;
168 }
169
170 r = process_managed_oom_message(m, uid, parameters);
71feeae4 171
9de5e321
AZ
172finish:
173 if (!FLAGS_SET(flags, VARLINK_REPLY_CONTINUES))
064a5c14 174 m->varlink_client = varlink_close_unref(link);
9de5e321
AZ
175
176 return r;
177}
178
4d620b90 179/* Fill 'new_h' with 'path's descendant OomdCGroupContexts. Only include descendant cgroups that are possible
9de5e321
AZ
180 * candidates for action. That is, only leaf cgroups or cgroups with memory.oom.group set to "1".
181 *
4d620b90
ZJS
182 * This function ignores most errors in order to handle cgroups that may have been cleaned up while
183 * populating the hashmap.
9de5e321 184 *
4d620b90 185 * 'new_h' is of the form { key: cgroup paths -> value: OomdCGroupContext } */
9de5e321
AZ
186static int recursively_get_cgroup_context(Hashmap *new_h, const char *path) {
187 _cleanup_free_ char *subpath = NULL;
188 _cleanup_closedir_ DIR *d = NULL;
189 int r;
190
191 assert(new_h);
192 assert(path);
193
194 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
195 if (r < 0)
196 return r;
197
198 r = cg_read_subgroup(d, &subpath);
199 if (r < 0)
200 return r;
201 else if (r == 0) { /* No subgroups? We're a leaf node */
202 r = oomd_insert_cgroup_context(NULL, new_h, path);
77b04c0a
AZ
203 if (r == -ENOMEM)
204 return r;
205 if (r < 0)
206 log_debug_errno(r, "Failed to insert context for %s, ignoring: %m", path);
207 return 0;
9de5e321
AZ
208 }
209
210 do {
211 _cleanup_free_ char *cg_path = NULL;
212 bool oom_group;
213
214 cg_path = path_join(empty_to_root(path), subpath);
215 if (!cg_path)
216 return -ENOMEM;
217
218 subpath = mfree(subpath);
219
220 r = cg_get_attribute_as_bool("memory", cg_path, "memory.oom.group", &oom_group);
221 /* The cgroup might be gone. Skip it as a candidate since we can't get information on it. */
77b04c0a
AZ
222 if (r == -ENOMEM)
223 return r;
224 if (r < 0) {
225 log_debug_errno(r, "Failed to read memory.oom.group from %s, ignoring: %m", cg_path);
226 return 0;
227 }
9de5e321 228
349a2003 229 if (oom_group)
9de5e321 230 r = oomd_insert_cgroup_context(NULL, new_h, cg_path);
349a2003 231 else
9de5e321 232 r = recursively_get_cgroup_context(new_h, cg_path);
349a2003
AZ
233 if (r == -ENOMEM)
234 return r;
77b04c0a
AZ
235 if (r < 0)
236 log_debug_errno(r, "Failed to insert or recursively get from %s, ignoring: %m", cg_path);
9de5e321
AZ
237 } while ((r = cg_read_subgroup(d, &subpath)) > 0);
238
239 return 0;
240}
241
242static int update_monitored_cgroup_contexts(Hashmap **monitored_cgroups) {
243 _cleanup_hashmap_free_ Hashmap *new_base = NULL;
244 OomdCGroupContext *ctx;
245 int r;
246
247 assert(monitored_cgroups);
248
249 new_base = hashmap_new(&oomd_cgroup_ctx_hash_ops);
250 if (!new_base)
251 return -ENOMEM;
252
253 HASHMAP_FOREACH(ctx, *monitored_cgroups) {
254 /* Skip most errors since the cgroup we're trying to update might not exist anymore. */
255 r = oomd_insert_cgroup_context(*monitored_cgroups, new_base, ctx->path);
256 if (r == -ENOMEM)
257 return r;
77b04c0a
AZ
258 if (r < 0 && !IN_SET(r, -EEXIST, -ENOENT))
259 log_debug_errno(r, "Failed to insert context for %s, ignoring: %m", ctx->path);
9de5e321
AZ
260 }
261
262 hashmap_free(*monitored_cgroups);
263 *monitored_cgroups = TAKE_PTR(new_base);
264
265 return 0;
266}
267
268static int get_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups, Hashmap **ret_candidates) {
269 _cleanup_hashmap_free_ Hashmap *candidates = NULL;
270 OomdCGroupContext *ctx;
271 int r;
272
273 assert(monitored_cgroups);
274 assert(ret_candidates);
275
276 candidates = hashmap_new(&oomd_cgroup_ctx_hash_ops);
277 if (!candidates)
278 return -ENOMEM;
279
280 HASHMAP_FOREACH(ctx, monitored_cgroups) {
281 r = recursively_get_cgroup_context(candidates, ctx->path);
282 if (r == -ENOMEM)
283 return r;
77b04c0a
AZ
284 if (r < 0)
285 log_debug_errno(r, "Failed to recursively get contexts for %s, ignoring: %m", ctx->path);
9de5e321
AZ
286 }
287
288 *ret_candidates = TAKE_PTR(candidates);
289
290 return 0;
291}
292
91cbb4bd
AZ
293static int update_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups, Hashmap **candidates) {
294 _cleanup_hashmap_free_ Hashmap *new_candidates = NULL;
295 int r;
296
297 assert(monitored_cgroups);
298 assert(candidates);
299 assert(*candidates);
300
301 r = get_monitored_cgroup_contexts_candidates(monitored_cgroups, &new_candidates);
302 if (r < 0)
303 return log_debug_errno(r, "Failed to get candidate contexts: %m");
304
305 oomd_update_cgroup_contexts_between_hashmaps(*candidates, new_candidates);
306
307 hashmap_free(*candidates);
308 *candidates = TAKE_PTR(new_candidates);
309
310 return 0;
311}
312
9de5e321
AZ
313static int acquire_managed_oom_connect(Manager *m) {
314 _cleanup_(varlink_close_unrefp) Varlink *link = NULL;
315 int r;
316
317 assert(m);
318 assert(m->event);
319
064a5c14 320 r = varlink_connect_address(&link, VARLINK_ADDR_PATH_MANAGED_OOM_SYSTEM);
9de5e321 321 if (r < 0)
064a5c14 322 return log_error_errno(r, "Failed to connect to " VARLINK_ADDR_PATH_MANAGED_OOM_SYSTEM ": %m");
9de5e321
AZ
323
324 (void) varlink_set_userdata(link, m);
325 (void) varlink_set_description(link, "oomd");
326 (void) varlink_set_relative_timeout(link, USEC_INFINITY);
327
328 r = varlink_attach_event(link, m->event, SD_EVENT_PRIORITY_NORMAL);
329 if (r < 0)
330 return log_error_errno(r, "Failed to attach varlink connection to event loop: %m");
331
332 r = varlink_bind_reply(link, process_managed_oom_reply);
333 if (r < 0)
334 return log_error_errno(r, "Failed to bind reply callback: %m");
335
336 r = varlink_observe(link, "io.systemd.ManagedOOM.SubscribeManagedOOMCGroups", NULL);
337 if (r < 0)
338 return log_error_errno(r, "Failed to observe varlink call: %m");
339
064a5c14 340 m->varlink_client = TAKE_PTR(link);
9de5e321
AZ
341 return 0;
342}
343
81d66fab 344static int monitor_swap_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
99534007 345 Manager *m = ASSERT_PTR(userdata);
9de5e321
AZ
346 usec_t usec_now;
347 int r;
348
349 assert(s);
9de5e321
AZ
350
351 /* Reset timer */
352 r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now);
353 if (r < 0)
77b04c0a 354 return log_error_errno(r, "Failed to reset event timer: %m");
9de5e321 355
81d66fab 356 r = sd_event_source_set_time_relative(s, SWAP_INTERVAL_USEC);
9de5e321 357 if (r < 0)
77b04c0a 358 return log_error_errno(r, "Failed to set relative time for timer: %m");
9de5e321
AZ
359
360 /* Reconnect if our connection dropped */
064a5c14 361 if (!m->varlink_client) {
9de5e321
AZ
362 r = acquire_managed_oom_connect(m);
363 if (r < 0)
77b04c0a 364 return log_error_errno(r, "Failed to acquire varlink connection: %m");
9de5e321
AZ
365 }
366
47136b9d
AZ
367 /* We still try to acquire system information for oomctl even if no units want swap monitoring */
368 r = oomd_system_context_acquire("/proc/meminfo", &m->system_context);
369 /* If there are no units depending on swap actions, the only error we exit on is ENOMEM. */
370 if (r == -ENOMEM || (r < 0 && !hashmap_isempty(m->monitored_swap_cgroup_contexts)))
81d66fab
AZ
371 return log_error_errno(r, "Failed to acquire system context: %m");
372
373 /* Return early if nothing is requesting swap monitoring */
374 if (hashmap_isempty(m->monitored_swap_cgroup_contexts))
375 return 0;
376
377 /* Note that m->monitored_swap_cgroup_contexts does not need to be updated every interval because only the
378 * system context is used for deciding whether the swap threshold is hit. m->monitored_swap_cgroup_contexts
379 * is only used to decide which cgroups to kill (and even then only the resource usages of its descendent
380 * nodes are the ones that matter). */
381
030bc91c
NR
382 /* Check amount of memory available and swap free so we don't free up swap when memory is still available. */
383 if (oomd_mem_available_below(&m->system_context, 10000 - m->swap_used_limit_permyriad) &&
cb5ce676 384 oomd_swap_free_below(&m->system_context, 10000 - m->swap_used_limit_permyriad)) {
81d66fab
AZ
385 _cleanup_hashmap_free_ Hashmap *candidates = NULL;
386 _cleanup_free_ char *selected = NULL;
685b0985 387 uint64_t threshold;
81d66fab 388
cb5ce676
AZ
389 log_debug("Memory used (%"PRIu64") / total (%"PRIu64") and "
390 "swap used (%"PRIu64") / total (%"PRIu64") is more than " PERMYRIAD_AS_PERCENT_FORMAT_STR,
391 m->system_context.mem_used, m->system_context.mem_total,
81d66fab
AZ
392 m->system_context.swap_used, m->system_context.swap_total,
393 PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
394
395 r = get_monitored_cgroup_contexts_candidates(m->monitored_swap_cgroup_contexts, &candidates);
396 if (r == -ENOMEM)
397 return log_oom();
398 if (r < 0)
399 log_debug_errno(r, "Failed to get monitored swap cgroup candidates, ignoring: %m");
400
685b0985
AZ
401 threshold = m->system_context.swap_total * THRESHOLD_SWAP_USED_PERCENT / 100;
402 r = oomd_kill_by_swap_usage(candidates, threshold, m->dry_run, &selected);
81d66fab
AZ
403 if (r == -ENOMEM)
404 return log_oom();
405 if (r < 0)
406 log_notice_errno(r, "Failed to kill any cgroup(s) based on swap: %m");
407 else {
d784a8d4 408 if (selected && r > 0) {
cb5ce676
AZ
409 log_notice("Killed %s due to memory used (%"PRIu64") / total (%"PRIu64") and "
410 "swap used (%"PRIu64") / total (%"PRIu64") being more than "
81d66fab 411 PERMYRIAD_AS_PERCENT_FORMAT_STR,
cb5ce676
AZ
412 selected,
413 m->system_context.mem_used, m->system_context.mem_total,
414 m->system_context.swap_used, m->system_context.swap_total,
81d66fab 415 PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
d784a8d4
OS
416
417 /* send dbus signal */
418 (void) sd_bus_emit_signal(m->bus,
419 "/org/freedesktop/oom1",
420 "org.freedesktop.oom1.Manager",
421 "Killed",
422 "ss",
423 selected,
424 "memory-used");
425 }
81d66fab
AZ
426 return 0;
427 }
428 }
429
430 return 0;
431}
432
cb13961a
AZ
433static void clear_candidate_hashmapp(Manager **m) {
434 if (*m)
435 hashmap_clear((*m)->monitored_mem_pressure_cgroup_contexts_candidates);
436}
437
81d66fab 438static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
cb13961a
AZ
439 /* Don't want to use stale candidate data. Setting this will clear the candidate hashmap on return unless we
440 * update the candidate data (in which case clear_candidates will be NULL). */
d7ac0952 441 _unused_ _cleanup_(clear_candidate_hashmapp) Manager *clear_candidates = userdata;
81d66fab 442 _cleanup_set_free_ Set *targets = NULL;
cb13961a 443 bool in_post_action_delay = false;
99534007 444 Manager *m = ASSERT_PTR(userdata);
81d66fab
AZ
445 usec_t usec_now;
446 int r;
447
448 assert(s);
81d66fab
AZ
449
450 /* Reset timer */
451 r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now);
77b04c0a 452 if (r < 0)
81d66fab
AZ
453 return log_error_errno(r, "Failed to reset event timer: %m");
454
455 r = sd_event_source_set_time_relative(s, MEM_PRESSURE_INTERVAL_USEC);
456 if (r < 0)
457 return log_error_errno(r, "Failed to set relative time for timer: %m");
458
459 /* Reconnect if our connection dropped */
064a5c14 460 if (!m->varlink_client) {
81d66fab
AZ
461 r = acquire_managed_oom_connect(m);
462 if (r < 0)
463 return log_error_errno(r, "Failed to acquire varlink connection: %m");
464 }
9de5e321 465
81d66fab 466 /* Return early if nothing is requesting memory pressure monitoring */
cb13961a 467 if (hashmap_isempty(m->monitored_mem_pressure_cgroup_contexts))
81d66fab 468 return 0;
81d66fab
AZ
469
470 /* Update the cgroups used for detection/action */
9de5e321
AZ
471 r = update_monitored_cgroup_contexts(&m->monitored_mem_pressure_cgroup_contexts);
472 if (r == -ENOMEM)
77b04c0a
AZ
473 return log_oom();
474 if (r < 0)
475 log_debug_errno(r, "Failed to update monitored memory pressure cgroup contexts, ignoring: %m");
9de5e321 476
81d66fab
AZ
477 /* Since pressure counters are lagging, we need to wait a bit after a kill to ensure we don't read stale
478 * values and go on a kill storm. */
479 if (m->mem_pressure_post_action_delay_start > 0) {
480 if (m->mem_pressure_post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now)
cb13961a 481 in_post_action_delay = true;
9de5e321 482 else
81d66fab 483 m->mem_pressure_post_action_delay_start = 0;
9de5e321
AZ
484 }
485
c20aa7b1 486 r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets);
9de5e321 487 if (r == -ENOMEM)
77b04c0a
AZ
488 return log_oom();
489 if (r < 0)
490 log_debug_errno(r, "Failed to check if memory pressure exceeded limits, ignoring: %m");
cb13961a 491 else if (r == 1 && !in_post_action_delay) {
df637ede
AZ
492 OomdCGroupContext *t;
493 SET_FOREACH(t, targets) {
494 _cleanup_free_ char *selected = NULL;
df637ede
AZ
495
496 /* Check if there was reclaim activity in the given interval. The concern is the following case:
497 * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending
498 * cgroup. Even after this, well-behaved processes will fault in recently resident pages and
499 * this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need
500 * to kill something (it won't help anyways). */
501 if ((now(CLOCK_MONOTONIC) - t->last_had_mem_reclaim) > RECLAIM_DURATION_USEC)
502 continue;
503
504 log_debug("Memory pressure for %s is %lu.%02lu%% > %lu.%02lu%% for > %s with reclaim activity",
505 t->path,
3542da24
LB
506 LOADAVG_INT_SIDE(t->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(t->memory_pressure.avg10),
507 LOADAVG_INT_SIDE(t->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(t->mem_pressure_limit),
5291f26d 508 FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC));
df637ede 509
cb13961a
AZ
510 r = update_monitored_cgroup_contexts_candidates(
511 m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates);
512 if (r == -ENOMEM)
513 return log_oom();
514 if (r < 0)
515 log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
516 else
517 clear_candidates = NULL;
518
df637ede
AZ
519 r = oomd_kill_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates, t->path, m->dry_run, &selected);
520 if (r == -ENOMEM)
521 return log_oom();
522 if (r < 0)
523 log_notice_errno(r, "Failed to kill any cgroup(s) under %s based on pressure: %m", t->path);
524 else {
914d4e99
AZ
525 /* Don't act on all the high pressure cgroups at once; return as soon as we kill one.
526 * If r == 0 then it means there were not eligible candidates, the candidate cgroup
527 * disappeared, or the candidate cgroup has no processes by the time we tried to kill
528 * it. In either case, go through the event loop again and select a new candidate if
529 * pressure is still high. */
df637ede 530 m->mem_pressure_post_action_delay_start = usec_now;
d784a8d4 531 if (selected && r > 0) {
df637ede
AZ
532 log_notice("Killed %s due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%"
533 " for > %s with reclaim activity",
534 selected, t->path,
3542da24
LB
535 LOADAVG_INT_SIDE(t->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(t->memory_pressure.avg10),
536 LOADAVG_INT_SIDE(t->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(t->mem_pressure_limit),
5291f26d 537 FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC));
d784a8d4
OS
538
539 /* send dbus signal */
540 (void) sd_bus_emit_signal(m->bus,
541 "/org/freedesktop/oom1",
542 "org.freedesktop.oom1.Manager",
543 "Killed",
544 "ss",
545 selected,
546 "memory-pressure");
547 }
df637ede 548 return 0;
9de5e321
AZ
549 }
550 }
cb13961a
AZ
551 } else {
552 /* If any monitored cgroup is over their pressure limit, get all the kill candidates for every
553 * monitored cgroup. This saves CPU cycles from doing it every interval by only doing it when a kill
554 * might happen.
555 * Candidate cgroup data will continue to get updated during the post-action delay period in case
556 * pressure continues to be high after a kill. */
557 OomdCGroupContext *c;
558 HASHMAP_FOREACH(c, m->monitored_mem_pressure_cgroup_contexts) {
559 if (c->mem_pressure_limit_hit_start == 0)
560 continue;
561
562 r = update_monitored_cgroup_contexts_candidates(
563 m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates);
564 if (r == -ENOMEM)
565 return log_oom();
566 if (r < 0)
567 log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
568 else {
569 clear_candidates = NULL;
570 break;
571 }
572 }
9de5e321
AZ
573 }
574
81d66fab
AZ
575 return 0;
576}
9de5e321 577
81d66fab
AZ
578static int monitor_swap_contexts(Manager *m) {
579 _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
580 int r;
9de5e321 581
81d66fab
AZ
582 assert(m);
583 assert(m->event);
9de5e321 584
81d66fab
AZ
585 r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_swap_contexts_handler, m);
586 if (r < 0)
587 return r;
588
589 r = sd_event_source_set_exit_on_failure(s, true);
590 if (r < 0)
591 return r;
9de5e321 592
81d66fab
AZ
593 r = sd_event_source_set_enabled(s, SD_EVENT_ON);
594 if (r < 0)
595 return r;
596
597 (void) sd_event_source_set_description(s, "oomd-swap-timer");
598
599 m->swap_context_event_source = TAKE_PTR(s);
9de5e321
AZ
600 return 0;
601}
602
81d66fab 603static int monitor_memory_pressure_contexts(Manager *m) {
9de5e321
AZ
604 _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
605 int r;
606
607 assert(m);
608 assert(m->event);
609
81d66fab 610 r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_memory_pressure_contexts_handler, m);
9de5e321
AZ
611 if (r < 0)
612 return r;
613
614 r = sd_event_source_set_exit_on_failure(s, true);
615 if (r < 0)
616 return r;
617
618 r = sd_event_source_set_enabled(s, SD_EVENT_ON);
619 if (r < 0)
620 return r;
621
81d66fab 622 (void) sd_event_source_set_description(s, "oomd-memory-pressure-timer");
9de5e321 623
81d66fab 624 m->mem_pressure_context_event_source = TAKE_PTR(s);
9de5e321
AZ
625 return 0;
626}
627
75db809a 628Manager* manager_free(Manager *m) {
9de5e321
AZ
629 assert(m);
630
064a5c14
DDM
631 varlink_server_unref(m->varlink_server);
632 varlink_close_unref(m->varlink_client);
81d66fab
AZ
633 sd_event_source_unref(m->swap_context_event_source);
634 sd_event_source_unref(m->mem_pressure_context_event_source);
9de5e321
AZ
635 sd_event_unref(m->event);
636
5c616ecf
AZ
637 bus_verify_polkit_async_registry_free(m->polkit_registry);
638 sd_bus_flush_close_unref(m->bus);
639
9de5e321
AZ
640 hashmap_free(m->monitored_swap_cgroup_contexts);
641 hashmap_free(m->monitored_mem_pressure_cgroup_contexts);
91cbb4bd 642 hashmap_free(m->monitored_mem_pressure_cgroup_contexts_candidates);
9de5e321 643
75db809a 644 return mfree(m);
9de5e321
AZ
645}
646
647int manager_new(Manager **ret) {
648 _cleanup_(manager_freep) Manager *m = NULL;
649 int r;
650
651 assert(ret);
652
653 m = new0(Manager, 1);
654 if (!m)
655 return -ENOMEM;
656
657 r = sd_event_default(&m->event);
658 if (r < 0)
659 return r;
660
661 (void) sd_event_set_watchdog(m->event, true);
662
663 r = sd_event_add_signal(m->event, NULL, SIGINT, NULL, NULL);
664 if (r < 0)
665 return r;
666
667 r = sd_event_add_signal(m->event, NULL, SIGTERM, NULL, NULL);
668 if (r < 0)
669 return r;
670
671 m->monitored_swap_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops);
672 if (!m->monitored_swap_cgroup_contexts)
673 return -ENOMEM;
674
675 m->monitored_mem_pressure_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops);
676 if (!m->monitored_mem_pressure_cgroup_contexts)
677 return -ENOMEM;
678
91cbb4bd
AZ
679 m->monitored_mem_pressure_cgroup_contexts_candidates = hashmap_new(&oomd_cgroup_ctx_hash_ops);
680 if (!m->monitored_mem_pressure_cgroup_contexts_candidates)
681 return -ENOMEM;
682
9de5e321
AZ
683 *ret = TAKE_PTR(m);
684 return 0;
685}
686
5c616ecf
AZ
687static int manager_connect_bus(Manager *m) {
688 int r;
689
690 assert(m);
691 assert(!m->bus);
692
693 r = bus_open_system_watch_bind_with_description(&m->bus, "bus-api-oom");
694 if (r < 0)
695 return log_error_errno(r, "Failed to connect to bus: %m");
696
c9a00f5a 697 r = bus_add_implementation(m->bus, &manager_object, m);
5c616ecf 698 if (r < 0)
c9a00f5a 699 return r;
5c616ecf
AZ
700
701 r = bus_log_control_api_register(m->bus);
702 if (r < 0)
703 return r;
704
705 r = sd_bus_request_name_async(m->bus, NULL, "org.freedesktop.oom1", 0, NULL, NULL);
706 if (r < 0)
707 return log_error_errno(r, "Failed to request name: %m");
708
709 r = sd_bus_attach_event(m->bus, m->event, 0);
710 if (r < 0)
711 return log_error_errno(r, "Failed to attach bus to event loop: %m");
712
713 return 0;
714}
715
064a5c14
DDM
716static int manager_varlink_init(Manager *m, int fd) {
717 _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL;
718 int r;
719
720 assert(m);
721 assert(!m->varlink_server);
722
723 r = varlink_server_new(&s, VARLINK_SERVER_ACCOUNT_UID|VARLINK_SERVER_INHERIT_USERDATA);
724 if (r < 0)
725 return log_error_errno(r, "Failed to allocate varlink server object: %m");
726
727 varlink_server_set_userdata(s, m);
728
729 r = varlink_server_bind_method(s, "io.systemd.oom.ReportManagedOOMCGroups", process_managed_oom_request);
730 if (r < 0)
731 return log_error_errno(r, "Failed to register varlink method: %m");
732
733 if (fd < 0)
734 r = varlink_server_listen_address(s, VARLINK_ADDR_PATH_MANAGED_OOM_USER, 0666);
735 else
736 r = varlink_server_listen_fd(s, fd);
737 if (r < 0)
738 return log_error_errno(r, "Failed to bind to varlink socket: %m");
739
740 r = varlink_server_attach_event(s, m->event, SD_EVENT_PRIORITY_NORMAL);
741 if (r < 0)
742 return log_error_errno(r, "Failed to attach varlink connection to event loop: %m");
743
744 log_debug("Initialized systemd-oomd varlink server");
745
746 m->varlink_server = TAKE_PTR(s);
747 return 0;
748}
749
d06e7fb5
LP
750int manager_start(
751 Manager *m,
752 bool dry_run,
753 int swap_used_limit_permyriad,
754 int mem_pressure_limit_permyriad,
064a5c14
DDM
755 usec_t mem_pressure_usec,
756 int fd) {
d06e7fb5 757
0a9f9344 758 unsigned long l, f;
9de5e321
AZ
759 int r;
760
761 assert(m);
762
763 m->dry_run = dry_run;
764
d06e7fb5
LP
765 m->swap_used_limit_permyriad = swap_used_limit_permyriad >= 0 ? swap_used_limit_permyriad : DEFAULT_SWAP_USED_LIMIT_PERCENT * 100;
766 assert(m->swap_used_limit_permyriad <= 10000);
9de5e321 767
d06e7fb5 768 if (mem_pressure_limit_permyriad >= 0) {
0a9f9344
AZ
769 assert(mem_pressure_limit_permyriad <= 10000);
770
771 l = mem_pressure_limit_permyriad / 100;
772 f = mem_pressure_limit_permyriad % 100;
773 } else {
774 l = DEFAULT_MEM_PRESSURE_LIMIT_PERCENT;
775 f = 0;
776 }
777 r = store_loadavg_fixed_point(l, f, &m->default_mem_pressure_limit);
9de5e321
AZ
778 if (r < 0)
779 return r;
780
c20aa7b1
AZ
781 m->default_mem_pressure_duration_usec = mem_pressure_usec ?: DEFAULT_MEM_PRESSURE_DURATION_USEC;
782
5c616ecf
AZ
783 r = manager_connect_bus(m);
784 if (r < 0)
785 return r;
786
9de5e321
AZ
787 r = acquire_managed_oom_connect(m);
788 if (r < 0)
789 return r;
790
064a5c14
DDM
791 r = manager_varlink_init(m, fd);
792 if (r < 0)
793 return r;
794
81d66fab
AZ
795 r = monitor_memory_pressure_contexts(m);
796 if (r < 0)
797 return r;
798
799 r = monitor_swap_contexts(m);
9de5e321
AZ
800 if (r < 0)
801 return r;
802
803 return 0;
804}
5c616ecf
AZ
805
806int manager_get_dump_string(Manager *m, char **ret) {
807 _cleanup_free_ char *dump = NULL;
808 _cleanup_fclose_ FILE *f = NULL;
809 OomdCGroupContext *c;
810 size_t size;
811 char *key;
812 int r;
813
814 assert(m);
815 assert(ret);
816
817 f = open_memstream_unlocked(&dump, &size);
818 if (!f)
819 return -errno;
820
821 fprintf(f,
822 "Dry Run: %s\n"
d06e7fb5 823 "Swap Used Limit: " PERMYRIAD_AS_PERCENT_FORMAT_STR "\n"
0a9f9344 824 "Default Memory Pressure Limit: %lu.%02lu%%\n"
c20aa7b1 825 "Default Memory Pressure Duration: %s\n"
5c616ecf
AZ
826 "System Context:\n",
827 yes_no(m->dry_run),
d06e7fb5 828 PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad),
3542da24 829 LOADAVG_INT_SIDE(m->default_mem_pressure_limit), LOADAVG_DECIMAL_SIDE(m->default_mem_pressure_limit),
5291f26d 830 FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC));
5c616ecf
AZ
831 oomd_dump_system_context(&m->system_context, f, "\t");
832
833 fprintf(f, "Swap Monitored CGroups:\n");
834 HASHMAP_FOREACH_KEY(c, key, m->monitored_swap_cgroup_contexts)
835 oomd_dump_swap_cgroup_context(c, f, "\t");
836
837 fprintf(f, "Memory Pressure Monitored CGroups:\n");
838 HASHMAP_FOREACH_KEY(c, key, m->monitored_mem_pressure_cgroup_contexts)
839 oomd_dump_memory_pressure_cgroup_context(c, f, "\t");
840
841 r = fflush_and_check(f);
842 if (r < 0)
843 return r;
844
845 f = safe_fclose(f);
846
847 *ret = TAKE_PTR(dump);
848 return 0;
849}