]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/oom/oomd-manager.c
varlink,json: introduce new varlink_dispatch() helper
[thirdparty/systemd.git] / src / oom / oomd-manager.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
9de5e321 2
064a5c14
DDM
3#include "sd-daemon.h"
4
5c616ecf
AZ
5#include "bus-log-control-api.h"
6#include "bus-util.h"
7#include "bus-polkit.h"
9de5e321
AZ
8#include "cgroup-util.h"
9#include "fd-util.h"
10#include "fileio.h"
064a5c14 11#include "format-util.h"
408a3bbd 12#include "memory-util.h"
2485b7e2 13#include "memstream-util.h"
5c616ecf 14#include "oomd-manager-bus.h"
9de5e321
AZ
15#include "oomd-manager.h"
16#include "path-util.h"
d9d3f05d 17#include "percent-util.h"
abef4a7b 18#include "varlink-io.systemd.oom.h"
9de5e321 19
71feeae4 20typedef struct ManagedOOMMessage {
9de5e321
AZ
21 ManagedOOMMode mode;
22 char *path;
23 char *property;
d06e7fb5 24 uint32_t limit;
71feeae4 25} ManagedOOMMessage;
9de5e321 26
71feeae4
DDM
27static void managed_oom_message_destroy(ManagedOOMMessage *message) {
28 assert(message);
29 free(message->path);
30 free(message->property);
9de5e321
AZ
31}
32
33static int managed_oom_mode(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) {
34 ManagedOOMMode *mode = userdata, m;
35 const char *s;
36
37 assert(mode);
38 assert_se(s = json_variant_string(v));
39
40 m = managed_oom_mode_from_string(s);
41 if (m < 0)
7211c853 42 return json_log(v, flags, m, "%s is not a valid ManagedOOMMode", s);
9de5e321
AZ
43
44 *mode = m;
45 return 0;
46}
47
064a5c14 48static int process_managed_oom_message(Manager *m, uid_t uid, JsonVariant *parameters) {
9de5e321 49 JsonVariant *c, *cgroups;
71feeae4 50 int r;
9de5e321
AZ
51
52 static const JsonDispatch dispatch_table[] = {
71feeae4
DDM
53 { "mode", JSON_VARIANT_STRING, managed_oom_mode, offsetof(ManagedOOMMessage, mode), JSON_MANDATORY },
54 { "path", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMMessage, path), JSON_MANDATORY },
55 { "property", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMMessage, property), JSON_MANDATORY },
56 { "limit", JSON_VARIANT_UNSIGNED, json_dispatch_uint32, offsetof(ManagedOOMMessage, limit), 0 },
9de5e321
AZ
57 {},
58 };
59
71feeae4
DDM
60 assert(m);
61 assert(parameters);
9de5e321
AZ
62
63 cgroups = json_variant_by_key(parameters, "cgroups");
71feeae4
DDM
64 if (!cgroups)
65 return -EINVAL;
9de5e321
AZ
66
67 /* Skip malformed elements and keep processing in case the others are good */
68 JSON_VARIANT_ARRAY_FOREACH(c, cgroups) {
71feeae4 69 _cleanup_(managed_oom_message_destroy) ManagedOOMMessage message = {};
9de5e321
AZ
70 OomdCGroupContext *ctx;
71 Hashmap *monitor_hm;
72 loadavg_t limit;
9de5e321
AZ
73
74 if (!json_variant_is_object(c))
75 continue;
76
f1b622a0 77 r = json_dispatch(c, dispatch_table, 0, &message);
71feeae4
DDM
78 if (r == -ENOMEM)
79 return r;
80 if (r < 0)
9de5e321
AZ
81 continue;
82
064a5c14
DDM
83 if (uid != 0) {
84 uid_t cg_uid;
85
86 r = cg_path_get_owner_uid(message.path, &cg_uid);
87 if (r < 0) {
b6f6df4c 88 log_debug_errno(r, "Failed to get cgroup %s owner uid: %m", message.path);
064a5c14
DDM
89 continue;
90 }
91
92 /* Let's not be lenient for permission errors and skip processing if we receive an
93 * update for a cgroup that doesn't belong to the user. */
94 if (uid != cg_uid)
95 return log_error_errno(SYNTHETIC_ERRNO(EPERM),
96 "cgroup path owner UID does not match sender uid "
97 "(" UID_FMT " != " UID_FMT ")", uid, cg_uid);
98 }
99
71feeae4 100 monitor_hm = streq(message.property, "ManagedOOMSwap") ?
9de5e321
AZ
101 m->monitored_swap_cgroup_contexts : m->monitored_mem_pressure_cgroup_contexts;
102
71feeae4
DDM
103 if (message.mode == MANAGED_OOM_AUTO) {
104 (void) oomd_cgroup_context_free(hashmap_remove(monitor_hm, empty_to_root(message.path)));
9de5e321
AZ
105 continue;
106 }
107
108 limit = m->default_mem_pressure_limit;
109
71feeae4
DDM
110 if (streq(message.property, "ManagedOOMMemoryPressure") && message.limit > 0) {
111 int permyriad = UINT32_SCALE_TO_PERMYRIAD(message.limit);
d06e7fb5 112
5f1d6ebd 113 r = store_loadavg_fixed_point(permyriad / 100LU, permyriad % 100LU, &limit);
71feeae4 114 if (r < 0)
9de5e321 115 continue;
9de5e321
AZ
116 }
117
71feeae4
DDM
118 r = oomd_insert_cgroup_context(NULL, monitor_hm, message.path);
119 if (r == -ENOMEM)
120 return r;
121 if (r < 0 && r != -EEXIST)
122 log_debug_errno(r, "Failed to insert message, ignoring: %m");
9de5e321
AZ
123
124 /* Always update the limit in case it was changed. For non-memory pressure detection the value is
125 * ignored so always updating it here is not a problem. */
71feeae4 126 ctx = hashmap_get(monitor_hm, empty_to_root(message.path));
9de5e321
AZ
127 if (ctx)
128 ctx->mem_pressure_limit = limit;
129 }
130
b63beb4d
CH
131 /* Toggle wake-ups for "ManagedOOMSwap" if entries are present. */
132 r = sd_event_source_set_enabled(m->swap_context_event_source,
133 hashmap_isempty(m->monitored_swap_cgroup_contexts) ? SD_EVENT_OFF : SD_EVENT_ON);
134 if (r < 0)
135 return log_error_errno(r, "Failed to toggle enabled state of swap context source: %m");
136
71feeae4
DDM
137 return 0;
138}
139
064a5c14
DDM
140static int process_managed_oom_request(
141 Varlink *link,
142 JsonVariant *parameters,
143 VarlinkMethodFlags flags,
144 void *userdata) {
99534007 145 Manager *m = ASSERT_PTR(userdata);
064a5c14
DDM
146 uid_t uid;
147 int r;
148
064a5c14
DDM
149 r = varlink_get_peer_uid(link, &uid);
150 if (r < 0)
151 return log_error_errno(r, "Failed to get varlink peer uid: %m");
152
153 return process_managed_oom_message(m, uid, parameters);
154}
155
71feeae4
DDM
156static int process_managed_oom_reply(
157 Varlink *link,
158 JsonVariant *parameters,
159 const char *error_id,
160 VarlinkReplyFlags flags,
161 void *userdata) {
99534007 162 Manager *m = ASSERT_PTR(userdata);
064a5c14 163 uid_t uid;
71feeae4
DDM
164 int r;
165
71feeae4
DDM
166 if (error_id) {
167 r = -EIO;
168 log_debug("Error getting ManagedOOM cgroups: %s", error_id);
169 goto finish;
170 }
171
064a5c14
DDM
172 r = varlink_get_peer_uid(link, &uid);
173 if (r < 0) {
174 log_error_errno(r, "Failed to get varlink peer uid: %m");
175 goto finish;
176 }
177
178 r = process_managed_oom_message(m, uid, parameters);
71feeae4 179
9de5e321
AZ
180finish:
181 if (!FLAGS_SET(flags, VARLINK_REPLY_CONTINUES))
064a5c14 182 m->varlink_client = varlink_close_unref(link);
9de5e321
AZ
183
184 return r;
185}
186
4d620b90 187/* Fill 'new_h' with 'path's descendant OomdCGroupContexts. Only include descendant cgroups that are possible
9de5e321
AZ
188 * candidates for action. That is, only leaf cgroups or cgroups with memory.oom.group set to "1".
189 *
4d620b90
ZJS
190 * This function ignores most errors in order to handle cgroups that may have been cleaned up while
191 * populating the hashmap.
9de5e321 192 *
4d620b90 193 * 'new_h' is of the form { key: cgroup paths -> value: OomdCGroupContext } */
9de5e321
AZ
194static int recursively_get_cgroup_context(Hashmap *new_h, const char *path) {
195 _cleanup_free_ char *subpath = NULL;
196 _cleanup_closedir_ DIR *d = NULL;
197 int r;
198
199 assert(new_h);
200 assert(path);
201
202 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
203 if (r < 0)
204 return r;
205
206 r = cg_read_subgroup(d, &subpath);
207 if (r < 0)
208 return r;
209 else if (r == 0) { /* No subgroups? We're a leaf node */
210 r = oomd_insert_cgroup_context(NULL, new_h, path);
77b04c0a
AZ
211 if (r == -ENOMEM)
212 return r;
213 if (r < 0)
214 log_debug_errno(r, "Failed to insert context for %s, ignoring: %m", path);
215 return 0;
9de5e321
AZ
216 }
217
218 do {
219 _cleanup_free_ char *cg_path = NULL;
220 bool oom_group;
221
222 cg_path = path_join(empty_to_root(path), subpath);
223 if (!cg_path)
224 return -ENOMEM;
225
226 subpath = mfree(subpath);
227
228 r = cg_get_attribute_as_bool("memory", cg_path, "memory.oom.group", &oom_group);
229 /* The cgroup might be gone. Skip it as a candidate since we can't get information on it. */
77b04c0a
AZ
230 if (r == -ENOMEM)
231 return r;
232 if (r < 0) {
233 log_debug_errno(r, "Failed to read memory.oom.group from %s, ignoring: %m", cg_path);
234 return 0;
235 }
9de5e321 236
349a2003 237 if (oom_group)
9de5e321 238 r = oomd_insert_cgroup_context(NULL, new_h, cg_path);
349a2003 239 else
9de5e321 240 r = recursively_get_cgroup_context(new_h, cg_path);
349a2003
AZ
241 if (r == -ENOMEM)
242 return r;
77b04c0a
AZ
243 if (r < 0)
244 log_debug_errno(r, "Failed to insert or recursively get from %s, ignoring: %m", cg_path);
9de5e321
AZ
245 } while ((r = cg_read_subgroup(d, &subpath)) > 0);
246
247 return 0;
248}
249
250static int update_monitored_cgroup_contexts(Hashmap **monitored_cgroups) {
251 _cleanup_hashmap_free_ Hashmap *new_base = NULL;
252 OomdCGroupContext *ctx;
253 int r;
254
255 assert(monitored_cgroups);
256
257 new_base = hashmap_new(&oomd_cgroup_ctx_hash_ops);
258 if (!new_base)
259 return -ENOMEM;
260
261 HASHMAP_FOREACH(ctx, *monitored_cgroups) {
262 /* Skip most errors since the cgroup we're trying to update might not exist anymore. */
263 r = oomd_insert_cgroup_context(*monitored_cgroups, new_base, ctx->path);
264 if (r == -ENOMEM)
265 return r;
77b04c0a
AZ
266 if (r < 0 && !IN_SET(r, -EEXIST, -ENOENT))
267 log_debug_errno(r, "Failed to insert context for %s, ignoring: %m", ctx->path);
9de5e321
AZ
268 }
269
270 hashmap_free(*monitored_cgroups);
271 *monitored_cgroups = TAKE_PTR(new_base);
272
273 return 0;
274}
275
276static int get_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups, Hashmap **ret_candidates) {
277 _cleanup_hashmap_free_ Hashmap *candidates = NULL;
278 OomdCGroupContext *ctx;
279 int r;
280
281 assert(monitored_cgroups);
282 assert(ret_candidates);
283
284 candidates = hashmap_new(&oomd_cgroup_ctx_hash_ops);
285 if (!candidates)
286 return -ENOMEM;
287
288 HASHMAP_FOREACH(ctx, monitored_cgroups) {
289 r = recursively_get_cgroup_context(candidates, ctx->path);
290 if (r == -ENOMEM)
291 return r;
77b04c0a
AZ
292 if (r < 0)
293 log_debug_errno(r, "Failed to recursively get contexts for %s, ignoring: %m", ctx->path);
9de5e321
AZ
294 }
295
296 *ret_candidates = TAKE_PTR(candidates);
297
298 return 0;
299}
300
91cbb4bd
AZ
301static int update_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups, Hashmap **candidates) {
302 _cleanup_hashmap_free_ Hashmap *new_candidates = NULL;
303 int r;
304
305 assert(monitored_cgroups);
306 assert(candidates);
307 assert(*candidates);
308
309 r = get_monitored_cgroup_contexts_candidates(monitored_cgroups, &new_candidates);
310 if (r < 0)
311 return log_debug_errno(r, "Failed to get candidate contexts: %m");
312
313 oomd_update_cgroup_contexts_between_hashmaps(*candidates, new_candidates);
314
315 hashmap_free(*candidates);
316 *candidates = TAKE_PTR(new_candidates);
317
318 return 0;
319}
320
9de5e321
AZ
321static int acquire_managed_oom_connect(Manager *m) {
322 _cleanup_(varlink_close_unrefp) Varlink *link = NULL;
323 int r;
324
325 assert(m);
326 assert(m->event);
327
064a5c14 328 r = varlink_connect_address(&link, VARLINK_ADDR_PATH_MANAGED_OOM_SYSTEM);
9de5e321 329 if (r < 0)
064a5c14 330 return log_error_errno(r, "Failed to connect to " VARLINK_ADDR_PATH_MANAGED_OOM_SYSTEM ": %m");
9de5e321
AZ
331
332 (void) varlink_set_userdata(link, m);
333 (void) varlink_set_description(link, "oomd");
334 (void) varlink_set_relative_timeout(link, USEC_INFINITY);
335
336 r = varlink_attach_event(link, m->event, SD_EVENT_PRIORITY_NORMAL);
337 if (r < 0)
338 return log_error_errno(r, "Failed to attach varlink connection to event loop: %m");
339
340 r = varlink_bind_reply(link, process_managed_oom_reply);
341 if (r < 0)
342 return log_error_errno(r, "Failed to bind reply callback: %m");
343
344 r = varlink_observe(link, "io.systemd.ManagedOOM.SubscribeManagedOOMCGroups", NULL);
345 if (r < 0)
346 return log_error_errno(r, "Failed to observe varlink call: %m");
347
064a5c14 348 m->varlink_client = TAKE_PTR(link);
9de5e321
AZ
349 return 0;
350}
351
81d66fab 352static int monitor_swap_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
99534007 353 Manager *m = ASSERT_PTR(userdata);
9de5e321
AZ
354 usec_t usec_now;
355 int r;
356
357 assert(s);
b63beb4d 358 assert(!hashmap_isempty(m->monitored_swap_cgroup_contexts));
9de5e321
AZ
359
360 /* Reset timer */
361 r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now);
362 if (r < 0)
77b04c0a 363 return log_error_errno(r, "Failed to reset event timer: %m");
9de5e321 364
81d66fab 365 r = sd_event_source_set_time_relative(s, SWAP_INTERVAL_USEC);
9de5e321 366 if (r < 0)
77b04c0a 367 return log_error_errno(r, "Failed to set relative time for timer: %m");
9de5e321
AZ
368
369 /* Reconnect if our connection dropped */
064a5c14 370 if (!m->varlink_client) {
9de5e321
AZ
371 r = acquire_managed_oom_connect(m);
372 if (r < 0)
77b04c0a 373 return log_error_errno(r, "Failed to acquire varlink connection: %m");
9de5e321
AZ
374 }
375
47136b9d
AZ
376 /* We still try to acquire system information for oomctl even if no units want swap monitoring */
377 r = oomd_system_context_acquire("/proc/meminfo", &m->system_context);
378 /* If there are no units depending on swap actions, the only error we exit on is ENOMEM. */
b63beb4d 379 if (r < 0)
81d66fab
AZ
380 return log_error_errno(r, "Failed to acquire system context: %m");
381
81d66fab
AZ
382 /* Note that m->monitored_swap_cgroup_contexts does not need to be updated every interval because only the
383 * system context is used for deciding whether the swap threshold is hit. m->monitored_swap_cgroup_contexts
384 * is only used to decide which cgroups to kill (and even then only the resource usages of its descendent
385 * nodes are the ones that matter). */
386
030bc91c
NR
387 /* Check amount of memory available and swap free so we don't free up swap when memory is still available. */
388 if (oomd_mem_available_below(&m->system_context, 10000 - m->swap_used_limit_permyriad) &&
cb5ce676 389 oomd_swap_free_below(&m->system_context, 10000 - m->swap_used_limit_permyriad)) {
81d66fab
AZ
390 _cleanup_hashmap_free_ Hashmap *candidates = NULL;
391 _cleanup_free_ char *selected = NULL;
685b0985 392 uint64_t threshold;
81d66fab 393
cb5ce676
AZ
394 log_debug("Memory used (%"PRIu64") / total (%"PRIu64") and "
395 "swap used (%"PRIu64") / total (%"PRIu64") is more than " PERMYRIAD_AS_PERCENT_FORMAT_STR,
396 m->system_context.mem_used, m->system_context.mem_total,
81d66fab
AZ
397 m->system_context.swap_used, m->system_context.swap_total,
398 PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
399
400 r = get_monitored_cgroup_contexts_candidates(m->monitored_swap_cgroup_contexts, &candidates);
401 if (r == -ENOMEM)
402 return log_oom();
403 if (r < 0)
404 log_debug_errno(r, "Failed to get monitored swap cgroup candidates, ignoring: %m");
405
685b0985
AZ
406 threshold = m->system_context.swap_total * THRESHOLD_SWAP_USED_PERCENT / 100;
407 r = oomd_kill_by_swap_usage(candidates, threshold, m->dry_run, &selected);
81d66fab
AZ
408 if (r == -ENOMEM)
409 return log_oom();
410 if (r < 0)
0923b425 411 log_notice_errno(r, "Failed to kill any cgroups based on swap: %m");
81d66fab 412 else {
d784a8d4 413 if (selected && r > 0) {
cb5ce676
AZ
414 log_notice("Killed %s due to memory used (%"PRIu64") / total (%"PRIu64") and "
415 "swap used (%"PRIu64") / total (%"PRIu64") being more than "
81d66fab 416 PERMYRIAD_AS_PERCENT_FORMAT_STR,
cb5ce676
AZ
417 selected,
418 m->system_context.mem_used, m->system_context.mem_total,
419 m->system_context.swap_used, m->system_context.swap_total,
81d66fab 420 PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
d784a8d4
OS
421
422 /* send dbus signal */
423 (void) sd_bus_emit_signal(m->bus,
424 "/org/freedesktop/oom1",
425 "org.freedesktop.oom1.Manager",
426 "Killed",
427 "ss",
428 selected,
429 "memory-used");
430 }
81d66fab
AZ
431 return 0;
432 }
433 }
434
435 return 0;
436}
437
cb13961a
AZ
438static void clear_candidate_hashmapp(Manager **m) {
439 if (*m)
440 hashmap_clear((*m)->monitored_mem_pressure_cgroup_contexts_candidates);
441}
442
81d66fab 443static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
cb13961a
AZ
444 /* Don't want to use stale candidate data. Setting this will clear the candidate hashmap on return unless we
445 * update the candidate data (in which case clear_candidates will be NULL). */
d7ac0952 446 _unused_ _cleanup_(clear_candidate_hashmapp) Manager *clear_candidates = userdata;
81d66fab 447 _cleanup_set_free_ Set *targets = NULL;
cb13961a 448 bool in_post_action_delay = false;
99534007 449 Manager *m = ASSERT_PTR(userdata);
81d66fab
AZ
450 usec_t usec_now;
451 int r;
452
453 assert(s);
81d66fab
AZ
454
455 /* Reset timer */
456 r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now);
77b04c0a 457 if (r < 0)
81d66fab
AZ
458 return log_error_errno(r, "Failed to reset event timer: %m");
459
460 r = sd_event_source_set_time_relative(s, MEM_PRESSURE_INTERVAL_USEC);
461 if (r < 0)
462 return log_error_errno(r, "Failed to set relative time for timer: %m");
463
464 /* Reconnect if our connection dropped */
064a5c14 465 if (!m->varlink_client) {
81d66fab
AZ
466 r = acquire_managed_oom_connect(m);
467 if (r < 0)
468 return log_error_errno(r, "Failed to acquire varlink connection: %m");
469 }
9de5e321 470
81d66fab 471 /* Return early if nothing is requesting memory pressure monitoring */
cb13961a 472 if (hashmap_isempty(m->monitored_mem_pressure_cgroup_contexts))
81d66fab 473 return 0;
81d66fab
AZ
474
475 /* Update the cgroups used for detection/action */
9de5e321
AZ
476 r = update_monitored_cgroup_contexts(&m->monitored_mem_pressure_cgroup_contexts);
477 if (r == -ENOMEM)
77b04c0a
AZ
478 return log_oom();
479 if (r < 0)
480 log_debug_errno(r, "Failed to update monitored memory pressure cgroup contexts, ignoring: %m");
9de5e321 481
81d66fab
AZ
482 /* Since pressure counters are lagging, we need to wait a bit after a kill to ensure we don't read stale
483 * values and go on a kill storm. */
484 if (m->mem_pressure_post_action_delay_start > 0) {
485 if (m->mem_pressure_post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now)
cb13961a 486 in_post_action_delay = true;
9de5e321 487 else
81d66fab 488 m->mem_pressure_post_action_delay_start = 0;
9de5e321
AZ
489 }
490
c20aa7b1 491 r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets);
9de5e321 492 if (r == -ENOMEM)
77b04c0a
AZ
493 return log_oom();
494 if (r < 0)
495 log_debug_errno(r, "Failed to check if memory pressure exceeded limits, ignoring: %m");
cb13961a 496 else if (r == 1 && !in_post_action_delay) {
df637ede
AZ
497 OomdCGroupContext *t;
498 SET_FOREACH(t, targets) {
499 _cleanup_free_ char *selected = NULL;
df637ede
AZ
500
501 /* Check if there was reclaim activity in the given interval. The concern is the following case:
502 * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending
503 * cgroup. Even after this, well-behaved processes will fault in recently resident pages and
504 * this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need
505 * to kill something (it won't help anyways). */
506 if ((now(CLOCK_MONOTONIC) - t->last_had_mem_reclaim) > RECLAIM_DURATION_USEC)
507 continue;
508
509 log_debug("Memory pressure for %s is %lu.%02lu%% > %lu.%02lu%% for > %s with reclaim activity",
510 t->path,
3542da24
LB
511 LOADAVG_INT_SIDE(t->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(t->memory_pressure.avg10),
512 LOADAVG_INT_SIDE(t->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(t->mem_pressure_limit),
5291f26d 513 FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC));
df637ede 514
cb13961a
AZ
515 r = update_monitored_cgroup_contexts_candidates(
516 m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates);
517 if (r == -ENOMEM)
518 return log_oom();
519 if (r < 0)
520 log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
521 else
522 clear_candidates = NULL;
523
ebfb6019
ZJS
524 r = oomd_kill_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates,
525 /* prefix= */ t->path,
526 /* dry_run= */ m->dry_run,
527 &selected);
df637ede
AZ
528 if (r == -ENOMEM)
529 return log_oom();
530 if (r < 0)
0923b425 531 log_notice_errno(r, "Failed to kill any cgroups under %s based on pressure: %m", t->path);
df637ede 532 else {
914d4e99
AZ
533 /* Don't act on all the high pressure cgroups at once; return as soon as we kill one.
534 * If r == 0 then it means there were not eligible candidates, the candidate cgroup
535 * disappeared, or the candidate cgroup has no processes by the time we tried to kill
536 * it. In either case, go through the event loop again and select a new candidate if
537 * pressure is still high. */
df637ede 538 m->mem_pressure_post_action_delay_start = usec_now;
d784a8d4 539 if (selected && r > 0) {
df637ede
AZ
540 log_notice("Killed %s due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%"
541 " for > %s with reclaim activity",
542 selected, t->path,
3542da24
LB
543 LOADAVG_INT_SIDE(t->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(t->memory_pressure.avg10),
544 LOADAVG_INT_SIDE(t->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(t->mem_pressure_limit),
5291f26d 545 FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC));
d784a8d4
OS
546
547 /* send dbus signal */
548 (void) sd_bus_emit_signal(m->bus,
549 "/org/freedesktop/oom1",
550 "org.freedesktop.oom1.Manager",
551 "Killed",
552 "ss",
553 selected,
554 "memory-pressure");
555 }
df637ede 556 return 0;
9de5e321
AZ
557 }
558 }
cb13961a
AZ
559 } else {
560 /* If any monitored cgroup is over their pressure limit, get all the kill candidates for every
561 * monitored cgroup. This saves CPU cycles from doing it every interval by only doing it when a kill
562 * might happen.
563 * Candidate cgroup data will continue to get updated during the post-action delay period in case
564 * pressure continues to be high after a kill. */
565 OomdCGroupContext *c;
566 HASHMAP_FOREACH(c, m->monitored_mem_pressure_cgroup_contexts) {
567 if (c->mem_pressure_limit_hit_start == 0)
568 continue;
569
570 r = update_monitored_cgroup_contexts_candidates(
571 m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates);
572 if (r == -ENOMEM)
573 return log_oom();
574 if (r < 0)
575 log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
576 else {
577 clear_candidates = NULL;
578 break;
579 }
580 }
9de5e321
AZ
581 }
582
81d66fab
AZ
583 return 0;
584}
9de5e321 585
81d66fab
AZ
586static int monitor_swap_contexts(Manager *m) {
587 _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
588 int r;
9de5e321 589
81d66fab
AZ
590 assert(m);
591 assert(m->event);
9de5e321 592
81d66fab
AZ
593 r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_swap_contexts_handler, m);
594 if (r < 0)
595 return r;
596
597 r = sd_event_source_set_exit_on_failure(s, true);
598 if (r < 0)
599 return r;
9de5e321 600
b63beb4d 601 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
81d66fab
AZ
602 if (r < 0)
603 return r;
604
605 (void) sd_event_source_set_description(s, "oomd-swap-timer");
606
607 m->swap_context_event_source = TAKE_PTR(s);
9de5e321
AZ
608 return 0;
609}
610
81d66fab 611static int monitor_memory_pressure_contexts(Manager *m) {
9de5e321
AZ
612 _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
613 int r;
614
615 assert(m);
616 assert(m->event);
617
81d66fab 618 r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_memory_pressure_contexts_handler, m);
9de5e321
AZ
619 if (r < 0)
620 return r;
621
622 r = sd_event_source_set_exit_on_failure(s, true);
623 if (r < 0)
624 return r;
625
626 r = sd_event_source_set_enabled(s, SD_EVENT_ON);
627 if (r < 0)
628 return r;
629
81d66fab 630 (void) sd_event_source_set_description(s, "oomd-memory-pressure-timer");
9de5e321 631
81d66fab 632 m->mem_pressure_context_event_source = TAKE_PTR(s);
9de5e321
AZ
633 return 0;
634}
635
75db809a 636Manager* manager_free(Manager *m) {
9de5e321
AZ
637 assert(m);
638
064a5c14
DDM
639 varlink_server_unref(m->varlink_server);
640 varlink_close_unref(m->varlink_client);
81d66fab
AZ
641 sd_event_source_unref(m->swap_context_event_source);
642 sd_event_source_unref(m->mem_pressure_context_event_source);
9de5e321
AZ
643 sd_event_unref(m->event);
644
5c616ecf
AZ
645 bus_verify_polkit_async_registry_free(m->polkit_registry);
646 sd_bus_flush_close_unref(m->bus);
647
9de5e321
AZ
648 hashmap_free(m->monitored_swap_cgroup_contexts);
649 hashmap_free(m->monitored_mem_pressure_cgroup_contexts);
91cbb4bd 650 hashmap_free(m->monitored_mem_pressure_cgroup_contexts_candidates);
9de5e321 651
75db809a 652 return mfree(m);
9de5e321
AZ
653}
654
655int manager_new(Manager **ret) {
656 _cleanup_(manager_freep) Manager *m = NULL;
657 int r;
658
659 assert(ret);
660
661 m = new0(Manager, 1);
662 if (!m)
663 return -ENOMEM;
664
665 r = sd_event_default(&m->event);
666 if (r < 0)
667 return r;
668
669 (void) sd_event_set_watchdog(m->event, true);
670
671 r = sd_event_add_signal(m->event, NULL, SIGINT, NULL, NULL);
672 if (r < 0)
673 return r;
674
675 r = sd_event_add_signal(m->event, NULL, SIGTERM, NULL, NULL);
676 if (r < 0)
677 return r;
678
679 m->monitored_swap_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops);
680 if (!m->monitored_swap_cgroup_contexts)
681 return -ENOMEM;
682
683 m->monitored_mem_pressure_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops);
684 if (!m->monitored_mem_pressure_cgroup_contexts)
685 return -ENOMEM;
686
91cbb4bd
AZ
687 m->monitored_mem_pressure_cgroup_contexts_candidates = hashmap_new(&oomd_cgroup_ctx_hash_ops);
688 if (!m->monitored_mem_pressure_cgroup_contexts_candidates)
689 return -ENOMEM;
690
9de5e321
AZ
691 *ret = TAKE_PTR(m);
692 return 0;
693}
694
5c616ecf
AZ
695static int manager_connect_bus(Manager *m) {
696 int r;
697
698 assert(m);
699 assert(!m->bus);
700
701 r = bus_open_system_watch_bind_with_description(&m->bus, "bus-api-oom");
702 if (r < 0)
703 return log_error_errno(r, "Failed to connect to bus: %m");
704
c9a00f5a 705 r = bus_add_implementation(m->bus, &manager_object, m);
5c616ecf 706 if (r < 0)
c9a00f5a 707 return r;
5c616ecf
AZ
708
709 r = bus_log_control_api_register(m->bus);
710 if (r < 0)
711 return r;
712
713 r = sd_bus_request_name_async(m->bus, NULL, "org.freedesktop.oom1", 0, NULL, NULL);
714 if (r < 0)
715 return log_error_errno(r, "Failed to request name: %m");
716
717 r = sd_bus_attach_event(m->bus, m->event, 0);
718 if (r < 0)
719 return log_error_errno(r, "Failed to attach bus to event loop: %m");
720
721 return 0;
722}
723
064a5c14
DDM
724static int manager_varlink_init(Manager *m, int fd) {
725 _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL;
726 int r;
727
728 assert(m);
729 assert(!m->varlink_server);
730
731 r = varlink_server_new(&s, VARLINK_SERVER_ACCOUNT_UID|VARLINK_SERVER_INHERIT_USERDATA);
732 if (r < 0)
733 return log_error_errno(r, "Failed to allocate varlink server object: %m");
734
735 varlink_server_set_userdata(s, m);
736
abef4a7b
LP
737 r = varlink_server_add_interface(s, &vl_interface_io_systemd_oom);
738 if (r < 0)
739 return log_error_errno(r, "Failed to add oom interface to varlink server: %m");
740
064a5c14
DDM
741 r = varlink_server_bind_method(s, "io.systemd.oom.ReportManagedOOMCGroups", process_managed_oom_request);
742 if (r < 0)
743 return log_error_errno(r, "Failed to register varlink method: %m");
744
745 if (fd < 0)
746 r = varlink_server_listen_address(s, VARLINK_ADDR_PATH_MANAGED_OOM_USER, 0666);
747 else
748 r = varlink_server_listen_fd(s, fd);
749 if (r < 0)
750 return log_error_errno(r, "Failed to bind to varlink socket: %m");
751
752 r = varlink_server_attach_event(s, m->event, SD_EVENT_PRIORITY_NORMAL);
753 if (r < 0)
754 return log_error_errno(r, "Failed to attach varlink connection to event loop: %m");
755
756 log_debug("Initialized systemd-oomd varlink server");
757
758 m->varlink_server = TAKE_PTR(s);
759 return 0;
760}
761
d06e7fb5
LP
762int manager_start(
763 Manager *m,
764 bool dry_run,
765 int swap_used_limit_permyriad,
766 int mem_pressure_limit_permyriad,
064a5c14
DDM
767 usec_t mem_pressure_usec,
768 int fd) {
d06e7fb5 769
0a9f9344 770 unsigned long l, f;
9de5e321
AZ
771 int r;
772
773 assert(m);
774
775 m->dry_run = dry_run;
776
d06e7fb5
LP
777 m->swap_used_limit_permyriad = swap_used_limit_permyriad >= 0 ? swap_used_limit_permyriad : DEFAULT_SWAP_USED_LIMIT_PERCENT * 100;
778 assert(m->swap_used_limit_permyriad <= 10000);
9de5e321 779
d06e7fb5 780 if (mem_pressure_limit_permyriad >= 0) {
0a9f9344
AZ
781 assert(mem_pressure_limit_permyriad <= 10000);
782
783 l = mem_pressure_limit_permyriad / 100;
784 f = mem_pressure_limit_permyriad % 100;
785 } else {
786 l = DEFAULT_MEM_PRESSURE_LIMIT_PERCENT;
787 f = 0;
788 }
789 r = store_loadavg_fixed_point(l, f, &m->default_mem_pressure_limit);
9de5e321
AZ
790 if (r < 0)
791 return r;
792
c20aa7b1
AZ
793 m->default_mem_pressure_duration_usec = mem_pressure_usec ?: DEFAULT_MEM_PRESSURE_DURATION_USEC;
794
5c616ecf
AZ
795 r = manager_connect_bus(m);
796 if (r < 0)
797 return r;
798
9de5e321
AZ
799 r = acquire_managed_oom_connect(m);
800 if (r < 0)
801 return r;
802
064a5c14
DDM
803 r = manager_varlink_init(m, fd);
804 if (r < 0)
805 return r;
806
81d66fab
AZ
807 r = monitor_memory_pressure_contexts(m);
808 if (r < 0)
809 return r;
810
811 r = monitor_swap_contexts(m);
9de5e321
AZ
812 if (r < 0)
813 return r;
814
815 return 0;
816}
5c616ecf
AZ
817
818int manager_get_dump_string(Manager *m, char **ret) {
2485b7e2 819 _cleanup_(memstream_done) MemStream ms = {};
5c616ecf 820 OomdCGroupContext *c;
2485b7e2 821 FILE *f;
5c616ecf
AZ
822
823 assert(m);
824 assert(ret);
825
2485b7e2 826 f = memstream_init(&ms);
5c616ecf 827 if (!f)
2485b7e2 828 return -ENOMEM;
5c616ecf
AZ
829
830 fprintf(f,
831 "Dry Run: %s\n"
d06e7fb5 832 "Swap Used Limit: " PERMYRIAD_AS_PERCENT_FORMAT_STR "\n"
0a9f9344 833 "Default Memory Pressure Limit: %lu.%02lu%%\n"
c20aa7b1 834 "Default Memory Pressure Duration: %s\n"
5c616ecf
AZ
835 "System Context:\n",
836 yes_no(m->dry_run),
d06e7fb5 837 PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad),
3542da24 838 LOADAVG_INT_SIDE(m->default_mem_pressure_limit), LOADAVG_DECIMAL_SIDE(m->default_mem_pressure_limit),
5291f26d 839 FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC));
5c616ecf
AZ
840 oomd_dump_system_context(&m->system_context, f, "\t");
841
842 fprintf(f, "Swap Monitored CGroups:\n");
64377c60 843 HASHMAP_FOREACH(c, m->monitored_swap_cgroup_contexts)
5c616ecf
AZ
844 oomd_dump_swap_cgroup_context(c, f, "\t");
845
846 fprintf(f, "Memory Pressure Monitored CGroups:\n");
64377c60 847 HASHMAP_FOREACH(c, m->monitored_mem_pressure_cgroup_contexts)
5c616ecf
AZ
848 oomd_dump_memory_pressure_cgroup_context(c, f, "\t");
849
2485b7e2 850 return memstream_finalize(&ms, ret, NULL);
5c616ecf 851}