1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
3 #include "bus-log-control-api.h"
5 #include "bus-polkit.h"
6 #include "cgroup-util.h"
9 #include "memory-util.h"
10 #include "oomd-manager-bus.h"
11 #include "oomd-manager.h"
12 #include "path-util.h"
13 #include "percent-util.h"
15 typedef struct ManagedOOMReply
{
22 static void managed_oom_reply_destroy(ManagedOOMReply
*reply
) {
25 free(reply
->property
);
28 static int managed_oom_mode(const char *name
, JsonVariant
*v
, JsonDispatchFlags flags
, void *userdata
) {
29 ManagedOOMMode
*mode
= userdata
, m
;
33 assert_se(s
= json_variant_string(v
));
35 m
= managed_oom_mode_from_string(s
);
37 return json_log(v
, flags
, m
, "%s is not a valid ManagedOOMMode", s
);
43 static int process_managed_oom_reply(
45 JsonVariant
*parameters
,
47 VarlinkReplyFlags flags
,
49 JsonVariant
*c
, *cgroups
;
50 Manager
*m
= userdata
;
55 static const JsonDispatch dispatch_table
[] = {
56 { "mode", JSON_VARIANT_STRING
, managed_oom_mode
, offsetof(ManagedOOMReply
, mode
), JSON_MANDATORY
},
57 { "path", JSON_VARIANT_STRING
, json_dispatch_string
, offsetof(ManagedOOMReply
, path
), JSON_MANDATORY
},
58 { "property", JSON_VARIANT_STRING
, json_dispatch_string
, offsetof(ManagedOOMReply
, property
), JSON_MANDATORY
},
59 { "limit", JSON_VARIANT_UNSIGNED
, json_dispatch_uint32
, offsetof(ManagedOOMReply
, limit
), 0 },
65 log_debug("Error getting ManagedOOM cgroups: %s", error_id
);
69 cgroups
= json_variant_by_key(parameters
, "cgroups");
75 /* Skip malformed elements and keep processing in case the others are good */
76 JSON_VARIANT_ARRAY_FOREACH(c
, cgroups
) {
77 _cleanup_(managed_oom_reply_destroy
) ManagedOOMReply reply
= {};
78 OomdCGroupContext
*ctx
;
83 if (!json_variant_is_object(c
))
86 ret
= json_dispatch(c
, dispatch_table
, NULL
, 0, &reply
);
94 monitor_hm
= streq(reply
.property
, "ManagedOOMSwap") ?
95 m
->monitored_swap_cgroup_contexts
: m
->monitored_mem_pressure_cgroup_contexts
;
97 if (reply
.mode
== MANAGED_OOM_AUTO
) {
98 (void) oomd_cgroup_context_free(hashmap_remove(monitor_hm
, empty_to_root(reply
.path
)));
102 limit
= m
->default_mem_pressure_limit
;
104 if (streq(reply
.property
, "ManagedOOMMemoryPressure") && reply
.limit
> 0) {
105 int permyriad
= UINT32_SCALE_TO_PERMYRIAD(reply
.limit
);
107 ret
= store_loadavg_fixed_point(
108 (unsigned long) permyriad
/ 100,
109 (unsigned long) permyriad
% 100,
115 ret
= oomd_insert_cgroup_context(NULL
, monitor_hm
, reply
.path
);
116 if (ret
== -ENOMEM
) {
120 if (ret
< 0 && ret
!= -EEXIST
)
121 log_debug_errno(ret
, "Failed to insert reply, ignoring: %m");
123 /* Always update the limit in case it was changed. For non-memory pressure detection the value is
124 * ignored so always updating it here is not a problem. */
125 ctx
= hashmap_get(monitor_hm
, empty_to_root(reply
.path
));
127 ctx
->mem_pressure_limit
= limit
;
131 if (!FLAGS_SET(flags
, VARLINK_REPLY_CONTINUES
))
132 m
->varlink
= varlink_close_unref(link
);
137 /* Fill `new_h` with `path`'s descendent OomdCGroupContexts. Only include descendent cgroups that are possible
138 * candidates for action. That is, only leaf cgroups or cgroups with memory.oom.group set to "1".
140 * This function ignores most errors in order to handle cgroups that may have been cleaned up while populating
143 * `new_h` is of the form { key: cgroup paths -> value: OomdCGroupContext } */
144 static int recursively_get_cgroup_context(Hashmap
*new_h
, const char *path
) {
145 _cleanup_free_
char *subpath
= NULL
;
146 _cleanup_closedir_
DIR *d
= NULL
;
152 r
= cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER
, path
, &d
);
156 r
= cg_read_subgroup(d
, &subpath
);
159 else if (r
== 0) { /* No subgroups? We're a leaf node */
160 r
= oomd_insert_cgroup_context(NULL
, new_h
, path
);
164 log_debug_errno(r
, "Failed to insert context for %s, ignoring: %m", path
);
169 _cleanup_free_
char *cg_path
= NULL
;
172 cg_path
= path_join(empty_to_root(path
), subpath
);
176 subpath
= mfree(subpath
);
178 r
= cg_get_attribute_as_bool("memory", cg_path
, "memory.oom.group", &oom_group
);
179 /* The cgroup might be gone. Skip it as a candidate since we can't get information on it. */
183 log_debug_errno(r
, "Failed to read memory.oom.group from %s, ignoring: %m", cg_path
);
188 r
= oomd_insert_cgroup_context(NULL
, new_h
, cg_path
);
190 r
= recursively_get_cgroup_context(new_h
, cg_path
);
194 log_debug_errno(r
, "Failed to insert or recursively get from %s, ignoring: %m", cg_path
);
195 } while ((r
= cg_read_subgroup(d
, &subpath
)) > 0);
200 static int update_monitored_cgroup_contexts(Hashmap
**monitored_cgroups
) {
201 _cleanup_hashmap_free_ Hashmap
*new_base
= NULL
;
202 OomdCGroupContext
*ctx
;
205 assert(monitored_cgroups
);
207 new_base
= hashmap_new(&oomd_cgroup_ctx_hash_ops
);
211 HASHMAP_FOREACH(ctx
, *monitored_cgroups
) {
212 /* Skip most errors since the cgroup we're trying to update might not exist anymore. */
213 r
= oomd_insert_cgroup_context(*monitored_cgroups
, new_base
, ctx
->path
);
216 if (r
< 0 && !IN_SET(r
, -EEXIST
, -ENOENT
))
217 log_debug_errno(r
, "Failed to insert context for %s, ignoring: %m", ctx
->path
);
220 hashmap_free(*monitored_cgroups
);
221 *monitored_cgroups
= TAKE_PTR(new_base
);
226 static int get_monitored_cgroup_contexts_candidates(Hashmap
*monitored_cgroups
, Hashmap
**ret_candidates
) {
227 _cleanup_hashmap_free_ Hashmap
*candidates
= NULL
;
228 OomdCGroupContext
*ctx
;
231 assert(monitored_cgroups
);
232 assert(ret_candidates
);
234 candidates
= hashmap_new(&oomd_cgroup_ctx_hash_ops
);
238 HASHMAP_FOREACH(ctx
, monitored_cgroups
) {
239 r
= recursively_get_cgroup_context(candidates
, ctx
->path
);
243 log_debug_errno(r
, "Failed to recursively get contexts for %s, ignoring: %m", ctx
->path
);
246 *ret_candidates
= TAKE_PTR(candidates
);
251 static int update_monitored_cgroup_contexts_candidates(Hashmap
*monitored_cgroups
, Hashmap
**candidates
) {
252 _cleanup_hashmap_free_ Hashmap
*new_candidates
= NULL
;
255 assert(monitored_cgroups
);
259 r
= get_monitored_cgroup_contexts_candidates(monitored_cgroups
, &new_candidates
);
261 return log_debug_errno(r
, "Failed to get candidate contexts: %m");
263 oomd_update_cgroup_contexts_between_hashmaps(*candidates
, new_candidates
);
265 hashmap_free(*candidates
);
266 *candidates
= TAKE_PTR(new_candidates
);
271 static int acquire_managed_oom_connect(Manager
*m
) {
272 _cleanup_(varlink_close_unrefp
) Varlink
*link
= NULL
;
278 r
= varlink_connect_address(&link
, VARLINK_ADDR_PATH_MANAGED_OOM
);
280 return log_error_errno(r
, "Failed to connect to %s: %m", VARLINK_ADDR_PATH_MANAGED_OOM
);
282 (void) varlink_set_userdata(link
, m
);
283 (void) varlink_set_description(link
, "oomd");
284 (void) varlink_set_relative_timeout(link
, USEC_INFINITY
);
286 r
= varlink_attach_event(link
, m
->event
, SD_EVENT_PRIORITY_NORMAL
);
288 return log_error_errno(r
, "Failed to attach varlink connection to event loop: %m");
290 r
= varlink_bind_reply(link
, process_managed_oom_reply
);
292 return log_error_errno(r
, "Failed to bind reply callback: %m");
294 r
= varlink_observe(link
, "io.systemd.ManagedOOM.SubscribeManagedOOMCGroups", NULL
);
296 return log_error_errno(r
, "Failed to observe varlink call: %m");
298 m
->varlink
= TAKE_PTR(link
);
302 static int monitor_cgroup_contexts_handler(sd_event_source
*s
, uint64_t usec
, void *userdata
) {
303 _cleanup_set_free_ Set
*targets
= NULL
;
304 Manager
*m
= userdata
;
312 r
= sd_event_now(sd_event_source_get_event(s
), CLOCK_MONOTONIC
, &usec_now
);
314 return log_error_errno(r
, "Failed to reset event timer: %m");
316 r
= sd_event_source_set_time_relative(s
, INTERVAL_USEC
);
318 return log_error_errno(r
, "Failed to set relative time for timer: %m");
320 /* Reconnect if our connection dropped */
322 r
= acquire_managed_oom_connect(m
);
324 return log_error_errno(r
, "Failed to acquire varlink connection: %m");
327 /* Update the cgroups used for detection/action */
328 r
= update_monitored_cgroup_contexts(&m
->monitored_swap_cgroup_contexts
);
332 log_debug_errno(r
, "Failed to update monitored swap cgroup contexts, ignoring: %m");
334 r
= update_monitored_cgroup_contexts(&m
->monitored_mem_pressure_cgroup_contexts
);
338 log_debug_errno(r
, "Failed to update monitored memory pressure cgroup contexts, ignoring: %m");
340 r
= update_monitored_cgroup_contexts_candidates(
341 m
->monitored_mem_pressure_cgroup_contexts
, &m
->monitored_mem_pressure_cgroup_contexts_candidates
);
345 log_debug_errno(r
, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
347 r
= oomd_system_context_acquire("/proc/swaps", &m
->system_context
);
348 /* If there aren't units depending on swap actions, the only error we exit on is ENOMEM.
349 * Allow ENOENT in the event that swap is disabled on the system. */
350 if (r
== -ENOMEM
|| (r
< 0 && r
!= -ENOENT
&& !hashmap_isempty(m
->monitored_swap_cgroup_contexts
)))
351 return log_error_errno(r
, "Failed to acquire system context: %m");
352 else if (r
== -ENOENT
)
353 zero(m
->system_context
);
355 if (oomd_memory_reclaim(m
->monitored_mem_pressure_cgroup_contexts
))
356 m
->last_reclaim_at
= usec_now
;
358 /* If we're still recovering from a kill, don't try to kill again yet */
359 if (m
->post_action_delay_start
> 0) {
360 if (m
->post_action_delay_start
+ POST_ACTION_DELAY_USEC
> usec_now
)
363 m
->post_action_delay_start
= 0;
366 r
= oomd_pressure_above(m
->monitored_mem_pressure_cgroup_contexts
, m
->default_mem_pressure_duration_usec
, &targets
);
370 log_debug_errno(r
, "Failed to check if memory pressure exceeded limits, ignoring: %m");
372 /* Check if there was reclaim activity in the given interval. The concern is the following case:
373 * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending
374 * cgroup. Even after this, well-behaved processes will fault in recently resident pages and
375 * this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need
376 * to kill something (it won't help anyways). */
377 if ((usec_now
- m
->last_reclaim_at
) <= RECLAIM_DURATION_USEC
) {
378 OomdCGroupContext
*t
;
380 SET_FOREACH(t
, targets
) {
381 _cleanup_free_
char *selected
= NULL
;
382 char ts
[FORMAT_TIMESPAN_MAX
];
384 log_debug("Memory pressure for %s is %lu.%02lu%% > %lu.%02lu%% for > %s with reclaim activity",
386 LOAD_INT(t
->memory_pressure
.avg10
), LOAD_FRAC(t
->memory_pressure
.avg10
),
387 LOAD_INT(t
->mem_pressure_limit
), LOAD_FRAC(t
->mem_pressure_limit
),
388 format_timespan(ts
, sizeof ts
,
389 m
->default_mem_pressure_duration_usec
,
392 r
= oomd_kill_by_pgscan_rate(m
->monitored_mem_pressure_cgroup_contexts_candidates
, t
->path
, m
->dry_run
, &selected
);
396 log_notice_errno(r
, "Failed to kill any cgroup(s) under %s based on pressure: %m", t
->path
);
398 /* Don't act on all the high pressure cgroups at once; return as soon as we kill one */
399 m
->post_action_delay_start
= usec_now
;
401 log_notice("Killed %s due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%"
402 " for > %s with reclaim activity",
404 LOAD_INT(t
->memory_pressure
.avg10
), LOAD_FRAC(t
->memory_pressure
.avg10
),
405 LOAD_INT(t
->mem_pressure_limit
), LOAD_FRAC(t
->mem_pressure_limit
),
406 format_timespan(ts
, sizeof ts
,
407 m
->default_mem_pressure_duration_usec
,
415 if (oomd_swap_free_below(&m
->system_context
, 10000 - m
->swap_used_limit_permyriad
)) {
416 _cleanup_hashmap_free_ Hashmap
*candidates
= NULL
;
417 _cleanup_free_
char *selected
= NULL
;
419 log_debug("Swap used (%"PRIu64
") / total (%"PRIu64
") is more than " PERMYRIAD_AS_PERCENT_FORMAT_STR
,
420 m
->system_context
.swap_used
, m
->system_context
.swap_total
,
421 PERMYRIAD_AS_PERCENT_FORMAT_VAL(m
->swap_used_limit_permyriad
));
423 r
= get_monitored_cgroup_contexts_candidates(m
->monitored_swap_cgroup_contexts
, &candidates
);
427 log_debug_errno(r
, "Failed to get monitored swap cgroup candidates, ignoring: %m");
429 r
= oomd_kill_by_swap_usage(candidates
, m
->dry_run
, &selected
);
433 log_notice_errno(r
, "Failed to kill any cgroup(s) based on swap: %m");
435 m
->post_action_delay_start
= usec_now
;
437 log_notice("Killed %s due to swap used (%"PRIu64
") / total (%"PRIu64
") being more than "
438 PERMYRIAD_AS_PERCENT_FORMAT_STR
,
439 selected
, m
->system_context
.swap_used
, m
->system_context
.swap_total
,
440 PERMYRIAD_AS_PERCENT_FORMAT_VAL(m
->swap_used_limit_permyriad
));
448 static int monitor_cgroup_contexts(Manager
*m
) {
449 _cleanup_(sd_event_source_unrefp
) sd_event_source
*s
= NULL
;
455 r
= sd_event_add_time(m
->event
, &s
, CLOCK_MONOTONIC
, 0, 0, monitor_cgroup_contexts_handler
, m
);
459 r
= sd_event_source_set_exit_on_failure(s
, true);
463 r
= sd_event_source_set_enabled(s
, SD_EVENT_ON
);
467 (void) sd_event_source_set_description(s
, "oomd-timer");
469 m
->cgroup_context_event_source
= TAKE_PTR(s
);
473 Manager
* manager_free(Manager
*m
) {
476 varlink_close_unref(m
->varlink
);
477 sd_event_source_unref(m
->cgroup_context_event_source
);
478 sd_event_unref(m
->event
);
480 bus_verify_polkit_async_registry_free(m
->polkit_registry
);
481 sd_bus_flush_close_unref(m
->bus
);
483 hashmap_free(m
->monitored_swap_cgroup_contexts
);
484 hashmap_free(m
->monitored_mem_pressure_cgroup_contexts
);
485 hashmap_free(m
->monitored_mem_pressure_cgroup_contexts_candidates
);
490 int manager_new(Manager
**ret
) {
491 _cleanup_(manager_freep
) Manager
*m
= NULL
;
496 m
= new0(Manager
, 1);
500 r
= sd_event_default(&m
->event
);
504 (void) sd_event_set_watchdog(m
->event
, true);
506 r
= sd_event_add_signal(m
->event
, NULL
, SIGINT
, NULL
, NULL
);
510 r
= sd_event_add_signal(m
->event
, NULL
, SIGTERM
, NULL
, NULL
);
514 m
->monitored_swap_cgroup_contexts
= hashmap_new(&oomd_cgroup_ctx_hash_ops
);
515 if (!m
->monitored_swap_cgroup_contexts
)
518 m
->monitored_mem_pressure_cgroup_contexts
= hashmap_new(&oomd_cgroup_ctx_hash_ops
);
519 if (!m
->monitored_mem_pressure_cgroup_contexts
)
522 m
->monitored_mem_pressure_cgroup_contexts_candidates
= hashmap_new(&oomd_cgroup_ctx_hash_ops
);
523 if (!m
->monitored_mem_pressure_cgroup_contexts_candidates
)
530 static int manager_connect_bus(Manager
*m
) {
536 r
= bus_open_system_watch_bind_with_description(&m
->bus
, "bus-api-oom");
538 return log_error_errno(r
, "Failed to connect to bus: %m");
540 r
= bus_add_implementation(m
->bus
, &manager_object
, m
);
544 r
= bus_log_control_api_register(m
->bus
);
548 r
= sd_bus_request_name_async(m
->bus
, NULL
, "org.freedesktop.oom1", 0, NULL
, NULL
);
550 return log_error_errno(r
, "Failed to request name: %m");
552 r
= sd_bus_attach_event(m
->bus
, m
->event
, 0);
554 return log_error_errno(r
, "Failed to attach bus to event loop: %m");
562 int swap_used_limit_permyriad
,
563 int mem_pressure_limit_permyriad
,
564 usec_t mem_pressure_usec
) {
571 m
->dry_run
= dry_run
;
573 m
->swap_used_limit_permyriad
= swap_used_limit_permyriad
>= 0 ? swap_used_limit_permyriad
: DEFAULT_SWAP_USED_LIMIT_PERCENT
* 100;
574 assert(m
->swap_used_limit_permyriad
<= 10000);
576 if (mem_pressure_limit_permyriad
>= 0) {
577 assert(mem_pressure_limit_permyriad
<= 10000);
579 l
= mem_pressure_limit_permyriad
/ 100;
580 f
= mem_pressure_limit_permyriad
% 100;
582 l
= DEFAULT_MEM_PRESSURE_LIMIT_PERCENT
;
585 r
= store_loadavg_fixed_point(l
, f
, &m
->default_mem_pressure_limit
);
589 m
->default_mem_pressure_duration_usec
= mem_pressure_usec
?: DEFAULT_MEM_PRESSURE_DURATION_USEC
;
591 r
= manager_connect_bus(m
);
595 r
= acquire_managed_oom_connect(m
);
599 r
= monitor_cgroup_contexts(m
);
606 int manager_get_dump_string(Manager
*m
, char **ret
) {
607 _cleanup_free_
char *dump
= NULL
;
608 _cleanup_fclose_
FILE *f
= NULL
;
609 char buf
[FORMAT_TIMESPAN_MAX
];
610 OomdCGroupContext
*c
;
618 f
= open_memstream_unlocked(&dump
, &size
);
624 "Swap Used Limit: " PERMYRIAD_AS_PERCENT_FORMAT_STR
"\n"
625 "Default Memory Pressure Limit: %lu.%02lu%%\n"
626 "Default Memory Pressure Duration: %s\n"
629 PERMYRIAD_AS_PERCENT_FORMAT_VAL(m
->swap_used_limit_permyriad
),
630 LOAD_INT(m
->default_mem_pressure_limit
), LOAD_FRAC(m
->default_mem_pressure_limit
),
631 format_timespan(buf
, sizeof(buf
), m
->default_mem_pressure_duration_usec
, USEC_PER_SEC
));
632 oomd_dump_system_context(&m
->system_context
, f
, "\t");
634 fprintf(f
, "Swap Monitored CGroups:\n");
635 HASHMAP_FOREACH_KEY(c
, key
, m
->monitored_swap_cgroup_contexts
)
636 oomd_dump_swap_cgroup_context(c
, f
, "\t");
638 fprintf(f
, "Memory Pressure Monitored CGroups:\n");
639 HASHMAP_FOREACH_KEY(c
, key
, m
->monitored_mem_pressure_cgroup_contexts
)
640 oomd_dump_memory_pressure_cgroup_context(c
, f
, "\t");
642 r
= fflush_and_check(f
);
648 *ret
= TAKE_PTR(dump
);