1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 #include "bus-log-control-api.h"
5 #include "bus-polkit.h"
6 #include "cgroup-util.h"
9 #include "oomd-manager-bus.h"
10 #include "oomd-manager.h"
11 #include "path-util.h"
13 typedef struct ManagedOOMReply
{
20 static void managed_oom_reply_destroy(ManagedOOMReply
*reply
) {
23 free(reply
->property
);
26 static int managed_oom_mode(const char *name
, JsonVariant
*v
, JsonDispatchFlags flags
, void *userdata
) {
27 ManagedOOMMode
*mode
= userdata
, m
;
31 assert_se(s
= json_variant_string(v
));
33 m
= managed_oom_mode_from_string(s
);
35 return json_log(v
, flags
, SYNTHETIC_ERRNO(EINVAL
), "%s is not a valid ManagedOOMMode", s
);
41 static int process_managed_oom_reply(
43 JsonVariant
*parameters
,
45 VarlinkReplyFlags flags
,
47 JsonVariant
*c
, *cgroups
;
48 Manager
*m
= userdata
;
53 static const JsonDispatch dispatch_table
[] = {
54 { "mode", JSON_VARIANT_STRING
, managed_oom_mode
, offsetof(ManagedOOMReply
, mode
), JSON_MANDATORY
},
55 { "path", JSON_VARIANT_STRING
, json_dispatch_string
, offsetof(ManagedOOMReply
, path
), JSON_MANDATORY
},
56 { "property", JSON_VARIANT_STRING
, json_dispatch_string
, offsetof(ManagedOOMReply
, property
), JSON_MANDATORY
},
57 { "limit", JSON_VARIANT_UNSIGNED
, json_dispatch_unsigned
, offsetof(ManagedOOMReply
, limit
), 0 },
63 log_debug("Error getting ManagedOOM cgroups: %s", error_id
);
67 cgroups
= json_variant_by_key(parameters
, "cgroups");
73 /* Skip malformed elements and keep processing in case the others are good */
74 JSON_VARIANT_ARRAY_FOREACH(c
, cgroups
) {
75 _cleanup_(managed_oom_reply_destroy
) ManagedOOMReply reply
= {};
76 OomdCGroupContext
*ctx
;
81 if (!json_variant_is_object(c
))
84 ret
= json_dispatch(c
, dispatch_table
, NULL
, 0, &reply
);
91 monitor_hm
= streq(reply
.property
, "ManagedOOMSwap") ?
92 m
->monitored_swap_cgroup_contexts
: m
->monitored_mem_pressure_cgroup_contexts
;
94 if (reply
.mode
== MANAGED_OOM_AUTO
) {
95 (void) oomd_cgroup_context_free(hashmap_remove(monitor_hm
, reply
.path
));
99 limit
= m
->default_mem_pressure_limit
;
101 if (streq(reply
.property
, "ManagedOOMMemoryPressure")) {
102 if (reply
.limit
> 100)
104 else if (reply
.limit
!= 0) {
105 ret
= store_loadavg_fixed_point((unsigned long) reply
.limit
, 0, &limit
);
111 ret
= oomd_insert_cgroup_context(NULL
, monitor_hm
, reply
.path
);
112 if (ret
== -ENOMEM
) {
117 /* Always update the limit in case it was changed. For non-memory pressure detection the value is
118 * ignored so always updating it here is not a problem. */
119 ctx
= hashmap_get(monitor_hm
, reply
.path
);
121 ctx
->mem_pressure_limit
= limit
;
125 if (!FLAGS_SET(flags
, VARLINK_REPLY_CONTINUES
))
126 m
->varlink
= varlink_close_unref(link
);
131 /* Fill `new_h` with `path`'s descendent OomdCGroupContexts. Only include descendent cgroups that are possible
132 * candidates for action. That is, only leaf cgroups or cgroups with memory.oom.group set to "1".
134 * This function ignores most errors in order to handle cgroups that may have been cleaned up while populating
137 * `new_h` is of the form { key: cgroup paths -> value: OomdCGroupContext } */
138 static int recursively_get_cgroup_context(Hashmap
*new_h
, const char *path
) {
139 _cleanup_free_
char *subpath
= NULL
;
140 _cleanup_closedir_
DIR *d
= NULL
;
146 r
= cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER
, path
, &d
);
150 r
= cg_read_subgroup(d
, &subpath
);
153 else if (r
== 0) { /* No subgroups? We're a leaf node */
154 r
= oomd_insert_cgroup_context(NULL
, new_h
, path
);
155 return (r
== -ENOMEM
) ? r
: 0;
159 _cleanup_free_
char *cg_path
= NULL
;
162 cg_path
= path_join(empty_to_root(path
), subpath
);
166 subpath
= mfree(subpath
);
168 r
= cg_get_attribute_as_bool("memory", cg_path
, "memory.oom.group", &oom_group
);
169 /* The cgroup might be gone. Skip it as a candidate since we can't get information on it. */
171 return (r
== -ENOMEM
) ? r
: 0;
174 r
= oomd_insert_cgroup_context(NULL
, new_h
, cg_path
);
176 r
= recursively_get_cgroup_context(new_h
, cg_path
);
179 } while ((r
= cg_read_subgroup(d
, &subpath
)) > 0);
184 static int update_monitored_cgroup_contexts(Hashmap
**monitored_cgroups
) {
185 _cleanup_hashmap_free_ Hashmap
*new_base
= NULL
;
186 OomdCGroupContext
*ctx
;
189 assert(monitored_cgroups
);
191 new_base
= hashmap_new(&oomd_cgroup_ctx_hash_ops
);
195 HASHMAP_FOREACH(ctx
, *monitored_cgroups
) {
196 /* Skip most errors since the cgroup we're trying to update might not exist anymore. */
197 r
= oomd_insert_cgroup_context(*monitored_cgroups
, new_base
, ctx
->path
);
202 hashmap_free(*monitored_cgroups
);
203 *monitored_cgroups
= TAKE_PTR(new_base
);
208 static int get_monitored_cgroup_contexts_candidates(Hashmap
*monitored_cgroups
, Hashmap
**ret_candidates
) {
209 _cleanup_hashmap_free_ Hashmap
*candidates
= NULL
;
210 OomdCGroupContext
*ctx
;
213 assert(monitored_cgroups
);
214 assert(ret_candidates
);
216 candidates
= hashmap_new(&oomd_cgroup_ctx_hash_ops
);
220 HASHMAP_FOREACH(ctx
, monitored_cgroups
) {
221 r
= recursively_get_cgroup_context(candidates
, ctx
->path
);
226 *ret_candidates
= TAKE_PTR(candidates
);
231 static int acquire_managed_oom_connect(Manager
*m
) {
232 _cleanup_(varlink_close_unrefp
) Varlink
*link
= NULL
;
238 r
= varlink_connect_address(&link
, VARLINK_ADDR_PATH_MANAGED_OOM
);
240 return log_error_errno(r
, "Failed to connect to %s: %m", VARLINK_ADDR_PATH_MANAGED_OOM
);
242 (void) varlink_set_userdata(link
, m
);
243 (void) varlink_set_description(link
, "oomd");
244 (void) varlink_set_relative_timeout(link
, USEC_INFINITY
);
246 r
= varlink_attach_event(link
, m
->event
, SD_EVENT_PRIORITY_NORMAL
);
248 return log_error_errno(r
, "Failed to attach varlink connection to event loop: %m");
250 r
= varlink_bind_reply(link
, process_managed_oom_reply
);
252 return log_error_errno(r
, "Failed to bind reply callback: %m");
254 r
= varlink_observe(link
, "io.systemd.ManagedOOM.SubscribeManagedOOMCGroups", NULL
);
256 return log_error_errno(r
, "Failed to observe varlink call: %m");
258 m
->varlink
= TAKE_PTR(link
);
262 static int monitor_cgroup_contexts_handler(sd_event_source
*s
, uint64_t usec
, void *userdata
) {
263 _cleanup_set_free_ Set
*targets
= NULL
;
264 Manager
*m
= userdata
;
272 r
= sd_event_now(sd_event_source_get_event(s
), CLOCK_MONOTONIC
, &usec_now
);
274 return log_error_errno(r
, "Failed to reset event timer");
276 r
= sd_event_source_set_time_relative(s
, INTERVAL_USEC
);
278 return log_error_errno(r
, "Failed to set relative time for timer");
280 /* Reconnect if our connection dropped */
282 r
= acquire_managed_oom_connect(m
);
284 return log_error_errno(r
, "Failed to acquire varlink connection");
287 /* Update the cgroups used for detection/action */
288 r
= update_monitored_cgroup_contexts(&m
->monitored_swap_cgroup_contexts
);
290 return log_error_errno(r
, "Failed to update monitored swap cgroup contexts");
292 r
= update_monitored_cgroup_contexts(&m
->monitored_mem_pressure_cgroup_contexts
);
294 return log_error_errno(r
, "Failed to update monitored memory pressure cgroup contexts");
296 r
= oomd_system_context_acquire("/proc/swaps", &m
->system_context
);
297 /* If there aren't units depending on swap actions, the only error we exit on is ENOMEM */
298 if (r
== -ENOMEM
|| (r
< 0 && !hashmap_isempty(m
->monitored_swap_cgroup_contexts
)))
299 return log_error_errno(r
, "Failed to acquire system context");
301 /* If we're still recovering from a kill, don't try to kill again yet */
302 if (m
->post_action_delay_start
> 0) {
303 if (m
->post_action_delay_start
+ POST_ACTION_DELAY_USEC
> usec_now
)
306 m
->post_action_delay_start
= 0;
309 r
= oomd_pressure_above(m
->monitored_mem_pressure_cgroup_contexts
, PRESSURE_DURATION_USEC
, &targets
);
311 return log_error_errno(r
, "Failed to check if memory pressure exceeded limits");
313 /* Check if there was reclaim activity in the last interval. The concern is the following case:
314 * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending
315 * cgroup. Even after this, well-behaved processes will fault in recently resident pages and
316 * this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need
317 * to kill something (it won't help anyways). */
318 if (oomd_memory_reclaim(m
->monitored_mem_pressure_cgroup_contexts
)) {
319 _cleanup_hashmap_free_ Hashmap
*candidates
= NULL
;
320 OomdCGroupContext
*t
;
322 r
= get_monitored_cgroup_contexts_candidates(m
->monitored_mem_pressure_cgroup_contexts
, &candidates
);
324 return log_error_errno(r
, "Failed to get monitored memory pressure cgroup candidates");
326 SET_FOREACH(t
, targets
) {
327 log_notice("Memory pressure for %s is greater than %lu for more than %"PRIu64
" seconds and there was reclaim activity",
328 t
->path
, LOAD_INT(t
->mem_pressure_limit
), PRESSURE_DURATION_USEC
/ USEC_PER_SEC
);
330 r
= oomd_kill_by_pgscan(candidates
, t
->path
, m
->dry_run
);
332 return log_error_errno(r
, "Failed to kill cgroup processes by pgscan");
334 log_info("Failed to kill any cgroup(s) under %s based on pressure", t
->path
);
336 /* Don't act on all the high pressure cgroups at once; return as soon as we kill one */
337 m
->post_action_delay_start
= usec_now
;
344 if (oomd_swap_free_below(&m
->system_context
, (100 - m
->swap_used_limit
))) {
345 _cleanup_hashmap_free_ Hashmap
*candidates
= NULL
;
347 log_notice("Swap used (%"PRIu64
") / total (%"PRIu64
") is more than %u%%",
348 m
->system_context
.swap_used
, m
->system_context
.swap_total
, m
->swap_used_limit
);
350 r
= get_monitored_cgroup_contexts_candidates(m
->monitored_swap_cgroup_contexts
, &candidates
);
352 return log_error_errno(r
, "Failed to get monitored swap cgroup candidates");
354 r
= oomd_kill_by_swap_usage(candidates
, m
->dry_run
);
356 return log_error_errno(r
, "Failed to kill cgroup processes by swap usage");
358 log_info("Failed to kill any cgroup(s) based on swap");
360 m
->post_action_delay_start
= usec_now
;
368 static int monitor_cgroup_contexts(Manager
*m
) {
369 _cleanup_(sd_event_source_unrefp
) sd_event_source
*s
= NULL
;
375 r
= sd_event_add_time(m
->event
, &s
, CLOCK_MONOTONIC
, 0, 0, monitor_cgroup_contexts_handler
, m
);
379 r
= sd_event_source_set_exit_on_failure(s
, true);
383 r
= sd_event_source_set_enabled(s
, SD_EVENT_ON
);
387 (void) sd_event_source_set_description(s
, "oomd-timer");
389 m
->cgroup_context_event_source
= TAKE_PTR(s
);
393 void manager_free(Manager
*m
) {
396 varlink_close_unref(m
->varlink
);
397 sd_event_source_unref(m
->cgroup_context_event_source
);
398 sd_event_unref(m
->event
);
400 bus_verify_polkit_async_registry_free(m
->polkit_registry
);
401 sd_bus_flush_close_unref(m
->bus
);
403 hashmap_free(m
->monitored_swap_cgroup_contexts
);
404 hashmap_free(m
->monitored_mem_pressure_cgroup_contexts
);
409 int manager_new(Manager
**ret
) {
410 _cleanup_(manager_freep
) Manager
*m
= NULL
;
415 m
= new0(Manager
, 1);
419 r
= sd_event_default(&m
->event
);
423 (void) sd_event_set_watchdog(m
->event
, true);
425 r
= sd_event_add_signal(m
->event
, NULL
, SIGINT
, NULL
, NULL
);
429 r
= sd_event_add_signal(m
->event
, NULL
, SIGTERM
, NULL
, NULL
);
433 m
->monitored_swap_cgroup_contexts
= hashmap_new(&oomd_cgroup_ctx_hash_ops
);
434 if (!m
->monitored_swap_cgroup_contexts
)
437 m
->monitored_mem_pressure_cgroup_contexts
= hashmap_new(&oomd_cgroup_ctx_hash_ops
);
438 if (!m
->monitored_mem_pressure_cgroup_contexts
)
445 static int manager_connect_bus(Manager
*m
) {
451 r
= bus_open_system_watch_bind_with_description(&m
->bus
, "bus-api-oom");
453 return log_error_errno(r
, "Failed to connect to bus: %m");
455 r
= bus_add_implementation(m
->bus
, &manager_object
, m
);
459 r
= bus_log_control_api_register(m
->bus
);
463 r
= sd_bus_request_name_async(m
->bus
, NULL
, "org.freedesktop.oom1", 0, NULL
, NULL
);
465 return log_error_errno(r
, "Failed to request name: %m");
467 r
= sd_bus_attach_event(m
->bus
, m
->event
, 0);
469 return log_error_errno(r
, "Failed to attach bus to event loop: %m");
474 int manager_start(Manager
*m
, bool dry_run
, int swap_used_limit
, int mem_pressure_limit
) {
480 m
->dry_run
= dry_run
;
482 m
->swap_used_limit
= swap_used_limit
!= -1 ? swap_used_limit
: DEFAULT_SWAP_USED_LIMIT
;
483 assert(m
->swap_used_limit
<= 100);
485 l
= mem_pressure_limit
!= -1 ? mem_pressure_limit
: DEFAULT_MEM_PRESSURE_LIMIT
;
486 r
= store_loadavg_fixed_point(l
, 0, &m
->default_mem_pressure_limit
);
490 r
= manager_connect_bus(m
);
494 r
= acquire_managed_oom_connect(m
);
498 r
= monitor_cgroup_contexts(m
);
505 int manager_get_dump_string(Manager
*m
, char **ret
) {
506 _cleanup_free_
char *dump
= NULL
;
507 _cleanup_fclose_
FILE *f
= NULL
;
508 OomdCGroupContext
*c
;
516 f
= open_memstream_unlocked(&dump
, &size
);
522 "Swap Used Limit: %u%%\n"
523 "Default Memory Pressure Limit: %lu%%\n"
527 LOAD_INT(m
->default_mem_pressure_limit
));
528 oomd_dump_system_context(&m
->system_context
, f
, "\t");
530 fprintf(f
, "Swap Monitored CGroups:\n");
531 HASHMAP_FOREACH_KEY(c
, key
, m
->monitored_swap_cgroup_contexts
)
532 oomd_dump_swap_cgroup_context(c
, f
, "\t");
534 fprintf(f
, "Memory Pressure Monitored CGroups:\n");
535 HASHMAP_FOREACH_KEY(c
, key
, m
->monitored_mem_pressure_cgroup_contexts
)
536 oomd_dump_memory_pressure_cgroup_context(c
, f
, "\t");
538 r
= fflush_and_check(f
);
544 *ret
= TAKE_PTR(dump
);