1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
3 #include "bus-log-control-api.h"
5 #include "bus-polkit.h"
6 #include "cgroup-util.h"
9 #include "memory-util.h"
10 #include "oomd-manager-bus.h"
11 #include "oomd-manager.h"
12 #include "path-util.h"
13 #include "percent-util.h"
15 typedef struct ManagedOOMReply
{
22 static void managed_oom_reply_destroy(ManagedOOMReply
*reply
) {
25 free(reply
->property
);
28 static int managed_oom_mode(const char *name
, JsonVariant
*v
, JsonDispatchFlags flags
, void *userdata
) {
29 ManagedOOMMode
*mode
= userdata
, m
;
33 assert_se(s
= json_variant_string(v
));
35 m
= managed_oom_mode_from_string(s
);
37 return json_log(v
, flags
, m
, "%s is not a valid ManagedOOMMode", s
);
43 static int process_managed_oom_reply(
45 JsonVariant
*parameters
,
47 VarlinkReplyFlags flags
,
49 JsonVariant
*c
, *cgroups
;
50 Manager
*m
= userdata
;
55 static const JsonDispatch dispatch_table
[] = {
56 { "mode", JSON_VARIANT_STRING
, managed_oom_mode
, offsetof(ManagedOOMReply
, mode
), JSON_MANDATORY
},
57 { "path", JSON_VARIANT_STRING
, json_dispatch_string
, offsetof(ManagedOOMReply
, path
), JSON_MANDATORY
},
58 { "property", JSON_VARIANT_STRING
, json_dispatch_string
, offsetof(ManagedOOMReply
, property
), JSON_MANDATORY
},
59 { "limit", JSON_VARIANT_UNSIGNED
, json_dispatch_uint32
, offsetof(ManagedOOMReply
, limit
), 0 },
65 log_debug("Error getting ManagedOOM cgroups: %s", error_id
);
69 cgroups
= json_variant_by_key(parameters
, "cgroups");
75 /* Skip malformed elements and keep processing in case the others are good */
76 JSON_VARIANT_ARRAY_FOREACH(c
, cgroups
) {
77 _cleanup_(managed_oom_reply_destroy
) ManagedOOMReply reply
= {};
78 OomdCGroupContext
*ctx
;
83 if (!json_variant_is_object(c
))
86 ret
= json_dispatch(c
, dispatch_table
, NULL
, 0, &reply
);
94 monitor_hm
= streq(reply
.property
, "ManagedOOMSwap") ?
95 m
->monitored_swap_cgroup_contexts
: m
->monitored_mem_pressure_cgroup_contexts
;
97 if (reply
.mode
== MANAGED_OOM_AUTO
) {
98 (void) oomd_cgroup_context_free(hashmap_remove(monitor_hm
, empty_to_root(reply
.path
)));
102 limit
= m
->default_mem_pressure_limit
;
104 if (streq(reply
.property
, "ManagedOOMMemoryPressure") && reply
.limit
> 0) {
105 int permyriad
= UINT32_SCALE_TO_PERMYRIAD(reply
.limit
);
107 ret
= store_loadavg_fixed_point(
108 (unsigned long) permyriad
/ 100,
109 (unsigned long) permyriad
% 100,
115 ret
= oomd_insert_cgroup_context(NULL
, monitor_hm
, reply
.path
);
116 if (ret
== -ENOMEM
) {
121 /* Always update the limit in case it was changed. For non-memory pressure detection the value is
122 * ignored so always updating it here is not a problem. */
123 ctx
= hashmap_get(monitor_hm
, empty_to_root(reply
.path
));
125 ctx
->mem_pressure_limit
= limit
;
129 if (!FLAGS_SET(flags
, VARLINK_REPLY_CONTINUES
))
130 m
->varlink
= varlink_close_unref(link
);
135 /* Fill `new_h` with `path`'s descendent OomdCGroupContexts. Only include descendent cgroups that are possible
136 * candidates for action. That is, only leaf cgroups or cgroups with memory.oom.group set to "1".
138 * This function ignores most errors in order to handle cgroups that may have been cleaned up while populating
141 * `new_h` is of the form { key: cgroup paths -> value: OomdCGroupContext } */
142 static int recursively_get_cgroup_context(Hashmap
*new_h
, const char *path
) {
143 _cleanup_free_
char *subpath
= NULL
;
144 _cleanup_closedir_
DIR *d
= NULL
;
150 r
= cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER
, path
, &d
);
154 r
= cg_read_subgroup(d
, &subpath
);
157 else if (r
== 0) { /* No subgroups? We're a leaf node */
158 r
= oomd_insert_cgroup_context(NULL
, new_h
, path
);
159 return (r
== -ENOMEM
) ? r
: 0;
163 _cleanup_free_
char *cg_path
= NULL
;
166 cg_path
= path_join(empty_to_root(path
), subpath
);
170 subpath
= mfree(subpath
);
172 r
= cg_get_attribute_as_bool("memory", cg_path
, "memory.oom.group", &oom_group
);
173 /* The cgroup might be gone. Skip it as a candidate since we can't get information on it. */
175 return (r
== -ENOMEM
) ? r
: 0;
178 r
= oomd_insert_cgroup_context(NULL
, new_h
, cg_path
);
180 r
= recursively_get_cgroup_context(new_h
, cg_path
);
183 } while ((r
= cg_read_subgroup(d
, &subpath
)) > 0);
188 static int update_monitored_cgroup_contexts(Hashmap
**monitored_cgroups
) {
189 _cleanup_hashmap_free_ Hashmap
*new_base
= NULL
;
190 OomdCGroupContext
*ctx
;
193 assert(monitored_cgroups
);
195 new_base
= hashmap_new(&oomd_cgroup_ctx_hash_ops
);
199 HASHMAP_FOREACH(ctx
, *monitored_cgroups
) {
200 /* Skip most errors since the cgroup we're trying to update might not exist anymore. */
201 r
= oomd_insert_cgroup_context(*monitored_cgroups
, new_base
, ctx
->path
);
206 hashmap_free(*monitored_cgroups
);
207 *monitored_cgroups
= TAKE_PTR(new_base
);
212 static int get_monitored_cgroup_contexts_candidates(Hashmap
*monitored_cgroups
, Hashmap
**ret_candidates
) {
213 _cleanup_hashmap_free_ Hashmap
*candidates
= NULL
;
214 OomdCGroupContext
*ctx
;
217 assert(monitored_cgroups
);
218 assert(ret_candidates
);
220 candidates
= hashmap_new(&oomd_cgroup_ctx_hash_ops
);
224 HASHMAP_FOREACH(ctx
, monitored_cgroups
) {
225 r
= recursively_get_cgroup_context(candidates
, ctx
->path
);
230 *ret_candidates
= TAKE_PTR(candidates
);
235 static int acquire_managed_oom_connect(Manager
*m
) {
236 _cleanup_(varlink_close_unrefp
) Varlink
*link
= NULL
;
242 r
= varlink_connect_address(&link
, VARLINK_ADDR_PATH_MANAGED_OOM
);
244 return log_error_errno(r
, "Failed to connect to %s: %m", VARLINK_ADDR_PATH_MANAGED_OOM
);
246 (void) varlink_set_userdata(link
, m
);
247 (void) varlink_set_description(link
, "oomd");
248 (void) varlink_set_relative_timeout(link
, USEC_INFINITY
);
250 r
= varlink_attach_event(link
, m
->event
, SD_EVENT_PRIORITY_NORMAL
);
252 return log_error_errno(r
, "Failed to attach varlink connection to event loop: %m");
254 r
= varlink_bind_reply(link
, process_managed_oom_reply
);
256 return log_error_errno(r
, "Failed to bind reply callback: %m");
258 r
= varlink_observe(link
, "io.systemd.ManagedOOM.SubscribeManagedOOMCGroups", NULL
);
260 return log_error_errno(r
, "Failed to observe varlink call: %m");
262 m
->varlink
= TAKE_PTR(link
);
266 static int monitor_cgroup_contexts_handler(sd_event_source
*s
, uint64_t usec
, void *userdata
) {
267 _cleanup_set_free_ Set
*targets
= NULL
;
268 Manager
*m
= userdata
;
276 r
= sd_event_now(sd_event_source_get_event(s
), CLOCK_MONOTONIC
, &usec_now
);
278 return log_error_errno(r
, "Failed to reset event timer");
280 r
= sd_event_source_set_time_relative(s
, INTERVAL_USEC
);
282 return log_error_errno(r
, "Failed to set relative time for timer");
284 /* Reconnect if our connection dropped */
286 r
= acquire_managed_oom_connect(m
);
288 return log_error_errno(r
, "Failed to acquire varlink connection");
291 /* Update the cgroups used for detection/action */
292 r
= update_monitored_cgroup_contexts(&m
->monitored_swap_cgroup_contexts
);
294 return log_error_errno(r
, "Failed to update monitored swap cgroup contexts");
296 r
= update_monitored_cgroup_contexts(&m
->monitored_mem_pressure_cgroup_contexts
);
298 return log_error_errno(r
, "Failed to update monitored memory pressure cgroup contexts");
300 r
= oomd_system_context_acquire("/proc/swaps", &m
->system_context
);
301 /* If there aren't units depending on swap actions, the only error we exit on is ENOMEM.
302 * Allow ENOENT in the event that swap is disabled on the system. */
303 if (r
== -ENOMEM
|| (r
< 0 && r
!= -ENOENT
&& !hashmap_isempty(m
->monitored_swap_cgroup_contexts
)))
304 return log_error_errno(r
, "Failed to acquire system context");
305 else if (r
== -ENOENT
)
306 zero(m
->system_context
);
308 if (oomd_memory_reclaim(m
->monitored_mem_pressure_cgroup_contexts
))
309 m
->last_reclaim_at
= usec_now
;
311 /* If we're still recovering from a kill, don't try to kill again yet */
312 if (m
->post_action_delay_start
> 0) {
313 if (m
->post_action_delay_start
+ POST_ACTION_DELAY_USEC
> usec_now
)
316 m
->post_action_delay_start
= 0;
319 r
= oomd_pressure_above(m
->monitored_mem_pressure_cgroup_contexts
, m
->default_mem_pressure_duration_usec
, &targets
);
321 return log_error_errno(r
, "Failed to check if memory pressure exceeded limits");
323 /* Check if there was reclaim activity in the given interval. The concern is the following case:
324 * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending
325 * cgroup. Even after this, well-behaved processes will fault in recently resident pages and
326 * this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need
327 * to kill something (it won't help anyways). */
328 if ((usec_now
- m
->last_reclaim_at
) <= RECLAIM_DURATION_USEC
) {
329 _cleanup_hashmap_free_ Hashmap
*candidates
= NULL
;
330 OomdCGroupContext
*t
;
332 r
= get_monitored_cgroup_contexts_candidates(m
->monitored_mem_pressure_cgroup_contexts
, &candidates
);
334 return log_error_errno(r
, "Failed to get monitored memory pressure cgroup candidates");
336 SET_FOREACH(t
, targets
) {
337 log_notice("Memory pressure for %s is greater than %lu for more than %"PRIu64
" seconds and there was reclaim activity",
338 t
->path
, LOAD_INT(t
->mem_pressure_limit
), m
->default_mem_pressure_duration_usec
/ USEC_PER_SEC
);
340 r
= oomd_kill_by_pgscan(candidates
, t
->path
, m
->dry_run
);
342 return log_error_errno(r
, "Failed to kill cgroup processes by pgscan");
344 log_info("Failed to kill any cgroup(s) under %s based on pressure", t
->path
);
346 /* Don't act on all the high pressure cgroups at once; return as soon as we kill one */
347 m
->post_action_delay_start
= usec_now
;
354 if (oomd_swap_free_below(&m
->system_context
, 10000 - m
->swap_used_limit_permyriad
)) {
355 _cleanup_hashmap_free_ Hashmap
*candidates
= NULL
;
357 log_notice("Swap used (%"PRIu64
") / total (%"PRIu64
") is more than " PERMYRIAD_AS_PERCENT_FORMAT_STR
,
358 m
->system_context
.swap_used
, m
->system_context
.swap_total
, PERMYRIAD_AS_PERCENT_FORMAT_VAL(m
->swap_used_limit_permyriad
));
360 r
= get_monitored_cgroup_contexts_candidates(m
->monitored_swap_cgroup_contexts
, &candidates
);
362 return log_error_errno(r
, "Failed to get monitored swap cgroup candidates");
364 r
= oomd_kill_by_swap_usage(candidates
, m
->dry_run
);
366 return log_error_errno(r
, "Failed to kill cgroup processes by swap usage");
368 log_info("Failed to kill any cgroup(s) based on swap");
370 m
->post_action_delay_start
= usec_now
;
378 static int monitor_cgroup_contexts(Manager
*m
) {
379 _cleanup_(sd_event_source_unrefp
) sd_event_source
*s
= NULL
;
385 r
= sd_event_add_time(m
->event
, &s
, CLOCK_MONOTONIC
, 0, 0, monitor_cgroup_contexts_handler
, m
);
389 r
= sd_event_source_set_exit_on_failure(s
, true);
393 r
= sd_event_source_set_enabled(s
, SD_EVENT_ON
);
397 (void) sd_event_source_set_description(s
, "oomd-timer");
399 m
->cgroup_context_event_source
= TAKE_PTR(s
);
403 Manager
* manager_free(Manager
*m
) {
406 varlink_close_unref(m
->varlink
);
407 sd_event_source_unref(m
->cgroup_context_event_source
);
408 sd_event_unref(m
->event
);
410 bus_verify_polkit_async_registry_free(m
->polkit_registry
);
411 sd_bus_flush_close_unref(m
->bus
);
413 hashmap_free(m
->monitored_swap_cgroup_contexts
);
414 hashmap_free(m
->monitored_mem_pressure_cgroup_contexts
);
419 int manager_new(Manager
**ret
) {
420 _cleanup_(manager_freep
) Manager
*m
= NULL
;
425 m
= new0(Manager
, 1);
429 r
= sd_event_default(&m
->event
);
433 (void) sd_event_set_watchdog(m
->event
, true);
435 r
= sd_event_add_signal(m
->event
, NULL
, SIGINT
, NULL
, NULL
);
439 r
= sd_event_add_signal(m
->event
, NULL
, SIGTERM
, NULL
, NULL
);
443 m
->monitored_swap_cgroup_contexts
= hashmap_new(&oomd_cgroup_ctx_hash_ops
);
444 if (!m
->monitored_swap_cgroup_contexts
)
447 m
->monitored_mem_pressure_cgroup_contexts
= hashmap_new(&oomd_cgroup_ctx_hash_ops
);
448 if (!m
->monitored_mem_pressure_cgroup_contexts
)
455 static int manager_connect_bus(Manager
*m
) {
461 r
= bus_open_system_watch_bind_with_description(&m
->bus
, "bus-api-oom");
463 return log_error_errno(r
, "Failed to connect to bus: %m");
465 r
= bus_add_implementation(m
->bus
, &manager_object
, m
);
469 r
= bus_log_control_api_register(m
->bus
);
473 r
= sd_bus_request_name_async(m
->bus
, NULL
, "org.freedesktop.oom1", 0, NULL
, NULL
);
475 return log_error_errno(r
, "Failed to request name: %m");
477 r
= sd_bus_attach_event(m
->bus
, m
->event
, 0);
479 return log_error_errno(r
, "Failed to attach bus to event loop: %m");
487 int swap_used_limit_permyriad
,
488 int mem_pressure_limit_permyriad
,
489 usec_t mem_pressure_usec
) {
496 m
->dry_run
= dry_run
;
498 m
->swap_used_limit_permyriad
= swap_used_limit_permyriad
>= 0 ? swap_used_limit_permyriad
: DEFAULT_SWAP_USED_LIMIT_PERCENT
* 100;
499 assert(m
->swap_used_limit_permyriad
<= 10000);
501 if (mem_pressure_limit_permyriad
>= 0) {
502 assert(mem_pressure_limit_permyriad
<= 10000);
504 l
= mem_pressure_limit_permyriad
/ 100;
505 f
= mem_pressure_limit_permyriad
% 100;
507 l
= DEFAULT_MEM_PRESSURE_LIMIT_PERCENT
;
510 r
= store_loadavg_fixed_point(l
, f
, &m
->default_mem_pressure_limit
);
514 m
->default_mem_pressure_duration_usec
= mem_pressure_usec
?: DEFAULT_MEM_PRESSURE_DURATION_USEC
;
516 r
= manager_connect_bus(m
);
520 r
= acquire_managed_oom_connect(m
);
524 r
= monitor_cgroup_contexts(m
);
531 int manager_get_dump_string(Manager
*m
, char **ret
) {
532 _cleanup_free_
char *dump
= NULL
;
533 _cleanup_fclose_
FILE *f
= NULL
;
534 char buf
[FORMAT_TIMESPAN_MAX
];
535 OomdCGroupContext
*c
;
543 f
= open_memstream_unlocked(&dump
, &size
);
549 "Swap Used Limit: " PERMYRIAD_AS_PERCENT_FORMAT_STR
"\n"
550 "Default Memory Pressure Limit: %lu.%02lu%%\n"
551 "Default Memory Pressure Duration: %s\n"
554 PERMYRIAD_AS_PERCENT_FORMAT_VAL(m
->swap_used_limit_permyriad
),
555 LOAD_INT(m
->default_mem_pressure_limit
), LOAD_FRAC(m
->default_mem_pressure_limit
),
556 format_timespan(buf
, sizeof(buf
), m
->default_mem_pressure_duration_usec
, USEC_PER_SEC
));
557 oomd_dump_system_context(&m
->system_context
, f
, "\t");
559 fprintf(f
, "Swap Monitored CGroups:\n");
560 HASHMAP_FOREACH_KEY(c
, key
, m
->monitored_swap_cgroup_contexts
)
561 oomd_dump_swap_cgroup_context(c
, f
, "\t");
563 fprintf(f
, "Memory Pressure Monitored CGroups:\n");
564 HASHMAP_FOREACH_KEY(c
, key
, m
->monitored_mem_pressure_cgroup_contexts
)
565 oomd_dump_memory_pressure_cgroup_context(c
, f
, "\t");
567 r
= fflush_and_check(f
);
573 *ret
= TAKE_PTR(dump
);