]>
Commit | Line | Data |
---|---|---|
db9ecf05 | 1 | /* SPDX-License-Identifier: LGPL-2.1-or-later */ |
9de5e321 | 2 | |
064a5c14 DDM |
3 | #include "sd-daemon.h" |
4 | ||
5c616ecf AZ |
5 | #include "bus-log-control-api.h" |
6 | #include "bus-util.h" | |
7 | #include "bus-polkit.h" | |
9de5e321 AZ |
8 | #include "cgroup-util.h" |
9 | #include "fd-util.h" | |
10 | #include "fileio.h" | |
064a5c14 | 11 | #include "format-util.h" |
408a3bbd | 12 | #include "memory-util.h" |
5c616ecf | 13 | #include "oomd-manager-bus.h" |
9de5e321 AZ |
14 | #include "oomd-manager.h" |
15 | #include "path-util.h" | |
d9d3f05d | 16 | #include "percent-util.h" |
9de5e321 | 17 | |
71feeae4 | 18 | typedef struct ManagedOOMMessage { |
9de5e321 AZ |
19 | ManagedOOMMode mode; |
20 | char *path; | |
21 | char *property; | |
d06e7fb5 | 22 | uint32_t limit; |
71feeae4 | 23 | } ManagedOOMMessage; |
9de5e321 | 24 | |
71feeae4 DDM |
25 | static void managed_oom_message_destroy(ManagedOOMMessage *message) { |
26 | assert(message); | |
27 | free(message->path); | |
28 | free(message->property); | |
9de5e321 AZ |
29 | } |
30 | ||
31 | static int managed_oom_mode(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { | |
32 | ManagedOOMMode *mode = userdata, m; | |
33 | const char *s; | |
34 | ||
35 | assert(mode); | |
36 | assert_se(s = json_variant_string(v)); | |
37 | ||
38 | m = managed_oom_mode_from_string(s); | |
39 | if (m < 0) | |
7211c853 | 40 | return json_log(v, flags, m, "%s is not a valid ManagedOOMMode", s); |
9de5e321 AZ |
41 | |
42 | *mode = m; | |
43 | return 0; | |
44 | } | |
45 | ||
064a5c14 | 46 | static int process_managed_oom_message(Manager *m, uid_t uid, JsonVariant *parameters) { |
9de5e321 | 47 | JsonVariant *c, *cgroups; |
71feeae4 | 48 | int r; |
9de5e321 AZ |
49 | |
50 | static const JsonDispatch dispatch_table[] = { | |
71feeae4 DDM |
51 | { "mode", JSON_VARIANT_STRING, managed_oom_mode, offsetof(ManagedOOMMessage, mode), JSON_MANDATORY }, |
52 | { "path", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMMessage, path), JSON_MANDATORY }, | |
53 | { "property", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMMessage, property), JSON_MANDATORY }, | |
54 | { "limit", JSON_VARIANT_UNSIGNED, json_dispatch_uint32, offsetof(ManagedOOMMessage, limit), 0 }, | |
9de5e321 AZ |
55 | {}, |
56 | }; | |
57 | ||
71feeae4 DDM |
58 | assert(m); |
59 | assert(parameters); | |
9de5e321 AZ |
60 | |
61 | cgroups = json_variant_by_key(parameters, "cgroups"); | |
71feeae4 DDM |
62 | if (!cgroups) |
63 | return -EINVAL; | |
9de5e321 AZ |
64 | |
65 | /* Skip malformed elements and keep processing in case the others are good */ | |
66 | JSON_VARIANT_ARRAY_FOREACH(c, cgroups) { | |
71feeae4 | 67 | _cleanup_(managed_oom_message_destroy) ManagedOOMMessage message = {}; |
9de5e321 AZ |
68 | OomdCGroupContext *ctx; |
69 | Hashmap *monitor_hm; | |
70 | loadavg_t limit; | |
9de5e321 AZ |
71 | |
72 | if (!json_variant_is_object(c)) | |
73 | continue; | |
74 | ||
71feeae4 DDM |
75 | r = json_dispatch(c, dispatch_table, NULL, 0, &message); |
76 | if (r == -ENOMEM) | |
77 | return r; | |
78 | if (r < 0) | |
9de5e321 AZ |
79 | continue; |
80 | ||
064a5c14 DDM |
81 | if (uid != 0) { |
82 | uid_t cg_uid; | |
83 | ||
84 | r = cg_path_get_owner_uid(message.path, &cg_uid); | |
85 | if (r < 0) { | |
b6f6df4c | 86 | log_debug_errno(r, "Failed to get cgroup %s owner uid: %m", message.path); |
064a5c14 DDM |
87 | continue; |
88 | } | |
89 | ||
90 | /* Let's not be lenient for permission errors and skip processing if we receive an | |
91 | * update for a cgroup that doesn't belong to the user. */ | |
92 | if (uid != cg_uid) | |
93 | return log_error_errno(SYNTHETIC_ERRNO(EPERM), | |
94 | "cgroup path owner UID does not match sender uid " | |
95 | "(" UID_FMT " != " UID_FMT ")", uid, cg_uid); | |
96 | } | |
97 | ||
71feeae4 | 98 | monitor_hm = streq(message.property, "ManagedOOMSwap") ? |
9de5e321 AZ |
99 | m->monitored_swap_cgroup_contexts : m->monitored_mem_pressure_cgroup_contexts; |
100 | ||
71feeae4 DDM |
101 | if (message.mode == MANAGED_OOM_AUTO) { |
102 | (void) oomd_cgroup_context_free(hashmap_remove(monitor_hm, empty_to_root(message.path))); | |
9de5e321 AZ |
103 | continue; |
104 | } | |
105 | ||
106 | limit = m->default_mem_pressure_limit; | |
107 | ||
71feeae4 DDM |
108 | if (streq(message.property, "ManagedOOMMemoryPressure") && message.limit > 0) { |
109 | int permyriad = UINT32_SCALE_TO_PERMYRIAD(message.limit); | |
d06e7fb5 | 110 | |
5f1d6ebd | 111 | r = store_loadavg_fixed_point(permyriad / 100LU, permyriad % 100LU, &limit); |
71feeae4 | 112 | if (r < 0) |
9de5e321 | 113 | continue; |
9de5e321 AZ |
114 | } |
115 | ||
71feeae4 DDM |
116 | r = oomd_insert_cgroup_context(NULL, monitor_hm, message.path); |
117 | if (r == -ENOMEM) | |
118 | return r; | |
119 | if (r < 0 && r != -EEXIST) | |
120 | log_debug_errno(r, "Failed to insert message, ignoring: %m"); | |
9de5e321 AZ |
121 | |
122 | /* Always update the limit in case it was changed. For non-memory pressure detection the value is | |
123 | * ignored so always updating it here is not a problem. */ | |
71feeae4 | 124 | ctx = hashmap_get(monitor_hm, empty_to_root(message.path)); |
9de5e321 AZ |
125 | if (ctx) |
126 | ctx->mem_pressure_limit = limit; | |
127 | } | |
128 | ||
71feeae4 DDM |
129 | return 0; |
130 | } | |
131 | ||
064a5c14 DDM |
132 | static int process_managed_oom_request( |
133 | Varlink *link, | |
134 | JsonVariant *parameters, | |
135 | VarlinkMethodFlags flags, | |
136 | void *userdata) { | |
137 | Manager *m = userdata; | |
138 | uid_t uid; | |
139 | int r; | |
140 | ||
141 | assert(m); | |
142 | ||
143 | r = varlink_get_peer_uid(link, &uid); | |
144 | if (r < 0) | |
145 | return log_error_errno(r, "Failed to get varlink peer uid: %m"); | |
146 | ||
147 | return process_managed_oom_message(m, uid, parameters); | |
148 | } | |
149 | ||
71feeae4 DDM |
150 | static int process_managed_oom_reply( |
151 | Varlink *link, | |
152 | JsonVariant *parameters, | |
153 | const char *error_id, | |
154 | VarlinkReplyFlags flags, | |
155 | void *userdata) { | |
156 | Manager *m = userdata; | |
064a5c14 | 157 | uid_t uid; |
71feeae4 DDM |
158 | int r; |
159 | ||
160 | assert(m); | |
161 | ||
162 | if (error_id) { | |
163 | r = -EIO; | |
164 | log_debug("Error getting ManagedOOM cgroups: %s", error_id); | |
165 | goto finish; | |
166 | } | |
167 | ||
064a5c14 DDM |
168 | r = varlink_get_peer_uid(link, &uid); |
169 | if (r < 0) { | |
170 | log_error_errno(r, "Failed to get varlink peer uid: %m"); | |
171 | goto finish; | |
172 | } | |
173 | ||
174 | r = process_managed_oom_message(m, uid, parameters); | |
71feeae4 | 175 | |
9de5e321 AZ |
176 | finish: |
177 | if (!FLAGS_SET(flags, VARLINK_REPLY_CONTINUES)) | |
064a5c14 | 178 | m->varlink_client = varlink_close_unref(link); |
9de5e321 AZ |
179 | |
180 | return r; | |
181 | } | |
182 | ||
4d620b90 | 183 | /* Fill 'new_h' with 'path's descendant OomdCGroupContexts. Only include descendant cgroups that are possible |
9de5e321 AZ |
184 | * candidates for action. That is, only leaf cgroups or cgroups with memory.oom.group set to "1". |
185 | * | |
4d620b90 ZJS |
186 | * This function ignores most errors in order to handle cgroups that may have been cleaned up while |
187 | * populating the hashmap. | |
9de5e321 | 188 | * |
4d620b90 | 189 | * 'new_h' is of the form { key: cgroup paths -> value: OomdCGroupContext } */ |
9de5e321 AZ |
190 | static int recursively_get_cgroup_context(Hashmap *new_h, const char *path) { |
191 | _cleanup_free_ char *subpath = NULL; | |
192 | _cleanup_closedir_ DIR *d = NULL; | |
193 | int r; | |
194 | ||
195 | assert(new_h); | |
196 | assert(path); | |
197 | ||
198 | r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d); | |
199 | if (r < 0) | |
200 | return r; | |
201 | ||
202 | r = cg_read_subgroup(d, &subpath); | |
203 | if (r < 0) | |
204 | return r; | |
205 | else if (r == 0) { /* No subgroups? We're a leaf node */ | |
206 | r = oomd_insert_cgroup_context(NULL, new_h, path); | |
77b04c0a AZ |
207 | if (r == -ENOMEM) |
208 | return r; | |
209 | if (r < 0) | |
210 | log_debug_errno(r, "Failed to insert context for %s, ignoring: %m", path); | |
211 | return 0; | |
9de5e321 AZ |
212 | } |
213 | ||
214 | do { | |
215 | _cleanup_free_ char *cg_path = NULL; | |
216 | bool oom_group; | |
217 | ||
218 | cg_path = path_join(empty_to_root(path), subpath); | |
219 | if (!cg_path) | |
220 | return -ENOMEM; | |
221 | ||
222 | subpath = mfree(subpath); | |
223 | ||
224 | r = cg_get_attribute_as_bool("memory", cg_path, "memory.oom.group", &oom_group); | |
225 | /* The cgroup might be gone. Skip it as a candidate since we can't get information on it. */ | |
77b04c0a AZ |
226 | if (r == -ENOMEM) |
227 | return r; | |
228 | if (r < 0) { | |
229 | log_debug_errno(r, "Failed to read memory.oom.group from %s, ignoring: %m", cg_path); | |
230 | return 0; | |
231 | } | |
9de5e321 | 232 | |
349a2003 | 233 | if (oom_group) |
9de5e321 | 234 | r = oomd_insert_cgroup_context(NULL, new_h, cg_path); |
349a2003 | 235 | else |
9de5e321 | 236 | r = recursively_get_cgroup_context(new_h, cg_path); |
349a2003 AZ |
237 | if (r == -ENOMEM) |
238 | return r; | |
77b04c0a AZ |
239 | if (r < 0) |
240 | log_debug_errno(r, "Failed to insert or recursively get from %s, ignoring: %m", cg_path); | |
9de5e321 AZ |
241 | } while ((r = cg_read_subgroup(d, &subpath)) > 0); |
242 | ||
243 | return 0; | |
244 | } | |
245 | ||
246 | static int update_monitored_cgroup_contexts(Hashmap **monitored_cgroups) { | |
247 | _cleanup_hashmap_free_ Hashmap *new_base = NULL; | |
248 | OomdCGroupContext *ctx; | |
249 | int r; | |
250 | ||
251 | assert(monitored_cgroups); | |
252 | ||
253 | new_base = hashmap_new(&oomd_cgroup_ctx_hash_ops); | |
254 | if (!new_base) | |
255 | return -ENOMEM; | |
256 | ||
257 | HASHMAP_FOREACH(ctx, *monitored_cgroups) { | |
258 | /* Skip most errors since the cgroup we're trying to update might not exist anymore. */ | |
259 | r = oomd_insert_cgroup_context(*monitored_cgroups, new_base, ctx->path); | |
260 | if (r == -ENOMEM) | |
261 | return r; | |
77b04c0a AZ |
262 | if (r < 0 && !IN_SET(r, -EEXIST, -ENOENT)) |
263 | log_debug_errno(r, "Failed to insert context for %s, ignoring: %m", ctx->path); | |
9de5e321 AZ |
264 | } |
265 | ||
266 | hashmap_free(*monitored_cgroups); | |
267 | *monitored_cgroups = TAKE_PTR(new_base); | |
268 | ||
269 | return 0; | |
270 | } | |
271 | ||
272 | static int get_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups, Hashmap **ret_candidates) { | |
273 | _cleanup_hashmap_free_ Hashmap *candidates = NULL; | |
274 | OomdCGroupContext *ctx; | |
275 | int r; | |
276 | ||
277 | assert(monitored_cgroups); | |
278 | assert(ret_candidates); | |
279 | ||
280 | candidates = hashmap_new(&oomd_cgroup_ctx_hash_ops); | |
281 | if (!candidates) | |
282 | return -ENOMEM; | |
283 | ||
284 | HASHMAP_FOREACH(ctx, monitored_cgroups) { | |
285 | r = recursively_get_cgroup_context(candidates, ctx->path); | |
286 | if (r == -ENOMEM) | |
287 | return r; | |
77b04c0a AZ |
288 | if (r < 0) |
289 | log_debug_errno(r, "Failed to recursively get contexts for %s, ignoring: %m", ctx->path); | |
9de5e321 AZ |
290 | } |
291 | ||
292 | *ret_candidates = TAKE_PTR(candidates); | |
293 | ||
294 | return 0; | |
295 | } | |
296 | ||
91cbb4bd AZ |
297 | static int update_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups, Hashmap **candidates) { |
298 | _cleanup_hashmap_free_ Hashmap *new_candidates = NULL; | |
299 | int r; | |
300 | ||
301 | assert(monitored_cgroups); | |
302 | assert(candidates); | |
303 | assert(*candidates); | |
304 | ||
305 | r = get_monitored_cgroup_contexts_candidates(monitored_cgroups, &new_candidates); | |
306 | if (r < 0) | |
307 | return log_debug_errno(r, "Failed to get candidate contexts: %m"); | |
308 | ||
309 | oomd_update_cgroup_contexts_between_hashmaps(*candidates, new_candidates); | |
310 | ||
311 | hashmap_free(*candidates); | |
312 | *candidates = TAKE_PTR(new_candidates); | |
313 | ||
314 | return 0; | |
315 | } | |
316 | ||
9de5e321 AZ |
317 | static int acquire_managed_oom_connect(Manager *m) { |
318 | _cleanup_(varlink_close_unrefp) Varlink *link = NULL; | |
319 | int r; | |
320 | ||
321 | assert(m); | |
322 | assert(m->event); | |
323 | ||
064a5c14 | 324 | r = varlink_connect_address(&link, VARLINK_ADDR_PATH_MANAGED_OOM_SYSTEM); |
9de5e321 | 325 | if (r < 0) |
064a5c14 | 326 | return log_error_errno(r, "Failed to connect to " VARLINK_ADDR_PATH_MANAGED_OOM_SYSTEM ": %m"); |
9de5e321 AZ |
327 | |
328 | (void) varlink_set_userdata(link, m); | |
329 | (void) varlink_set_description(link, "oomd"); | |
330 | (void) varlink_set_relative_timeout(link, USEC_INFINITY); | |
331 | ||
332 | r = varlink_attach_event(link, m->event, SD_EVENT_PRIORITY_NORMAL); | |
333 | if (r < 0) | |
334 | return log_error_errno(r, "Failed to attach varlink connection to event loop: %m"); | |
335 | ||
336 | r = varlink_bind_reply(link, process_managed_oom_reply); | |
337 | if (r < 0) | |
338 | return log_error_errno(r, "Failed to bind reply callback: %m"); | |
339 | ||
340 | r = varlink_observe(link, "io.systemd.ManagedOOM.SubscribeManagedOOMCGroups", NULL); | |
341 | if (r < 0) | |
342 | return log_error_errno(r, "Failed to observe varlink call: %m"); | |
343 | ||
064a5c14 | 344 | m->varlink_client = TAKE_PTR(link); |
9de5e321 AZ |
345 | return 0; |
346 | } | |
347 | ||
81d66fab | 348 | static int monitor_swap_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) { |
9de5e321 AZ |
349 | Manager *m = userdata; |
350 | usec_t usec_now; | |
351 | int r; | |
352 | ||
353 | assert(s); | |
354 | assert(userdata); | |
355 | ||
356 | /* Reset timer */ | |
357 | r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now); | |
358 | if (r < 0) | |
77b04c0a | 359 | return log_error_errno(r, "Failed to reset event timer: %m"); |
9de5e321 | 360 | |
81d66fab | 361 | r = sd_event_source_set_time_relative(s, SWAP_INTERVAL_USEC); |
9de5e321 | 362 | if (r < 0) |
77b04c0a | 363 | return log_error_errno(r, "Failed to set relative time for timer: %m"); |
9de5e321 AZ |
364 | |
365 | /* Reconnect if our connection dropped */ | |
064a5c14 | 366 | if (!m->varlink_client) { |
9de5e321 AZ |
367 | r = acquire_managed_oom_connect(m); |
368 | if (r < 0) | |
77b04c0a | 369 | return log_error_errno(r, "Failed to acquire varlink connection: %m"); |
9de5e321 AZ |
370 | } |
371 | ||
47136b9d AZ |
372 | /* We still try to acquire system information for oomctl even if no units want swap monitoring */ |
373 | r = oomd_system_context_acquire("/proc/meminfo", &m->system_context); | |
374 | /* If there are no units depending on swap actions, the only error we exit on is ENOMEM. */ | |
375 | if (r == -ENOMEM || (r < 0 && !hashmap_isempty(m->monitored_swap_cgroup_contexts))) | |
81d66fab AZ |
376 | return log_error_errno(r, "Failed to acquire system context: %m"); |
377 | ||
378 | /* Return early if nothing is requesting swap monitoring */ | |
379 | if (hashmap_isempty(m->monitored_swap_cgroup_contexts)) | |
380 | return 0; | |
381 | ||
382 | /* Note that m->monitored_swap_cgroup_contexts does not need to be updated every interval because only the | |
383 | * system context is used for deciding whether the swap threshold is hit. m->monitored_swap_cgroup_contexts | |
384 | * is only used to decide which cgroups to kill (and even then only the resource usages of its descendent | |
385 | * nodes are the ones that matter). */ | |
386 | ||
030bc91c NR |
387 | /* Check amount of memory available and swap free so we don't free up swap when memory is still available. */ |
388 | if (oomd_mem_available_below(&m->system_context, 10000 - m->swap_used_limit_permyriad) && | |
cb5ce676 | 389 | oomd_swap_free_below(&m->system_context, 10000 - m->swap_used_limit_permyriad)) { |
81d66fab AZ |
390 | _cleanup_hashmap_free_ Hashmap *candidates = NULL; |
391 | _cleanup_free_ char *selected = NULL; | |
685b0985 | 392 | uint64_t threshold; |
81d66fab | 393 | |
cb5ce676 AZ |
394 | log_debug("Memory used (%"PRIu64") / total (%"PRIu64") and " |
395 | "swap used (%"PRIu64") / total (%"PRIu64") is more than " PERMYRIAD_AS_PERCENT_FORMAT_STR, | |
396 | m->system_context.mem_used, m->system_context.mem_total, | |
81d66fab AZ |
397 | m->system_context.swap_used, m->system_context.swap_total, |
398 | PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad)); | |
399 | ||
400 | r = get_monitored_cgroup_contexts_candidates(m->monitored_swap_cgroup_contexts, &candidates); | |
401 | if (r == -ENOMEM) | |
402 | return log_oom(); | |
403 | if (r < 0) | |
404 | log_debug_errno(r, "Failed to get monitored swap cgroup candidates, ignoring: %m"); | |
405 | ||
685b0985 AZ |
406 | threshold = m->system_context.swap_total * THRESHOLD_SWAP_USED_PERCENT / 100; |
407 | r = oomd_kill_by_swap_usage(candidates, threshold, m->dry_run, &selected); | |
81d66fab AZ |
408 | if (r == -ENOMEM) |
409 | return log_oom(); | |
410 | if (r < 0) | |
411 | log_notice_errno(r, "Failed to kill any cgroup(s) based on swap: %m"); | |
412 | else { | |
d784a8d4 | 413 | if (selected && r > 0) { |
cb5ce676 AZ |
414 | log_notice("Killed %s due to memory used (%"PRIu64") / total (%"PRIu64") and " |
415 | "swap used (%"PRIu64") / total (%"PRIu64") being more than " | |
81d66fab | 416 | PERMYRIAD_AS_PERCENT_FORMAT_STR, |
cb5ce676 AZ |
417 | selected, |
418 | m->system_context.mem_used, m->system_context.mem_total, | |
419 | m->system_context.swap_used, m->system_context.swap_total, | |
81d66fab | 420 | PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad)); |
d784a8d4 OS |
421 | |
422 | /* send dbus signal */ | |
423 | (void) sd_bus_emit_signal(m->bus, | |
424 | "/org/freedesktop/oom1", | |
425 | "org.freedesktop.oom1.Manager", | |
426 | "Killed", | |
427 | "ss", | |
428 | selected, | |
429 | "memory-used"); | |
430 | } | |
81d66fab AZ |
431 | return 0; |
432 | } | |
433 | } | |
434 | ||
435 | return 0; | |
436 | } | |
437 | ||
cb13961a AZ |
438 | static void clear_candidate_hashmapp(Manager **m) { |
439 | if (*m) | |
440 | hashmap_clear((*m)->monitored_mem_pressure_cgroup_contexts_candidates); | |
441 | } | |
442 | ||
81d66fab | 443 | static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) { |
cb13961a AZ |
444 | /* Don't want to use stale candidate data. Setting this will clear the candidate hashmap on return unless we |
445 | * update the candidate data (in which case clear_candidates will be NULL). */ | |
d7ac0952 | 446 | _unused_ _cleanup_(clear_candidate_hashmapp) Manager *clear_candidates = userdata; |
81d66fab | 447 | _cleanup_set_free_ Set *targets = NULL; |
cb13961a | 448 | bool in_post_action_delay = false; |
81d66fab AZ |
449 | Manager *m = userdata; |
450 | usec_t usec_now; | |
451 | int r; | |
452 | ||
453 | assert(s); | |
454 | assert(userdata); | |
455 | ||
456 | /* Reset timer */ | |
457 | r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now); | |
77b04c0a | 458 | if (r < 0) |
81d66fab AZ |
459 | return log_error_errno(r, "Failed to reset event timer: %m"); |
460 | ||
461 | r = sd_event_source_set_time_relative(s, MEM_PRESSURE_INTERVAL_USEC); | |
462 | if (r < 0) | |
463 | return log_error_errno(r, "Failed to set relative time for timer: %m"); | |
464 | ||
465 | /* Reconnect if our connection dropped */ | |
064a5c14 | 466 | if (!m->varlink_client) { |
81d66fab AZ |
467 | r = acquire_managed_oom_connect(m); |
468 | if (r < 0) | |
469 | return log_error_errno(r, "Failed to acquire varlink connection: %m"); | |
470 | } | |
9de5e321 | 471 | |
81d66fab | 472 | /* Return early if nothing is requesting memory pressure monitoring */ |
cb13961a | 473 | if (hashmap_isempty(m->monitored_mem_pressure_cgroup_contexts)) |
81d66fab | 474 | return 0; |
81d66fab AZ |
475 | |
476 | /* Update the cgroups used for detection/action */ | |
9de5e321 AZ |
477 | r = update_monitored_cgroup_contexts(&m->monitored_mem_pressure_cgroup_contexts); |
478 | if (r == -ENOMEM) | |
77b04c0a AZ |
479 | return log_oom(); |
480 | if (r < 0) | |
481 | log_debug_errno(r, "Failed to update monitored memory pressure cgroup contexts, ignoring: %m"); | |
9de5e321 | 482 | |
81d66fab AZ |
483 | /* Since pressure counters are lagging, we need to wait a bit after a kill to ensure we don't read stale |
484 | * values and go on a kill storm. */ | |
485 | if (m->mem_pressure_post_action_delay_start > 0) { | |
486 | if (m->mem_pressure_post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now) | |
cb13961a | 487 | in_post_action_delay = true; |
9de5e321 | 488 | else |
81d66fab | 489 | m->mem_pressure_post_action_delay_start = 0; |
9de5e321 AZ |
490 | } |
491 | ||
c20aa7b1 | 492 | r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets); |
9de5e321 | 493 | if (r == -ENOMEM) |
77b04c0a AZ |
494 | return log_oom(); |
495 | if (r < 0) | |
496 | log_debug_errno(r, "Failed to check if memory pressure exceeded limits, ignoring: %m"); | |
cb13961a | 497 | else if (r == 1 && !in_post_action_delay) { |
df637ede AZ |
498 | OomdCGroupContext *t; |
499 | SET_FOREACH(t, targets) { | |
500 | _cleanup_free_ char *selected = NULL; | |
df637ede AZ |
501 | |
502 | /* Check if there was reclaim activity in the given interval. The concern is the following case: | |
503 | * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending | |
504 | * cgroup. Even after this, well-behaved processes will fault in recently resident pages and | |
505 | * this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need | |
506 | * to kill something (it won't help anyways). */ | |
507 | if ((now(CLOCK_MONOTONIC) - t->last_had_mem_reclaim) > RECLAIM_DURATION_USEC) | |
508 | continue; | |
509 | ||
510 | log_debug("Memory pressure for %s is %lu.%02lu%% > %lu.%02lu%% for > %s with reclaim activity", | |
511 | t->path, | |
3542da24 LB |
512 | LOADAVG_INT_SIDE(t->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(t->memory_pressure.avg10), |
513 | LOADAVG_INT_SIDE(t->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(t->mem_pressure_limit), | |
5291f26d | 514 | FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC)); |
df637ede | 515 | |
cb13961a AZ |
516 | r = update_monitored_cgroup_contexts_candidates( |
517 | m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates); | |
518 | if (r == -ENOMEM) | |
519 | return log_oom(); | |
520 | if (r < 0) | |
521 | log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m"); | |
522 | else | |
523 | clear_candidates = NULL; | |
524 | ||
df637ede AZ |
525 | r = oomd_kill_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates, t->path, m->dry_run, &selected); |
526 | if (r == -ENOMEM) | |
527 | return log_oom(); | |
528 | if (r < 0) | |
529 | log_notice_errno(r, "Failed to kill any cgroup(s) under %s based on pressure: %m", t->path); | |
530 | else { | |
914d4e99 AZ |
531 | /* Don't act on all the high pressure cgroups at once; return as soon as we kill one. |
532 | * If r == 0 then it means there were not eligible candidates, the candidate cgroup | |
533 | * disappeared, or the candidate cgroup has no processes by the time we tried to kill | |
534 | * it. In either case, go through the event loop again and select a new candidate if | |
535 | * pressure is still high. */ | |
df637ede | 536 | m->mem_pressure_post_action_delay_start = usec_now; |
d784a8d4 | 537 | if (selected && r > 0) { |
df637ede AZ |
538 | log_notice("Killed %s due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%" |
539 | " for > %s with reclaim activity", | |
540 | selected, t->path, | |
3542da24 LB |
541 | LOADAVG_INT_SIDE(t->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(t->memory_pressure.avg10), |
542 | LOADAVG_INT_SIDE(t->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(t->mem_pressure_limit), | |
5291f26d | 543 | FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC)); |
d784a8d4 OS |
544 | |
545 | /* send dbus signal */ | |
546 | (void) sd_bus_emit_signal(m->bus, | |
547 | "/org/freedesktop/oom1", | |
548 | "org.freedesktop.oom1.Manager", | |
549 | "Killed", | |
550 | "ss", | |
551 | selected, | |
552 | "memory-pressure"); | |
553 | } | |
df637ede | 554 | return 0; |
9de5e321 AZ |
555 | } |
556 | } | |
cb13961a AZ |
557 | } else { |
558 | /* If any monitored cgroup is over their pressure limit, get all the kill candidates for every | |
559 | * monitored cgroup. This saves CPU cycles from doing it every interval by only doing it when a kill | |
560 | * might happen. | |
561 | * Candidate cgroup data will continue to get updated during the post-action delay period in case | |
562 | * pressure continues to be high after a kill. */ | |
563 | OomdCGroupContext *c; | |
564 | HASHMAP_FOREACH(c, m->monitored_mem_pressure_cgroup_contexts) { | |
565 | if (c->mem_pressure_limit_hit_start == 0) | |
566 | continue; | |
567 | ||
568 | r = update_monitored_cgroup_contexts_candidates( | |
569 | m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates); | |
570 | if (r == -ENOMEM) | |
571 | return log_oom(); | |
572 | if (r < 0) | |
573 | log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m"); | |
574 | else { | |
575 | clear_candidates = NULL; | |
576 | break; | |
577 | } | |
578 | } | |
9de5e321 AZ |
579 | } |
580 | ||
81d66fab AZ |
581 | return 0; |
582 | } | |
9de5e321 | 583 | |
81d66fab AZ |
584 | static int monitor_swap_contexts(Manager *m) { |
585 | _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; | |
586 | int r; | |
9de5e321 | 587 | |
81d66fab AZ |
588 | assert(m); |
589 | assert(m->event); | |
9de5e321 | 590 | |
81d66fab AZ |
591 | r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_swap_contexts_handler, m); |
592 | if (r < 0) | |
593 | return r; | |
594 | ||
595 | r = sd_event_source_set_exit_on_failure(s, true); | |
596 | if (r < 0) | |
597 | return r; | |
9de5e321 | 598 | |
81d66fab AZ |
599 | r = sd_event_source_set_enabled(s, SD_EVENT_ON); |
600 | if (r < 0) | |
601 | return r; | |
602 | ||
603 | (void) sd_event_source_set_description(s, "oomd-swap-timer"); | |
604 | ||
605 | m->swap_context_event_source = TAKE_PTR(s); | |
9de5e321 AZ |
606 | return 0; |
607 | } | |
608 | ||
81d66fab | 609 | static int monitor_memory_pressure_contexts(Manager *m) { |
9de5e321 AZ |
610 | _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; |
611 | int r; | |
612 | ||
613 | assert(m); | |
614 | assert(m->event); | |
615 | ||
81d66fab | 616 | r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_memory_pressure_contexts_handler, m); |
9de5e321 AZ |
617 | if (r < 0) |
618 | return r; | |
619 | ||
620 | r = sd_event_source_set_exit_on_failure(s, true); | |
621 | if (r < 0) | |
622 | return r; | |
623 | ||
624 | r = sd_event_source_set_enabled(s, SD_EVENT_ON); | |
625 | if (r < 0) | |
626 | return r; | |
627 | ||
81d66fab | 628 | (void) sd_event_source_set_description(s, "oomd-memory-pressure-timer"); |
9de5e321 | 629 | |
81d66fab | 630 | m->mem_pressure_context_event_source = TAKE_PTR(s); |
9de5e321 AZ |
631 | return 0; |
632 | } | |
633 | ||
75db809a | 634 | Manager* manager_free(Manager *m) { |
9de5e321 AZ |
635 | assert(m); |
636 | ||
064a5c14 DDM |
637 | varlink_server_unref(m->varlink_server); |
638 | varlink_close_unref(m->varlink_client); | |
81d66fab AZ |
639 | sd_event_source_unref(m->swap_context_event_source); |
640 | sd_event_source_unref(m->mem_pressure_context_event_source); | |
9de5e321 AZ |
641 | sd_event_unref(m->event); |
642 | ||
5c616ecf AZ |
643 | bus_verify_polkit_async_registry_free(m->polkit_registry); |
644 | sd_bus_flush_close_unref(m->bus); | |
645 | ||
9de5e321 AZ |
646 | hashmap_free(m->monitored_swap_cgroup_contexts); |
647 | hashmap_free(m->monitored_mem_pressure_cgroup_contexts); | |
91cbb4bd | 648 | hashmap_free(m->monitored_mem_pressure_cgroup_contexts_candidates); |
9de5e321 | 649 | |
75db809a | 650 | return mfree(m); |
9de5e321 AZ |
651 | } |
652 | ||
653 | int manager_new(Manager **ret) { | |
654 | _cleanup_(manager_freep) Manager *m = NULL; | |
655 | int r; | |
656 | ||
657 | assert(ret); | |
658 | ||
659 | m = new0(Manager, 1); | |
660 | if (!m) | |
661 | return -ENOMEM; | |
662 | ||
663 | r = sd_event_default(&m->event); | |
664 | if (r < 0) | |
665 | return r; | |
666 | ||
667 | (void) sd_event_set_watchdog(m->event, true); | |
668 | ||
669 | r = sd_event_add_signal(m->event, NULL, SIGINT, NULL, NULL); | |
670 | if (r < 0) | |
671 | return r; | |
672 | ||
673 | r = sd_event_add_signal(m->event, NULL, SIGTERM, NULL, NULL); | |
674 | if (r < 0) | |
675 | return r; | |
676 | ||
677 | m->monitored_swap_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops); | |
678 | if (!m->monitored_swap_cgroup_contexts) | |
679 | return -ENOMEM; | |
680 | ||
681 | m->monitored_mem_pressure_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops); | |
682 | if (!m->monitored_mem_pressure_cgroup_contexts) | |
683 | return -ENOMEM; | |
684 | ||
91cbb4bd AZ |
685 | m->monitored_mem_pressure_cgroup_contexts_candidates = hashmap_new(&oomd_cgroup_ctx_hash_ops); |
686 | if (!m->monitored_mem_pressure_cgroup_contexts_candidates) | |
687 | return -ENOMEM; | |
688 | ||
9de5e321 AZ |
689 | *ret = TAKE_PTR(m); |
690 | return 0; | |
691 | } | |
692 | ||
5c616ecf AZ |
693 | static int manager_connect_bus(Manager *m) { |
694 | int r; | |
695 | ||
696 | assert(m); | |
697 | assert(!m->bus); | |
698 | ||
699 | r = bus_open_system_watch_bind_with_description(&m->bus, "bus-api-oom"); | |
700 | if (r < 0) | |
701 | return log_error_errno(r, "Failed to connect to bus: %m"); | |
702 | ||
c9a00f5a | 703 | r = bus_add_implementation(m->bus, &manager_object, m); |
5c616ecf | 704 | if (r < 0) |
c9a00f5a | 705 | return r; |
5c616ecf AZ |
706 | |
707 | r = bus_log_control_api_register(m->bus); | |
708 | if (r < 0) | |
709 | return r; | |
710 | ||
711 | r = sd_bus_request_name_async(m->bus, NULL, "org.freedesktop.oom1", 0, NULL, NULL); | |
712 | if (r < 0) | |
713 | return log_error_errno(r, "Failed to request name: %m"); | |
714 | ||
715 | r = sd_bus_attach_event(m->bus, m->event, 0); | |
716 | if (r < 0) | |
717 | return log_error_errno(r, "Failed to attach bus to event loop: %m"); | |
718 | ||
719 | return 0; | |
720 | } | |
721 | ||
064a5c14 DDM |
722 | static int manager_varlink_init(Manager *m, int fd) { |
723 | _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL; | |
724 | int r; | |
725 | ||
726 | assert(m); | |
727 | assert(!m->varlink_server); | |
728 | ||
729 | r = varlink_server_new(&s, VARLINK_SERVER_ACCOUNT_UID|VARLINK_SERVER_INHERIT_USERDATA); | |
730 | if (r < 0) | |
731 | return log_error_errno(r, "Failed to allocate varlink server object: %m"); | |
732 | ||
733 | varlink_server_set_userdata(s, m); | |
734 | ||
735 | r = varlink_server_bind_method(s, "io.systemd.oom.ReportManagedOOMCGroups", process_managed_oom_request); | |
736 | if (r < 0) | |
737 | return log_error_errno(r, "Failed to register varlink method: %m"); | |
738 | ||
739 | if (fd < 0) | |
740 | r = varlink_server_listen_address(s, VARLINK_ADDR_PATH_MANAGED_OOM_USER, 0666); | |
741 | else | |
742 | r = varlink_server_listen_fd(s, fd); | |
743 | if (r < 0) | |
744 | return log_error_errno(r, "Failed to bind to varlink socket: %m"); | |
745 | ||
746 | r = varlink_server_attach_event(s, m->event, SD_EVENT_PRIORITY_NORMAL); | |
747 | if (r < 0) | |
748 | return log_error_errno(r, "Failed to attach varlink connection to event loop: %m"); | |
749 | ||
750 | log_debug("Initialized systemd-oomd varlink server"); | |
751 | ||
752 | m->varlink_server = TAKE_PTR(s); | |
753 | return 0; | |
754 | } | |
755 | ||
d06e7fb5 LP |
756 | int manager_start( |
757 | Manager *m, | |
758 | bool dry_run, | |
759 | int swap_used_limit_permyriad, | |
760 | int mem_pressure_limit_permyriad, | |
064a5c14 DDM |
761 | usec_t mem_pressure_usec, |
762 | int fd) { | |
d06e7fb5 | 763 | |
0a9f9344 | 764 | unsigned long l, f; |
9de5e321 AZ |
765 | int r; |
766 | ||
767 | assert(m); | |
768 | ||
769 | m->dry_run = dry_run; | |
770 | ||
d06e7fb5 LP |
771 | m->swap_used_limit_permyriad = swap_used_limit_permyriad >= 0 ? swap_used_limit_permyriad : DEFAULT_SWAP_USED_LIMIT_PERCENT * 100; |
772 | assert(m->swap_used_limit_permyriad <= 10000); | |
9de5e321 | 773 | |
d06e7fb5 | 774 | if (mem_pressure_limit_permyriad >= 0) { |
0a9f9344 AZ |
775 | assert(mem_pressure_limit_permyriad <= 10000); |
776 | ||
777 | l = mem_pressure_limit_permyriad / 100; | |
778 | f = mem_pressure_limit_permyriad % 100; | |
779 | } else { | |
780 | l = DEFAULT_MEM_PRESSURE_LIMIT_PERCENT; | |
781 | f = 0; | |
782 | } | |
783 | r = store_loadavg_fixed_point(l, f, &m->default_mem_pressure_limit); | |
9de5e321 AZ |
784 | if (r < 0) |
785 | return r; | |
786 | ||
c20aa7b1 AZ |
787 | m->default_mem_pressure_duration_usec = mem_pressure_usec ?: DEFAULT_MEM_PRESSURE_DURATION_USEC; |
788 | ||
5c616ecf AZ |
789 | r = manager_connect_bus(m); |
790 | if (r < 0) | |
791 | return r; | |
792 | ||
9de5e321 AZ |
793 | r = acquire_managed_oom_connect(m); |
794 | if (r < 0) | |
795 | return r; | |
796 | ||
064a5c14 DDM |
797 | r = manager_varlink_init(m, fd); |
798 | if (r < 0) | |
799 | return r; | |
800 | ||
81d66fab AZ |
801 | r = monitor_memory_pressure_contexts(m); |
802 | if (r < 0) | |
803 | return r; | |
804 | ||
805 | r = monitor_swap_contexts(m); | |
9de5e321 AZ |
806 | if (r < 0) |
807 | return r; | |
808 | ||
809 | return 0; | |
810 | } | |
5c616ecf AZ |
811 | |
812 | int manager_get_dump_string(Manager *m, char **ret) { | |
813 | _cleanup_free_ char *dump = NULL; | |
814 | _cleanup_fclose_ FILE *f = NULL; | |
815 | OomdCGroupContext *c; | |
816 | size_t size; | |
817 | char *key; | |
818 | int r; | |
819 | ||
820 | assert(m); | |
821 | assert(ret); | |
822 | ||
823 | f = open_memstream_unlocked(&dump, &size); | |
824 | if (!f) | |
825 | return -errno; | |
826 | ||
827 | fprintf(f, | |
828 | "Dry Run: %s\n" | |
d06e7fb5 | 829 | "Swap Used Limit: " PERMYRIAD_AS_PERCENT_FORMAT_STR "\n" |
0a9f9344 | 830 | "Default Memory Pressure Limit: %lu.%02lu%%\n" |
c20aa7b1 | 831 | "Default Memory Pressure Duration: %s\n" |
5c616ecf AZ |
832 | "System Context:\n", |
833 | yes_no(m->dry_run), | |
d06e7fb5 | 834 | PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad), |
3542da24 | 835 | LOADAVG_INT_SIDE(m->default_mem_pressure_limit), LOADAVG_DECIMAL_SIDE(m->default_mem_pressure_limit), |
5291f26d | 836 | FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC)); |
5c616ecf AZ |
837 | oomd_dump_system_context(&m->system_context, f, "\t"); |
838 | ||
839 | fprintf(f, "Swap Monitored CGroups:\n"); | |
840 | HASHMAP_FOREACH_KEY(c, key, m->monitored_swap_cgroup_contexts) | |
841 | oomd_dump_swap_cgroup_context(c, f, "\t"); | |
842 | ||
843 | fprintf(f, "Memory Pressure Monitored CGroups:\n"); | |
844 | HASHMAP_FOREACH_KEY(c, key, m->monitored_mem_pressure_cgroup_contexts) | |
845 | oomd_dump_memory_pressure_cgroup_context(c, f, "\t"); | |
846 | ||
847 | r = fflush_and_check(f); | |
848 | if (r < 0) | |
849 | return r; | |
850 | ||
851 | f = safe_fclose(f); | |
852 | ||
853 | *ret = TAKE_PTR(dump); | |
854 | return 0; | |
855 | } |