]>
Commit | Line | Data |
---|---|---|
db9ecf05 | 1 | /* SPDX-License-Identifier: LGPL-2.1-or-later */ |
9de5e321 | 2 | |
064a5c14 DDM |
3 | #include "sd-daemon.h" |
4 | ||
5c616ecf AZ |
5 | #include "bus-log-control-api.h" |
6 | #include "bus-util.h" | |
7 | #include "bus-polkit.h" | |
9de5e321 AZ |
8 | #include "cgroup-util.h" |
9 | #include "fd-util.h" | |
10 | #include "fileio.h" | |
064a5c14 | 11 | #include "format-util.h" |
408a3bbd | 12 | #include "memory-util.h" |
5c616ecf | 13 | #include "oomd-manager-bus.h" |
9de5e321 AZ |
14 | #include "oomd-manager.h" |
15 | #include "path-util.h" | |
d9d3f05d | 16 | #include "percent-util.h" |
9de5e321 | 17 | |
71feeae4 | 18 | typedef struct ManagedOOMMessage { |
9de5e321 AZ |
19 | ManagedOOMMode mode; |
20 | char *path; | |
21 | char *property; | |
d06e7fb5 | 22 | uint32_t limit; |
71feeae4 | 23 | } ManagedOOMMessage; |
9de5e321 | 24 | |
71feeae4 DDM |
25 | static void managed_oom_message_destroy(ManagedOOMMessage *message) { |
26 | assert(message); | |
27 | free(message->path); | |
28 | free(message->property); | |
9de5e321 AZ |
29 | } |
30 | ||
31 | static int managed_oom_mode(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { | |
32 | ManagedOOMMode *mode = userdata, m; | |
33 | const char *s; | |
34 | ||
35 | assert(mode); | |
36 | assert_se(s = json_variant_string(v)); | |
37 | ||
38 | m = managed_oom_mode_from_string(s); | |
39 | if (m < 0) | |
7211c853 | 40 | return json_log(v, flags, m, "%s is not a valid ManagedOOMMode", s); |
9de5e321 AZ |
41 | |
42 | *mode = m; | |
43 | return 0; | |
44 | } | |
45 | ||
064a5c14 | 46 | static int process_managed_oom_message(Manager *m, uid_t uid, JsonVariant *parameters) { |
9de5e321 | 47 | JsonVariant *c, *cgroups; |
71feeae4 | 48 | int r; |
9de5e321 AZ |
49 | |
50 | static const JsonDispatch dispatch_table[] = { | |
71feeae4 DDM |
51 | { "mode", JSON_VARIANT_STRING, managed_oom_mode, offsetof(ManagedOOMMessage, mode), JSON_MANDATORY }, |
52 | { "path", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMMessage, path), JSON_MANDATORY }, | |
53 | { "property", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMMessage, property), JSON_MANDATORY }, | |
54 | { "limit", JSON_VARIANT_UNSIGNED, json_dispatch_uint32, offsetof(ManagedOOMMessage, limit), 0 }, | |
9de5e321 AZ |
55 | {}, |
56 | }; | |
57 | ||
71feeae4 DDM |
58 | assert(m); |
59 | assert(parameters); | |
9de5e321 AZ |
60 | |
61 | cgroups = json_variant_by_key(parameters, "cgroups"); | |
71feeae4 DDM |
62 | if (!cgroups) |
63 | return -EINVAL; | |
9de5e321 AZ |
64 | |
65 | /* Skip malformed elements and keep processing in case the others are good */ | |
66 | JSON_VARIANT_ARRAY_FOREACH(c, cgroups) { | |
71feeae4 | 67 | _cleanup_(managed_oom_message_destroy) ManagedOOMMessage message = {}; |
9de5e321 AZ |
68 | OomdCGroupContext *ctx; |
69 | Hashmap *monitor_hm; | |
70 | loadavg_t limit; | |
9de5e321 AZ |
71 | |
72 | if (!json_variant_is_object(c)) | |
73 | continue; | |
74 | ||
71feeae4 DDM |
75 | r = json_dispatch(c, dispatch_table, NULL, 0, &message); |
76 | if (r == -ENOMEM) | |
77 | return r; | |
78 | if (r < 0) | |
9de5e321 AZ |
79 | continue; |
80 | ||
064a5c14 DDM |
81 | if (uid != 0) { |
82 | uid_t cg_uid; | |
83 | ||
84 | r = cg_path_get_owner_uid(message.path, &cg_uid); | |
85 | if (r < 0) { | |
b6f6df4c | 86 | log_debug_errno(r, "Failed to get cgroup %s owner uid: %m", message.path); |
064a5c14 DDM |
87 | continue; |
88 | } | |
89 | ||
90 | /* Let's not be lenient for permission errors and skip processing if we receive an | |
91 | * update for a cgroup that doesn't belong to the user. */ | |
92 | if (uid != cg_uid) | |
93 | return log_error_errno(SYNTHETIC_ERRNO(EPERM), | |
94 | "cgroup path owner UID does not match sender uid " | |
95 | "(" UID_FMT " != " UID_FMT ")", uid, cg_uid); | |
96 | } | |
97 | ||
71feeae4 | 98 | monitor_hm = streq(message.property, "ManagedOOMSwap") ? |
9de5e321 AZ |
99 | m->monitored_swap_cgroup_contexts : m->monitored_mem_pressure_cgroup_contexts; |
100 | ||
71feeae4 DDM |
101 | if (message.mode == MANAGED_OOM_AUTO) { |
102 | (void) oomd_cgroup_context_free(hashmap_remove(monitor_hm, empty_to_root(message.path))); | |
9de5e321 AZ |
103 | continue; |
104 | } | |
105 | ||
106 | limit = m->default_mem_pressure_limit; | |
107 | ||
71feeae4 DDM |
108 | if (streq(message.property, "ManagedOOMMemoryPressure") && message.limit > 0) { |
109 | int permyriad = UINT32_SCALE_TO_PERMYRIAD(message.limit); | |
d06e7fb5 | 110 | |
5f1d6ebd | 111 | r = store_loadavg_fixed_point(permyriad / 100LU, permyriad % 100LU, &limit); |
71feeae4 | 112 | if (r < 0) |
9de5e321 | 113 | continue; |
9de5e321 AZ |
114 | } |
115 | ||
71feeae4 DDM |
116 | r = oomd_insert_cgroup_context(NULL, monitor_hm, message.path); |
117 | if (r == -ENOMEM) | |
118 | return r; | |
119 | if (r < 0 && r != -EEXIST) | |
120 | log_debug_errno(r, "Failed to insert message, ignoring: %m"); | |
9de5e321 AZ |
121 | |
122 | /* Always update the limit in case it was changed. For non-memory pressure detection the value is | |
123 | * ignored so always updating it here is not a problem. */ | |
71feeae4 | 124 | ctx = hashmap_get(monitor_hm, empty_to_root(message.path)); |
9de5e321 AZ |
125 | if (ctx) |
126 | ctx->mem_pressure_limit = limit; | |
127 | } | |
128 | ||
71feeae4 DDM |
129 | return 0; |
130 | } | |
131 | ||
064a5c14 DDM |
132 | static int process_managed_oom_request( |
133 | Varlink *link, | |
134 | JsonVariant *parameters, | |
135 | VarlinkMethodFlags flags, | |
136 | void *userdata) { | |
99534007 | 137 | Manager *m = ASSERT_PTR(userdata); |
064a5c14 DDM |
138 | uid_t uid; |
139 | int r; | |
140 | ||
064a5c14 DDM |
141 | r = varlink_get_peer_uid(link, &uid); |
142 | if (r < 0) | |
143 | return log_error_errno(r, "Failed to get varlink peer uid: %m"); | |
144 | ||
145 | return process_managed_oom_message(m, uid, parameters); | |
146 | } | |
147 | ||
71feeae4 DDM |
148 | static int process_managed_oom_reply( |
149 | Varlink *link, | |
150 | JsonVariant *parameters, | |
151 | const char *error_id, | |
152 | VarlinkReplyFlags flags, | |
153 | void *userdata) { | |
99534007 | 154 | Manager *m = ASSERT_PTR(userdata); |
064a5c14 | 155 | uid_t uid; |
71feeae4 DDM |
156 | int r; |
157 | ||
71feeae4 DDM |
158 | if (error_id) { |
159 | r = -EIO; | |
160 | log_debug("Error getting ManagedOOM cgroups: %s", error_id); | |
161 | goto finish; | |
162 | } | |
163 | ||
064a5c14 DDM |
164 | r = varlink_get_peer_uid(link, &uid); |
165 | if (r < 0) { | |
166 | log_error_errno(r, "Failed to get varlink peer uid: %m"); | |
167 | goto finish; | |
168 | } | |
169 | ||
170 | r = process_managed_oom_message(m, uid, parameters); | |
71feeae4 | 171 | |
9de5e321 AZ |
172 | finish: |
173 | if (!FLAGS_SET(flags, VARLINK_REPLY_CONTINUES)) | |
064a5c14 | 174 | m->varlink_client = varlink_close_unref(link); |
9de5e321 AZ |
175 | |
176 | return r; | |
177 | } | |
178 | ||
4d620b90 | 179 | /* Fill 'new_h' with 'path's descendant OomdCGroupContexts. Only include descendant cgroups that are possible |
9de5e321 AZ |
180 | * candidates for action. That is, only leaf cgroups or cgroups with memory.oom.group set to "1". |
181 | * | |
4d620b90 ZJS |
182 | * This function ignores most errors in order to handle cgroups that may have been cleaned up while |
183 | * populating the hashmap. | |
9de5e321 | 184 | * |
4d620b90 | 185 | * 'new_h' is of the form { key: cgroup paths -> value: OomdCGroupContext } */ |
9de5e321 AZ |
186 | static int recursively_get_cgroup_context(Hashmap *new_h, const char *path) { |
187 | _cleanup_free_ char *subpath = NULL; | |
188 | _cleanup_closedir_ DIR *d = NULL; | |
189 | int r; | |
190 | ||
191 | assert(new_h); | |
192 | assert(path); | |
193 | ||
194 | r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d); | |
195 | if (r < 0) | |
196 | return r; | |
197 | ||
198 | r = cg_read_subgroup(d, &subpath); | |
199 | if (r < 0) | |
200 | return r; | |
201 | else if (r == 0) { /* No subgroups? We're a leaf node */ | |
202 | r = oomd_insert_cgroup_context(NULL, new_h, path); | |
77b04c0a AZ |
203 | if (r == -ENOMEM) |
204 | return r; | |
205 | if (r < 0) | |
206 | log_debug_errno(r, "Failed to insert context for %s, ignoring: %m", path); | |
207 | return 0; | |
9de5e321 AZ |
208 | } |
209 | ||
210 | do { | |
211 | _cleanup_free_ char *cg_path = NULL; | |
212 | bool oom_group; | |
213 | ||
214 | cg_path = path_join(empty_to_root(path), subpath); | |
215 | if (!cg_path) | |
216 | return -ENOMEM; | |
217 | ||
218 | subpath = mfree(subpath); | |
219 | ||
220 | r = cg_get_attribute_as_bool("memory", cg_path, "memory.oom.group", &oom_group); | |
221 | /* The cgroup might be gone. Skip it as a candidate since we can't get information on it. */ | |
77b04c0a AZ |
222 | if (r == -ENOMEM) |
223 | return r; | |
224 | if (r < 0) { | |
225 | log_debug_errno(r, "Failed to read memory.oom.group from %s, ignoring: %m", cg_path); | |
226 | return 0; | |
227 | } | |
9de5e321 | 228 | |
349a2003 | 229 | if (oom_group) |
9de5e321 | 230 | r = oomd_insert_cgroup_context(NULL, new_h, cg_path); |
349a2003 | 231 | else |
9de5e321 | 232 | r = recursively_get_cgroup_context(new_h, cg_path); |
349a2003 AZ |
233 | if (r == -ENOMEM) |
234 | return r; | |
77b04c0a AZ |
235 | if (r < 0) |
236 | log_debug_errno(r, "Failed to insert or recursively get from %s, ignoring: %m", cg_path); | |
9de5e321 AZ |
237 | } while ((r = cg_read_subgroup(d, &subpath)) > 0); |
238 | ||
239 | return 0; | |
240 | } | |
241 | ||
242 | static int update_monitored_cgroup_contexts(Hashmap **monitored_cgroups) { | |
243 | _cleanup_hashmap_free_ Hashmap *new_base = NULL; | |
244 | OomdCGroupContext *ctx; | |
245 | int r; | |
246 | ||
247 | assert(monitored_cgroups); | |
248 | ||
249 | new_base = hashmap_new(&oomd_cgroup_ctx_hash_ops); | |
250 | if (!new_base) | |
251 | return -ENOMEM; | |
252 | ||
253 | HASHMAP_FOREACH(ctx, *monitored_cgroups) { | |
254 | /* Skip most errors since the cgroup we're trying to update might not exist anymore. */ | |
255 | r = oomd_insert_cgroup_context(*monitored_cgroups, new_base, ctx->path); | |
256 | if (r == -ENOMEM) | |
257 | return r; | |
77b04c0a AZ |
258 | if (r < 0 && !IN_SET(r, -EEXIST, -ENOENT)) |
259 | log_debug_errno(r, "Failed to insert context for %s, ignoring: %m", ctx->path); | |
9de5e321 AZ |
260 | } |
261 | ||
262 | hashmap_free(*monitored_cgroups); | |
263 | *monitored_cgroups = TAKE_PTR(new_base); | |
264 | ||
265 | return 0; | |
266 | } | |
267 | ||
268 | static int get_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups, Hashmap **ret_candidates) { | |
269 | _cleanup_hashmap_free_ Hashmap *candidates = NULL; | |
270 | OomdCGroupContext *ctx; | |
271 | int r; | |
272 | ||
273 | assert(monitored_cgroups); | |
274 | assert(ret_candidates); | |
275 | ||
276 | candidates = hashmap_new(&oomd_cgroup_ctx_hash_ops); | |
277 | if (!candidates) | |
278 | return -ENOMEM; | |
279 | ||
280 | HASHMAP_FOREACH(ctx, monitored_cgroups) { | |
281 | r = recursively_get_cgroup_context(candidates, ctx->path); | |
282 | if (r == -ENOMEM) | |
283 | return r; | |
77b04c0a AZ |
284 | if (r < 0) |
285 | log_debug_errno(r, "Failed to recursively get contexts for %s, ignoring: %m", ctx->path); | |
9de5e321 AZ |
286 | } |
287 | ||
288 | *ret_candidates = TAKE_PTR(candidates); | |
289 | ||
290 | return 0; | |
291 | } | |
292 | ||
91cbb4bd AZ |
293 | static int update_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups, Hashmap **candidates) { |
294 | _cleanup_hashmap_free_ Hashmap *new_candidates = NULL; | |
295 | int r; | |
296 | ||
297 | assert(monitored_cgroups); | |
298 | assert(candidates); | |
299 | assert(*candidates); | |
300 | ||
301 | r = get_monitored_cgroup_contexts_candidates(monitored_cgroups, &new_candidates); | |
302 | if (r < 0) | |
303 | return log_debug_errno(r, "Failed to get candidate contexts: %m"); | |
304 | ||
305 | oomd_update_cgroup_contexts_between_hashmaps(*candidates, new_candidates); | |
306 | ||
307 | hashmap_free(*candidates); | |
308 | *candidates = TAKE_PTR(new_candidates); | |
309 | ||
310 | return 0; | |
311 | } | |
312 | ||
9de5e321 AZ |
313 | static int acquire_managed_oom_connect(Manager *m) { |
314 | _cleanup_(varlink_close_unrefp) Varlink *link = NULL; | |
315 | int r; | |
316 | ||
317 | assert(m); | |
318 | assert(m->event); | |
319 | ||
064a5c14 | 320 | r = varlink_connect_address(&link, VARLINK_ADDR_PATH_MANAGED_OOM_SYSTEM); |
9de5e321 | 321 | if (r < 0) |
064a5c14 | 322 | return log_error_errno(r, "Failed to connect to " VARLINK_ADDR_PATH_MANAGED_OOM_SYSTEM ": %m"); |
9de5e321 AZ |
323 | |
324 | (void) varlink_set_userdata(link, m); | |
325 | (void) varlink_set_description(link, "oomd"); | |
326 | (void) varlink_set_relative_timeout(link, USEC_INFINITY); | |
327 | ||
328 | r = varlink_attach_event(link, m->event, SD_EVENT_PRIORITY_NORMAL); | |
329 | if (r < 0) | |
330 | return log_error_errno(r, "Failed to attach varlink connection to event loop: %m"); | |
331 | ||
332 | r = varlink_bind_reply(link, process_managed_oom_reply); | |
333 | if (r < 0) | |
334 | return log_error_errno(r, "Failed to bind reply callback: %m"); | |
335 | ||
336 | r = varlink_observe(link, "io.systemd.ManagedOOM.SubscribeManagedOOMCGroups", NULL); | |
337 | if (r < 0) | |
338 | return log_error_errno(r, "Failed to observe varlink call: %m"); | |
339 | ||
064a5c14 | 340 | m->varlink_client = TAKE_PTR(link); |
9de5e321 AZ |
341 | return 0; |
342 | } | |
343 | ||
81d66fab | 344 | static int monitor_swap_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) { |
99534007 | 345 | Manager *m = ASSERT_PTR(userdata); |
9de5e321 AZ |
346 | usec_t usec_now; |
347 | int r; | |
348 | ||
349 | assert(s); | |
9de5e321 AZ |
350 | |
351 | /* Reset timer */ | |
352 | r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now); | |
353 | if (r < 0) | |
77b04c0a | 354 | return log_error_errno(r, "Failed to reset event timer: %m"); |
9de5e321 | 355 | |
81d66fab | 356 | r = sd_event_source_set_time_relative(s, SWAP_INTERVAL_USEC); |
9de5e321 | 357 | if (r < 0) |
77b04c0a | 358 | return log_error_errno(r, "Failed to set relative time for timer: %m"); |
9de5e321 AZ |
359 | |
360 | /* Reconnect if our connection dropped */ | |
064a5c14 | 361 | if (!m->varlink_client) { |
9de5e321 AZ |
362 | r = acquire_managed_oom_connect(m); |
363 | if (r < 0) | |
77b04c0a | 364 | return log_error_errno(r, "Failed to acquire varlink connection: %m"); |
9de5e321 AZ |
365 | } |
366 | ||
47136b9d AZ |
367 | /* We still try to acquire system information for oomctl even if no units want swap monitoring */ |
368 | r = oomd_system_context_acquire("/proc/meminfo", &m->system_context); | |
369 | /* If there are no units depending on swap actions, the only error we exit on is ENOMEM. */ | |
370 | if (r == -ENOMEM || (r < 0 && !hashmap_isempty(m->monitored_swap_cgroup_contexts))) | |
81d66fab AZ |
371 | return log_error_errno(r, "Failed to acquire system context: %m"); |
372 | ||
373 | /* Return early if nothing is requesting swap monitoring */ | |
374 | if (hashmap_isempty(m->monitored_swap_cgroup_contexts)) | |
375 | return 0; | |
376 | ||
377 | /* Note that m->monitored_swap_cgroup_contexts does not need to be updated every interval because only the | |
378 | * system context is used for deciding whether the swap threshold is hit. m->monitored_swap_cgroup_contexts | |
379 | * is only used to decide which cgroups to kill (and even then only the resource usages of its descendent | |
380 | * nodes are the ones that matter). */ | |
381 | ||
030bc91c NR |
382 | /* Check amount of memory available and swap free so we don't free up swap when memory is still available. */ |
383 | if (oomd_mem_available_below(&m->system_context, 10000 - m->swap_used_limit_permyriad) && | |
cb5ce676 | 384 | oomd_swap_free_below(&m->system_context, 10000 - m->swap_used_limit_permyriad)) { |
81d66fab AZ |
385 | _cleanup_hashmap_free_ Hashmap *candidates = NULL; |
386 | _cleanup_free_ char *selected = NULL; | |
685b0985 | 387 | uint64_t threshold; |
81d66fab | 388 | |
cb5ce676 AZ |
389 | log_debug("Memory used (%"PRIu64") / total (%"PRIu64") and " |
390 | "swap used (%"PRIu64") / total (%"PRIu64") is more than " PERMYRIAD_AS_PERCENT_FORMAT_STR, | |
391 | m->system_context.mem_used, m->system_context.mem_total, | |
81d66fab AZ |
392 | m->system_context.swap_used, m->system_context.swap_total, |
393 | PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad)); | |
394 | ||
395 | r = get_monitored_cgroup_contexts_candidates(m->monitored_swap_cgroup_contexts, &candidates); | |
396 | if (r == -ENOMEM) | |
397 | return log_oom(); | |
398 | if (r < 0) | |
399 | log_debug_errno(r, "Failed to get monitored swap cgroup candidates, ignoring: %m"); | |
400 | ||
685b0985 AZ |
401 | threshold = m->system_context.swap_total * THRESHOLD_SWAP_USED_PERCENT / 100; |
402 | r = oomd_kill_by_swap_usage(candidates, threshold, m->dry_run, &selected); | |
81d66fab AZ |
403 | if (r == -ENOMEM) |
404 | return log_oom(); | |
405 | if (r < 0) | |
406 | log_notice_errno(r, "Failed to kill any cgroup(s) based on swap: %m"); | |
407 | else { | |
d784a8d4 | 408 | if (selected && r > 0) { |
cb5ce676 AZ |
409 | log_notice("Killed %s due to memory used (%"PRIu64") / total (%"PRIu64") and " |
410 | "swap used (%"PRIu64") / total (%"PRIu64") being more than " | |
81d66fab | 411 | PERMYRIAD_AS_PERCENT_FORMAT_STR, |
cb5ce676 AZ |
412 | selected, |
413 | m->system_context.mem_used, m->system_context.mem_total, | |
414 | m->system_context.swap_used, m->system_context.swap_total, | |
81d66fab | 415 | PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad)); |
d784a8d4 OS |
416 | |
417 | /* send dbus signal */ | |
418 | (void) sd_bus_emit_signal(m->bus, | |
419 | "/org/freedesktop/oom1", | |
420 | "org.freedesktop.oom1.Manager", | |
421 | "Killed", | |
422 | "ss", | |
423 | selected, | |
424 | "memory-used"); | |
425 | } | |
81d66fab AZ |
426 | return 0; |
427 | } | |
428 | } | |
429 | ||
430 | return 0; | |
431 | } | |
432 | ||
cb13961a AZ |
433 | static void clear_candidate_hashmapp(Manager **m) { |
434 | if (*m) | |
435 | hashmap_clear((*m)->monitored_mem_pressure_cgroup_contexts_candidates); | |
436 | } | |
437 | ||
81d66fab | 438 | static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) { |
cb13961a AZ |
439 | /* Don't want to use stale candidate data. Setting this will clear the candidate hashmap on return unless we |
440 | * update the candidate data (in which case clear_candidates will be NULL). */ | |
d7ac0952 | 441 | _unused_ _cleanup_(clear_candidate_hashmapp) Manager *clear_candidates = userdata; |
81d66fab | 442 | _cleanup_set_free_ Set *targets = NULL; |
cb13961a | 443 | bool in_post_action_delay = false; |
99534007 | 444 | Manager *m = ASSERT_PTR(userdata); |
81d66fab AZ |
445 | usec_t usec_now; |
446 | int r; | |
447 | ||
448 | assert(s); | |
81d66fab AZ |
449 | |
450 | /* Reset timer */ | |
451 | r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now); | |
77b04c0a | 452 | if (r < 0) |
81d66fab AZ |
453 | return log_error_errno(r, "Failed to reset event timer: %m"); |
454 | ||
455 | r = sd_event_source_set_time_relative(s, MEM_PRESSURE_INTERVAL_USEC); | |
456 | if (r < 0) | |
457 | return log_error_errno(r, "Failed to set relative time for timer: %m"); | |
458 | ||
459 | /* Reconnect if our connection dropped */ | |
064a5c14 | 460 | if (!m->varlink_client) { |
81d66fab AZ |
461 | r = acquire_managed_oom_connect(m); |
462 | if (r < 0) | |
463 | return log_error_errno(r, "Failed to acquire varlink connection: %m"); | |
464 | } | |
9de5e321 | 465 | |
81d66fab | 466 | /* Return early if nothing is requesting memory pressure monitoring */ |
cb13961a | 467 | if (hashmap_isempty(m->monitored_mem_pressure_cgroup_contexts)) |
81d66fab | 468 | return 0; |
81d66fab AZ |
469 | |
470 | /* Update the cgroups used for detection/action */ | |
9de5e321 AZ |
471 | r = update_monitored_cgroup_contexts(&m->monitored_mem_pressure_cgroup_contexts); |
472 | if (r == -ENOMEM) | |
77b04c0a AZ |
473 | return log_oom(); |
474 | if (r < 0) | |
475 | log_debug_errno(r, "Failed to update monitored memory pressure cgroup contexts, ignoring: %m"); | |
9de5e321 | 476 | |
81d66fab AZ |
477 | /* Since pressure counters are lagging, we need to wait a bit after a kill to ensure we don't read stale |
478 | * values and go on a kill storm. */ | |
479 | if (m->mem_pressure_post_action_delay_start > 0) { | |
480 | if (m->mem_pressure_post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now) | |
cb13961a | 481 | in_post_action_delay = true; |
9de5e321 | 482 | else |
81d66fab | 483 | m->mem_pressure_post_action_delay_start = 0; |
9de5e321 AZ |
484 | } |
485 | ||
c20aa7b1 | 486 | r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets); |
9de5e321 | 487 | if (r == -ENOMEM) |
77b04c0a AZ |
488 | return log_oom(); |
489 | if (r < 0) | |
490 | log_debug_errno(r, "Failed to check if memory pressure exceeded limits, ignoring: %m"); | |
cb13961a | 491 | else if (r == 1 && !in_post_action_delay) { |
df637ede AZ |
492 | OomdCGroupContext *t; |
493 | SET_FOREACH(t, targets) { | |
494 | _cleanup_free_ char *selected = NULL; | |
df637ede AZ |
495 | |
496 | /* Check if there was reclaim activity in the given interval. The concern is the following case: | |
497 | * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending | |
498 | * cgroup. Even after this, well-behaved processes will fault in recently resident pages and | |
499 | * this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need | |
500 | * to kill something (it won't help anyways). */ | |
501 | if ((now(CLOCK_MONOTONIC) - t->last_had_mem_reclaim) > RECLAIM_DURATION_USEC) | |
502 | continue; | |
503 | ||
504 | log_debug("Memory pressure for %s is %lu.%02lu%% > %lu.%02lu%% for > %s with reclaim activity", | |
505 | t->path, | |
3542da24 LB |
506 | LOADAVG_INT_SIDE(t->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(t->memory_pressure.avg10), |
507 | LOADAVG_INT_SIDE(t->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(t->mem_pressure_limit), | |
5291f26d | 508 | FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC)); |
df637ede | 509 | |
cb13961a AZ |
510 | r = update_monitored_cgroup_contexts_candidates( |
511 | m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates); | |
512 | if (r == -ENOMEM) | |
513 | return log_oom(); | |
514 | if (r < 0) | |
515 | log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m"); | |
516 | else | |
517 | clear_candidates = NULL; | |
518 | ||
df637ede AZ |
519 | r = oomd_kill_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates, t->path, m->dry_run, &selected); |
520 | if (r == -ENOMEM) | |
521 | return log_oom(); | |
522 | if (r < 0) | |
523 | log_notice_errno(r, "Failed to kill any cgroup(s) under %s based on pressure: %m", t->path); | |
524 | else { | |
914d4e99 AZ |
525 | /* Don't act on all the high pressure cgroups at once; return as soon as we kill one. |
526 | * If r == 0 then it means there were not eligible candidates, the candidate cgroup | |
527 | * disappeared, or the candidate cgroup has no processes by the time we tried to kill | |
528 | * it. In either case, go through the event loop again and select a new candidate if | |
529 | * pressure is still high. */ | |
df637ede | 530 | m->mem_pressure_post_action_delay_start = usec_now; |
d784a8d4 | 531 | if (selected && r > 0) { |
df637ede AZ |
532 | log_notice("Killed %s due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%" |
533 | " for > %s with reclaim activity", | |
534 | selected, t->path, | |
3542da24 LB |
535 | LOADAVG_INT_SIDE(t->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(t->memory_pressure.avg10), |
536 | LOADAVG_INT_SIDE(t->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(t->mem_pressure_limit), | |
5291f26d | 537 | FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC)); |
d784a8d4 OS |
538 | |
539 | /* send dbus signal */ | |
540 | (void) sd_bus_emit_signal(m->bus, | |
541 | "/org/freedesktop/oom1", | |
542 | "org.freedesktop.oom1.Manager", | |
543 | "Killed", | |
544 | "ss", | |
545 | selected, | |
546 | "memory-pressure"); | |
547 | } | |
df637ede | 548 | return 0; |
9de5e321 AZ |
549 | } |
550 | } | |
cb13961a AZ |
551 | } else { |
552 | /* If any monitored cgroup is over their pressure limit, get all the kill candidates for every | |
553 | * monitored cgroup. This saves CPU cycles from doing it every interval by only doing it when a kill | |
554 | * might happen. | |
555 | * Candidate cgroup data will continue to get updated during the post-action delay period in case | |
556 | * pressure continues to be high after a kill. */ | |
557 | OomdCGroupContext *c; | |
558 | HASHMAP_FOREACH(c, m->monitored_mem_pressure_cgroup_contexts) { | |
559 | if (c->mem_pressure_limit_hit_start == 0) | |
560 | continue; | |
561 | ||
562 | r = update_monitored_cgroup_contexts_candidates( | |
563 | m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates); | |
564 | if (r == -ENOMEM) | |
565 | return log_oom(); | |
566 | if (r < 0) | |
567 | log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m"); | |
568 | else { | |
569 | clear_candidates = NULL; | |
570 | break; | |
571 | } | |
572 | } | |
9de5e321 AZ |
573 | } |
574 | ||
81d66fab AZ |
575 | return 0; |
576 | } | |
9de5e321 | 577 | |
81d66fab AZ |
578 | static int monitor_swap_contexts(Manager *m) { |
579 | _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; | |
580 | int r; | |
9de5e321 | 581 | |
81d66fab AZ |
582 | assert(m); |
583 | assert(m->event); | |
9de5e321 | 584 | |
81d66fab AZ |
585 | r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_swap_contexts_handler, m); |
586 | if (r < 0) | |
587 | return r; | |
588 | ||
589 | r = sd_event_source_set_exit_on_failure(s, true); | |
590 | if (r < 0) | |
591 | return r; | |
9de5e321 | 592 | |
81d66fab AZ |
593 | r = sd_event_source_set_enabled(s, SD_EVENT_ON); |
594 | if (r < 0) | |
595 | return r; | |
596 | ||
597 | (void) sd_event_source_set_description(s, "oomd-swap-timer"); | |
598 | ||
599 | m->swap_context_event_source = TAKE_PTR(s); | |
9de5e321 AZ |
600 | return 0; |
601 | } | |
602 | ||
81d66fab | 603 | static int monitor_memory_pressure_contexts(Manager *m) { |
9de5e321 AZ |
604 | _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; |
605 | int r; | |
606 | ||
607 | assert(m); | |
608 | assert(m->event); | |
609 | ||
81d66fab | 610 | r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_memory_pressure_contexts_handler, m); |
9de5e321 AZ |
611 | if (r < 0) |
612 | return r; | |
613 | ||
614 | r = sd_event_source_set_exit_on_failure(s, true); | |
615 | if (r < 0) | |
616 | return r; | |
617 | ||
618 | r = sd_event_source_set_enabled(s, SD_EVENT_ON); | |
619 | if (r < 0) | |
620 | return r; | |
621 | ||
81d66fab | 622 | (void) sd_event_source_set_description(s, "oomd-memory-pressure-timer"); |
9de5e321 | 623 | |
81d66fab | 624 | m->mem_pressure_context_event_source = TAKE_PTR(s); |
9de5e321 AZ |
625 | return 0; |
626 | } | |
627 | ||
75db809a | 628 | Manager* manager_free(Manager *m) { |
9de5e321 AZ |
629 | assert(m); |
630 | ||
064a5c14 DDM |
631 | varlink_server_unref(m->varlink_server); |
632 | varlink_close_unref(m->varlink_client); | |
81d66fab AZ |
633 | sd_event_source_unref(m->swap_context_event_source); |
634 | sd_event_source_unref(m->mem_pressure_context_event_source); | |
9de5e321 AZ |
635 | sd_event_unref(m->event); |
636 | ||
5c616ecf AZ |
637 | bus_verify_polkit_async_registry_free(m->polkit_registry); |
638 | sd_bus_flush_close_unref(m->bus); | |
639 | ||
9de5e321 AZ |
640 | hashmap_free(m->monitored_swap_cgroup_contexts); |
641 | hashmap_free(m->monitored_mem_pressure_cgroup_contexts); | |
91cbb4bd | 642 | hashmap_free(m->monitored_mem_pressure_cgroup_contexts_candidates); |
9de5e321 | 643 | |
75db809a | 644 | return mfree(m); |
9de5e321 AZ |
645 | } |
646 | ||
647 | int manager_new(Manager **ret) { | |
648 | _cleanup_(manager_freep) Manager *m = NULL; | |
649 | int r; | |
650 | ||
651 | assert(ret); | |
652 | ||
653 | m = new0(Manager, 1); | |
654 | if (!m) | |
655 | return -ENOMEM; | |
656 | ||
657 | r = sd_event_default(&m->event); | |
658 | if (r < 0) | |
659 | return r; | |
660 | ||
661 | (void) sd_event_set_watchdog(m->event, true); | |
662 | ||
663 | r = sd_event_add_signal(m->event, NULL, SIGINT, NULL, NULL); | |
664 | if (r < 0) | |
665 | return r; | |
666 | ||
667 | r = sd_event_add_signal(m->event, NULL, SIGTERM, NULL, NULL); | |
668 | if (r < 0) | |
669 | return r; | |
670 | ||
671 | m->monitored_swap_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops); | |
672 | if (!m->monitored_swap_cgroup_contexts) | |
673 | return -ENOMEM; | |
674 | ||
675 | m->monitored_mem_pressure_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops); | |
676 | if (!m->monitored_mem_pressure_cgroup_contexts) | |
677 | return -ENOMEM; | |
678 | ||
91cbb4bd AZ |
679 | m->monitored_mem_pressure_cgroup_contexts_candidates = hashmap_new(&oomd_cgroup_ctx_hash_ops); |
680 | if (!m->monitored_mem_pressure_cgroup_contexts_candidates) | |
681 | return -ENOMEM; | |
682 | ||
9de5e321 AZ |
683 | *ret = TAKE_PTR(m); |
684 | return 0; | |
685 | } | |
686 | ||
5c616ecf AZ |
687 | static int manager_connect_bus(Manager *m) { |
688 | int r; | |
689 | ||
690 | assert(m); | |
691 | assert(!m->bus); | |
692 | ||
693 | r = bus_open_system_watch_bind_with_description(&m->bus, "bus-api-oom"); | |
694 | if (r < 0) | |
695 | return log_error_errno(r, "Failed to connect to bus: %m"); | |
696 | ||
c9a00f5a | 697 | r = bus_add_implementation(m->bus, &manager_object, m); |
5c616ecf | 698 | if (r < 0) |
c9a00f5a | 699 | return r; |
5c616ecf AZ |
700 | |
701 | r = bus_log_control_api_register(m->bus); | |
702 | if (r < 0) | |
703 | return r; | |
704 | ||
705 | r = sd_bus_request_name_async(m->bus, NULL, "org.freedesktop.oom1", 0, NULL, NULL); | |
706 | if (r < 0) | |
707 | return log_error_errno(r, "Failed to request name: %m"); | |
708 | ||
709 | r = sd_bus_attach_event(m->bus, m->event, 0); | |
710 | if (r < 0) | |
711 | return log_error_errno(r, "Failed to attach bus to event loop: %m"); | |
712 | ||
713 | return 0; | |
714 | } | |
715 | ||
064a5c14 DDM |
716 | static int manager_varlink_init(Manager *m, int fd) { |
717 | _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL; | |
718 | int r; | |
719 | ||
720 | assert(m); | |
721 | assert(!m->varlink_server); | |
722 | ||
723 | r = varlink_server_new(&s, VARLINK_SERVER_ACCOUNT_UID|VARLINK_SERVER_INHERIT_USERDATA); | |
724 | if (r < 0) | |
725 | return log_error_errno(r, "Failed to allocate varlink server object: %m"); | |
726 | ||
727 | varlink_server_set_userdata(s, m); | |
728 | ||
729 | r = varlink_server_bind_method(s, "io.systemd.oom.ReportManagedOOMCGroups", process_managed_oom_request); | |
730 | if (r < 0) | |
731 | return log_error_errno(r, "Failed to register varlink method: %m"); | |
732 | ||
733 | if (fd < 0) | |
734 | r = varlink_server_listen_address(s, VARLINK_ADDR_PATH_MANAGED_OOM_USER, 0666); | |
735 | else | |
736 | r = varlink_server_listen_fd(s, fd); | |
737 | if (r < 0) | |
738 | return log_error_errno(r, "Failed to bind to varlink socket: %m"); | |
739 | ||
740 | r = varlink_server_attach_event(s, m->event, SD_EVENT_PRIORITY_NORMAL); | |
741 | if (r < 0) | |
742 | return log_error_errno(r, "Failed to attach varlink connection to event loop: %m"); | |
743 | ||
744 | log_debug("Initialized systemd-oomd varlink server"); | |
745 | ||
746 | m->varlink_server = TAKE_PTR(s); | |
747 | return 0; | |
748 | } | |
749 | ||
d06e7fb5 LP |
750 | int manager_start( |
751 | Manager *m, | |
752 | bool dry_run, | |
753 | int swap_used_limit_permyriad, | |
754 | int mem_pressure_limit_permyriad, | |
064a5c14 DDM |
755 | usec_t mem_pressure_usec, |
756 | int fd) { | |
d06e7fb5 | 757 | |
0a9f9344 | 758 | unsigned long l, f; |
9de5e321 AZ |
759 | int r; |
760 | ||
761 | assert(m); | |
762 | ||
763 | m->dry_run = dry_run; | |
764 | ||
d06e7fb5 LP |
765 | m->swap_used_limit_permyriad = swap_used_limit_permyriad >= 0 ? swap_used_limit_permyriad : DEFAULT_SWAP_USED_LIMIT_PERCENT * 100; |
766 | assert(m->swap_used_limit_permyriad <= 10000); | |
9de5e321 | 767 | |
d06e7fb5 | 768 | if (mem_pressure_limit_permyriad >= 0) { |
0a9f9344 AZ |
769 | assert(mem_pressure_limit_permyriad <= 10000); |
770 | ||
771 | l = mem_pressure_limit_permyriad / 100; | |
772 | f = mem_pressure_limit_permyriad % 100; | |
773 | } else { | |
774 | l = DEFAULT_MEM_PRESSURE_LIMIT_PERCENT; | |
775 | f = 0; | |
776 | } | |
777 | r = store_loadavg_fixed_point(l, f, &m->default_mem_pressure_limit); | |
9de5e321 AZ |
778 | if (r < 0) |
779 | return r; | |
780 | ||
c20aa7b1 AZ |
781 | m->default_mem_pressure_duration_usec = mem_pressure_usec ?: DEFAULT_MEM_PRESSURE_DURATION_USEC; |
782 | ||
5c616ecf AZ |
783 | r = manager_connect_bus(m); |
784 | if (r < 0) | |
785 | return r; | |
786 | ||
9de5e321 AZ |
787 | r = acquire_managed_oom_connect(m); |
788 | if (r < 0) | |
789 | return r; | |
790 | ||
064a5c14 DDM |
791 | r = manager_varlink_init(m, fd); |
792 | if (r < 0) | |
793 | return r; | |
794 | ||
81d66fab AZ |
795 | r = monitor_memory_pressure_contexts(m); |
796 | if (r < 0) | |
797 | return r; | |
798 | ||
799 | r = monitor_swap_contexts(m); | |
9de5e321 AZ |
800 | if (r < 0) |
801 | return r; | |
802 | ||
803 | return 0; | |
804 | } | |
5c616ecf AZ |
805 | |
806 | int manager_get_dump_string(Manager *m, char **ret) { | |
807 | _cleanup_free_ char *dump = NULL; | |
808 | _cleanup_fclose_ FILE *f = NULL; | |
809 | OomdCGroupContext *c; | |
810 | size_t size; | |
811 | char *key; | |
812 | int r; | |
813 | ||
814 | assert(m); | |
815 | assert(ret); | |
816 | ||
817 | f = open_memstream_unlocked(&dump, &size); | |
818 | if (!f) | |
819 | return -errno; | |
820 | ||
821 | fprintf(f, | |
822 | "Dry Run: %s\n" | |
d06e7fb5 | 823 | "Swap Used Limit: " PERMYRIAD_AS_PERCENT_FORMAT_STR "\n" |
0a9f9344 | 824 | "Default Memory Pressure Limit: %lu.%02lu%%\n" |
c20aa7b1 | 825 | "Default Memory Pressure Duration: %s\n" |
5c616ecf AZ |
826 | "System Context:\n", |
827 | yes_no(m->dry_run), | |
d06e7fb5 | 828 | PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad), |
3542da24 | 829 | LOADAVG_INT_SIDE(m->default_mem_pressure_limit), LOADAVG_DECIMAL_SIDE(m->default_mem_pressure_limit), |
5291f26d | 830 | FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC)); |
5c616ecf AZ |
831 | oomd_dump_system_context(&m->system_context, f, "\t"); |
832 | ||
833 | fprintf(f, "Swap Monitored CGroups:\n"); | |
834 | HASHMAP_FOREACH_KEY(c, key, m->monitored_swap_cgroup_contexts) | |
835 | oomd_dump_swap_cgroup_context(c, f, "\t"); | |
836 | ||
837 | fprintf(f, "Memory Pressure Monitored CGroups:\n"); | |
838 | HASHMAP_FOREACH_KEY(c, key, m->monitored_mem_pressure_cgroup_contexts) | |
839 | oomd_dump_memory_pressure_cgroup_context(c, f, "\t"); | |
840 | ||
841 | r = fflush_and_check(f); | |
842 | if (r < 0) | |
843 | return r; | |
844 | ||
845 | f = safe_fclose(f); | |
846 | ||
847 | *ret = TAKE_PTR(dump); | |
848 | return 0; | |
849 | } |