]>
Commit | Line | Data |
---|---|---|
db9ecf05 | 1 | /* SPDX-License-Identifier: LGPL-2.1-or-later */ |
9de5e321 | 2 | |
064a5c14 DDM |
3 | #include "sd-daemon.h" |
4 | ||
5c616ecf AZ |
5 | #include "bus-log-control-api.h" |
6 | #include "bus-util.h" | |
7 | #include "bus-polkit.h" | |
9de5e321 AZ |
8 | #include "cgroup-util.h" |
9 | #include "fd-util.h" | |
10 | #include "fileio.h" | |
064a5c14 | 11 | #include "format-util.h" |
408a3bbd | 12 | #include "memory-util.h" |
2485b7e2 | 13 | #include "memstream-util.h" |
5c616ecf | 14 | #include "oomd-manager-bus.h" |
9de5e321 AZ |
15 | #include "oomd-manager.h" |
16 | #include "path-util.h" | |
d9d3f05d | 17 | #include "percent-util.h" |
abef4a7b | 18 | #include "varlink-io.systemd.oom.h" |
9de5e321 | 19 | |
71feeae4 | 20 | typedef struct ManagedOOMMessage { |
9de5e321 AZ |
21 | ManagedOOMMode mode; |
22 | char *path; | |
23 | char *property; | |
d06e7fb5 | 24 | uint32_t limit; |
71feeae4 | 25 | } ManagedOOMMessage; |
9de5e321 | 26 | |
71feeae4 DDM |
27 | static void managed_oom_message_destroy(ManagedOOMMessage *message) { |
28 | assert(message); | |
29 | free(message->path); | |
30 | free(message->property); | |
9de5e321 AZ |
31 | } |
32 | ||
33 | static int managed_oom_mode(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { | |
34 | ManagedOOMMode *mode = userdata, m; | |
35 | const char *s; | |
36 | ||
37 | assert(mode); | |
38 | assert_se(s = json_variant_string(v)); | |
39 | ||
40 | m = managed_oom_mode_from_string(s); | |
41 | if (m < 0) | |
7211c853 | 42 | return json_log(v, flags, m, "%s is not a valid ManagedOOMMode", s); |
9de5e321 AZ |
43 | |
44 | *mode = m; | |
45 | return 0; | |
46 | } | |
47 | ||
064a5c14 | 48 | static int process_managed_oom_message(Manager *m, uid_t uid, JsonVariant *parameters) { |
9de5e321 | 49 | JsonVariant *c, *cgroups; |
71feeae4 | 50 | int r; |
9de5e321 AZ |
51 | |
52 | static const JsonDispatch dispatch_table[] = { | |
71feeae4 DDM |
53 | { "mode", JSON_VARIANT_STRING, managed_oom_mode, offsetof(ManagedOOMMessage, mode), JSON_MANDATORY }, |
54 | { "path", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMMessage, path), JSON_MANDATORY }, | |
55 | { "property", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMMessage, property), JSON_MANDATORY }, | |
56 | { "limit", JSON_VARIANT_UNSIGNED, json_dispatch_uint32, offsetof(ManagedOOMMessage, limit), 0 }, | |
9de5e321 AZ |
57 | {}, |
58 | }; | |
59 | ||
71feeae4 DDM |
60 | assert(m); |
61 | assert(parameters); | |
9de5e321 AZ |
62 | |
63 | cgroups = json_variant_by_key(parameters, "cgroups"); | |
71feeae4 DDM |
64 | if (!cgroups) |
65 | return -EINVAL; | |
9de5e321 AZ |
66 | |
67 | /* Skip malformed elements and keep processing in case the others are good */ | |
68 | JSON_VARIANT_ARRAY_FOREACH(c, cgroups) { | |
71feeae4 | 69 | _cleanup_(managed_oom_message_destroy) ManagedOOMMessage message = {}; |
9de5e321 AZ |
70 | OomdCGroupContext *ctx; |
71 | Hashmap *monitor_hm; | |
72 | loadavg_t limit; | |
9de5e321 AZ |
73 | |
74 | if (!json_variant_is_object(c)) | |
75 | continue; | |
76 | ||
f1b622a0 | 77 | r = json_dispatch(c, dispatch_table, 0, &message); |
71feeae4 DDM |
78 | if (r == -ENOMEM) |
79 | return r; | |
80 | if (r < 0) | |
9de5e321 AZ |
81 | continue; |
82 | ||
064a5c14 DDM |
83 | if (uid != 0) { |
84 | uid_t cg_uid; | |
85 | ||
86 | r = cg_path_get_owner_uid(message.path, &cg_uid); | |
87 | if (r < 0) { | |
b6f6df4c | 88 | log_debug_errno(r, "Failed to get cgroup %s owner uid: %m", message.path); |
064a5c14 DDM |
89 | continue; |
90 | } | |
91 | ||
92 | /* Let's not be lenient for permission errors and skip processing if we receive an | |
93 | * update for a cgroup that doesn't belong to the user. */ | |
94 | if (uid != cg_uid) | |
95 | return log_error_errno(SYNTHETIC_ERRNO(EPERM), | |
96 | "cgroup path owner UID does not match sender uid " | |
97 | "(" UID_FMT " != " UID_FMT ")", uid, cg_uid); | |
98 | } | |
99 | ||
71feeae4 | 100 | monitor_hm = streq(message.property, "ManagedOOMSwap") ? |
9de5e321 AZ |
101 | m->monitored_swap_cgroup_contexts : m->monitored_mem_pressure_cgroup_contexts; |
102 | ||
71feeae4 DDM |
103 | if (message.mode == MANAGED_OOM_AUTO) { |
104 | (void) oomd_cgroup_context_free(hashmap_remove(monitor_hm, empty_to_root(message.path))); | |
9de5e321 AZ |
105 | continue; |
106 | } | |
107 | ||
108 | limit = m->default_mem_pressure_limit; | |
109 | ||
71feeae4 DDM |
110 | if (streq(message.property, "ManagedOOMMemoryPressure") && message.limit > 0) { |
111 | int permyriad = UINT32_SCALE_TO_PERMYRIAD(message.limit); | |
d06e7fb5 | 112 | |
5f1d6ebd | 113 | r = store_loadavg_fixed_point(permyriad / 100LU, permyriad % 100LU, &limit); |
71feeae4 | 114 | if (r < 0) |
9de5e321 | 115 | continue; |
9de5e321 AZ |
116 | } |
117 | ||
71feeae4 DDM |
118 | r = oomd_insert_cgroup_context(NULL, monitor_hm, message.path); |
119 | if (r == -ENOMEM) | |
120 | return r; | |
121 | if (r < 0 && r != -EEXIST) | |
122 | log_debug_errno(r, "Failed to insert message, ignoring: %m"); | |
9de5e321 AZ |
123 | |
124 | /* Always update the limit in case it was changed. For non-memory pressure detection the value is | |
125 | * ignored so always updating it here is not a problem. */ | |
71feeae4 | 126 | ctx = hashmap_get(monitor_hm, empty_to_root(message.path)); |
9de5e321 AZ |
127 | if (ctx) |
128 | ctx->mem_pressure_limit = limit; | |
129 | } | |
130 | ||
b63beb4d CH |
131 | /* Toggle wake-ups for "ManagedOOMSwap" if entries are present. */ |
132 | r = sd_event_source_set_enabled(m->swap_context_event_source, | |
133 | hashmap_isempty(m->monitored_swap_cgroup_contexts) ? SD_EVENT_OFF : SD_EVENT_ON); | |
134 | if (r < 0) | |
135 | return log_error_errno(r, "Failed to toggle enabled state of swap context source: %m"); | |
136 | ||
71feeae4 DDM |
137 | return 0; |
138 | } | |
139 | ||
064a5c14 DDM |
140 | static int process_managed_oom_request( |
141 | Varlink *link, | |
142 | JsonVariant *parameters, | |
143 | VarlinkMethodFlags flags, | |
144 | void *userdata) { | |
99534007 | 145 | Manager *m = ASSERT_PTR(userdata); |
064a5c14 DDM |
146 | uid_t uid; |
147 | int r; | |
148 | ||
064a5c14 DDM |
149 | r = varlink_get_peer_uid(link, &uid); |
150 | if (r < 0) | |
151 | return log_error_errno(r, "Failed to get varlink peer uid: %m"); | |
152 | ||
153 | return process_managed_oom_message(m, uid, parameters); | |
154 | } | |
155 | ||
71feeae4 DDM |
156 | static int process_managed_oom_reply( |
157 | Varlink *link, | |
158 | JsonVariant *parameters, | |
159 | const char *error_id, | |
160 | VarlinkReplyFlags flags, | |
161 | void *userdata) { | |
99534007 | 162 | Manager *m = ASSERT_PTR(userdata); |
064a5c14 | 163 | uid_t uid; |
71feeae4 DDM |
164 | int r; |
165 | ||
71feeae4 DDM |
166 | if (error_id) { |
167 | r = -EIO; | |
168 | log_debug("Error getting ManagedOOM cgroups: %s", error_id); | |
169 | goto finish; | |
170 | } | |
171 | ||
064a5c14 DDM |
172 | r = varlink_get_peer_uid(link, &uid); |
173 | if (r < 0) { | |
174 | log_error_errno(r, "Failed to get varlink peer uid: %m"); | |
175 | goto finish; | |
176 | } | |
177 | ||
178 | r = process_managed_oom_message(m, uid, parameters); | |
71feeae4 | 179 | |
9de5e321 AZ |
180 | finish: |
181 | if (!FLAGS_SET(flags, VARLINK_REPLY_CONTINUES)) | |
064a5c14 | 182 | m->varlink_client = varlink_close_unref(link); |
9de5e321 AZ |
183 | |
184 | return r; | |
185 | } | |
186 | ||
4d620b90 | 187 | /* Fill 'new_h' with 'path's descendant OomdCGroupContexts. Only include descendant cgroups that are possible |
9de5e321 AZ |
188 | * candidates for action. That is, only leaf cgroups or cgroups with memory.oom.group set to "1". |
189 | * | |
4d620b90 ZJS |
190 | * This function ignores most errors in order to handle cgroups that may have been cleaned up while |
191 | * populating the hashmap. | |
9de5e321 | 192 | * |
4d620b90 | 193 | * 'new_h' is of the form { key: cgroup paths -> value: OomdCGroupContext } */ |
9de5e321 AZ |
194 | static int recursively_get_cgroup_context(Hashmap *new_h, const char *path) { |
195 | _cleanup_free_ char *subpath = NULL; | |
196 | _cleanup_closedir_ DIR *d = NULL; | |
197 | int r; | |
198 | ||
199 | assert(new_h); | |
200 | assert(path); | |
201 | ||
202 | r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d); | |
203 | if (r < 0) | |
204 | return r; | |
205 | ||
206 | r = cg_read_subgroup(d, &subpath); | |
207 | if (r < 0) | |
208 | return r; | |
209 | else if (r == 0) { /* No subgroups? We're a leaf node */ | |
210 | r = oomd_insert_cgroup_context(NULL, new_h, path); | |
77b04c0a AZ |
211 | if (r == -ENOMEM) |
212 | return r; | |
213 | if (r < 0) | |
214 | log_debug_errno(r, "Failed to insert context for %s, ignoring: %m", path); | |
215 | return 0; | |
9de5e321 AZ |
216 | } |
217 | ||
218 | do { | |
219 | _cleanup_free_ char *cg_path = NULL; | |
220 | bool oom_group; | |
221 | ||
222 | cg_path = path_join(empty_to_root(path), subpath); | |
223 | if (!cg_path) | |
224 | return -ENOMEM; | |
225 | ||
226 | subpath = mfree(subpath); | |
227 | ||
228 | r = cg_get_attribute_as_bool("memory", cg_path, "memory.oom.group", &oom_group); | |
229 | /* The cgroup might be gone. Skip it as a candidate since we can't get information on it. */ | |
77b04c0a AZ |
230 | if (r == -ENOMEM) |
231 | return r; | |
232 | if (r < 0) { | |
233 | log_debug_errno(r, "Failed to read memory.oom.group from %s, ignoring: %m", cg_path); | |
234 | return 0; | |
235 | } | |
9de5e321 | 236 | |
349a2003 | 237 | if (oom_group) |
9de5e321 | 238 | r = oomd_insert_cgroup_context(NULL, new_h, cg_path); |
349a2003 | 239 | else |
9de5e321 | 240 | r = recursively_get_cgroup_context(new_h, cg_path); |
349a2003 AZ |
241 | if (r == -ENOMEM) |
242 | return r; | |
77b04c0a AZ |
243 | if (r < 0) |
244 | log_debug_errno(r, "Failed to insert or recursively get from %s, ignoring: %m", cg_path); | |
9de5e321 AZ |
245 | } while ((r = cg_read_subgroup(d, &subpath)) > 0); |
246 | ||
247 | return 0; | |
248 | } | |
249 | ||
250 | static int update_monitored_cgroup_contexts(Hashmap **monitored_cgroups) { | |
251 | _cleanup_hashmap_free_ Hashmap *new_base = NULL; | |
252 | OomdCGroupContext *ctx; | |
253 | int r; | |
254 | ||
255 | assert(monitored_cgroups); | |
256 | ||
257 | new_base = hashmap_new(&oomd_cgroup_ctx_hash_ops); | |
258 | if (!new_base) | |
259 | return -ENOMEM; | |
260 | ||
261 | HASHMAP_FOREACH(ctx, *monitored_cgroups) { | |
262 | /* Skip most errors since the cgroup we're trying to update might not exist anymore. */ | |
263 | r = oomd_insert_cgroup_context(*monitored_cgroups, new_base, ctx->path); | |
264 | if (r == -ENOMEM) | |
265 | return r; | |
77b04c0a AZ |
266 | if (r < 0 && !IN_SET(r, -EEXIST, -ENOENT)) |
267 | log_debug_errno(r, "Failed to insert context for %s, ignoring: %m", ctx->path); | |
9de5e321 AZ |
268 | } |
269 | ||
270 | hashmap_free(*monitored_cgroups); | |
271 | *monitored_cgroups = TAKE_PTR(new_base); | |
272 | ||
273 | return 0; | |
274 | } | |
275 | ||
276 | static int get_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups, Hashmap **ret_candidates) { | |
277 | _cleanup_hashmap_free_ Hashmap *candidates = NULL; | |
278 | OomdCGroupContext *ctx; | |
279 | int r; | |
280 | ||
281 | assert(monitored_cgroups); | |
282 | assert(ret_candidates); | |
283 | ||
284 | candidates = hashmap_new(&oomd_cgroup_ctx_hash_ops); | |
285 | if (!candidates) | |
286 | return -ENOMEM; | |
287 | ||
288 | HASHMAP_FOREACH(ctx, monitored_cgroups) { | |
289 | r = recursively_get_cgroup_context(candidates, ctx->path); | |
290 | if (r == -ENOMEM) | |
291 | return r; | |
77b04c0a AZ |
292 | if (r < 0) |
293 | log_debug_errno(r, "Failed to recursively get contexts for %s, ignoring: %m", ctx->path); | |
9de5e321 AZ |
294 | } |
295 | ||
296 | *ret_candidates = TAKE_PTR(candidates); | |
297 | ||
298 | return 0; | |
299 | } | |
300 | ||
91cbb4bd AZ |
301 | static int update_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups, Hashmap **candidates) { |
302 | _cleanup_hashmap_free_ Hashmap *new_candidates = NULL; | |
303 | int r; | |
304 | ||
305 | assert(monitored_cgroups); | |
306 | assert(candidates); | |
307 | assert(*candidates); | |
308 | ||
309 | r = get_monitored_cgroup_contexts_candidates(monitored_cgroups, &new_candidates); | |
310 | if (r < 0) | |
311 | return log_debug_errno(r, "Failed to get candidate contexts: %m"); | |
312 | ||
313 | oomd_update_cgroup_contexts_between_hashmaps(*candidates, new_candidates); | |
314 | ||
315 | hashmap_free(*candidates); | |
316 | *candidates = TAKE_PTR(new_candidates); | |
317 | ||
318 | return 0; | |
319 | } | |
320 | ||
9de5e321 AZ |
321 | static int acquire_managed_oom_connect(Manager *m) { |
322 | _cleanup_(varlink_close_unrefp) Varlink *link = NULL; | |
323 | int r; | |
324 | ||
325 | assert(m); | |
326 | assert(m->event); | |
327 | ||
064a5c14 | 328 | r = varlink_connect_address(&link, VARLINK_ADDR_PATH_MANAGED_OOM_SYSTEM); |
9de5e321 | 329 | if (r < 0) |
064a5c14 | 330 | return log_error_errno(r, "Failed to connect to " VARLINK_ADDR_PATH_MANAGED_OOM_SYSTEM ": %m"); |
9de5e321 AZ |
331 | |
332 | (void) varlink_set_userdata(link, m); | |
333 | (void) varlink_set_description(link, "oomd"); | |
334 | (void) varlink_set_relative_timeout(link, USEC_INFINITY); | |
335 | ||
336 | r = varlink_attach_event(link, m->event, SD_EVENT_PRIORITY_NORMAL); | |
337 | if (r < 0) | |
338 | return log_error_errno(r, "Failed to attach varlink connection to event loop: %m"); | |
339 | ||
340 | r = varlink_bind_reply(link, process_managed_oom_reply); | |
341 | if (r < 0) | |
342 | return log_error_errno(r, "Failed to bind reply callback: %m"); | |
343 | ||
344 | r = varlink_observe(link, "io.systemd.ManagedOOM.SubscribeManagedOOMCGroups", NULL); | |
345 | if (r < 0) | |
346 | return log_error_errno(r, "Failed to observe varlink call: %m"); | |
347 | ||
064a5c14 | 348 | m->varlink_client = TAKE_PTR(link); |
9de5e321 AZ |
349 | return 0; |
350 | } | |
351 | ||
81d66fab | 352 | static int monitor_swap_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) { |
99534007 | 353 | Manager *m = ASSERT_PTR(userdata); |
9de5e321 AZ |
354 | usec_t usec_now; |
355 | int r; | |
356 | ||
357 | assert(s); | |
b63beb4d | 358 | assert(!hashmap_isempty(m->monitored_swap_cgroup_contexts)); |
9de5e321 AZ |
359 | |
360 | /* Reset timer */ | |
361 | r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now); | |
362 | if (r < 0) | |
77b04c0a | 363 | return log_error_errno(r, "Failed to reset event timer: %m"); |
9de5e321 | 364 | |
81d66fab | 365 | r = sd_event_source_set_time_relative(s, SWAP_INTERVAL_USEC); |
9de5e321 | 366 | if (r < 0) |
77b04c0a | 367 | return log_error_errno(r, "Failed to set relative time for timer: %m"); |
9de5e321 AZ |
368 | |
369 | /* Reconnect if our connection dropped */ | |
064a5c14 | 370 | if (!m->varlink_client) { |
9de5e321 AZ |
371 | r = acquire_managed_oom_connect(m); |
372 | if (r < 0) | |
77b04c0a | 373 | return log_error_errno(r, "Failed to acquire varlink connection: %m"); |
9de5e321 AZ |
374 | } |
375 | ||
47136b9d AZ |
376 | /* We still try to acquire system information for oomctl even if no units want swap monitoring */ |
377 | r = oomd_system_context_acquire("/proc/meminfo", &m->system_context); | |
378 | /* If there are no units depending on swap actions, the only error we exit on is ENOMEM. */ | |
b63beb4d | 379 | if (r < 0) |
81d66fab AZ |
380 | return log_error_errno(r, "Failed to acquire system context: %m"); |
381 | ||
81d66fab AZ |
382 | /* Note that m->monitored_swap_cgroup_contexts does not need to be updated every interval because only the |
383 | * system context is used for deciding whether the swap threshold is hit. m->monitored_swap_cgroup_contexts | |
384 | * is only used to decide which cgroups to kill (and even then only the resource usages of its descendent | |
385 | * nodes are the ones that matter). */ | |
386 | ||
030bc91c NR |
387 | /* Check amount of memory available and swap free so we don't free up swap when memory is still available. */ |
388 | if (oomd_mem_available_below(&m->system_context, 10000 - m->swap_used_limit_permyriad) && | |
cb5ce676 | 389 | oomd_swap_free_below(&m->system_context, 10000 - m->swap_used_limit_permyriad)) { |
81d66fab AZ |
390 | _cleanup_hashmap_free_ Hashmap *candidates = NULL; |
391 | _cleanup_free_ char *selected = NULL; | |
685b0985 | 392 | uint64_t threshold; |
81d66fab | 393 | |
cb5ce676 AZ |
394 | log_debug("Memory used (%"PRIu64") / total (%"PRIu64") and " |
395 | "swap used (%"PRIu64") / total (%"PRIu64") is more than " PERMYRIAD_AS_PERCENT_FORMAT_STR, | |
396 | m->system_context.mem_used, m->system_context.mem_total, | |
81d66fab AZ |
397 | m->system_context.swap_used, m->system_context.swap_total, |
398 | PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad)); | |
399 | ||
400 | r = get_monitored_cgroup_contexts_candidates(m->monitored_swap_cgroup_contexts, &candidates); | |
401 | if (r == -ENOMEM) | |
402 | return log_oom(); | |
403 | if (r < 0) | |
404 | log_debug_errno(r, "Failed to get monitored swap cgroup candidates, ignoring: %m"); | |
405 | ||
685b0985 AZ |
406 | threshold = m->system_context.swap_total * THRESHOLD_SWAP_USED_PERCENT / 100; |
407 | r = oomd_kill_by_swap_usage(candidates, threshold, m->dry_run, &selected); | |
81d66fab AZ |
408 | if (r == -ENOMEM) |
409 | return log_oom(); | |
410 | if (r < 0) | |
0923b425 | 411 | log_notice_errno(r, "Failed to kill any cgroups based on swap: %m"); |
81d66fab | 412 | else { |
d784a8d4 | 413 | if (selected && r > 0) { |
cb5ce676 AZ |
414 | log_notice("Killed %s due to memory used (%"PRIu64") / total (%"PRIu64") and " |
415 | "swap used (%"PRIu64") / total (%"PRIu64") being more than " | |
81d66fab | 416 | PERMYRIAD_AS_PERCENT_FORMAT_STR, |
cb5ce676 AZ |
417 | selected, |
418 | m->system_context.mem_used, m->system_context.mem_total, | |
419 | m->system_context.swap_used, m->system_context.swap_total, | |
81d66fab | 420 | PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad)); |
d784a8d4 OS |
421 | |
422 | /* send dbus signal */ | |
423 | (void) sd_bus_emit_signal(m->bus, | |
424 | "/org/freedesktop/oom1", | |
425 | "org.freedesktop.oom1.Manager", | |
426 | "Killed", | |
427 | "ss", | |
428 | selected, | |
429 | "memory-used"); | |
430 | } | |
81d66fab AZ |
431 | return 0; |
432 | } | |
433 | } | |
434 | ||
435 | return 0; | |
436 | } | |
437 | ||
cb13961a AZ |
438 | static void clear_candidate_hashmapp(Manager **m) { |
439 | if (*m) | |
440 | hashmap_clear((*m)->monitored_mem_pressure_cgroup_contexts_candidates); | |
441 | } | |
442 | ||
81d66fab | 443 | static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) { |
cb13961a AZ |
444 | /* Don't want to use stale candidate data. Setting this will clear the candidate hashmap on return unless we |
445 | * update the candidate data (in which case clear_candidates will be NULL). */ | |
d7ac0952 | 446 | _unused_ _cleanup_(clear_candidate_hashmapp) Manager *clear_candidates = userdata; |
81d66fab | 447 | _cleanup_set_free_ Set *targets = NULL; |
cb13961a | 448 | bool in_post_action_delay = false; |
99534007 | 449 | Manager *m = ASSERT_PTR(userdata); |
81d66fab AZ |
450 | usec_t usec_now; |
451 | int r; | |
452 | ||
453 | assert(s); | |
81d66fab AZ |
454 | |
455 | /* Reset timer */ | |
456 | r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now); | |
77b04c0a | 457 | if (r < 0) |
81d66fab AZ |
458 | return log_error_errno(r, "Failed to reset event timer: %m"); |
459 | ||
460 | r = sd_event_source_set_time_relative(s, MEM_PRESSURE_INTERVAL_USEC); | |
461 | if (r < 0) | |
462 | return log_error_errno(r, "Failed to set relative time for timer: %m"); | |
463 | ||
464 | /* Reconnect if our connection dropped */ | |
064a5c14 | 465 | if (!m->varlink_client) { |
81d66fab AZ |
466 | r = acquire_managed_oom_connect(m); |
467 | if (r < 0) | |
468 | return log_error_errno(r, "Failed to acquire varlink connection: %m"); | |
469 | } | |
9de5e321 | 470 | |
81d66fab | 471 | /* Return early if nothing is requesting memory pressure monitoring */ |
cb13961a | 472 | if (hashmap_isempty(m->monitored_mem_pressure_cgroup_contexts)) |
81d66fab | 473 | return 0; |
81d66fab AZ |
474 | |
475 | /* Update the cgroups used for detection/action */ | |
9de5e321 AZ |
476 | r = update_monitored_cgroup_contexts(&m->monitored_mem_pressure_cgroup_contexts); |
477 | if (r == -ENOMEM) | |
77b04c0a AZ |
478 | return log_oom(); |
479 | if (r < 0) | |
480 | log_debug_errno(r, "Failed to update monitored memory pressure cgroup contexts, ignoring: %m"); | |
9de5e321 | 481 | |
81d66fab AZ |
482 | /* Since pressure counters are lagging, we need to wait a bit after a kill to ensure we don't read stale |
483 | * values and go on a kill storm. */ | |
484 | if (m->mem_pressure_post_action_delay_start > 0) { | |
485 | if (m->mem_pressure_post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now) | |
cb13961a | 486 | in_post_action_delay = true; |
9de5e321 | 487 | else |
81d66fab | 488 | m->mem_pressure_post_action_delay_start = 0; |
9de5e321 AZ |
489 | } |
490 | ||
c20aa7b1 | 491 | r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets); |
9de5e321 | 492 | if (r == -ENOMEM) |
77b04c0a AZ |
493 | return log_oom(); |
494 | if (r < 0) | |
495 | log_debug_errno(r, "Failed to check if memory pressure exceeded limits, ignoring: %m"); | |
cb13961a | 496 | else if (r == 1 && !in_post_action_delay) { |
df637ede AZ |
497 | OomdCGroupContext *t; |
498 | SET_FOREACH(t, targets) { | |
499 | _cleanup_free_ char *selected = NULL; | |
df637ede AZ |
500 | |
501 | /* Check if there was reclaim activity in the given interval. The concern is the following case: | |
502 | * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending | |
503 | * cgroup. Even after this, well-behaved processes will fault in recently resident pages and | |
504 | * this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need | |
505 | * to kill something (it won't help anyways). */ | |
506 | if ((now(CLOCK_MONOTONIC) - t->last_had_mem_reclaim) > RECLAIM_DURATION_USEC) | |
507 | continue; | |
508 | ||
509 | log_debug("Memory pressure for %s is %lu.%02lu%% > %lu.%02lu%% for > %s with reclaim activity", | |
510 | t->path, | |
3542da24 LB |
511 | LOADAVG_INT_SIDE(t->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(t->memory_pressure.avg10), |
512 | LOADAVG_INT_SIDE(t->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(t->mem_pressure_limit), | |
5291f26d | 513 | FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC)); |
df637ede | 514 | |
cb13961a AZ |
515 | r = update_monitored_cgroup_contexts_candidates( |
516 | m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates); | |
517 | if (r == -ENOMEM) | |
518 | return log_oom(); | |
519 | if (r < 0) | |
520 | log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m"); | |
521 | else | |
522 | clear_candidates = NULL; | |
523 | ||
ebfb6019 ZJS |
524 | r = oomd_kill_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates, |
525 | /* prefix= */ t->path, | |
526 | /* dry_run= */ m->dry_run, | |
527 | &selected); | |
df637ede AZ |
528 | if (r == -ENOMEM) |
529 | return log_oom(); | |
530 | if (r < 0) | |
0923b425 | 531 | log_notice_errno(r, "Failed to kill any cgroups under %s based on pressure: %m", t->path); |
df637ede | 532 | else { |
914d4e99 AZ |
533 | /* Don't act on all the high pressure cgroups at once; return as soon as we kill one. |
534 | * If r == 0 then it means there were not eligible candidates, the candidate cgroup | |
535 | * disappeared, or the candidate cgroup has no processes by the time we tried to kill | |
536 | * it. In either case, go through the event loop again and select a new candidate if | |
537 | * pressure is still high. */ | |
df637ede | 538 | m->mem_pressure_post_action_delay_start = usec_now; |
d784a8d4 | 539 | if (selected && r > 0) { |
df637ede AZ |
540 | log_notice("Killed %s due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%" |
541 | " for > %s with reclaim activity", | |
542 | selected, t->path, | |
3542da24 LB |
543 | LOADAVG_INT_SIDE(t->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(t->memory_pressure.avg10), |
544 | LOADAVG_INT_SIDE(t->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(t->mem_pressure_limit), | |
5291f26d | 545 | FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC)); |
d784a8d4 OS |
546 | |
547 | /* send dbus signal */ | |
548 | (void) sd_bus_emit_signal(m->bus, | |
549 | "/org/freedesktop/oom1", | |
550 | "org.freedesktop.oom1.Manager", | |
551 | "Killed", | |
552 | "ss", | |
553 | selected, | |
554 | "memory-pressure"); | |
555 | } | |
df637ede | 556 | return 0; |
9de5e321 AZ |
557 | } |
558 | } | |
cb13961a AZ |
559 | } else { |
560 | /* If any monitored cgroup is over their pressure limit, get all the kill candidates for every | |
561 | * monitored cgroup. This saves CPU cycles from doing it every interval by only doing it when a kill | |
562 | * might happen. | |
563 | * Candidate cgroup data will continue to get updated during the post-action delay period in case | |
564 | * pressure continues to be high after a kill. */ | |
565 | OomdCGroupContext *c; | |
566 | HASHMAP_FOREACH(c, m->monitored_mem_pressure_cgroup_contexts) { | |
567 | if (c->mem_pressure_limit_hit_start == 0) | |
568 | continue; | |
569 | ||
570 | r = update_monitored_cgroup_contexts_candidates( | |
571 | m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates); | |
572 | if (r == -ENOMEM) | |
573 | return log_oom(); | |
574 | if (r < 0) | |
575 | log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m"); | |
576 | else { | |
577 | clear_candidates = NULL; | |
578 | break; | |
579 | } | |
580 | } | |
9de5e321 AZ |
581 | } |
582 | ||
81d66fab AZ |
583 | return 0; |
584 | } | |
9de5e321 | 585 | |
81d66fab AZ |
586 | static int monitor_swap_contexts(Manager *m) { |
587 | _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; | |
588 | int r; | |
9de5e321 | 589 | |
81d66fab AZ |
590 | assert(m); |
591 | assert(m->event); | |
9de5e321 | 592 | |
81d66fab AZ |
593 | r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_swap_contexts_handler, m); |
594 | if (r < 0) | |
595 | return r; | |
596 | ||
597 | r = sd_event_source_set_exit_on_failure(s, true); | |
598 | if (r < 0) | |
599 | return r; | |
9de5e321 | 600 | |
b63beb4d | 601 | r = sd_event_source_set_enabled(s, SD_EVENT_OFF); |
81d66fab AZ |
602 | if (r < 0) |
603 | return r; | |
604 | ||
605 | (void) sd_event_source_set_description(s, "oomd-swap-timer"); | |
606 | ||
607 | m->swap_context_event_source = TAKE_PTR(s); | |
9de5e321 AZ |
608 | return 0; |
609 | } | |
610 | ||
81d66fab | 611 | static int monitor_memory_pressure_contexts(Manager *m) { |
9de5e321 AZ |
612 | _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; |
613 | int r; | |
614 | ||
615 | assert(m); | |
616 | assert(m->event); | |
617 | ||
81d66fab | 618 | r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_memory_pressure_contexts_handler, m); |
9de5e321 AZ |
619 | if (r < 0) |
620 | return r; | |
621 | ||
622 | r = sd_event_source_set_exit_on_failure(s, true); | |
623 | if (r < 0) | |
624 | return r; | |
625 | ||
626 | r = sd_event_source_set_enabled(s, SD_EVENT_ON); | |
627 | if (r < 0) | |
628 | return r; | |
629 | ||
81d66fab | 630 | (void) sd_event_source_set_description(s, "oomd-memory-pressure-timer"); |
9de5e321 | 631 | |
81d66fab | 632 | m->mem_pressure_context_event_source = TAKE_PTR(s); |
9de5e321 AZ |
633 | return 0; |
634 | } | |
635 | ||
75db809a | 636 | Manager* manager_free(Manager *m) { |
9de5e321 AZ |
637 | assert(m); |
638 | ||
064a5c14 DDM |
639 | varlink_server_unref(m->varlink_server); |
640 | varlink_close_unref(m->varlink_client); | |
81d66fab AZ |
641 | sd_event_source_unref(m->swap_context_event_source); |
642 | sd_event_source_unref(m->mem_pressure_context_event_source); | |
9de5e321 AZ |
643 | sd_event_unref(m->event); |
644 | ||
5c616ecf AZ |
645 | bus_verify_polkit_async_registry_free(m->polkit_registry); |
646 | sd_bus_flush_close_unref(m->bus); | |
647 | ||
9de5e321 AZ |
648 | hashmap_free(m->monitored_swap_cgroup_contexts); |
649 | hashmap_free(m->monitored_mem_pressure_cgroup_contexts); | |
91cbb4bd | 650 | hashmap_free(m->monitored_mem_pressure_cgroup_contexts_candidates); |
9de5e321 | 651 | |
75db809a | 652 | return mfree(m); |
9de5e321 AZ |
653 | } |
654 | ||
655 | int manager_new(Manager **ret) { | |
656 | _cleanup_(manager_freep) Manager *m = NULL; | |
657 | int r; | |
658 | ||
659 | assert(ret); | |
660 | ||
661 | m = new0(Manager, 1); | |
662 | if (!m) | |
663 | return -ENOMEM; | |
664 | ||
665 | r = sd_event_default(&m->event); | |
666 | if (r < 0) | |
667 | return r; | |
668 | ||
669 | (void) sd_event_set_watchdog(m->event, true); | |
670 | ||
671 | r = sd_event_add_signal(m->event, NULL, SIGINT, NULL, NULL); | |
672 | if (r < 0) | |
673 | return r; | |
674 | ||
675 | r = sd_event_add_signal(m->event, NULL, SIGTERM, NULL, NULL); | |
676 | if (r < 0) | |
677 | return r; | |
678 | ||
679 | m->monitored_swap_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops); | |
680 | if (!m->monitored_swap_cgroup_contexts) | |
681 | return -ENOMEM; | |
682 | ||
683 | m->monitored_mem_pressure_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops); | |
684 | if (!m->monitored_mem_pressure_cgroup_contexts) | |
685 | return -ENOMEM; | |
686 | ||
91cbb4bd AZ |
687 | m->monitored_mem_pressure_cgroup_contexts_candidates = hashmap_new(&oomd_cgroup_ctx_hash_ops); |
688 | if (!m->monitored_mem_pressure_cgroup_contexts_candidates) | |
689 | return -ENOMEM; | |
690 | ||
9de5e321 AZ |
691 | *ret = TAKE_PTR(m); |
692 | return 0; | |
693 | } | |
694 | ||
5c616ecf AZ |
695 | static int manager_connect_bus(Manager *m) { |
696 | int r; | |
697 | ||
698 | assert(m); | |
699 | assert(!m->bus); | |
700 | ||
701 | r = bus_open_system_watch_bind_with_description(&m->bus, "bus-api-oom"); | |
702 | if (r < 0) | |
703 | return log_error_errno(r, "Failed to connect to bus: %m"); | |
704 | ||
c9a00f5a | 705 | r = bus_add_implementation(m->bus, &manager_object, m); |
5c616ecf | 706 | if (r < 0) |
c9a00f5a | 707 | return r; |
5c616ecf AZ |
708 | |
709 | r = bus_log_control_api_register(m->bus); | |
710 | if (r < 0) | |
711 | return r; | |
712 | ||
713 | r = sd_bus_request_name_async(m->bus, NULL, "org.freedesktop.oom1", 0, NULL, NULL); | |
714 | if (r < 0) | |
715 | return log_error_errno(r, "Failed to request name: %m"); | |
716 | ||
717 | r = sd_bus_attach_event(m->bus, m->event, 0); | |
718 | if (r < 0) | |
719 | return log_error_errno(r, "Failed to attach bus to event loop: %m"); | |
720 | ||
721 | return 0; | |
722 | } | |
723 | ||
064a5c14 DDM |
724 | static int manager_varlink_init(Manager *m, int fd) { |
725 | _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL; | |
726 | int r; | |
727 | ||
728 | assert(m); | |
729 | assert(!m->varlink_server); | |
730 | ||
731 | r = varlink_server_new(&s, VARLINK_SERVER_ACCOUNT_UID|VARLINK_SERVER_INHERIT_USERDATA); | |
732 | if (r < 0) | |
733 | return log_error_errno(r, "Failed to allocate varlink server object: %m"); | |
734 | ||
735 | varlink_server_set_userdata(s, m); | |
736 | ||
abef4a7b LP |
737 | r = varlink_server_add_interface(s, &vl_interface_io_systemd_oom); |
738 | if (r < 0) | |
739 | return log_error_errno(r, "Failed to add oom interface to varlink server: %m"); | |
740 | ||
064a5c14 DDM |
741 | r = varlink_server_bind_method(s, "io.systemd.oom.ReportManagedOOMCGroups", process_managed_oom_request); |
742 | if (r < 0) | |
743 | return log_error_errno(r, "Failed to register varlink method: %m"); | |
744 | ||
745 | if (fd < 0) | |
746 | r = varlink_server_listen_address(s, VARLINK_ADDR_PATH_MANAGED_OOM_USER, 0666); | |
747 | else | |
748 | r = varlink_server_listen_fd(s, fd); | |
749 | if (r < 0) | |
750 | return log_error_errno(r, "Failed to bind to varlink socket: %m"); | |
751 | ||
752 | r = varlink_server_attach_event(s, m->event, SD_EVENT_PRIORITY_NORMAL); | |
753 | if (r < 0) | |
754 | return log_error_errno(r, "Failed to attach varlink connection to event loop: %m"); | |
755 | ||
756 | log_debug("Initialized systemd-oomd varlink server"); | |
757 | ||
758 | m->varlink_server = TAKE_PTR(s); | |
759 | return 0; | |
760 | } | |
761 | ||
d06e7fb5 LP |
762 | int manager_start( |
763 | Manager *m, | |
764 | bool dry_run, | |
765 | int swap_used_limit_permyriad, | |
766 | int mem_pressure_limit_permyriad, | |
064a5c14 DDM |
767 | usec_t mem_pressure_usec, |
768 | int fd) { | |
d06e7fb5 | 769 | |
0a9f9344 | 770 | unsigned long l, f; |
9de5e321 AZ |
771 | int r; |
772 | ||
773 | assert(m); | |
774 | ||
775 | m->dry_run = dry_run; | |
776 | ||
d06e7fb5 LP |
777 | m->swap_used_limit_permyriad = swap_used_limit_permyriad >= 0 ? swap_used_limit_permyriad : DEFAULT_SWAP_USED_LIMIT_PERCENT * 100; |
778 | assert(m->swap_used_limit_permyriad <= 10000); | |
9de5e321 | 779 | |
d06e7fb5 | 780 | if (mem_pressure_limit_permyriad >= 0) { |
0a9f9344 AZ |
781 | assert(mem_pressure_limit_permyriad <= 10000); |
782 | ||
783 | l = mem_pressure_limit_permyriad / 100; | |
784 | f = mem_pressure_limit_permyriad % 100; | |
785 | } else { | |
786 | l = DEFAULT_MEM_PRESSURE_LIMIT_PERCENT; | |
787 | f = 0; | |
788 | } | |
789 | r = store_loadavg_fixed_point(l, f, &m->default_mem_pressure_limit); | |
9de5e321 AZ |
790 | if (r < 0) |
791 | return r; | |
792 | ||
c20aa7b1 AZ |
793 | m->default_mem_pressure_duration_usec = mem_pressure_usec ?: DEFAULT_MEM_PRESSURE_DURATION_USEC; |
794 | ||
5c616ecf AZ |
795 | r = manager_connect_bus(m); |
796 | if (r < 0) | |
797 | return r; | |
798 | ||
9de5e321 AZ |
799 | r = acquire_managed_oom_connect(m); |
800 | if (r < 0) | |
801 | return r; | |
802 | ||
064a5c14 DDM |
803 | r = manager_varlink_init(m, fd); |
804 | if (r < 0) | |
805 | return r; | |
806 | ||
81d66fab AZ |
807 | r = monitor_memory_pressure_contexts(m); |
808 | if (r < 0) | |
809 | return r; | |
810 | ||
811 | r = monitor_swap_contexts(m); | |
9de5e321 AZ |
812 | if (r < 0) |
813 | return r; | |
814 | ||
815 | return 0; | |
816 | } | |
5c616ecf AZ |
817 | |
818 | int manager_get_dump_string(Manager *m, char **ret) { | |
2485b7e2 | 819 | _cleanup_(memstream_done) MemStream ms = {}; |
5c616ecf | 820 | OomdCGroupContext *c; |
2485b7e2 | 821 | FILE *f; |
5c616ecf AZ |
822 | |
823 | assert(m); | |
824 | assert(ret); | |
825 | ||
2485b7e2 | 826 | f = memstream_init(&ms); |
5c616ecf | 827 | if (!f) |
2485b7e2 | 828 | return -ENOMEM; |
5c616ecf AZ |
829 | |
830 | fprintf(f, | |
831 | "Dry Run: %s\n" | |
d06e7fb5 | 832 | "Swap Used Limit: " PERMYRIAD_AS_PERCENT_FORMAT_STR "\n" |
0a9f9344 | 833 | "Default Memory Pressure Limit: %lu.%02lu%%\n" |
c20aa7b1 | 834 | "Default Memory Pressure Duration: %s\n" |
5c616ecf AZ |
835 | "System Context:\n", |
836 | yes_no(m->dry_run), | |
d06e7fb5 | 837 | PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad), |
3542da24 | 838 | LOADAVG_INT_SIDE(m->default_mem_pressure_limit), LOADAVG_DECIMAL_SIDE(m->default_mem_pressure_limit), |
5291f26d | 839 | FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC)); |
5c616ecf AZ |
840 | oomd_dump_system_context(&m->system_context, f, "\t"); |
841 | ||
842 | fprintf(f, "Swap Monitored CGroups:\n"); | |
64377c60 | 843 | HASHMAP_FOREACH(c, m->monitored_swap_cgroup_contexts) |
5c616ecf AZ |
844 | oomd_dump_swap_cgroup_context(c, f, "\t"); |
845 | ||
846 | fprintf(f, "Memory Pressure Monitored CGroups:\n"); | |
64377c60 | 847 | HASHMAP_FOREACH(c, m->monitored_mem_pressure_cgroup_contexts) |
5c616ecf AZ |
848 | oomd_dump_memory_pressure_cgroup_context(c, f, "\t"); |
849 | ||
2485b7e2 | 850 | return memstream_finalize(&ms, ret, NULL); |
5c616ecf | 851 | } |