]>
Commit | Line | Data |
---|---|---|
db9ecf05 | 1 | /* SPDX-License-Identifier: LGPL-2.1-or-later */ |
61ff7397 AZ |
2 | |
3 | #include <sys/xattr.h> | |
4 | #include <unistd.h> | |
5 | ||
6 | #include "fd-util.h" | |
7 | #include "format-util.h" | |
8 | #include "oomd-util.h" | |
9 | #include "parse-util.h" | |
10 | #include "path-util.h" | |
11 | #include "procfs-util.h" | |
12 | #include "signal-util.h" | |
13 | #include "sort-util.h" | |
14 | #include "stat-util.h" | |
15 | #include "stdio-util.h" | |
16 | ||
17 | DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR( | |
18 | oomd_cgroup_ctx_hash_ops, | |
19 | char, | |
20 | string_hash_func, | |
21 | string_compare_func, | |
22 | OomdCGroupContext, | |
23 | oomd_cgroup_context_free); | |
24 | ||
25 | static int log_kill(pid_t pid, int sig, void *userdata) { | |
26 | log_debug("oomd attempting to kill " PID_FMT " with %s", pid, signal_to_string(sig)); | |
27 | return 0; | |
28 | } | |
29 | ||
30 | static int increment_oomd_xattr(const char *path, const char *xattr, uint64_t num_procs_killed) { | |
31 | _cleanup_free_ char *value = NULL; | |
32 | char buf[DECIMAL_STR_MAX(uint64_t) + 1]; | |
33 | uint64_t curr_count = 0; | |
34 | int r; | |
35 | ||
36 | assert(path); | |
37 | assert(xattr); | |
38 | ||
39 | r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, path, xattr, &value); | |
40 | if (r < 0 && r != -ENODATA) | |
41 | return r; | |
42 | ||
43 | if (!isempty(value)) { | |
44 | r = safe_atou64(value, &curr_count); | |
45 | if (r < 0) | |
46 | return r; | |
47 | } | |
48 | ||
49 | if (curr_count > UINT64_MAX - num_procs_killed) | |
50 | return -EOVERFLOW; | |
51 | ||
52 | xsprintf(buf, "%"PRIu64, curr_count + num_procs_killed); | |
53 | r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, path, xattr, buf, strlen(buf), 0); | |
54 | if (r < 0) | |
55 | return r; | |
56 | ||
57 | return 0; | |
58 | } | |
59 | ||
60 | OomdCGroupContext *oomd_cgroup_context_free(OomdCGroupContext *ctx) { | |
61 | if (!ctx) | |
62 | return NULL; | |
63 | ||
64 | free(ctx->path); | |
65 | return mfree(ctx); | |
66 | } | |
67 | ||
68 | int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret) { | |
69 | _cleanup_set_free_ Set *targets = NULL; | |
70 | OomdCGroupContext *ctx; | |
71 | char *key; | |
72 | int r; | |
73 | ||
74 | assert(h); | |
75 | assert(ret); | |
76 | ||
77 | targets = set_new(NULL); | |
78 | if (!targets) | |
79 | return -ENOMEM; | |
80 | ||
81 | HASHMAP_FOREACH_KEY(ctx, key, h) { | |
82 | if (ctx->memory_pressure.avg10 > ctx->mem_pressure_limit) { | |
83 | usec_t diff; | |
84 | ||
85 | if (ctx->last_hit_mem_pressure_limit == 0) | |
86 | ctx->last_hit_mem_pressure_limit = now(CLOCK_MONOTONIC); | |
87 | ||
88 | diff = now(CLOCK_MONOTONIC) - ctx->last_hit_mem_pressure_limit; | |
89 | if (diff >= duration) { | |
90 | r = set_put(targets, ctx); | |
91 | if (r < 0) | |
92 | return -ENOMEM; | |
93 | } | |
94 | } else | |
95 | ctx->last_hit_mem_pressure_limit = 0; | |
96 | } | |
97 | ||
98 | if (!set_isempty(targets)) { | |
99 | *ret = TAKE_PTR(targets); | |
100 | return 1; | |
101 | } | |
102 | ||
103 | *ret = NULL; | |
104 | return 0; | |
105 | } | |
106 | ||
107 | bool oomd_memory_reclaim(Hashmap *h) { | |
108 | uint64_t pgscan = 0, pgscan_of = 0, last_pgscan = 0, last_pgscan_of = 0; | |
109 | OomdCGroupContext *ctx; | |
110 | ||
111 | assert(h); | |
112 | ||
113 | /* If sum of all the current pgscan values are greater than the sum of all the last_pgscan values, | |
114 | * there was reclaim activity. Used along with pressure checks to decide whether to take action. */ | |
115 | ||
116 | HASHMAP_FOREACH(ctx, h) { | |
117 | uint64_t sum; | |
118 | ||
119 | sum = pgscan + ctx->pgscan; | |
120 | if (sum < pgscan || sum < ctx->pgscan) | |
121 | pgscan_of++; /* count overflows */ | |
122 | pgscan = sum; | |
123 | ||
124 | sum = last_pgscan + ctx->last_pgscan; | |
125 | if (sum < last_pgscan || sum < ctx->last_pgscan) | |
126 | last_pgscan_of++; /* count overflows */ | |
127 | last_pgscan = sum; | |
128 | } | |
129 | ||
130 | /* overflow counts are the same, return sums comparison */ | |
131 | if (last_pgscan_of == pgscan_of) | |
132 | return pgscan > last_pgscan; | |
133 | ||
134 | return pgscan_of > last_pgscan_of; | |
135 | } | |
136 | ||
137 | bool oomd_swap_free_below(const OomdSystemContext *ctx, uint64_t threshold_percent) { | |
138 | uint64_t swap_threshold; | |
139 | ||
140 | assert(ctx); | |
141 | assert(threshold_percent <= 100); | |
142 | ||
143 | swap_threshold = ctx->swap_total * threshold_percent / ((uint64_t) 100); | |
144 | return (ctx->swap_total - ctx->swap_used) < swap_threshold; | |
145 | } | |
146 | ||
147 | int oomd_sort_cgroup_contexts(Hashmap *h, oomd_compare_t compare_func, const char *prefix, OomdCGroupContext ***ret) { | |
148 | _cleanup_free_ OomdCGroupContext **sorted = NULL; | |
149 | OomdCGroupContext *item; | |
150 | size_t k = 0; | |
151 | ||
152 | assert(h); | |
153 | assert(compare_func); | |
154 | assert(ret); | |
155 | ||
156 | sorted = new0(OomdCGroupContext*, hashmap_size(h)); | |
157 | if (!sorted) | |
158 | return -ENOMEM; | |
159 | ||
160 | HASHMAP_FOREACH(item, h) { | |
59331b8e AZ |
161 | /* Skip over cgroups that are not valid candidates or are explicitly marked for omission */ |
162 | if ((item->path && prefix && !path_startswith(item->path, prefix)) || item->preference == MANAGED_OOM_PREFERENCE_OMIT) | |
61ff7397 AZ |
163 | continue; |
164 | ||
165 | sorted[k++] = item; | |
166 | } | |
167 | ||
168 | typesafe_qsort(sorted, k, compare_func); | |
169 | ||
170 | *ret = TAKE_PTR(sorted); | |
171 | ||
172 | assert(k <= INT_MAX); | |
173 | return (int) k; | |
174 | } | |
175 | ||
176 | int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run) { | |
177 | _cleanup_set_free_ Set *pids_killed = NULL; | |
178 | int r; | |
179 | ||
180 | assert(path); | |
181 | ||
182 | if (dry_run) { | |
183 | _cleanup_free_ char *cg_path = NULL; | |
184 | ||
185 | r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &cg_path); | |
186 | if (r < 0) | |
187 | return r; | |
188 | ||
189 | log_debug("oomd dry-run: Would have tried to kill %s with recurse=%s", cg_path, true_false(recurse)); | |
190 | return 0; | |
191 | } | |
192 | ||
193 | pids_killed = set_new(NULL); | |
194 | if (!pids_killed) | |
195 | return -ENOMEM; | |
196 | ||
197 | if (recurse) | |
198 | r = cg_kill_recursive(SYSTEMD_CGROUP_CONTROLLER, path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL); | |
199 | else | |
200 | r = cg_kill(SYSTEMD_CGROUP_CONTROLLER, path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL); | |
201 | if (r < 0) | |
202 | return r; | |
203 | ||
e3038333 | 204 | r = increment_oomd_xattr(path, "user.oomd_kill", set_size(pids_killed)); |
61ff7397 | 205 | if (r < 0) |
e3038333 | 206 | log_debug_errno(r, "Failed to set user.oomd_kill on kill: %m"); |
61ff7397 AZ |
207 | |
208 | return set_size(pids_killed) != 0; | |
209 | } | |
210 | ||
211 | int oomd_kill_by_pgscan(Hashmap *h, const char *prefix, bool dry_run) { | |
212 | _cleanup_free_ OomdCGroupContext **sorted = NULL; | |
213 | int r; | |
214 | ||
215 | assert(h); | |
216 | ||
1f76411b | 217 | r = oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, prefix, &sorted); |
61ff7397 AZ |
218 | if (r < 0) |
219 | return r; | |
220 | ||
221 | for (int i = 0; i < r; i++) { | |
59331b8e AZ |
222 | /* Skip cgroups with no reclaim and memory usage; it won't alleviate pressure. */ |
223 | /* Don't break since there might be "avoid" cgroups at the end. */ | |
74f834e9 | 224 | if (sorted[i]->pgscan == 0 && sorted[i]->current_memory_usage == 0) |
59331b8e | 225 | continue; |
61ff7397 AZ |
226 | |
227 | r = oomd_cgroup_kill(sorted[i]->path, true, dry_run); | |
228 | if (r > 0 || r == -ENOMEM) | |
229 | break; | |
230 | } | |
231 | ||
232 | return r; | |
233 | } | |
234 | ||
235 | int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run) { | |
236 | _cleanup_free_ OomdCGroupContext **sorted = NULL; | |
237 | int r; | |
238 | ||
239 | assert(h); | |
240 | ||
241 | r = oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted); | |
242 | if (r < 0) | |
243 | return r; | |
244 | ||
245 | /* Try to kill cgroups with non-zero swap usage until we either succeed in | |
246 | * killing or we get to a cgroup with no swap usage. */ | |
247 | for (int i = 0; i < r; i++) { | |
59331b8e AZ |
248 | /* Skip over cgroups with no resource usage. Don't break since there might be "avoid" |
249 | * cgroups at the end. */ | |
61ff7397 | 250 | if (sorted[i]->swap_usage == 0) |
59331b8e | 251 | continue; |
61ff7397 AZ |
252 | |
253 | r = oomd_cgroup_kill(sorted[i]->path, true, dry_run); | |
254 | if (r > 0 || r == -ENOMEM) | |
255 | break; | |
256 | } | |
257 | ||
258 | return r; | |
259 | } | |
260 | ||
261 | int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret) { | |
262 | _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL; | |
263 | _cleanup_free_ char *p = NULL, *val = NULL; | |
264 | bool is_root; | |
59331b8e | 265 | uid_t uid; |
61ff7397 AZ |
266 | int r; |
267 | ||
268 | assert(path); | |
269 | assert(ret); | |
270 | ||
271 | ctx = new0(OomdCGroupContext, 1); | |
272 | if (!ctx) | |
273 | return -ENOMEM; | |
274 | ||
275 | is_root = empty_or_root(path); | |
59331b8e | 276 | ctx->preference = MANAGED_OOM_PREFERENCE_NONE; |
61ff7397 AZ |
277 | |
278 | r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, "memory.pressure", &p); | |
279 | if (r < 0) | |
280 | return log_debug_errno(r, "Error getting cgroup memory pressure path from %s: %m", path); | |
281 | ||
282 | r = read_resource_pressure(p, PRESSURE_TYPE_FULL, &ctx->memory_pressure); | |
283 | if (r < 0) | |
284 | return log_debug_errno(r, "Error parsing memory pressure from %s: %m", p); | |
285 | ||
59331b8e AZ |
286 | r = cg_get_owner(SYSTEMD_CGROUP_CONTROLLER, path, &uid); |
287 | if (r < 0) | |
288 | log_debug_errno(r, "Failed to get owner/group from %s: %m", path); | |
289 | else if (uid == 0) { | |
290 | /* Ignore most errors when reading the xattr since it is usually unset and cgroup xattrs are only used | |
291 | * as an optional feature of systemd-oomd (and the system might not even support them). */ | |
292 | r = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER, path, "user.oomd_avoid"); | |
293 | if (r == -ENOMEM) | |
294 | return r; | |
295 | ctx->preference = r == 1 ? MANAGED_OOM_PREFERENCE_AVOID : ctx->preference; | |
296 | ||
297 | r = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER, path, "user.oomd_omit"); | |
298 | if (r == -ENOMEM) | |
299 | return r; | |
300 | ctx->preference = r == 1 ? MANAGED_OOM_PREFERENCE_OMIT : ctx->preference; | |
301 | } | |
302 | ||
61ff7397 AZ |
303 | if (is_root) { |
304 | r = procfs_memory_get_used(&ctx->current_memory_usage); | |
305 | if (r < 0) | |
306 | return log_debug_errno(r, "Error getting memory used from procfs: %m"); | |
307 | } else { | |
308 | r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.current", &ctx->current_memory_usage); | |
309 | if (r < 0) | |
310 | return log_debug_errno(r, "Error getting memory.current from %s: %m", path); | |
311 | ||
312 | r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.min", &ctx->memory_min); | |
313 | if (r < 0) | |
314 | return log_debug_errno(r, "Error getting memory.min from %s: %m", path); | |
315 | ||
316 | r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.low", &ctx->memory_low); | |
317 | if (r < 0) | |
318 | return log_debug_errno(r, "Error getting memory.low from %s: %m", path); | |
319 | ||
320 | r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.swap.current", &ctx->swap_usage); | |
321 | if (r < 0) | |
322 | return log_debug_errno(r, "Error getting memory.swap.current from %s: %m", path); | |
323 | ||
324 | r = cg_get_keyed_attribute(SYSTEMD_CGROUP_CONTROLLER, path, "memory.stat", STRV_MAKE("pgscan"), &val); | |
325 | if (r < 0) | |
326 | return log_debug_errno(r, "Error getting pgscan from memory.stat under %s: %m", path); | |
327 | ||
328 | r = safe_atou64(val, &ctx->pgscan); | |
329 | if (r < 0) | |
330 | return log_debug_errno(r, "Error converting pgscan value to uint64_t: %m"); | |
331 | } | |
332 | ||
333 | ctx->path = strdup(empty_to_root(path)); | |
334 | if (!ctx->path) | |
335 | return -ENOMEM; | |
336 | ||
337 | *ret = TAKE_PTR(ctx); | |
338 | return 0; | |
339 | } | |
340 | ||
341 | int oomd_system_context_acquire(const char *proc_swaps_path, OomdSystemContext *ret) { | |
342 | _cleanup_fclose_ FILE *f = NULL; | |
343 | OomdSystemContext ctx = {}; | |
344 | int r; | |
345 | ||
346 | assert(proc_swaps_path); | |
347 | assert(ret); | |
348 | ||
349 | f = fopen(proc_swaps_path, "re"); | |
350 | if (!f) | |
351 | return -errno; | |
352 | ||
353 | (void) fscanf(f, "%*s %*s %*s %*s %*s\n"); | |
354 | ||
355 | for (;;) { | |
356 | uint64_t total, used; | |
357 | ||
358 | r = fscanf(f, | |
359 | "%*s " /* device/file */ | |
360 | "%*s " /* type of swap */ | |
361 | "%" PRIu64 " " /* swap size */ | |
362 | "%" PRIu64 " " /* used */ | |
363 | "%*s\n", /* priority */ | |
364 | &total, &used); | |
365 | ||
366 | if (r == EOF && feof(f)) | |
367 | break; | |
368 | ||
369 | if (r != 2) { | |
370 | if (ferror(f)) | |
371 | return log_debug_errno(errno, "Error reading from %s: %m", proc_swaps_path); | |
372 | ||
373 | return log_debug_errno(SYNTHETIC_ERRNO(EIO), | |
374 | "Failed to parse values from %s: %m", proc_swaps_path); | |
375 | } | |
376 | ||
377 | ctx.swap_total += total * 1024U; | |
378 | ctx.swap_used += used * 1024U; | |
379 | } | |
380 | ||
381 | *ret = ctx; | |
382 | return 0; | |
383 | } | |
384 | ||
385 | int oomd_insert_cgroup_context(Hashmap *old_h, Hashmap *new_h, const char *path) { | |
386 | _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *curr_ctx = NULL; | |
387 | OomdCGroupContext *old_ctx, *ctx; | |
388 | int r; | |
389 | ||
390 | assert(new_h); | |
391 | assert(path); | |
392 | ||
393 | r = oomd_cgroup_context_acquire(path, &curr_ctx); | |
394 | if (r < 0) | |
395 | return log_debug_errno(r, "Failed to get OomdCGroupContext for %s: %m", path); | |
396 | ||
397 | old_ctx = hashmap_get(old_h, path); | |
398 | if (old_ctx) { | |
399 | curr_ctx->last_pgscan = old_ctx->pgscan; | |
400 | curr_ctx->mem_pressure_limit = old_ctx->mem_pressure_limit; | |
401 | curr_ctx->last_hit_mem_pressure_limit = old_ctx->last_hit_mem_pressure_limit; | |
402 | } | |
403 | ||
404 | ctx = TAKE_PTR(curr_ctx); | |
405 | r = hashmap_put(new_h, ctx->path, ctx); | |
406 | if (r < 0) | |
407 | return r; | |
408 | ||
409 | return 0; | |
410 | } | |
5c616ecf AZ |
411 | |
412 | void oomd_dump_swap_cgroup_context(const OomdCGroupContext *ctx, FILE *f, const char *prefix) { | |
413 | char swap[FORMAT_BYTES_MAX]; | |
414 | ||
415 | assert(ctx); | |
416 | assert(f); | |
417 | ||
418 | if (!empty_or_root(ctx->path)) | |
419 | fprintf(f, | |
420 | "%sPath: %s\n" | |
421 | "%s\tSwap Usage: %s\n", | |
422 | strempty(prefix), ctx->path, | |
423 | strempty(prefix), format_bytes(swap, sizeof(swap), ctx->swap_usage)); | |
424 | else | |
425 | fprintf(f, | |
426 | "%sPath: %s\n" | |
427 | "%s\tSwap Usage: (see System Context)\n", | |
428 | strempty(prefix), ctx->path, | |
429 | strempty(prefix)); | |
430 | } | |
431 | ||
432 | void oomd_dump_memory_pressure_cgroup_context(const OomdCGroupContext *ctx, FILE *f, const char *prefix) { | |
433 | char tbuf[FORMAT_TIMESPAN_MAX], mem_use[FORMAT_BYTES_MAX]; | |
434 | char mem_min[FORMAT_BYTES_MAX], mem_low[FORMAT_BYTES_MAX]; | |
435 | ||
436 | assert(ctx); | |
437 | assert(f); | |
438 | ||
439 | fprintf(f, | |
440 | "%sPath: %s\n" | |
0a9f9344 | 441 | "%s\tMemory Pressure Limit: %lu.%02lu%%\n" |
5c616ecf AZ |
442 | "%s\tPressure: Avg10: %lu.%02lu Avg60: %lu.%02lu Avg300: %lu.%02lu Total: %s\n" |
443 | "%s\tCurrent Memory Usage: %s\n", | |
444 | strempty(prefix), ctx->path, | |
0a9f9344 | 445 | strempty(prefix), LOAD_INT(ctx->mem_pressure_limit), LOAD_FRAC(ctx->mem_pressure_limit), |
5c616ecf AZ |
446 | strempty(prefix), |
447 | LOAD_INT(ctx->memory_pressure.avg10), LOAD_FRAC(ctx->memory_pressure.avg10), | |
448 | LOAD_INT(ctx->memory_pressure.avg60), LOAD_FRAC(ctx->memory_pressure.avg60), | |
449 | LOAD_INT(ctx->memory_pressure.avg300), LOAD_FRAC(ctx->memory_pressure.avg300), | |
450 | format_timespan(tbuf, sizeof(tbuf), ctx->memory_pressure.total, USEC_PER_SEC), | |
451 | strempty(prefix), format_bytes(mem_use, sizeof(mem_use), ctx->current_memory_usage)); | |
452 | ||
453 | if (!empty_or_root(ctx->path)) | |
454 | fprintf(f, | |
455 | "%s\tMemory Min: %s\n" | |
456 | "%s\tMemory Low: %s\n" | |
457 | "%s\tPgscan: %" PRIu64 "\n", | |
458 | strempty(prefix), format_bytes_cgroup_protection(mem_min, sizeof(mem_min), ctx->memory_min), | |
459 | strempty(prefix), format_bytes_cgroup_protection(mem_low, sizeof(mem_low), ctx->memory_low), | |
460 | strempty(prefix), ctx->pgscan); | |
461 | } | |
462 | ||
463 | void oomd_dump_system_context(const OomdSystemContext *ctx, FILE *f, const char *prefix) { | |
464 | char used[FORMAT_BYTES_MAX], total[FORMAT_BYTES_MAX]; | |
465 | ||
466 | assert(ctx); | |
467 | assert(f); | |
468 | ||
469 | fprintf(f, | |
470 | "%sSwap: Used: %s Total: %s\n", | |
471 | strempty(prefix), | |
472 | format_bytes(used, sizeof(used), ctx->swap_used), | |
473 | format_bytes(total, sizeof(total), ctx->swap_total)); | |
474 | } |