]>
Commit | Line | Data |
---|---|---|
db9ecf05 | 1 | /* SPDX-License-Identifier: LGPL-2.1-or-later */ |
fdb3deca | 2 | |
fdb3deca ZJS |
3 | #include <unistd.h> |
4 | ||
5 | #include "cgroup-setup.h" | |
6 | #include "cgroup-util.h" | |
7 | #include "errno-util.h" | |
8e5aba7a LP |
8 | #include "fd-util.h" |
9 | #include "fileio.h" | |
10 | #include "fs-util.h" | |
5545f336 | 11 | #include "missing_threads.h" |
8e5aba7a | 12 | #include "mkdir.h" |
fdb3deca ZJS |
13 | #include "parse-util.h" |
14 | #include "path-util.h" | |
15 | #include "proc-cmdline.h" | |
8e5aba7a LP |
16 | #include "process-util.h" |
17 | #include "recurse-dir.h" | |
fdb3deca ZJS |
18 | #include "stdio-util.h" |
19 | #include "string-util.h" | |
fdb3deca | 20 | #include "user-util.h" |
22eeada9 DS |
21 | #include "virt.h" |
22 | ||
23 | static int cg_any_controller_used_for_v1(void) { | |
24 | _cleanup_free_ char *buf = NULL; | |
25 | _cleanup_strv_free_ char **lines = NULL; | |
22eeada9 DS |
26 | int r; |
27 | ||
28 | r = read_full_virtual_file("/proc/cgroups", &buf, NULL); | |
29 | if (r < 0) | |
30 | return log_debug_errno(r, "Could not read /proc/cgroups, ignoring: %m"); | |
31 | ||
32 | r = strv_split_newlines_full(&lines, buf, 0); | |
33 | if (r < 0) | |
34 | return r; | |
35 | ||
36 | /* The intention of this is to check if the fully unified cgroup tree setup is possible, meaning all | |
37 | * enabled kernel cgroup controllers are currently not in use by cgroup1. For reference: | |
38 | * https://systemd.io/CGROUP_DELEGATION/#three-different-tree-setups- | |
39 | * | |
40 | * Note that this is typically only useful to check inside a container where we don't know what | |
41 | * cgroup tree setup is in use by the host; if the host is using legacy or hybrid, we can't use | |
42 | * unified since some or all controllers would be missing. This is not the best way to detect this, | |
43 | * as whatever container manager created our container should have mounted /sys/fs/cgroup | |
44 | * appropriately, but in case that wasn't done, we try to detect if it's possible for us to use | |
45 | * unified cgroups. */ | |
46 | STRV_FOREACH(line, lines) { | |
47 | _cleanup_free_ char *name = NULL, *hierarchy_id = NULL, *num = NULL, *enabled = NULL; | |
48 | ||
49 | /* Skip header line */ | |
50 | if (startswith(*line, "#")) | |
51 | continue; | |
52 | ||
53 | const char *p = *line; | |
4f495126 | 54 | r = extract_many_words(&p, NULL, 0, &name, &hierarchy_id, &num, &enabled); |
22eeada9 DS |
55 | if (r < 0) |
56 | return log_debug_errno(r, "Error parsing /proc/cgroups line, ignoring: %m"); | |
57 | else if (r < 4) { | |
58 | log_debug("Invalid /proc/cgroups line, ignoring."); | |
59 | continue; | |
60 | } | |
61 | ||
62 | /* Ignore disabled controllers. */ | |
63 | if (streq(enabled, "0")) | |
64 | continue; | |
65 | ||
a69f1dd9 DS |
66 | /* Ignore controllers we don't care about. */ |
67 | if (cgroup_controller_from_string(name) < 0) | |
68 | continue; | |
69 | ||
22eeada9 DS |
70 | /* Since the unified cgroup doesn't use multiple hierarchies, if any controller has a |
71 | * non-zero hierarchy_id that means it's in use already in a legacy (or hybrid) cgroup v1 | |
72 | * hierarchy, and can't be used in a unified cgroup. */ | |
73 | if (!streq(hierarchy_id, "0")) { | |
74 | log_debug("Cgroup controller %s in use by legacy v1 hierarchy.", name); | |
75 | return 1; | |
76 | } | |
77 | } | |
78 | ||
79 | return 0; | |
80 | } | |
fdb3deca ZJS |
81 | |
82 | bool cg_is_unified_wanted(void) { | |
83 | static thread_local int wanted = -1; | |
fdb3deca ZJS |
84 | int r; |
85 | ||
86 | /* If we have a cached value, return that. */ | |
87 | if (wanted >= 0) | |
88 | return wanted; | |
89 | ||
90 | /* If the hierarchy is already mounted, then follow whatever was chosen for it. */ | |
91 | r = cg_unified_cached(true); | |
92 | if (r >= 0) | |
93 | return (wanted = r >= CGROUP_UNIFIED_ALL); | |
94 | ||
95 | /* If we were explicitly passed systemd.unified_cgroup_hierarchy, respect that. */ | |
31323f21 | 96 | bool b; |
3787934b | 97 | r = proc_cmdline_get_bool("systemd.unified_cgroup_hierarchy", /* flags = */ 0, &b); |
fdb3deca ZJS |
98 | if (r > 0) |
99 | return (wanted = b); | |
100 | ||
101 | /* If we passed cgroup_no_v1=all with no other instructions, it seems highly unlikely that we want to | |
102 | * use hybrid or legacy hierarchy. */ | |
31323f21 | 103 | _cleanup_free_ char *c = NULL; |
fdb3deca ZJS |
104 | r = proc_cmdline_get_key("cgroup_no_v1", 0, &c); |
105 | if (r > 0 && streq_ptr(c, "all")) | |
106 | return (wanted = true); | |
107 | ||
22eeada9 | 108 | /* If any controller is in use as v1, don't use unified. */ |
9398ef07 | 109 | return (wanted = (cg_any_controller_used_for_v1() <= 0)); |
fdb3deca ZJS |
110 | } |
111 | ||
112 | bool cg_is_legacy_wanted(void) { | |
113 | static thread_local int wanted = -1; | |
114 | ||
115 | /* If we have a cached value, return that. */ | |
116 | if (wanted >= 0) | |
117 | return wanted; | |
118 | ||
119 | /* Check if we have cgroup v2 already mounted. */ | |
120 | if (cg_unified_cached(true) == CGROUP_UNIFIED_ALL) | |
121 | return (wanted = false); | |
122 | ||
123 | /* Otherwise, assume that at least partial legacy is wanted, | |
124 | * since cgroup v2 should already be mounted at this point. */ | |
125 | return (wanted = true); | |
126 | } | |
127 | ||
128 | bool cg_is_hybrid_wanted(void) { | |
129 | static thread_local int wanted = -1; | |
130 | int r; | |
fdb3deca ZJS |
131 | |
132 | /* If we have a cached value, return that. */ | |
133 | if (wanted >= 0) | |
134 | return wanted; | |
135 | ||
136 | /* If the hierarchy is already mounted, then follow whatever was chosen for it. */ | |
137 | if (cg_unified_cached(true) == CGROUP_UNIFIED_ALL) | |
138 | return (wanted = false); | |
139 | ||
140 | /* Otherwise, let's see what the kernel command line has to say. Since checking is expensive, cache | |
31323f21 MY |
141 | * a non-error result. |
142 | * The meaning of the kernel option is reversed wrt. to the return value of this function, hence the | |
143 | * negation. */ | |
144 | bool b; | |
3787934b | 145 | r = proc_cmdline_get_bool("systemd.legacy_systemd_cgroup_controller", /* flags = */ 0, &b); |
31323f21 MY |
146 | if (r > 0) |
147 | return (wanted = !b); | |
fdb3deca | 148 | |
31323f21 MY |
149 | /* The default hierarchy is "unified". But if this is reached, it means that unified hierarchy was |
150 | * not mounted, so return true too. */ | |
151 | return (wanted = true); | |
fdb3deca ZJS |
152 | } |
153 | ||
f4f3efa4 MY |
154 | bool cg_is_legacy_force_enabled(void) { |
155 | bool force; | |
156 | ||
157 | if (!cg_is_legacy_wanted()) | |
158 | return false; | |
159 | ||
160 | /* If in container, we have to follow host's cgroup hierarchy. */ | |
161 | if (detect_container() > 0) | |
162 | return true; | |
163 | ||
164 | if (proc_cmdline_get_bool("SYSTEMD_CGROUP_ENABLE_LEGACY_FORCE", /* flags = */ 0, &force) < 0) | |
165 | return false; | |
166 | ||
167 | return force; | |
168 | } | |
169 | ||
fdb3deca ZJS |
170 | int cg_weight_parse(const char *s, uint64_t *ret) { |
171 | uint64_t u; | |
172 | int r; | |
173 | ||
174 | if (isempty(s)) { | |
175 | *ret = CGROUP_WEIGHT_INVALID; | |
176 | return 0; | |
177 | } | |
178 | ||
179 | r = safe_atou64(s, &u); | |
180 | if (r < 0) | |
181 | return r; | |
182 | ||
183 | if (u < CGROUP_WEIGHT_MIN || u > CGROUP_WEIGHT_MAX) | |
184 | return -ERANGE; | |
185 | ||
186 | *ret = u; | |
187 | return 0; | |
188 | } | |
189 | ||
c8340822 | 190 | int cg_cpu_weight_parse(const char *s, uint64_t *ret) { |
191 | if (streq_ptr(s, "idle")) | |
192 | return *ret = CGROUP_WEIGHT_IDLE; | |
193 | return cg_weight_parse(s, ret); | |
194 | } | |
195 | ||
fdb3deca ZJS |
196 | int cg_cpu_shares_parse(const char *s, uint64_t *ret) { |
197 | uint64_t u; | |
198 | int r; | |
199 | ||
200 | if (isempty(s)) { | |
201 | *ret = CGROUP_CPU_SHARES_INVALID; | |
202 | return 0; | |
203 | } | |
204 | ||
205 | r = safe_atou64(s, &u); | |
206 | if (r < 0) | |
207 | return r; | |
208 | ||
209 | if (u < CGROUP_CPU_SHARES_MIN || u > CGROUP_CPU_SHARES_MAX) | |
210 | return -ERANGE; | |
211 | ||
212 | *ret = u; | |
213 | return 0; | |
214 | } | |
215 | ||
216 | int cg_blkio_weight_parse(const char *s, uint64_t *ret) { | |
217 | uint64_t u; | |
218 | int r; | |
219 | ||
220 | if (isempty(s)) { | |
221 | *ret = CGROUP_BLKIO_WEIGHT_INVALID; | |
222 | return 0; | |
223 | } | |
224 | ||
225 | r = safe_atou64(s, &u); | |
226 | if (r < 0) | |
227 | return r; | |
228 | ||
229 | if (u < CGROUP_BLKIO_WEIGHT_MIN || u > CGROUP_BLKIO_WEIGHT_MAX) | |
230 | return -ERANGE; | |
231 | ||
232 | *ret = u; | |
233 | return 0; | |
234 | } | |
235 | ||
8e5aba7a LP |
236 | static int trim_cb( |
237 | RecurseDirEvent event, | |
238 | const char *path, | |
239 | int dir_fd, | |
240 | int inode_fd, | |
241 | const struct dirent *de, | |
242 | const struct statx *sx, | |
243 | void *userdata) { | |
244 | ||
245 | /* Failures to delete inner cgroup we ignore (but debug log in case error code is unexpected) */ | |
246 | if (event == RECURSE_DIR_LEAVE && | |
247 | de->d_type == DT_DIR && | |
248 | unlinkat(dir_fd, de->d_name, AT_REMOVEDIR) < 0 && | |
249 | !IN_SET(errno, ENOENT, ENOTEMPTY, EBUSY)) | |
250 | log_debug_errno(errno, "Failed to trim inner cgroup %s, ignoring: %m", path); | |
251 | ||
252 | return RECURSE_DIR_CONTINUE; | |
fdb3deca ZJS |
253 | } |
254 | ||
255 | int cg_trim(const char *controller, const char *path, bool delete_root) { | |
256 | _cleanup_free_ char *fs = NULL; | |
c53aafb7 | 257 | int r, q; |
fdb3deca ZJS |
258 | |
259 | assert(path); | |
8e5aba7a | 260 | assert(controller); |
fdb3deca ZJS |
261 | |
262 | r = cg_get_path(controller, path, NULL, &fs); | |
263 | if (r < 0) | |
264 | return r; | |
265 | ||
8e5aba7a LP |
266 | r = recurse_dir_at( |
267 | AT_FDCWD, | |
268 | fs, | |
269 | /* statx_mask= */ 0, | |
270 | /* n_depth_max= */ UINT_MAX, | |
271 | RECURSE_DIR_ENSURE_TYPE, | |
272 | trim_cb, | |
273 | NULL); | |
274 | if (r == -ENOENT) /* non-existing is the ultimate trimming, hence no error */ | |
275 | r = 0; | |
276 | else if (r < 0) | |
277 | log_debug_errno(r, "Failed to iterate through cgroup %s: %m", path); | |
278 | ||
627cdcc7 | 279 | /* If we shall delete the top-level cgroup, then propagate the failure to do so (except if it is |
8e5aba7a LP |
280 | * already gone anyway). Also, let's debug log about this failure, except if the error code is an |
281 | * expected one. */ | |
282 | if (delete_root && !empty_or_root(path) && | |
283 | rmdir(fs) < 0 && errno != ENOENT) { | |
284 | if (!IN_SET(errno, ENOTEMPTY, EBUSY)) | |
285 | log_debug_errno(errno, "Failed to trim cgroup %s: %m", path); | |
286 | if (r >= 0) | |
287 | r = -errno; | |
fdb3deca ZJS |
288 | } |
289 | ||
290 | q = cg_hybrid_unified(); | |
291 | if (q < 0) | |
292 | return q; | |
8e5aba7a LP |
293 | if (q > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) |
294 | (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, delete_root); | |
fdb3deca ZJS |
295 | |
296 | return r; | |
297 | } | |
298 | ||
299 | /* Create a cgroup in the hierarchy of controller. | |
300 | * Returns 0 if the group already existed, 1 on success, negative otherwise. | |
301 | */ | |
302 | int cg_create(const char *controller, const char *path) { | |
303 | _cleanup_free_ char *fs = NULL; | |
304 | int r; | |
305 | ||
306 | r = cg_get_path_and_check(controller, path, NULL, &fs); | |
307 | if (r < 0) | |
308 | return r; | |
309 | ||
310 | r = mkdir_parents(fs, 0755); | |
311 | if (r < 0) | |
312 | return r; | |
313 | ||
3f692e2e | 314 | r = RET_NERRNO(mkdir(fs, 0755)); |
fdb3deca ZJS |
315 | if (r == -EEXIST) |
316 | return 0; | |
317 | if (r < 0) | |
318 | return r; | |
319 | ||
320 | r = cg_hybrid_unified(); | |
321 | if (r < 0) | |
322 | return r; | |
323 | ||
324 | if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) { | |
325 | r = cg_create(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path); | |
326 | if (r < 0) | |
327 | log_warning_errno(r, "Failed to create compat systemd cgroup %s: %m", path); | |
328 | } | |
329 | ||
330 | return 1; | |
331 | } | |
332 | ||
333 | int cg_create_and_attach(const char *controller, const char *path, pid_t pid) { | |
334 | int r, q; | |
335 | ||
336 | assert(pid >= 0); | |
337 | ||
338 | r = cg_create(controller, path); | |
339 | if (r < 0) | |
340 | return r; | |
341 | ||
342 | q = cg_attach(controller, path, pid); | |
343 | if (q < 0) | |
344 | return q; | |
345 | ||
346 | /* This does not remove the cgroup on failure */ | |
347 | return r; | |
348 | } | |
349 | ||
350 | int cg_attach(const char *controller, const char *path, pid_t pid) { | |
351 | _cleanup_free_ char *fs = NULL; | |
352 | char c[DECIMAL_STR_MAX(pid_t) + 2]; | |
353 | int r; | |
354 | ||
355 | assert(path); | |
356 | assert(pid >= 0); | |
357 | ||
358 | r = cg_get_path_and_check(controller, path, "cgroup.procs", &fs); | |
359 | if (r < 0) | |
360 | return r; | |
361 | ||
362 | if (pid == 0) | |
363 | pid = getpid_cached(); | |
364 | ||
365 | xsprintf(c, PID_FMT "\n", pid); | |
366 | ||
367 | r = write_string_file(fs, c, WRITE_STRING_FILE_DISABLE_BUFFER); | |
bd1791b5 | 368 | if (r == -EOPNOTSUPP && cg_is_threaded(path) > 0) |
702cf08f YW |
369 | /* When the threaded mode is used, we cannot read/write the file. Let's return recognizable error. */ |
370 | return -EUCLEAN; | |
fdb3deca ZJS |
371 | if (r < 0) |
372 | return r; | |
373 | ||
374 | r = cg_hybrid_unified(); | |
375 | if (r < 0) | |
376 | return r; | |
377 | ||
378 | if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) { | |
379 | r = cg_attach(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, pid); | |
380 | if (r < 0) | |
381 | log_warning_errno(r, "Failed to attach "PID_FMT" to compat systemd cgroup %s: %m", pid, path); | |
382 | } | |
383 | ||
384 | return 0; | |
385 | } | |
386 | ||
b2dcfd8e LP |
387 | int cg_fd_attach(int fd, pid_t pid) { |
388 | char c[DECIMAL_STR_MAX(pid_t) + 2]; | |
389 | ||
390 | assert(fd >= 0); | |
391 | assert(pid >= 0); | |
392 | ||
393 | if (pid == 0) | |
394 | pid = getpid_cached(); | |
395 | ||
396 | xsprintf(c, PID_FMT "\n", pid); | |
397 | ||
398 | return write_string_file_at(fd, "cgroup.procs", c, WRITE_STRING_FILE_DISABLE_BUFFER); | |
399 | } | |
400 | ||
fdb3deca ZJS |
401 | int cg_attach_fallback(const char *controller, const char *path, pid_t pid) { |
402 | int r; | |
403 | ||
404 | assert(controller); | |
405 | assert(path); | |
406 | assert(pid >= 0); | |
407 | ||
408 | r = cg_attach(controller, path, pid); | |
409 | if (r < 0) { | |
410 | char prefix[strlen(path) + 1]; | |
411 | ||
412 | /* This didn't work? Then let's try all prefixes of | |
413 | * the destination */ | |
414 | ||
415 | PATH_FOREACH_PREFIX(prefix, path) { | |
416 | int q; | |
417 | ||
418 | q = cg_attach(controller, prefix, pid); | |
419 | if (q >= 0) | |
420 | return q; | |
421 | } | |
422 | } | |
423 | ||
424 | return r; | |
425 | } | |
426 | ||
427 | int cg_set_access( | |
428 | const char *controller, | |
429 | const char *path, | |
430 | uid_t uid, | |
431 | gid_t gid) { | |
432 | ||
433 | struct Attribute { | |
434 | const char *name; | |
435 | bool fatal; | |
436 | }; | |
437 | ||
438 | /* cgroup v1, aka legacy/non-unified */ | |
439 | static const struct Attribute legacy_attributes[] = { | |
440 | { "cgroup.procs", true }, | |
441 | { "tasks", false }, | |
442 | { "cgroup.clone_children", false }, | |
443 | {}, | |
444 | }; | |
445 | ||
446 | /* cgroup v2, aka unified */ | |
447 | static const struct Attribute unified_attributes[] = { | |
448 | { "cgroup.procs", true }, | |
449 | { "cgroup.subtree_control", true }, | |
450 | { "cgroup.threads", false }, | |
2c70a81d LP |
451 | { "memory.oom.group", false }, |
452 | { "memory.reclaim", false }, | |
fdb3deca ZJS |
453 | {}, |
454 | }; | |
455 | ||
456 | static const struct Attribute* const attributes[] = { | |
457 | [false] = legacy_attributes, | |
458 | [true] = unified_attributes, | |
459 | }; | |
460 | ||
461 | _cleanup_free_ char *fs = NULL; | |
462 | const struct Attribute *i; | |
463 | int r, unified; | |
464 | ||
465 | assert(path); | |
466 | ||
467 | if (uid == UID_INVALID && gid == GID_INVALID) | |
468 | return 0; | |
469 | ||
470 | unified = cg_unified_controller(controller); | |
471 | if (unified < 0) | |
472 | return unified; | |
473 | ||
474 | /* Configure access to the cgroup itself */ | |
475 | r = cg_get_path(controller, path, NULL, &fs); | |
476 | if (r < 0) | |
477 | return r; | |
478 | ||
479 | r = chmod_and_chown(fs, 0755, uid, gid); | |
480 | if (r < 0) | |
481 | return r; | |
482 | ||
483 | /* Configure access to the cgroup's attributes */ | |
484 | for (i = attributes[unified]; i->name; i++) { | |
485 | fs = mfree(fs); | |
486 | ||
487 | r = cg_get_path(controller, path, i->name, &fs); | |
488 | if (r < 0) | |
489 | return r; | |
490 | ||
491 | r = chmod_and_chown(fs, 0644, uid, gid); | |
492 | if (r < 0) { | |
493 | if (i->fatal) | |
494 | return r; | |
495 | ||
496 | log_debug_errno(r, "Failed to set access on cgroup %s, ignoring: %m", fs); | |
497 | } | |
498 | } | |
499 | ||
500 | if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) { | |
501 | r = cg_hybrid_unified(); | |
502 | if (r < 0) | |
503 | return r; | |
504 | if (r > 0) { | |
505 | /* Always propagate access mode from unified to legacy controller */ | |
506 | r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, uid, gid); | |
507 | if (r < 0) | |
508 | log_debug_errno(r, "Failed to set access on compatibility systemd cgroup %s, ignoring: %m", path); | |
509 | } | |
510 | } | |
511 | ||
512 | return 0; | |
513 | } | |
514 | ||
bcd9b981 LP |
515 | struct access_callback_data { |
516 | uid_t uid; | |
517 | gid_t gid; | |
518 | int error; | |
519 | }; | |
520 | ||
521 | static int access_callback( | |
522 | RecurseDirEvent event, | |
523 | const char *path, | |
524 | int dir_fd, | |
525 | int inode_fd, | |
526 | const struct dirent *de, | |
527 | const struct statx *sx, | |
528 | void *userdata) { | |
529 | ||
530 | struct access_callback_data *d = ASSERT_PTR(userdata); | |
531 | ||
532 | if (!IN_SET(event, RECURSE_DIR_ENTER, RECURSE_DIR_ENTRY)) | |
533 | return RECURSE_DIR_CONTINUE; | |
534 | ||
535 | assert(inode_fd >= 0); | |
536 | ||
537 | /* fchown() doesn't support O_PATH fds, hence we use the /proc/self/fd/ trick */ | |
538 | if (chown(FORMAT_PROC_FD_PATH(inode_fd), d->uid, d->gid) < 0) { | |
539 | log_debug_errno(errno, "Failed to change ownership of '%s', ignoring: %m", ASSERT_PTR(path)); | |
540 | ||
541 | if (d->error == 0) /* Return last error to caller */ | |
542 | d->error = errno; | |
543 | } | |
544 | ||
545 | return RECURSE_DIR_CONTINUE; | |
546 | } | |
547 | ||
548 | int cg_set_access_recursive( | |
549 | const char *controller, | |
550 | const char *path, | |
551 | uid_t uid, | |
552 | gid_t gid) { | |
553 | ||
554 | _cleanup_close_ int fd = -EBADF; | |
555 | _cleanup_free_ char *fs = NULL; | |
556 | int r; | |
557 | ||
558 | /* A recursive version of cg_set_access(). But note that this one changes ownership of *all* files, | |
559 | * not just the allowlist that cg_set_access() uses. Use cg_set_access() on the cgroup you want to | |
560 | * delegate, and cg_set_access_recursive() for any subcrgoups you might want to create below it. */ | |
561 | ||
562 | if (!uid_is_valid(uid) && !gid_is_valid(gid)) | |
563 | return 0; | |
564 | ||
565 | r = cg_get_path(controller, path, NULL, &fs); | |
566 | if (r < 0) | |
567 | return r; | |
568 | ||
569 | fd = open(fs, O_DIRECTORY|O_CLOEXEC|O_RDONLY); | |
570 | if (fd < 0) | |
571 | return -errno; | |
572 | ||
573 | struct access_callback_data d = { | |
574 | .uid = uid, | |
575 | .gid = gid, | |
576 | }; | |
577 | ||
578 | r = recurse_dir(fd, | |
579 | fs, | |
580 | /* statx_mask= */ 0, | |
581 | /* n_depth_max= */ UINT_MAX, | |
582 | RECURSE_DIR_SAME_MOUNT|RECURSE_DIR_INODE_FD|RECURSE_DIR_TOPLEVEL, | |
583 | access_callback, | |
584 | &d); | |
585 | if (r < 0) | |
586 | return r; | |
587 | ||
588 | return -d.error; | |
589 | } | |
590 | ||
fdb3deca ZJS |
591 | int cg_migrate( |
592 | const char *cfrom, | |
593 | const char *pfrom, | |
594 | const char *cto, | |
595 | const char *pto, | |
596 | CGroupFlags flags) { | |
597 | ||
598 | bool done = false; | |
599 | _cleanup_set_free_ Set *s = NULL; | |
600 | int r, ret = 0; | |
fdb3deca ZJS |
601 | |
602 | assert(cfrom); | |
603 | assert(pfrom); | |
604 | assert(cto); | |
605 | assert(pto); | |
606 | ||
fdb3deca ZJS |
607 | do { |
608 | _cleanup_fclose_ FILE *f = NULL; | |
da130b9a MY |
609 | pid_t pid; |
610 | ||
fdb3deca ZJS |
611 | done = true; |
612 | ||
613 | r = cg_enumerate_processes(cfrom, pfrom, &f); | |
da130b9a MY |
614 | if (r < 0) |
615 | return RET_GATHER(ret, r); | |
fdb3deca ZJS |
616 | |
617 | while ((r = cg_read_pid(f, &pid)) > 0) { | |
da130b9a MY |
618 | /* This might do weird stuff if we aren't a single-threaded program. However, we |
619 | * luckily know we are. */ | |
620 | if (FLAGS_SET(flags, CGROUP_IGNORE_SELF) && pid == getpid_cached()) | |
fdb3deca ZJS |
621 | continue; |
622 | ||
da130b9a | 623 | if (set_contains(s, PID_TO_PTR(pid))) |
fdb3deca ZJS |
624 | continue; |
625 | ||
da130b9a MY |
626 | /* Ignore kernel threads. Since they can only exist in the root cgroup, we only |
627 | * check for them there. */ | |
628 | if (cfrom && empty_or_root(pfrom) && | |
fc87713b | 629 | pid_is_kernel_thread(pid) > 0) |
fdb3deca ZJS |
630 | continue; |
631 | ||
632 | r = cg_attach(cto, pto, pid); | |
633 | if (r < 0) { | |
da130b9a MY |
634 | if (r != -ESRCH) |
635 | RET_GATHER(ret, r); | |
fdb3deca ZJS |
636 | } else if (ret == 0) |
637 | ret = 1; | |
638 | ||
639 | done = false; | |
640 | ||
da130b9a MY |
641 | r = set_ensure_put(&s, /* hash_ops = */ NULL, PID_TO_PTR(pid)); |
642 | if (r < 0) | |
643 | return RET_GATHER(ret, r); | |
fdb3deca | 644 | } |
da130b9a MY |
645 | if (r < 0) |
646 | return RET_GATHER(ret, r); | |
fdb3deca ZJS |
647 | } while (!done); |
648 | ||
649 | return ret; | |
650 | } | |
651 | ||
652 | int cg_migrate_recursive( | |
653 | const char *cfrom, | |
654 | const char *pfrom, | |
655 | const char *cto, | |
656 | const char *pto, | |
657 | CGroupFlags flags) { | |
658 | ||
659 | _cleanup_closedir_ DIR *d = NULL; | |
660 | int r, ret = 0; | |
661 | char *fn; | |
662 | ||
663 | assert(cfrom); | |
664 | assert(pfrom); | |
665 | assert(cto); | |
666 | assert(pto); | |
667 | ||
668 | ret = cg_migrate(cfrom, pfrom, cto, pto, flags); | |
669 | ||
670 | r = cg_enumerate_subgroups(cfrom, pfrom, &d); | |
671 | if (r < 0) { | |
672 | if (ret >= 0 && r != -ENOENT) | |
673 | return r; | |
674 | ||
675 | return ret; | |
676 | } | |
677 | ||
678 | while ((r = cg_read_subgroup(d, &fn)) > 0) { | |
679 | _cleanup_free_ char *p = NULL; | |
680 | ||
681 | p = path_join(empty_to_root(pfrom), fn); | |
682 | free(fn); | |
683 | if (!p) | |
684 | return -ENOMEM; | |
685 | ||
686 | r = cg_migrate_recursive(cfrom, p, cto, pto, flags); | |
687 | if (r != 0 && ret >= 0) | |
688 | ret = r; | |
689 | } | |
690 | ||
691 | if (r < 0 && ret >= 0) | |
692 | ret = r; | |
693 | ||
694 | if (flags & CGROUP_REMOVE) { | |
695 | r = cg_rmdir(cfrom, pfrom); | |
696 | if (r < 0 && ret >= 0 && !IN_SET(r, -ENOENT, -EBUSY)) | |
697 | return r; | |
698 | } | |
699 | ||
700 | return ret; | |
701 | } | |
702 | ||
703 | int cg_migrate_recursive_fallback( | |
704 | const char *cfrom, | |
705 | const char *pfrom, | |
706 | const char *cto, | |
707 | const char *pto, | |
708 | CGroupFlags flags) { | |
709 | ||
710 | int r; | |
711 | ||
712 | assert(cfrom); | |
713 | assert(pfrom); | |
714 | assert(cto); | |
715 | assert(pto); | |
716 | ||
717 | r = cg_migrate_recursive(cfrom, pfrom, cto, pto, flags); | |
718 | if (r < 0) { | |
719 | char prefix[strlen(pto) + 1]; | |
720 | ||
721 | /* This didn't work? Then let's try all prefixes of the destination */ | |
722 | ||
723 | PATH_FOREACH_PREFIX(prefix, pto) { | |
724 | int q; | |
725 | ||
726 | q = cg_migrate_recursive(cfrom, pfrom, cto, prefix, flags); | |
727 | if (q >= 0) | |
728 | return q; | |
729 | } | |
730 | } | |
731 | ||
732 | return r; | |
733 | } | |
734 | ||
735 | int cg_create_everywhere(CGroupMask supported, CGroupMask mask, const char *path) { | |
736 | CGroupController c; | |
737 | CGroupMask done; | |
738 | bool created; | |
739 | int r; | |
740 | ||
741 | /* This one will create a cgroup in our private tree, but also | |
742 | * duplicate it in the trees specified in mask, and remove it | |
743 | * in all others. | |
744 | * | |
745 | * Returns 0 if the group already existed in the systemd hierarchy, | |
746 | * 1 on success, negative otherwise. | |
747 | */ | |
748 | ||
749 | /* First create the cgroup in our own hierarchy. */ | |
750 | r = cg_create(SYSTEMD_CGROUP_CONTROLLER, path); | |
751 | if (r < 0) | |
752 | return r; | |
753 | created = r; | |
754 | ||
755 | /* If we are in the unified hierarchy, we are done now */ | |
756 | r = cg_all_unified(); | |
757 | if (r < 0) | |
758 | return r; | |
759 | if (r > 0) | |
760 | return created; | |
761 | ||
762 | supported &= CGROUP_MASK_V1; | |
763 | mask = CGROUP_MASK_EXTEND_JOINED(mask); | |
764 | done = 0; | |
765 | ||
766 | /* Otherwise, do the same in the other hierarchies */ | |
767 | for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) { | |
768 | CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c); | |
769 | const char *n; | |
770 | ||
771 | if (!FLAGS_SET(supported, bit)) | |
772 | continue; | |
773 | ||
774 | if (FLAGS_SET(done, bit)) | |
775 | continue; | |
776 | ||
777 | n = cgroup_controller_to_string(c); | |
778 | if (FLAGS_SET(mask, bit)) | |
779 | (void) cg_create(n, path); | |
fdb3deca ZJS |
780 | |
781 | done |= CGROUP_MASK_EXTEND_JOINED(bit); | |
782 | } | |
783 | ||
784 | return created; | |
785 | } | |
786 | ||
787 | int cg_attach_everywhere(CGroupMask supported, const char *path, pid_t pid, cg_migrate_callback_t path_callback, void *userdata) { | |
fdb3deca ZJS |
788 | int r; |
789 | ||
790 | r = cg_attach(SYSTEMD_CGROUP_CONTROLLER, path, pid); | |
791 | if (r < 0) | |
792 | return r; | |
793 | ||
794 | r = cg_all_unified(); | |
795 | if (r < 0) | |
796 | return r; | |
797 | if (r > 0) | |
798 | return 0; | |
799 | ||
800 | supported &= CGROUP_MASK_V1; | |
3a193ac6 | 801 | CGroupMask done = 0; |
fdb3deca | 802 | |
3a193ac6 | 803 | for (CGroupController c = 0; c < _CGROUP_CONTROLLER_MAX; c++) { |
fdb3deca ZJS |
804 | CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c); |
805 | const char *p = NULL; | |
806 | ||
807 | if (!FLAGS_SET(supported, bit)) | |
808 | continue; | |
809 | ||
810 | if (FLAGS_SET(done, bit)) | |
811 | continue; | |
812 | ||
813 | if (path_callback) | |
814 | p = path_callback(bit, userdata); | |
815 | if (!p) | |
816 | p = path; | |
817 | ||
818 | (void) cg_attach_fallback(cgroup_controller_to_string(c), p, pid); | |
819 | done |= CGROUP_MASK_EXTEND_JOINED(bit); | |
820 | } | |
821 | ||
822 | return 0; | |
823 | } | |
824 | ||
7b639614 | 825 | int cg_migrate_v1_controllers(CGroupMask supported, CGroupMask mask, const char *from, cg_migrate_callback_t to_callback, void *userdata) { |
fdb3deca ZJS |
826 | CGroupController c; |
827 | CGroupMask done; | |
828 | int r = 0, q; | |
829 | ||
7b639614 | 830 | assert(to_callback); |
fdb3deca ZJS |
831 | |
832 | supported &= CGROUP_MASK_V1; | |
7b639614 | 833 | mask = CGROUP_MASK_EXTEND_JOINED(mask); |
fdb3deca ZJS |
834 | done = 0; |
835 | ||
836 | for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) { | |
837 | CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c); | |
7b639614 | 838 | const char *to = NULL; |
fdb3deca ZJS |
839 | |
840 | if (!FLAGS_SET(supported, bit)) | |
841 | continue; | |
842 | ||
843 | if (FLAGS_SET(done, bit)) | |
844 | continue; | |
845 | ||
7b639614 MK |
846 | if (!FLAGS_SET(mask, bit)) |
847 | continue; | |
fdb3deca | 848 | |
7b639614 MK |
849 | to = to_callback(bit, userdata); |
850 | ||
851 | /* Remember first error and try continuing */ | |
852 | q = cg_migrate_recursive_fallback(SYSTEMD_CGROUP_CONTROLLER, from, cgroup_controller_to_string(c), to, 0); | |
853 | r = (r < 0) ? r : q; | |
e3a4724d | 854 | |
855 | done |= CGROUP_MASK_EXTEND_JOINED(bit); | |
fdb3deca ZJS |
856 | } |
857 | ||
858 | return r; | |
859 | } | |
860 | ||
861 | int cg_trim_everywhere(CGroupMask supported, const char *path, bool delete_root) { | |
fdb3deca ZJS |
862 | int r, q; |
863 | ||
864 | r = cg_trim(SYSTEMD_CGROUP_CONTROLLER, path, delete_root); | |
865 | if (r < 0) | |
866 | return r; | |
867 | ||
868 | q = cg_all_unified(); | |
869 | if (q < 0) | |
870 | return q; | |
871 | if (q > 0) | |
872 | return r; | |
873 | ||
7b639614 MK |
874 | return cg_trim_v1_controllers(supported, _CGROUP_MASK_ALL, path, delete_root); |
875 | } | |
876 | ||
877 | int cg_trim_v1_controllers(CGroupMask supported, CGroupMask mask, const char *path, bool delete_root) { | |
878 | CGroupController c; | |
879 | CGroupMask done; | |
880 | int r = 0, q; | |
881 | ||
fdb3deca | 882 | supported &= CGROUP_MASK_V1; |
7b639614 | 883 | mask = CGROUP_MASK_EXTEND_JOINED(mask); |
fdb3deca ZJS |
884 | done = 0; |
885 | ||
886 | for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) { | |
887 | CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c); | |
888 | ||
889 | if (!FLAGS_SET(supported, bit)) | |
890 | continue; | |
891 | ||
892 | if (FLAGS_SET(done, bit)) | |
893 | continue; | |
894 | ||
7b639614 MK |
895 | if (FLAGS_SET(mask, bit)) { |
896 | /* Remember first error and try continuing */ | |
897 | q = cg_trim(cgroup_controller_to_string(c), path, delete_root); | |
898 | r = (r < 0) ? r : q; | |
899 | } | |
fdb3deca ZJS |
900 | done |= CGROUP_MASK_EXTEND_JOINED(bit); |
901 | } | |
902 | ||
903 | return r; | |
904 | } | |
905 | ||
906 | int cg_enable_everywhere( | |
907 | CGroupMask supported, | |
908 | CGroupMask mask, | |
909 | const char *p, | |
910 | CGroupMask *ret_result_mask) { | |
911 | ||
912 | _cleanup_fclose_ FILE *f = NULL; | |
913 | _cleanup_free_ char *fs = NULL; | |
914 | CGroupController c; | |
915 | CGroupMask ret = 0; | |
916 | int r; | |
917 | ||
918 | assert(p); | |
919 | ||
920 | if (supported == 0) { | |
921 | if (ret_result_mask) | |
922 | *ret_result_mask = 0; | |
923 | return 0; | |
924 | } | |
925 | ||
926 | r = cg_all_unified(); | |
927 | if (r < 0) | |
928 | return r; | |
929 | if (r == 0) { | |
930 | /* On the legacy hierarchy there's no concept of "enabling" controllers in cgroups defined. Let's claim | |
931 | * complete success right away. (If you wonder why we return the full mask here, rather than zero: the | |
932 | * caller tends to use the returned mask later on to compare if all controllers where properly joined, | |
933 | * and if not requeues realization. This use is the primary purpose of the return value, hence let's | |
934 | * minimize surprises here and reduce triggers for re-realization by always saying we fully | |
935 | * succeeded.) */ | |
936 | if (ret_result_mask) | |
937 | *ret_result_mask = mask & supported & CGROUP_MASK_V2; /* If you wonder why we mask this with | |
938 | * CGROUP_MASK_V2: The 'supported' mask | |
939 | * might contain pure-V1 or BPF | |
940 | * controllers, and we never want to | |
941 | * claim that we could enable those with | |
942 | * cgroup.subtree_control */ | |
943 | return 0; | |
944 | } | |
945 | ||
946 | r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, p, "cgroup.subtree_control", &fs); | |
947 | if (r < 0) | |
948 | return r; | |
949 | ||
950 | for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) { | |
951 | CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c); | |
952 | const char *n; | |
953 | ||
954 | if (!FLAGS_SET(CGROUP_MASK_V2, bit)) | |
955 | continue; | |
956 | ||
957 | if (!FLAGS_SET(supported, bit)) | |
958 | continue; | |
959 | ||
960 | n = cgroup_controller_to_string(c); | |
961 | { | |
962 | char s[1 + strlen(n) + 1]; | |
963 | ||
964 | s[0] = FLAGS_SET(mask, bit) ? '+' : '-'; | |
965 | strcpy(s + 1, n); | |
966 | ||
967 | if (!f) { | |
968 | f = fopen(fs, "we"); | |
969 | if (!f) | |
970 | return log_debug_errno(errno, "Failed to open cgroup.subtree_control file of %s: %m", p); | |
971 | } | |
972 | ||
973 | r = write_string_stream(f, s, WRITE_STRING_FILE_DISABLE_BUFFER); | |
974 | if (r < 0) { | |
975 | log_debug_errno(r, "Failed to %s controller %s for %s (%s): %m", | |
976 | FLAGS_SET(mask, bit) ? "enable" : "disable", n, p, fs); | |
977 | clearerr(f); | |
978 | ||
979 | /* If we can't turn off a controller, leave it on in the reported resulting mask. This | |
980 | * happens for example when we attempt to turn off a controller up in the tree that is | |
981 | * used down in the tree. */ | |
982 | if (!FLAGS_SET(mask, bit) && r == -EBUSY) /* You might wonder why we check for EBUSY | |
983 | * only here, and not follow the same logic | |
984 | * for other errors such as EINVAL or | |
985 | * EOPNOTSUPP or anything else. That's | |
986 | * because EBUSY indicates that the | |
987 | * controllers is currently enabled and | |
988 | * cannot be disabled because something down | |
989 | * the hierarchy is still using it. Any other | |
990 | * error most likely means something like "I | |
991 | * never heard of this controller" or | |
992 | * similar. In the former case it's hence | |
993 | * safe to assume the controller is still on | |
994 | * after the failed operation, while in the | |
995 | * latter case it's safer to assume the | |
996 | * controller is unknown and hence certainly | |
997 | * not enabled. */ | |
998 | ret |= bit; | |
999 | } else { | |
1000 | /* Otherwise, if we managed to turn on a controller, set the bit reflecting that. */ | |
1001 | if (FLAGS_SET(mask, bit)) | |
1002 | ret |= bit; | |
1003 | } | |
1004 | } | |
1005 | } | |
1006 | ||
1007 | /* Let's return the precise set of controllers now enabled for the cgroup. */ | |
1008 | if (ret_result_mask) | |
1009 | *ret_result_mask = ret; | |
1010 | ||
1011 | return 0; | |
1012 | } |