]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/basic/cgroup-util.c
Merge pull request #13022 from keszybz/coverity-cleanups
[thirdparty/systemd.git] / src / basic / cgroup-util.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
8c6db833 2
84ac7bea 3#include <dirent.h>
8c6db833 4#include <errno.h>
84ac7bea 5#include <ftw.h>
11c3a366 6#include <limits.h>
8c6db833 7#include <signal.h>
11c3a366 8#include <stddef.h>
8c6db833 9#include <stdlib.h>
84ac7bea 10#include <string.h>
672c48cc 11#include <sys/stat.h>
11c3a366 12#include <sys/statfs.h>
672c48cc 13#include <sys/types.h>
f98c2585 14#include <sys/utsname.h>
4b58153d 15#include <sys/xattr.h>
84ac7bea 16#include <unistd.h>
8c6db833 17
b5efdb8a 18#include "alloc-util.h"
3ffd4af2 19#include "cgroup-util.h"
93cc7779 20#include "def.h"
a0956174 21#include "dirent-util.h"
84ac7bea 22#include "extract-word.h"
3ffd4af2 23#include "fd-util.h"
84ac7bea 24#include "fileio.h"
f97b34a6 25#include "format-util.h"
f4f15635 26#include "fs-util.h"
93cc7779 27#include "log.h"
84ac7bea
LP
28#include "login-util.h"
29#include "macro.h"
93cc7779 30#include "missing.h"
84ac7bea 31#include "mkdir.h"
6bedfcbb 32#include "parse-util.h"
9eb977db 33#include "path-util.h"
872a590e 34#include "proc-cmdline.h"
84ac7bea
LP
35#include "process-util.h"
36#include "set.h"
9444b1f2 37#include "special.h"
872a590e 38#include "stat-util.h"
d054f0a4 39#include "stdio-util.h"
8b43440b 40#include "string-table.h"
07630cea 41#include "string-util.h"
aae7e17f 42#include "strv.h"
84ac7bea 43#include "unit-name.h"
b1d4f8e1 44#include "user-util.h"
8c6db833 45
e48fcfef 46static int cg_enumerate_items(const char *controller, const char *path, FILE **_f, const char *item) {
7027ff61 47 _cleanup_free_ char *fs = NULL;
c6c18be3 48 FILE *f;
7027ff61 49 int r;
c6c18be3 50
c6c18be3
LP
51 assert(_f);
52
e48fcfef 53 r = cg_get_path(controller, path, item, &fs);
c3175a7f 54 if (r < 0)
c6c18be3
LP
55 return r;
56
57 f = fopen(fs, "re");
c6c18be3
LP
58 if (!f)
59 return -errno;
60
61 *_f = f;
62 return 0;
63}
64
e48fcfef
TM
65int cg_enumerate_processes(const char *controller, const char *path, FILE **_f) {
66 return cg_enumerate_items(controller, path, _f, "cgroup.procs");
67}
68
c6c18be3
LP
69int cg_read_pid(FILE *f, pid_t *_pid) {
70 unsigned long ul;
71
72 /* Note that the cgroup.procs might contain duplicates! See
73 * cgroups.txt for details. */
74
7027ff61
LP
75 assert(f);
76 assert(_pid);
77
c6c18be3
LP
78 errno = 0;
79 if (fscanf(f, "%lu", &ul) != 1) {
80
81 if (feof(f))
82 return 0;
83
66855de7 84 return errno_or_else(EIO);
c6c18be3
LP
85 }
86
87 if (ul <= 0)
88 return -EIO;
89
90 *_pid = (pid_t) ul;
91 return 1;
92}
93
8b238b13
LP
94int cg_read_event(
95 const char *controller,
96 const char *path,
97 const char *event,
31a9be23 98 char **ret) {
8b238b13 99
ab2c3861 100 _cleanup_free_ char *events = NULL, *content = NULL;
ab2c3861
TH
101 int r;
102
103 r = cg_get_path(controller, path, "cgroup.events", &events);
104 if (r < 0)
105 return r;
106
107 r = read_full_file(events, &content, NULL);
108 if (r < 0)
109 return r;
110
31a9be23
YW
111 for (const char *p = content;;) {
112 _cleanup_free_ char *line = NULL, *key = NULL, *val = NULL;
113 const char *q;
114
115 r = extract_first_word(&p, &line, "\n", 0);
116 if (r < 0)
117 return r;
118 if (r == 0)
119 return -ENOENT;
120
121 q = line;
122 r = extract_first_word(&q, &key, " ", 0);
123 if (r < 0)
124 return r;
125 if (r == 0)
ab2c3861
TH
126 return -EINVAL;
127
31a9be23 128 if (!streq(key, event))
ab2c3861
TH
129 continue;
130
31a9be23
YW
131 val = strdup(q);
132 if (!val)
133 return -ENOMEM;
134
135 *ret = TAKE_PTR(val);
ab2c3861
TH
136 return 0;
137 }
ab2c3861
TH
138}
139
3228995c
CB
140bool cg_ns_supported(void) {
141 static thread_local int enabled = -1;
142
143 if (enabled >= 0)
144 return enabled;
145
0887fa71
LP
146 if (access("/proc/self/ns/cgroup", F_OK) < 0) {
147 if (errno != ENOENT)
148 log_debug_errno(errno, "Failed to check whether /proc/self/ns/cgroup is available, assuming not: %m");
149 enabled = false;
150 } else
151 enabled = true;
3228995c
CB
152
153 return enabled;
154}
155
35d2e7ec 156int cg_enumerate_subgroups(const char *controller, const char *path, DIR **_d) {
7027ff61 157 _cleanup_free_ char *fs = NULL;
35d2e7ec
LP
158 int r;
159 DIR *d;
160
35d2e7ec
LP
161 assert(_d);
162
163 /* This is not recursive! */
164
c3175a7f
LP
165 r = cg_get_path(controller, path, NULL, &fs);
166 if (r < 0)
35d2e7ec
LP
167 return r;
168
169 d = opendir(fs);
35d2e7ec
LP
170 if (!d)
171 return -errno;
172
173 *_d = d;
174 return 0;
175}
176
177int cg_read_subgroup(DIR *d, char **fn) {
178 struct dirent *de;
179
180 assert(d);
7027ff61 181 assert(fn);
35d2e7ec 182
f01327ad 183 FOREACH_DIRENT_ALL(de, d, return -errno) {
35d2e7ec
LP
184 char *b;
185
186 if (de->d_type != DT_DIR)
187 continue;
188
49bfc877 189 if (dot_or_dot_dot(de->d_name))
35d2e7ec
LP
190 continue;
191
7027ff61
LP
192 b = strdup(de->d_name);
193 if (!b)
35d2e7ec
LP
194 return -ENOMEM;
195
196 *fn = b;
197 return 1;
198 }
199
35d2e7ec
LP
200 return 0;
201}
202
4ad49000 203int cg_rmdir(const char *controller, const char *path) {
7027ff61 204 _cleanup_free_ char *p = NULL;
35d2e7ec
LP
205 int r;
206
ad293f5a
LP
207 r = cg_get_path(controller, path, NULL, &p);
208 if (r < 0)
35d2e7ec
LP
209 return r;
210
211 r = rmdir(p);
7027ff61
LP
212 if (r < 0 && errno != ENOENT)
213 return -errno;
35d2e7ec 214
b4cccbc1 215 r = cg_hybrid_unified();
f20db199 216 if (r <= 0)
b4cccbc1 217 return r;
b4cccbc1
LP
218
219 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
2977724b
TH
220 r = cg_rmdir(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path);
221 if (r < 0)
222 log_warning_errno(r, "Failed to remove compat systemd cgroup %s: %m", path);
223 }
224
7027ff61 225 return 0;
35d2e7ec
LP
226}
227
e48fcfef 228static int cg_kill_items(
1d98fef1
LP
229 const char *controller,
230 const char *path,
231 int sig,
232 CGroupFlags flags,
233 Set *s,
234 cg_kill_log_func_t log_kill,
e48fcfef
TM
235 void *userdata,
236 const char *item) {
1d98fef1 237
7027ff61 238 _cleanup_set_free_ Set *allocated_set = NULL;
35d2e7ec 239 bool done = false;
c53d2d54 240 int r, ret = 0, ret_log_kill = 0;
35d2e7ec 241 pid_t my_pid;
8c6db833 242
8c6db833
LP
243 assert(sig >= 0);
244
0d5b4810
LP
245 /* Don't send SIGCONT twice. Also, SIGKILL always works even when process is suspended, hence don't send
246 * SIGCONT on SIGKILL. */
247 if (IN_SET(sig, SIGCONT, SIGKILL))
248 flags &= ~CGROUP_SIGCONT;
249
8c6db833
LP
250 /* This goes through the tasks list and kills them all. This
251 * is repeated until no further processes are added to the
252 * tasks list, to properly handle forking processes */
253
7027ff61 254 if (!s) {
d5099efc 255 s = allocated_set = set_new(NULL);
7027ff61 256 if (!s)
ca949c9d 257 return -ENOMEM;
7027ff61 258 }
8c6db833 259
df0ff127 260 my_pid = getpid_cached();
8c6db833
LP
261
262 do {
7027ff61 263 _cleanup_fclose_ FILE *f = NULL;
0b172489 264 pid_t pid = 0;
8c6db833
LP
265 done = true;
266
e48fcfef 267 r = cg_enumerate_items(controller, path, &f, item);
7027ff61 268 if (r < 0) {
4c633005 269 if (ret >= 0 && r != -ENOENT)
7027ff61 270 return r;
35d2e7ec 271
7027ff61 272 return ret;
35d2e7ec 273 }
c6c18be3
LP
274
275 while ((r = cg_read_pid(f, &pid)) > 0) {
8c6db833 276
1d98fef1 277 if ((flags & CGROUP_IGNORE_SELF) && pid == my_pid)
c6c18be3 278 continue;
8c6db833 279
fea72cc0 280 if (set_get(s, PID_TO_PTR(pid)) == PID_TO_PTR(pid))
c6c18be3 281 continue;
8c6db833 282
1d98fef1 283 if (log_kill)
c53d2d54 284 ret_log_kill = log_kill(pid, sig, userdata);
1d98fef1 285
8c6db833
LP
286 /* If we haven't killed this process yet, kill
287 * it */
4c633005
LP
288 if (kill(pid, sig) < 0) {
289 if (ret >= 0 && errno != ESRCH)
8c6db833 290 ret = -errno;
6e8314c4 291 } else {
1d98fef1 292 if (flags & CGROUP_SIGCONT)
e155a0aa 293 (void) kill(pid, SIGCONT);
430c18ed 294
c53d2d54
DB
295 if (ret == 0) {
296 if (log_kill)
297 ret = ret_log_kill;
298 else
299 ret = 1;
300 }
430c18ed 301 }
8c6db833 302
8c6db833
LP
303 done = false;
304
fea72cc0 305 r = set_put(s, PID_TO_PTR(pid));
7027ff61 306 if (r < 0) {
35d2e7ec 307 if (ret >= 0)
7027ff61 308 return r;
35d2e7ec 309
7027ff61 310 return ret;
35d2e7ec
LP
311 }
312 }
313
314 if (r < 0) {
315 if (ret >= 0)
7027ff61 316 return r;
35d2e7ec 317
7027ff61 318 return ret;
8c6db833
LP
319 }
320
8c6db833
LP
321 /* To avoid racing against processes which fork
322 * quicker than we can kill them we repeat this until
323 * no new pids need to be killed. */
324
35d2e7ec 325 } while (!done);
8c6db833 326
35d2e7ec 327 return ret;
8c6db833
LP
328}
329
e48fcfef
TM
330int cg_kill(
331 const char *controller,
332 const char *path,
333 int sig,
334 CGroupFlags flags,
335 Set *s,
336 cg_kill_log_func_t log_kill,
337 void *userdata) {
338 int r;
339
340 r = cg_kill_items(controller, path, sig, flags, s, log_kill, userdata, "cgroup.procs");
341 if (r < 0 || sig != SIGKILL)
342 return r;
343
344 /* Only in case of killing with SIGKILL and when using cgroupsv2, kill remaining threads manually as
345 a workaround for kernel bug. It was fixed in 5.2-rc5 (c03cd7738a83). */
346 r = cg_unified_controller(controller);
347 if (r < 0)
348 return r;
349 if (r == 0) /* doesn't apply to legacy hierarchy */
350 return 0;
351
352 return cg_kill_items(controller, path, sig, flags, s, log_kill, userdata, "cgroup.threads");
353}
354
1d98fef1
LP
355int cg_kill_recursive(
356 const char *controller,
357 const char *path,
358 int sig,
359 CGroupFlags flags,
360 Set *s,
361 cg_kill_log_func_t log_kill,
362 void *userdata) {
363
7027ff61
LP
364 _cleanup_set_free_ Set *allocated_set = NULL;
365 _cleanup_closedir_ DIR *d = NULL;
e155a0aa 366 int r, ret;
35d2e7ec 367 char *fn;
8c6db833
LP
368
369 assert(path);
8c6db833
LP
370 assert(sig >= 0);
371
7027ff61 372 if (!s) {
d5099efc 373 s = allocated_set = set_new(NULL);
7027ff61 374 if (!s)
ca949c9d 375 return -ENOMEM;
7027ff61 376 }
ca949c9d 377
1d98fef1 378 ret = cg_kill(controller, path, sig, flags, s, log_kill, userdata);
8c6db833 379
7027ff61
LP
380 r = cg_enumerate_subgroups(controller, path, &d);
381 if (r < 0) {
4c633005 382 if (ret >= 0 && r != -ENOENT)
7027ff61 383 return r;
8c6db833 384
7027ff61 385 return ret;
35d2e7ec 386 }
8c6db833 387
35d2e7ec 388 while ((r = cg_read_subgroup(d, &fn)) > 0) {
7027ff61 389 _cleanup_free_ char *p = NULL;
8c6db833 390
657ee2d8 391 p = path_join(path, fn);
35d2e7ec 392 free(fn);
7027ff61
LP
393 if (!p)
394 return -ENOMEM;
8c6db833 395
1d98fef1 396 r = cg_kill_recursive(controller, p, sig, flags, s, log_kill, userdata);
e155a0aa 397 if (r != 0 && ret >= 0)
35d2e7ec 398 ret = r;
8c6db833 399 }
7027ff61 400 if (ret >= 0 && r < 0)
35d2e7ec
LP
401 ret = r;
402
1d98fef1 403 if (flags & CGROUP_REMOVE) {
4ad49000 404 r = cg_rmdir(controller, path);
4c701096 405 if (r < 0 && ret >= 0 && !IN_SET(r, -ENOENT, -EBUSY))
7027ff61
LP
406 return r;
407 }
ca949c9d 408
8c6db833
LP
409 return ret;
410}
411
1d98fef1
LP
412int cg_migrate(
413 const char *cfrom,
414 const char *pfrom,
415 const char *cto,
416 const char *pto,
417 CGroupFlags flags) {
418
35d2e7ec 419 bool done = false;
246aa6dd 420 _cleanup_set_free_ Set *s = NULL;
8c6db833
LP
421 int r, ret = 0;
422 pid_t my_pid;
423
246aa6dd
LP
424 assert(cfrom);
425 assert(pfrom);
426 assert(cto);
427 assert(pto);
8c6db833 428
d5099efc 429 s = set_new(NULL);
246aa6dd 430 if (!s)
35d2e7ec
LP
431 return -ENOMEM;
432
df0ff127 433 my_pid = getpid_cached();
8c6db833
LP
434
435 do {
7027ff61 436 _cleanup_fclose_ FILE *f = NULL;
0b172489 437 pid_t pid = 0;
8c6db833
LP
438 done = true;
439
b043cd0b 440 r = cg_enumerate_processes(cfrom, pfrom, &f);
246aa6dd 441 if (r < 0) {
4c633005 442 if (ret >= 0 && r != -ENOENT)
7027ff61 443 return r;
35d2e7ec 444
246aa6dd 445 return ret;
35d2e7ec 446 }
c6c18be3
LP
447
448 while ((r = cg_read_pid(f, &pid)) > 0) {
8c6db833 449
35d2e7ec
LP
450 /* This might do weird stuff if we aren't a
451 * single-threaded program. However, we
452 * luckily know we are not */
1d98fef1 453 if ((flags & CGROUP_IGNORE_SELF) && pid == my_pid)
c6c18be3 454 continue;
8c6db833 455
fea72cc0 456 if (set_get(s, PID_TO_PTR(pid)) == PID_TO_PTR(pid))
35d2e7ec
LP
457 continue;
458
9b84c7f9
LP
459 /* Ignore kernel threads. Since they can only
460 * exist in the root cgroup, we only check for
461 * them there. */
462 if (cfrom &&
57ea45e1 463 empty_or_root(pfrom) &&
9b84c7f9
LP
464 is_kernel_thread(pid) > 0)
465 continue;
466
246aa6dd
LP
467 r = cg_attach(cto, pto, pid);
468 if (r < 0) {
4c633005 469 if (ret >= 0 && r != -ESRCH)
35d2e7ec
LP
470 ret = r;
471 } else if (ret == 0)
472 ret = 1;
8c6db833 473
8c6db833 474 done = false;
35d2e7ec 475
fea72cc0 476 r = set_put(s, PID_TO_PTR(pid));
246aa6dd 477 if (r < 0) {
35d2e7ec 478 if (ret >= 0)
7027ff61 479 return r;
35d2e7ec 480
246aa6dd 481 return ret;
35d2e7ec
LP
482 }
483 }
484
485 if (r < 0) {
486 if (ret >= 0)
7027ff61 487 return r;
35d2e7ec 488
246aa6dd 489 return ret;
8c6db833 490 }
35d2e7ec 491 } while (!done);
8c6db833 492
35d2e7ec 493 return ret;
8c6db833
LP
494}
495
4ad49000
LP
496int cg_migrate_recursive(
497 const char *cfrom,
498 const char *pfrom,
499 const char *cto,
500 const char *pto,
1d98fef1 501 CGroupFlags flags) {
4ad49000 502
246aa6dd 503 _cleanup_closedir_ DIR *d = NULL;
7027ff61 504 int r, ret = 0;
35d2e7ec 505 char *fn;
8c6db833 506
246aa6dd
LP
507 assert(cfrom);
508 assert(pfrom);
509 assert(cto);
510 assert(pto);
8c6db833 511
1d98fef1 512 ret = cg_migrate(cfrom, pfrom, cto, pto, flags);
8c6db833 513
246aa6dd
LP
514 r = cg_enumerate_subgroups(cfrom, pfrom, &d);
515 if (r < 0) {
4c633005 516 if (ret >= 0 && r != -ENOENT)
7027ff61
LP
517 return r;
518
246aa6dd 519 return ret;
35d2e7ec
LP
520 }
521
522 while ((r = cg_read_subgroup(d, &fn)) > 0) {
246aa6dd 523 _cleanup_free_ char *p = NULL;
8c6db833 524
657ee2d8 525 p = path_join(pfrom, fn);
35d2e7ec 526 free(fn);
e155a0aa
LP
527 if (!p)
528 return -ENOMEM;
8c6db833 529
1d98fef1 530 r = cg_migrate_recursive(cfrom, p, cto, pto, flags);
35d2e7ec
LP
531 if (r != 0 && ret >= 0)
532 ret = r;
8c6db833
LP
533 }
534
35d2e7ec
LP
535 if (r < 0 && ret >= 0)
536 ret = r;
537
1d98fef1 538 if (flags & CGROUP_REMOVE) {
4ad49000 539 r = cg_rmdir(cfrom, pfrom);
4c701096 540 if (r < 0 && ret >= 0 && !IN_SET(r, -ENOENT, -EBUSY))
246aa6dd
LP
541 return r;
542 }
8c6db833
LP
543
544 return ret;
545}
546
13b84ec7
LP
547int cg_migrate_recursive_fallback(
548 const char *cfrom,
549 const char *pfrom,
550 const char *cto,
551 const char *pto,
1d98fef1 552 CGroupFlags flags) {
13b84ec7
LP
553
554 int r;
555
556 assert(cfrom);
557 assert(pfrom);
558 assert(cto);
559 assert(pto);
560
1d98fef1 561 r = cg_migrate_recursive(cfrom, pfrom, cto, pto, flags);
13b84ec7
LP
562 if (r < 0) {
563 char prefix[strlen(pto) + 1];
564
565 /* This didn't work? Then let's try all prefixes of the destination */
566
fecffe5d 567 PATH_FOREACH_PREFIX(prefix, pto) {
e155a0aa
LP
568 int q;
569
1d98fef1 570 q = cg_migrate_recursive(cfrom, pfrom, cto, prefix, flags);
e155a0aa
LP
571 if (q >= 0)
572 return q;
13b84ec7
LP
573 }
574 }
575
e155a0aa 576 return r;
13b84ec7
LP
577}
578
efdb0237
LP
579static const char *controller_to_dirname(const char *controller) {
580 const char *e;
3474ae3c 581
7027ff61
LP
582 assert(controller);
583
efdb0237
LP
584 /* Converts a controller name to the directory name below
585 * /sys/fs/cgroup/ we want to mount it to. Effectively, this
586 * just cuts off the name= prefixed used for named
587 * hierarchies, if it is specified. */
588
2977724b 589 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
b4cccbc1 590 if (cg_hybrid_unified() > 0)
2977724b
TH
591 controller = SYSTEMD_CGROUP_CONTROLLER_HYBRID;
592 else
593 controller = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
594 }
b6629c4b 595
efdb0237
LP
596 e = startswith(controller, "name=");
597 if (e)
598 return e;
599
600 return controller;
3474ae3c
LP
601}
602
569b19d8
LP
603static int join_path_legacy(const char *controller, const char *path, const char *suffix, char **fs) {
604 const char *dn;
018ef268 605 char *t = NULL;
3474ae3c 606
efdb0237 607 assert(fs);
569b19d8
LP
608 assert(controller);
609
610 dn = controller_to_dirname(controller);
efdb0237
LP
611
612 if (isempty(path) && isempty(suffix))
657ee2d8 613 t = path_join("/sys/fs/cgroup", dn);
efdb0237 614 else if (isempty(path))
657ee2d8 615 t = path_join("/sys/fs/cgroup", dn, suffix);
efdb0237 616 else if (isempty(suffix))
657ee2d8 617 t = path_join("/sys/fs/cgroup", dn, path);
efdb0237 618 else
657ee2d8 619 t = path_join("/sys/fs/cgroup", dn, path, suffix);
efdb0237
LP
620 if (!t)
621 return -ENOMEM;
3474ae3c 622
efdb0237
LP
623 *fs = t;
624 return 0;
625}
626
627static int join_path_unified(const char *path, const char *suffix, char **fs) {
628 char *t;
629
630 assert(fs);
631
632 if (isempty(path) && isempty(suffix))
633 t = strdup("/sys/fs/cgroup");
634 else if (isempty(path))
657ee2d8 635 t = path_join("/sys/fs/cgroup", suffix);
efdb0237 636 else if (isempty(suffix))
657ee2d8 637 t = path_join("/sys/fs/cgroup", path);
efdb0237 638 else
657ee2d8 639 t = path_join("/sys/fs/cgroup", path, suffix);
3474ae3c
LP
640 if (!t)
641 return -ENOMEM;
642
efdb0237 643 *fs = t;
3474ae3c
LP
644 return 0;
645}
646
8c6db833 647int cg_get_path(const char *controller, const char *path, const char *suffix, char **fs) {
415fc41c 648 int r;
8c6db833 649
dbd821ac
LP
650 assert(fs);
651
efdb0237
LP
652 if (!controller) {
653 char *t;
654
569b19d8
LP
655 /* If no controller is specified, we return the path
656 * *below* the controllers, without any prefix. */
efdb0237
LP
657
658 if (!path && !suffix)
659 return -EINVAL;
660
989189ea 661 if (!suffix)
efdb0237 662 t = strdup(path);
989189ea 663 else if (!path)
efdb0237
LP
664 t = strdup(suffix);
665 else
657ee2d8 666 t = path_join(path, suffix);
efdb0237
LP
667 if (!t)
668 return -ENOMEM;
669
858d36c1 670 *fs = path_simplify(t, false);
efdb0237
LP
671 return 0;
672 }
673
674 if (!cg_controller_is_valid(controller))
78edb35a
LP
675 return -EINVAL;
676
b4cccbc1
LP
677 r = cg_all_unified();
678 if (r < 0)
679 return r;
680 if (r > 0)
efdb0237 681 r = join_path_unified(path, suffix, fs);
569b19d8
LP
682 else
683 r = join_path_legacy(controller, path, suffix, fs);
efdb0237
LP
684 if (r < 0)
685 return r;
7027ff61 686
858d36c1 687 path_simplify(*fs, false);
efdb0237 688 return 0;
3474ae3c 689}
dbd821ac 690
efdb0237 691static int controller_is_accessible(const char *controller) {
b4cccbc1 692 int r;
37099707 693
efdb0237 694 assert(controller);
37099707 695
efdb0237
LP
696 /* Checks whether a specific controller is accessible,
697 * i.e. its hierarchy mounted. In the unified hierarchy all
698 * controllers are considered accessible, except for the named
699 * hierarchies */
b12afc8c 700
efdb0237
LP
701 if (!cg_controller_is_valid(controller))
702 return -EINVAL;
703
b4cccbc1
LP
704 r = cg_all_unified();
705 if (r < 0)
706 return r;
707 if (r > 0) {
efdb0237
LP
708 /* We don't support named hierarchies if we are using
709 * the unified hierarchy. */
710
711 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER))
712 return 0;
713
714 if (startswith(controller, "name="))
715 return -EOPNOTSUPP;
716
717 } else {
718 const char *cc, *dn;
719
720 dn = controller_to_dirname(controller);
721 cc = strjoina("/sys/fs/cgroup/", dn);
722
723 if (laccess(cc, F_OK) < 0)
724 return -errno;
725 }
37099707
LP
726
727 return 0;
728}
729
3474ae3c 730int cg_get_path_and_check(const char *controller, const char *path, const char *suffix, char **fs) {
37099707 731 int r;
dbd821ac 732
efdb0237 733 assert(controller);
3474ae3c 734 assert(fs);
70132bd0 735
efdb0237
LP
736 /* Check if the specified controller is actually accessible */
737 r = controller_is_accessible(controller);
37099707
LP
738 if (r < 0)
739 return r;
3474ae3c 740
efdb0237 741 return cg_get_path(controller, path, suffix, fs);
8c6db833
LP
742}
743
e27796a0 744static int trim_cb(const char *path, const struct stat *sb, int typeflag, struct FTW *ftwbuf) {
4ad49000
LP
745 assert(path);
746 assert(sb);
747 assert(ftwbuf);
e27796a0
LP
748
749 if (typeflag != FTW_DP)
750 return 0;
751
752 if (ftwbuf->level < 1)
753 return 0;
754
e155a0aa 755 (void) rmdir(path);
e27796a0
LP
756 return 0;
757}
758
8c6db833 759int cg_trim(const char *controller, const char *path, bool delete_root) {
7027ff61 760 _cleanup_free_ char *fs = NULL;
2977724b 761 int r = 0, q;
8c6db833 762
8c6db833
LP
763 assert(path);
764
e27796a0
LP
765 r = cg_get_path(controller, path, NULL, &fs);
766 if (r < 0)
8c6db833
LP
767 return r;
768
e27796a0 769 errno = 0;
e155a0aa
LP
770 if (nftw(fs, trim_cb, 64, FTW_DEPTH|FTW_MOUNT|FTW_PHYS) != 0) {
771 if (errno == ENOENT)
772 r = 0;
e155a0aa 773 else
66855de7 774 r = errno_or_else(EIO);
e155a0aa 775 }
e27796a0
LP
776
777 if (delete_root) {
4ad49000
LP
778 if (rmdir(fs) < 0 && errno != ENOENT)
779 return -errno;
e27796a0
LP
780 }
781
b4cccbc1
LP
782 q = cg_hybrid_unified();
783 if (q < 0)
784 return q;
785 if (q > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
2977724b
TH
786 q = cg_trim(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, delete_root);
787 if (q < 0)
788 log_warning_errno(q, "Failed to trim compat systemd cgroup %s: %m", path);
789 }
790
e27796a0 791 return r;
8c6db833
LP
792}
793
65be7e06
ZJS
794/* Create a cgroup in the hierarchy of controller.
795 * Returns 0 if the group already existed, 1 on success, negative otherwise.
796 */
1434ae6f
LP
797int cg_create(const char *controller, const char *path) {
798 _cleanup_free_ char *fs = NULL;
799 int r;
800
801 r = cg_get_path_and_check(controller, path, NULL, &fs);
802 if (r < 0)
803 return r;
804
805 r = mkdir_parents(fs, 0755);
806 if (r < 0)
807 return r;
808
dae8b82e
ZJS
809 r = mkdir_errno_wrapper(fs, 0755);
810 if (r == -EEXIST)
811 return 0;
812 if (r < 0)
813 return r;
1434ae6f 814
b4cccbc1
LP
815 r = cg_hybrid_unified();
816 if (r < 0)
817 return r;
818
819 if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
2977724b
TH
820 r = cg_create(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path);
821 if (r < 0)
822 log_warning_errno(r, "Failed to create compat systemd cgroup %s: %m", path);
823 }
824
1434ae6f
LP
825 return 1;
826}
827
828int cg_create_and_attach(const char *controller, const char *path, pid_t pid) {
829 int r, q;
830
831 assert(pid >= 0);
832
833 r = cg_create(controller, path);
834 if (r < 0)
835 return r;
836
837 q = cg_attach(controller, path, pid);
838 if (q < 0)
839 return q;
840
841 /* This does not remove the cgroup on failure */
842 return r;
843}
844
8c6db833 845int cg_attach(const char *controller, const char *path, pid_t pid) {
574d5f2d
LP
846 _cleanup_free_ char *fs = NULL;
847 char c[DECIMAL_STR_MAX(pid_t) + 2];
8c6db833
LP
848 int r;
849
8c6db833
LP
850 assert(path);
851 assert(pid >= 0);
852
b043cd0b 853 r = cg_get_path_and_check(controller, path, "cgroup.procs", &fs);
3474ae3c 854 if (r < 0)
c6c18be3 855 return r;
8c6db833
LP
856
857 if (pid == 0)
df0ff127 858 pid = getpid_cached();
8c6db833 859
d054f0a4 860 xsprintf(c, PID_FMT "\n", pid);
8c6db833 861
604028de 862 r = write_string_file(fs, c, WRITE_STRING_FILE_DISABLE_BUFFER);
2977724b
TH
863 if (r < 0)
864 return r;
865
b4cccbc1
LP
866 r = cg_hybrid_unified();
867 if (r < 0)
868 return r;
869
870 if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
2977724b
TH
871 r = cg_attach(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, pid);
872 if (r < 0)
bd68e99b 873 log_warning_errno(r, "Failed to attach "PID_FMT" to compat systemd cgroup %s: %m", pid, path);
2977724b
TH
874 }
875
876 return 0;
8c6db833
LP
877}
878
13b84ec7
LP
879int cg_attach_fallback(const char *controller, const char *path, pid_t pid) {
880 int r;
881
882 assert(controller);
883 assert(path);
884 assert(pid >= 0);
885
886 r = cg_attach(controller, path, pid);
887 if (r < 0) {
888 char prefix[strlen(path) + 1];
889
890 /* This didn't work? Then let's try all prefixes of
891 * the destination */
892
fecffe5d 893 PATH_FOREACH_PREFIX(prefix, path) {
e155a0aa
LP
894 int q;
895
896 q = cg_attach(controller, prefix, pid);
897 if (q >= 0)
898 return q;
13b84ec7
LP
899 }
900 }
901
e155a0aa 902 return r;
13b84ec7
LP
903}
904
62b9bb26 905int cg_set_access(
2d76d14e
LP
906 const char *controller,
907 const char *path,
2d76d14e
LP
908 uid_t uid,
909 gid_t gid) {
910
62b9bb26
LP
911 struct Attribute {
912 const char *name;
913 bool fatal;
914 };
915
4e1dfa45 916 /* cgroup v1, aka legacy/non-unified */
62b9bb26
LP
917 static const struct Attribute legacy_attributes[] = {
918 { "cgroup.procs", true },
919 { "tasks", false },
920 { "cgroup.clone_children", false },
921 {},
922 };
923
4e1dfa45 924 /* cgroup v2, aka unified */
62b9bb26
LP
925 static const struct Attribute unified_attributes[] = {
926 { "cgroup.procs", true },
927 { "cgroup.subtree_control", true },
928 { "cgroup.threads", false },
929 {},
930 };
931
932 static const struct Attribute* const attributes[] = {
933 [false] = legacy_attributes,
934 [true] = unified_attributes,
935 };
974efc46 936
40853aa5 937 _cleanup_free_ char *fs = NULL;
62b9bb26
LP
938 const struct Attribute *i;
939 int r, unified;
8c6db833 940
8c6db833
LP
941 assert(path);
942
62b9bb26 943 if (uid == UID_INVALID && gid == GID_INVALID)
8d53b453
LP
944 return 0;
945
62b9bb26
LP
946 unified = cg_unified_controller(controller);
947 if (unified < 0)
948 return unified;
8c6db833 949
62b9bb26
LP
950 /* Configure access to the cgroup itself */
951 r = cg_get_path(controller, path, NULL, &fs);
974efc46
LP
952 if (r < 0)
953 return r;
8c6db833 954
62b9bb26 955 r = chmod_and_chown(fs, 0755, uid, gid);
b4cccbc1
LP
956 if (r < 0)
957 return r;
40853aa5 958
62b9bb26
LP
959 /* Configure access to the cgroup's attributes */
960 for (i = attributes[unified]; i->name; i++) {
40853aa5 961 fs = mfree(fs);
40853aa5 962
62b9bb26 963 r = cg_get_path(controller, path, i->name, &fs);
40853aa5
LP
964 if (r < 0)
965 return r;
efdb0237 966
62b9bb26
LP
967 r = chmod_and_chown(fs, 0644, uid, gid);
968 if (r < 0) {
969 if (i->fatal)
970 return r;
5beac75e 971
62b9bb26
LP
972 log_debug_errno(r, "Failed to set access on cgroup %s, ignoring: %m", fs);
973 }
974 }
975
976 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
977 r = cg_hybrid_unified();
2977724b 978 if (r < 0)
62b9bb26
LP
979 return r;
980 if (r > 0) {
981 /* Always propagate access mode from unified to legacy controller */
982 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, uid, gid);
983 if (r < 0)
984 log_debug_errno(r, "Failed to set access on compatibility systemd cgroup %s, ignoring: %m", path);
985 }
2977724b 986 }
974efc46 987
efdb0237 988 return 0;
8c6db833
LP
989}
990
4b58153d
LP
991int cg_set_xattr(const char *controller, const char *path, const char *name, const void *value, size_t size, int flags) {
992 _cleanup_free_ char *fs = NULL;
993 int r;
994
995 assert(path);
996 assert(name);
997 assert(value || size <= 0);
998
999 r = cg_get_path(controller, path, NULL, &fs);
1000 if (r < 0)
1001 return r;
1002
1003 if (setxattr(fs, name, value, size, flags) < 0)
1004 return -errno;
1005
1006 return 0;
1007}
1008
1009int cg_get_xattr(const char *controller, const char *path, const char *name, void *value, size_t size) {
1010 _cleanup_free_ char *fs = NULL;
1011 ssize_t n;
1012 int r;
1013
1014 assert(path);
1015 assert(name);
1016
1017 r = cg_get_path(controller, path, NULL, &fs);
1018 if (r < 0)
1019 return r;
1020
1021 n = getxattr(fs, name, value, size);
1022 if (n < 0)
1023 return -errno;
1024
1025 return (int) n;
1026}
1027
7027ff61 1028int cg_pid_get_path(const char *controller, pid_t pid, char **path) {
7027ff61 1029 _cleanup_fclose_ FILE *f = NULL;
b6629c4b 1030 const char *fs, *controller_str;
d2b39cb6 1031 int unified, r;
efdb0237 1032 size_t cs = 0;
8c6db833 1033
8c6db833 1034 assert(path);
c6c18be3 1035 assert(pid >= 0);
8c6db833 1036
5da38d07
TH
1037 if (controller) {
1038 if (!cg_controller_is_valid(controller))
1039 return -EINVAL;
1040 } else
1041 controller = SYSTEMD_CGROUP_CONTROLLER;
1042
c22800e4 1043 unified = cg_unified_controller(controller);
b4cccbc1
LP
1044 if (unified < 0)
1045 return unified;
1046 if (unified == 0) {
b6629c4b
TH
1047 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER))
1048 controller_str = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
1049 else
1050 controller_str = controller;
1051
1052 cs = strlen(controller_str);
1053 }
7027ff61 1054
b68fa010 1055 fs = procfs_file_alloca(pid, "cgroup");
fdeea3f4
ZJS
1056 r = fopen_unlocked(fs, "re", &f);
1057 if (r == -ENOENT)
1058 return -ESRCH;
1059 if (r < 0)
1060 return r;
35bbbf85 1061
d2b39cb6
LP
1062 for (;;) {
1063 _cleanup_free_ char *line = NULL;
efdb0237 1064 char *e, *p;
c6c18be3 1065
d2b39cb6
LP
1066 r = read_line(f, LONG_LINE_MAX, &line);
1067 if (r < 0)
1068 return r;
1069 if (r == 0)
1070 break;
c6c18be3 1071
efdb0237
LP
1072 if (unified) {
1073 e = startswith(line, "0:");
1074 if (!e)
1075 continue;
c6c18be3 1076
efdb0237
LP
1077 e = strchr(e, ':');
1078 if (!e)
1079 continue;
1080 } else {
1081 char *l;
1082 size_t k;
1083 const char *word, *state;
1084 bool found = false;
1085
1086 l = strchr(line, ':');
1087 if (!l)
1088 continue;
8af8afd6 1089
efdb0237
LP
1090 l++;
1091 e = strchr(l, ':');
1092 if (!e)
1093 continue;
8af8afd6 1094
efdb0237 1095 *e = 0;
00d4b1e6 1096 FOREACH_WORD_SEPARATOR(word, k, l, ",", state)
b6629c4b 1097 if (k == cs && memcmp(word, controller_str, cs) == 0) {
efdb0237
LP
1098 found = true;
1099 break;
1100 }
efdb0237
LP
1101 if (!found)
1102 continue;
8af8afd6
LP
1103 }
1104
8af8afd6 1105 p = strdup(e + 1);
7027ff61
LP
1106 if (!p)
1107 return -ENOMEM;
c6c18be3 1108
5e20b0a4
LP
1109 /* Truncate suffix indicating the process is a zombie */
1110 e = endswith(p, " (deleted)");
1111 if (e)
1112 *e = 0;
1113
c6c18be3 1114 *path = p;
7027ff61 1115 return 0;
c6c18be3
LP
1116 }
1117
1c80e425 1118 return -ENODATA;
8c6db833
LP
1119}
1120
1121int cg_install_release_agent(const char *controller, const char *agent) {
7027ff61 1122 _cleanup_free_ char *fs = NULL, *contents = NULL;
efdb0237 1123 const char *sc;
415fc41c 1124 int r;
8c6db833 1125
8c6db833
LP
1126 assert(agent);
1127
c22800e4 1128 r = cg_unified_controller(controller);
b4cccbc1
LP
1129 if (r < 0)
1130 return r;
1131 if (r > 0) /* doesn't apply to unified hierarchy */
efdb0237
LP
1132 return -EOPNOTSUPP;
1133
7027ff61
LP
1134 r = cg_get_path(controller, NULL, "release_agent", &fs);
1135 if (r < 0)
c6c18be3 1136 return r;
8c6db833 1137
7027ff61
LP
1138 r = read_one_line_file(fs, &contents);
1139 if (r < 0)
1140 return r;
8c6db833
LP
1141
1142 sc = strstrip(contents);
e155a0aa 1143 if (isempty(sc)) {
604028de 1144 r = write_string_file(fs, agent, WRITE_STRING_FILE_DISABLE_BUFFER);
574d5f2d 1145 if (r < 0)
7027ff61 1146 return r;
b8725df8 1147 } else if (!path_equal(sc, agent))
7027ff61 1148 return -EEXIST;
8c6db833 1149
0da16248 1150 fs = mfree(fs);
7027ff61
LP
1151 r = cg_get_path(controller, NULL, "notify_on_release", &fs);
1152 if (r < 0)
1153 return r;
8c6db833 1154
0da16248 1155 contents = mfree(contents);
7027ff61
LP
1156 r = read_one_line_file(fs, &contents);
1157 if (r < 0)
1158 return r;
8c6db833
LP
1159
1160 sc = strstrip(contents);
8c6db833 1161 if (streq(sc, "0")) {
604028de 1162 r = write_string_file(fs, "1", WRITE_STRING_FILE_DISABLE_BUFFER);
7027ff61
LP
1163 if (r < 0)
1164 return r;
c6c18be3 1165
7027ff61
LP
1166 return 1;
1167 }
8c6db833 1168
7027ff61
LP
1169 if (!streq(sc, "1"))
1170 return -EIO;
8c6db833 1171
7027ff61 1172 return 0;
8c6db833
LP
1173}
1174
ad929bcc
KS
1175int cg_uninstall_release_agent(const char *controller) {
1176 _cleanup_free_ char *fs = NULL;
415fc41c 1177 int r;
efdb0237 1178
c22800e4 1179 r = cg_unified_controller(controller);
b4cccbc1
LP
1180 if (r < 0)
1181 return r;
1182 if (r > 0) /* Doesn't apply to unified hierarchy */
efdb0237 1183 return -EOPNOTSUPP;
ad929bcc 1184
ac9ef333
LP
1185 r = cg_get_path(controller, NULL, "notify_on_release", &fs);
1186 if (r < 0)
1187 return r;
1188
604028de 1189 r = write_string_file(fs, "0", WRITE_STRING_FILE_DISABLE_BUFFER);
ac9ef333
LP
1190 if (r < 0)
1191 return r;
1192
0da16248 1193 fs = mfree(fs);
ac9ef333 1194
ad929bcc
KS
1195 r = cg_get_path(controller, NULL, "release_agent", &fs);
1196 if (r < 0)
1197 return r;
1198
604028de 1199 r = write_string_file(fs, "", WRITE_STRING_FILE_DISABLE_BUFFER);
ad929bcc
KS
1200 if (r < 0)
1201 return r;
1202
ac9ef333 1203 return 0;
ad929bcc
KS
1204}
1205
6f883237 1206int cg_is_empty(const char *controller, const char *path) {
7027ff61 1207 _cleanup_fclose_ FILE *f = NULL;
efdb0237 1208 pid_t pid;
7027ff61 1209 int r;
8c6db833 1210
8c6db833
LP
1211 assert(path);
1212
b043cd0b 1213 r = cg_enumerate_processes(controller, path, &f);
6f883237 1214 if (r == -ENOENT)
1bcf3fc6 1215 return true;
c3175a7f 1216 if (r < 0)
6f883237 1217 return r;
8c6db833 1218
6f883237 1219 r = cg_read_pid(f, &pid);
c6c18be3
LP
1220 if (r < 0)
1221 return r;
8c6db833 1222
6f883237 1223 return r == 0;
8c6db833
LP
1224}
1225
6f883237 1226int cg_is_empty_recursive(const char *controller, const char *path) {
415fc41c 1227 int r;
8c6db833 1228
8c6db833
LP
1229 assert(path);
1230
6fd66507 1231 /* The root cgroup is always populated */
57ea45e1 1232 if (controller && empty_or_root(path))
efdb0237 1233 return false;
6fd66507 1234
c22800e4 1235 r = cg_unified_controller(controller);
b4cccbc1
LP
1236 if (r < 0)
1237 return r;
1238 if (r > 0) {
ab2c3861 1239 _cleanup_free_ char *t = NULL;
8c6db833 1240
efdb0237 1241 /* On the unified hierarchy we can check empty state
ab2c3861 1242 * via the "populated" attribute of "cgroup.events". */
8c6db833 1243
ab2c3861 1244 r = cg_read_event(controller, path, "populated", &t);
1bcf3fc6
ZJS
1245 if (r == -ENOENT)
1246 return true;
efdb0237
LP
1247 if (r < 0)
1248 return r;
1249
1250 return streq(t, "0");
1251 } else {
1252 _cleanup_closedir_ DIR *d = NULL;
1253 char *fn;
8c6db833 1254
efdb0237 1255 r = cg_is_empty(controller, path);
35d2e7ec 1256 if (r <= 0)
7027ff61 1257 return r;
35d2e7ec 1258
efdb0237
LP
1259 r = cg_enumerate_subgroups(controller, path, &d);
1260 if (r == -ENOENT)
1bcf3fc6 1261 return true;
efdb0237
LP
1262 if (r < 0)
1263 return r;
35d2e7ec 1264
efdb0237
LP
1265 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1266 _cleanup_free_ char *p = NULL;
1267
657ee2d8 1268 p = path_join(path, fn);
efdb0237
LP
1269 free(fn);
1270 if (!p)
1271 return -ENOMEM;
1272
1273 r = cg_is_empty_recursive(controller, p);
1274 if (r <= 0)
1275 return r;
1276 }
1277 if (r < 0)
1278 return r;
1279
1280 return true;
1281 }
35d2e7ec
LP
1282}
1283
1284int cg_split_spec(const char *spec, char **controller, char **path) {
35d2e7ec 1285 char *t = NULL, *u = NULL;
efdb0237 1286 const char *e;
35d2e7ec
LP
1287
1288 assert(spec);
35d2e7ec
LP
1289
1290 if (*spec == '/') {
99be45a4 1291 if (!path_is_normalized(spec))
e884315e 1292 return -EINVAL;
35d2e7ec
LP
1293
1294 if (path) {
246aa6dd
LP
1295 t = strdup(spec);
1296 if (!t)
35d2e7ec
LP
1297 return -ENOMEM;
1298
858d36c1 1299 *path = path_simplify(t, false);
8c6db833
LP
1300 }
1301
35d2e7ec
LP
1302 if (controller)
1303 *controller = NULL;
1304
1305 return 0;
8c6db833
LP
1306 }
1307
246aa6dd
LP
1308 e = strchr(spec, ':');
1309 if (!e) {
185a0874 1310 if (!cg_controller_is_valid(spec))
35d2e7ec
LP
1311 return -EINVAL;
1312
1313 if (controller) {
efdb0237 1314 t = strdup(spec);
246aa6dd 1315 if (!t)
35d2e7ec
LP
1316 return -ENOMEM;
1317
1318 *controller = t;
1319 }
1320
1321 if (path)
1322 *path = NULL;
1323
1324 return 0;
8c6db833
LP
1325 }
1326
efdb0237 1327 t = strndup(spec, e-spec);
e884315e
LP
1328 if (!t)
1329 return -ENOMEM;
185a0874 1330 if (!cg_controller_is_valid(t)) {
e884315e 1331 free(t);
35d2e7ec 1332 return -EINVAL;
246aa6dd
LP
1333 }
1334
efdb0237
LP
1335 if (isempty(e+1))
1336 u = NULL;
1337 else {
baa89da4
LP
1338 u = strdup(e+1);
1339 if (!u) {
1340 free(t);
1341 return -ENOMEM;
1342 }
35d2e7ec 1343
99be45a4 1344 if (!path_is_normalized(u) ||
baa89da4
LP
1345 !path_is_absolute(u)) {
1346 free(t);
1347 free(u);
1348 return -EINVAL;
1349 }
1350
858d36c1 1351 path_simplify(u, false);
baa89da4 1352 }
5954c074 1353
35d2e7ec
LP
1354 if (controller)
1355 *controller = t;
e884315e
LP
1356 else
1357 free(t);
35d2e7ec
LP
1358
1359 if (path)
1360 *path = u;
e884315e
LP
1361 else
1362 free(u);
35d2e7ec
LP
1363
1364 return 0;
8c6db833 1365}
c6c18be3 1366
7027ff61 1367int cg_mangle_path(const char *path, char **result) {
78edb35a
LP
1368 _cleanup_free_ char *c = NULL, *p = NULL;
1369 char *t;
35d2e7ec
LP
1370 int r;
1371
1372 assert(path);
1373 assert(result);
1374
73e231ab 1375 /* First, check if it already is a filesystem path */
7027ff61 1376 if (path_startswith(path, "/sys/fs/cgroup")) {
35d2e7ec 1377
b69d29ce
LP
1378 t = strdup(path);
1379 if (!t)
35d2e7ec
LP
1380 return -ENOMEM;
1381
858d36c1 1382 *result = path_simplify(t, false);
35d2e7ec
LP
1383 return 0;
1384 }
1385
73e231ab 1386 /* Otherwise, treat it as cg spec */
b69d29ce
LP
1387 r = cg_split_spec(path, &c, &p);
1388 if (r < 0)
35d2e7ec
LP
1389 return r;
1390
efdb0237 1391 return cg_get_path(c ?: SYSTEMD_CGROUP_CONTROLLER, p ?: "/", NULL, result);
35d2e7ec 1392}
1f73f0f1 1393
7027ff61 1394int cg_get_root_path(char **path) {
9444b1f2 1395 char *p, *e;
7027ff61
LP
1396 int r;
1397
1398 assert(path);
1399
9444b1f2 1400 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 1, &p);
7027ff61
LP
1401 if (r < 0)
1402 return r;
1403
efdb0237
LP
1404 e = endswith(p, "/" SPECIAL_INIT_SCOPE);
1405 if (!e)
1406 e = endswith(p, "/" SPECIAL_SYSTEM_SLICE); /* legacy */
1407 if (!e)
1408 e = endswith(p, "/system"); /* even more legacy */
9444b1f2 1409 if (e)
7027ff61
LP
1410 *e = 0;
1411
1f73f0f1
LP
1412 *path = p;
1413 return 0;
1414}
b59e2465 1415
751bc6ac
LP
1416int cg_shift_path(const char *cgroup, const char *root, const char **shifted) {
1417 _cleanup_free_ char *rt = NULL;
1418 char *p;
ba1261bc
LP
1419 int r;
1420
e9174f29 1421 assert(cgroup);
751bc6ac 1422 assert(shifted);
e9174f29
LP
1423
1424 if (!root) {
1425 /* If the root was specified let's use that, otherwise
1426 * let's determine it from PID 1 */
1427
751bc6ac 1428 r = cg_get_root_path(&rt);
e9174f29
LP
1429 if (r < 0)
1430 return r;
1431
751bc6ac 1432 root = rt;
e9174f29 1433 }
ba1261bc 1434
751bc6ac 1435 p = path_startswith(cgroup, root);
efdb0237 1436 if (p && p > cgroup)
751bc6ac
LP
1437 *shifted = p - 1;
1438 else
1439 *shifted = cgroup;
1440
1441 return 0;
1442}
1443
1444int cg_pid_get_path_shifted(pid_t pid, const char *root, char **cgroup) {
1445 _cleanup_free_ char *raw = NULL;
1446 const char *c;
1447 int r;
1448
1449 assert(pid >= 0);
1450 assert(cgroup);
1451
1452 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &raw);
7027ff61 1453 if (r < 0)
ba1261bc 1454 return r;
ba1261bc 1455
751bc6ac
LP
1456 r = cg_shift_path(raw, root, &c);
1457 if (r < 0)
1458 return r;
ba1261bc 1459
ae2a15bc
LP
1460 if (c == raw)
1461 *cgroup = TAKE_PTR(raw);
1462 else {
751bc6ac 1463 char *n;
ba1261bc 1464
751bc6ac
LP
1465 n = strdup(c);
1466 if (!n)
ba1261bc 1467 return -ENOMEM;
ba1261bc 1468
751bc6ac
LP
1469 *cgroup = n;
1470 }
ba1261bc
LP
1471
1472 return 0;
1473}
1474
9ed794a3 1475int cg_path_decode_unit(const char *cgroup, char **unit) {
8b0849e9
LP
1476 char *c, *s;
1477 size_t n;
ef1673d1
MT
1478
1479 assert(cgroup);
6c03089c 1480 assert(unit);
ef1673d1 1481
8b0849e9
LP
1482 n = strcspn(cgroup, "/");
1483 if (n < 3)
1484 return -ENXIO;
1485
1486 c = strndupa(cgroup, n);
ae018d9b 1487 c = cg_unescape(c);
ef1673d1 1488
7410616c 1489 if (!unit_name_is_valid(c, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
cfeaa44a 1490 return -ENXIO;
ef1673d1 1491
d7bd3de0 1492 s = strdup(c);
6c03089c
LP
1493 if (!s)
1494 return -ENOMEM;
1495
1496 *unit = s;
ef1673d1
MT
1497 return 0;
1498}
1499
8b0849e9
LP
1500static bool valid_slice_name(const char *p, size_t n) {
1501
1502 if (!p)
1503 return false;
1504
fbd0b64f 1505 if (n < STRLEN("x.slice"))
8b0849e9
LP
1506 return false;
1507
1508 if (memcmp(p + n - 6, ".slice", 6) == 0) {
1509 char buf[n+1], *c;
1510
1511 memcpy(buf, p, n);
1512 buf[n] = 0;
1513
1514 c = cg_unescape(buf);
1515
7410616c 1516 return unit_name_is_valid(c, UNIT_NAME_PLAIN);
8b0849e9
LP
1517 }
1518
1519 return false;
1520}
1521
9444b1f2 1522static const char *skip_slices(const char *p) {
8b0849e9
LP
1523 assert(p);
1524
9444b1f2
LP
1525 /* Skips over all slice assignments */
1526
1527 for (;;) {
1021b21b
LP
1528 size_t n;
1529
9444b1f2
LP
1530 p += strspn(p, "/");
1531
1532 n = strcspn(p, "/");
8b0849e9 1533 if (!valid_slice_name(p, n))
9444b1f2
LP
1534 return p;
1535
1536 p += n;
1537 }
1538}
1539
8b0849e9 1540int cg_path_get_unit(const char *path, char **ret) {
6c03089c 1541 const char *e;
8b0849e9
LP
1542 char *unit;
1543 int r;
6c03089c
LP
1544
1545 assert(path);
8b0849e9 1546 assert(ret);
6c03089c 1547
9444b1f2 1548 e = skip_slices(path);
6c03089c 1549
8b0849e9
LP
1550 r = cg_path_decode_unit(e, &unit);
1551 if (r < 0)
1552 return r;
1553
1554 /* We skipped over the slices, don't accept any now */
1555 if (endswith(unit, ".slice")) {
1556 free(unit);
1557 return -ENXIO;
1558 }
1559
1560 *ret = unit;
1561 return 0;
6c03089c
LP
1562}
1563
1564int cg_pid_get_unit(pid_t pid, char **unit) {
7fd1b19b 1565 _cleanup_free_ char *cgroup = NULL;
ba1261bc 1566 int r;
ba1261bc 1567
ef1673d1
MT
1568 assert(unit);
1569
7027ff61 1570 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
ef1673d1
MT
1571 if (r < 0)
1572 return r;
1573
6c03089c
LP
1574 return cg_path_get_unit(cgroup, unit);
1575}
ef1673d1 1576
d4fffc4b
ZJS
1577/**
1578 * Skip session-*.scope, but require it to be there.
1579 */
9444b1f2
LP
1580static const char *skip_session(const char *p) {
1581 size_t n;
1582
8b0849e9
LP
1583 if (isempty(p))
1584 return NULL;
9444b1f2
LP
1585
1586 p += strspn(p, "/");
1587
1588 n = strcspn(p, "/");
fbd0b64f 1589 if (n < STRLEN("session-x.scope"))
d4fffc4b
ZJS
1590 return NULL;
1591
8b0849e9
LP
1592 if (memcmp(p, "session-", 8) == 0 && memcmp(p + n - 6, ".scope", 6) == 0) {
1593 char buf[n - 8 - 6 + 1];
1594
1595 memcpy(buf, p + 8, n - 8 - 6);
1596 buf[n - 8 - 6] = 0;
d4fffc4b 1597
8b0849e9
LP
1598 /* Note that session scopes never need unescaping,
1599 * since they cannot conflict with the kernel's own
1600 * names, hence we don't need to call cg_unescape()
1601 * here. */
1602
1603 if (!session_id_valid(buf))
1604 return false;
1605
1606 p += n;
1607 p += strspn(p, "/");
1608 return p;
1609 }
1610
1611 return NULL;
d4fffc4b
ZJS
1612}
1613
1614/**
1615 * Skip user@*.service, but require it to be there.
1616 */
1617static const char *skip_user_manager(const char *p) {
1618 size_t n;
1619
8b0849e9
LP
1620 if (isempty(p))
1621 return NULL;
d4fffc4b
ZJS
1622
1623 p += strspn(p, "/");
1624
1625 n = strcspn(p, "/");
fbd0b64f 1626 if (n < STRLEN("user@x.service"))
6c03089c 1627 return NULL;
ef1673d1 1628
8b0849e9
LP
1629 if (memcmp(p, "user@", 5) == 0 && memcmp(p + n - 8, ".service", 8) == 0) {
1630 char buf[n - 5 - 8 + 1];
9444b1f2 1631
8b0849e9
LP
1632 memcpy(buf, p + 5, n - 5 - 8);
1633 buf[n - 5 - 8] = 0;
1634
1635 /* Note that user manager services never need unescaping,
1636 * since they cannot conflict with the kernel's own
1637 * names, hence we don't need to call cg_unescape()
1638 * here. */
1639
1640 if (parse_uid(buf, NULL) < 0)
1641 return NULL;
1642
1643 p += n;
1644 p += strspn(p, "/");
1645
1646 return p;
1647 }
1648
1649 return NULL;
9444b1f2
LP
1650}
1651
329ac4bc 1652static const char *skip_user_prefix(const char *path) {
d4fffc4b 1653 const char *e, *t;
ef1673d1 1654
6c03089c 1655 assert(path);
ba1261bc 1656
9444b1f2
LP
1657 /* Skip slices, if there are any */
1658 e = skip_slices(path);
ba1261bc 1659
329ac4bc 1660 /* Skip the user manager, if it's in the path now... */
8b0849e9 1661 t = skip_user_manager(e);
329ac4bc
LP
1662 if (t)
1663 return t;
8b0849e9 1664
329ac4bc
LP
1665 /* Alternatively skip the user session if it is in the path... */
1666 return skip_session(e);
1667}
32081481 1668
329ac4bc
LP
1669int cg_path_get_user_unit(const char *path, char **ret) {
1670 const char *t;
6c03089c 1671
329ac4bc
LP
1672 assert(path);
1673 assert(ret);
8b0849e9 1674
329ac4bc
LP
1675 t = skip_user_prefix(path);
1676 if (!t)
8b0849e9 1677 return -ENXIO;
8b0849e9 1678
329ac4bc
LP
1679 /* And from here on it looks pretty much the same as for a
1680 * system unit, hence let's use the same parser from here
1681 * on. */
1682 return cg_path_get_unit(t, ret);
ef1673d1 1683}
ba1261bc 1684
ef1673d1 1685int cg_pid_get_user_unit(pid_t pid, char **unit) {
7fd1b19b 1686 _cleanup_free_ char *cgroup = NULL;
6c03089c
LP
1687 int r;
1688
1689 assert(unit);
1690
7027ff61 1691 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
6c03089c
LP
1692 if (r < 0)
1693 return r;
1694
1695 return cg_path_get_user_unit(cgroup, unit);
ba1261bc 1696}
e884315e 1697
7027ff61 1698int cg_path_get_machine_name(const char *path, char **machine) {
efdb0237
LP
1699 _cleanup_free_ char *u = NULL;
1700 const char *sl;
89f7c846 1701 int r;
374ec6ab 1702
89f7c846
LP
1703 r = cg_path_get_unit(path, &u);
1704 if (r < 0)
1705 return r;
7027ff61 1706
efdb0237 1707 sl = strjoina("/run/systemd/machines/unit:", u);
89f7c846 1708 return readlink_malloc(sl, machine);
7027ff61
LP
1709}
1710
1711int cg_pid_get_machine_name(pid_t pid, char **machine) {
7fd1b19b 1712 _cleanup_free_ char *cgroup = NULL;
7027ff61
LP
1713 int r;
1714
1715 assert(machine);
1716
1717 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1718 if (r < 0)
1719 return r;
1720
1721 return cg_path_get_machine_name(cgroup, machine);
1722}
1723
1724int cg_path_get_session(const char *path, char **session) {
8b0849e9
LP
1725 _cleanup_free_ char *unit = NULL;
1726 char *start, *end;
1727 int r;
7027ff61
LP
1728
1729 assert(path);
7027ff61 1730
8b0849e9
LP
1731 r = cg_path_get_unit(path, &unit);
1732 if (r < 0)
1733 return r;
7027ff61 1734
8b0849e9
LP
1735 start = startswith(unit, "session-");
1736 if (!start)
cfeaa44a 1737 return -ENXIO;
8b0849e9
LP
1738 end = endswith(start, ".scope");
1739 if (!end)
cfeaa44a 1740 return -ENXIO;
8b0849e9
LP
1741
1742 *end = 0;
1743 if (!session_id_valid(start))
cfeaa44a 1744 return -ENXIO;
374ec6ab 1745
af08d2f9 1746 if (session) {
8b0849e9 1747 char *rr;
af08d2f9 1748
8b0849e9
LP
1749 rr = strdup(start);
1750 if (!rr)
af08d2f9
LP
1751 return -ENOMEM;
1752
8b0849e9 1753 *session = rr;
af08d2f9 1754 }
7027ff61 1755
7027ff61
LP
1756 return 0;
1757}
1758
1759int cg_pid_get_session(pid_t pid, char **session) {
7fd1b19b 1760 _cleanup_free_ char *cgroup = NULL;
7027ff61
LP
1761 int r;
1762
7027ff61
LP
1763 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1764 if (r < 0)
1765 return r;
1766
1767 return cg_path_get_session(cgroup, session);
1768}
1769
ae018d9b 1770int cg_path_get_owner_uid(const char *path, uid_t *uid) {
374ec6ab 1771 _cleanup_free_ char *slice = NULL;
8b0849e9 1772 char *start, *end;
374ec6ab 1773 int r;
ae018d9b
LP
1774
1775 assert(path);
ae018d9b 1776
374ec6ab
LP
1777 r = cg_path_get_slice(path, &slice);
1778 if (r < 0)
1779 return r;
ae018d9b 1780
674eb685
LP
1781 start = startswith(slice, "user-");
1782 if (!start)
cfeaa44a 1783 return -ENXIO;
8b0849e9 1784 end = endswith(start, ".slice");
674eb685 1785 if (!end)
cfeaa44a 1786 return -ENXIO;
ae018d9b 1787
8b0849e9
LP
1788 *end = 0;
1789 if (parse_uid(start, uid) < 0)
cfeaa44a 1790 return -ENXIO;
674eb685 1791
674eb685 1792 return 0;
ae018d9b
LP
1793}
1794
1795int cg_pid_get_owner_uid(pid_t pid, uid_t *uid) {
1796 _cleanup_free_ char *cgroup = NULL;
1797 int r;
1798
ae018d9b
LP
1799 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1800 if (r < 0)
1801 return r;
1802
1803 return cg_path_get_owner_uid(cgroup, uid);
1804}
1805
1021b21b
LP
1806int cg_path_get_slice(const char *p, char **slice) {
1807 const char *e = NULL;
1021b21b
LP
1808
1809 assert(p);
1810 assert(slice);
1811
329ac4bc
LP
1812 /* Finds the right-most slice unit from the beginning, but
1813 * stops before we come to the first non-slice unit. */
1814
1021b21b
LP
1815 for (;;) {
1816 size_t n;
1817
1818 p += strspn(p, "/");
1819
1820 n = strcspn(p, "/");
8b0849e9 1821 if (!valid_slice_name(p, n)) {
1021b21b 1822
8b0849e9
LP
1823 if (!e) {
1824 char *s;
1021b21b 1825
e5d855d3 1826 s = strdup(SPECIAL_ROOT_SLICE);
8b0849e9
LP
1827 if (!s)
1828 return -ENOMEM;
1021b21b 1829
8b0849e9
LP
1830 *slice = s;
1831 return 0;
1832 }
1833
1834 return cg_path_decode_unit(e, slice);
1021b21b
LP
1835 }
1836
1837 e = p;
1021b21b
LP
1838 p += n;
1839 }
1840}
1841
1842int cg_pid_get_slice(pid_t pid, char **slice) {
1843 _cleanup_free_ char *cgroup = NULL;
1844 int r;
1845
1846 assert(slice);
1847
1848 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1849 if (r < 0)
1850 return r;
1851
1852 return cg_path_get_slice(cgroup, slice);
1853}
1854
329ac4bc
LP
1855int cg_path_get_user_slice(const char *p, char **slice) {
1856 const char *t;
1857 assert(p);
1858 assert(slice);
1859
1860 t = skip_user_prefix(p);
1861 if (!t)
1862 return -ENXIO;
1863
1864 /* And now it looks pretty much the same as for a system
1865 * slice, so let's just use the same parser from here on. */
1866 return cg_path_get_slice(t, slice);
1867}
1868
1869int cg_pid_get_user_slice(pid_t pid, char **slice) {
1870 _cleanup_free_ char *cgroup = NULL;
1871 int r;
1872
1873 assert(slice);
1874
1875 r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1876 if (r < 0)
1877 return r;
1878
1879 return cg_path_get_user_slice(cgroup, slice);
1880}
1881
ae018d9b
LP
1882char *cg_escape(const char *p) {
1883 bool need_prefix = false;
1884
1885 /* This implements very minimal escaping for names to be used
1886 * as file names in the cgroup tree: any name which might
1887 * conflict with a kernel name or is prefixed with '_' is
1888 * prefixed with a '_'. That way, when reading cgroup names it
1889 * is sufficient to remove a single prefixing underscore if
1890 * there is one. */
1891
1892 /* The return value of this function (unlike cg_unescape())
1893 * needs free()! */
1894
4c701096 1895 if (IN_SET(p[0], 0, '_', '.') ||
0cbd293e 1896 STR_IN_SET(p, "notify_on_release", "release_agent", "tasks") ||
efdb0237 1897 startswith(p, "cgroup."))
ae018d9b
LP
1898 need_prefix = true;
1899 else {
1900 const char *dot;
1901
1902 dot = strrchr(p, '.');
1903 if (dot) {
efdb0237
LP
1904 CGroupController c;
1905 size_t l = dot - p;
ae018d9b 1906
efdb0237
LP
1907 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
1908 const char *n;
1909
1910 n = cgroup_controller_to_string(c);
ae018d9b 1911
efdb0237
LP
1912 if (l != strlen(n))
1913 continue;
ae018d9b 1914
efdb0237
LP
1915 if (memcmp(p, n, l) != 0)
1916 continue;
1917
1918 need_prefix = true;
1919 break;
ae018d9b
LP
1920 }
1921 }
1922 }
1923
1924 if (need_prefix)
b910cc72 1925 return strjoin("_", p);
efdb0237
LP
1926
1927 return strdup(p);
ae018d9b
LP
1928}
1929
1930char *cg_unescape(const char *p) {
1931 assert(p);
1932
1933 /* The return value of this function (unlike cg_escape())
1934 * doesn't need free()! */
1935
1936 if (p[0] == '_')
1937 return (char*) p+1;
1938
1939 return (char*) p;
1940}
78edb35a
LP
1941
1942#define CONTROLLER_VALID \
4b549144 1943 DIGITS LETTERS \
78edb35a
LP
1944 "_"
1945
185a0874 1946bool cg_controller_is_valid(const char *p) {
78edb35a
LP
1947 const char *t, *s;
1948
1949 if (!p)
1950 return false;
1951
b6629c4b
TH
1952 if (streq(p, SYSTEMD_CGROUP_CONTROLLER))
1953 return true;
1954
185a0874
DJL
1955 s = startswith(p, "name=");
1956 if (s)
1957 p = s;
78edb35a 1958
4c701096 1959 if (IN_SET(*p, 0, '_'))
78edb35a
LP
1960 return false;
1961
1962 for (t = p; *t; t++)
1963 if (!strchr(CONTROLLER_VALID, *t))
1964 return false;
1965
1966 if (t - p > FILENAME_MAX)
1967 return false;
1968
1969 return true;
1970}
a016b922
LP
1971
1972int cg_slice_to_path(const char *unit, char **ret) {
1973 _cleanup_free_ char *p = NULL, *s = NULL, *e = NULL;
1974 const char *dash;
7410616c 1975 int r;
a016b922
LP
1976
1977 assert(unit);
1978 assert(ret);
1979
e5d855d3 1980 if (streq(unit, SPECIAL_ROOT_SLICE)) {
c96cc582
LP
1981 char *x;
1982
1983 x = strdup("");
1984 if (!x)
1985 return -ENOMEM;
1986 *ret = x;
1987 return 0;
1988 }
1989
7410616c 1990 if (!unit_name_is_valid(unit, UNIT_NAME_PLAIN))
a016b922
LP
1991 return -EINVAL;
1992
1993 if (!endswith(unit, ".slice"))
1994 return -EINVAL;
1995
7410616c
LP
1996 r = unit_name_to_prefix(unit, &p);
1997 if (r < 0)
1998 return r;
a016b922
LP
1999
2000 dash = strchr(p, '-');
e66e5b61
LP
2001
2002 /* Don't allow initial dashes */
2003 if (dash == p)
2004 return -EINVAL;
2005
a016b922
LP
2006 while (dash) {
2007 _cleanup_free_ char *escaped = NULL;
2008 char n[dash - p + sizeof(".slice")];
2009
989290db 2010#if HAS_FEATURE_MEMORY_SANITIZER
1c56d501 2011 /* msan doesn't instrument stpncpy, so it thinks
5238e957 2012 * n is later used uninitialized:
1c56d501
ZJS
2013 * https://github.com/google/sanitizers/issues/926
2014 */
2015 zero(n);
2016#endif
2017
e66e5b61 2018 /* Don't allow trailing or double dashes */
4c701096 2019 if (IN_SET(dash[1], 0, '-'))
c96cc582 2020 return -EINVAL;
a016b922 2021
c96cc582 2022 strcpy(stpncpy(n, p, dash - p), ".slice");
7410616c 2023 if (!unit_name_is_valid(n, UNIT_NAME_PLAIN))
a016b922
LP
2024 return -EINVAL;
2025
2026 escaped = cg_escape(n);
2027 if (!escaped)
2028 return -ENOMEM;
2029
2030 if (!strextend(&s, escaped, "/", NULL))
2031 return -ENOMEM;
2032
2033 dash = strchr(dash+1, '-');
2034 }
2035
2036 e = cg_escape(unit);
2037 if (!e)
2038 return -ENOMEM;
2039
2040 if (!strextend(&s, e, NULL))
2041 return -ENOMEM;
2042
ae2a15bc 2043 *ret = TAKE_PTR(s);
a016b922
LP
2044
2045 return 0;
2046}
4ad49000
LP
2047
2048int cg_set_attribute(const char *controller, const char *path, const char *attribute, const char *value) {
2049 _cleanup_free_ char *p = NULL;
2050 int r;
2051
2052 r = cg_get_path(controller, path, attribute, &p);
2053 if (r < 0)
2054 return r;
2055
604028de 2056 return write_string_file(p, value, WRITE_STRING_FILE_DISABLE_BUFFER);
4ad49000
LP
2057}
2058
934277fe
LP
2059int cg_get_attribute(const char *controller, const char *path, const char *attribute, char **ret) {
2060 _cleanup_free_ char *p = NULL;
2061 int r;
2062
2063 r = cg_get_path(controller, path, attribute, &p);
2064 if (r < 0)
2065 return r;
2066
2067 return read_one_line_file(p, ret);
2068}
2069
b734a4ff
LP
2070int cg_get_keyed_attribute(
2071 const char *controller,
2072 const char *path,
2073 const char *attribute,
2074 char **keys,
2075 char **ret_values) {
66ebf6c0 2076
b734a4ff 2077 _cleanup_free_ char *filename = NULL, *contents = NULL;
b734a4ff 2078 const char *p;
9177fa9f 2079 size_t n, i, n_done = 0;
b734a4ff
LP
2080 char **v;
2081 int r;
2082
4e1dfa45 2083 /* Reads one or more fields of a cgroup v2 keyed attribute file. The 'keys' parameter should be an strv with
b734a4ff
LP
2084 * all keys to retrieve. The 'ret_values' parameter should be passed as string size with the same number of
2085 * entries as 'keys'. On success each entry will be set to the value of the matching key.
2086 *
2087 * If the attribute file doesn't exist at all returns ENOENT, if any key is not found returns ENXIO. */
66ebf6c0
TH
2088
2089 r = cg_get_path(controller, path, attribute, &filename);
2090 if (r < 0)
2091 return r;
2092
b734a4ff 2093 r = read_full_file(filename, &contents, NULL);
66ebf6c0
TH
2094 if (r < 0)
2095 return r;
2096
b734a4ff
LP
2097 n = strv_length(keys);
2098 if (n == 0) /* No keys to retrieve? That's easy, we are done then */
2099 return 0;
66ebf6c0 2100
b734a4ff
LP
2101 /* Let's build this up in a temporary array for now in order not to clobber the return parameter on failure */
2102 v = newa0(char*, n);
66ebf6c0 2103
b734a4ff
LP
2104 for (p = contents; *p;) {
2105 const char *w = NULL;
b734a4ff 2106
9177fa9f
ZJS
2107 for (i = 0; i < n; i++)
2108 if (!v[i]) {
b734a4ff
LP
2109 w = first_word(p, keys[i]);
2110 if (w)
2111 break;
66ebf6c0 2112 }
66ebf6c0 2113
b734a4ff 2114 if (w) {
b734a4ff
LP
2115 size_t l;
2116
2117 l = strcspn(w, NEWLINE);
9177fa9f
ZJS
2118 v[i] = strndup(w, l);
2119 if (!v[i]) {
b734a4ff
LP
2120 r = -ENOMEM;
2121 goto fail;
66ebf6c0 2122 }
b734a4ff 2123
b734a4ff 2124 n_done++;
b734a4ff
LP
2125 if (n_done >= n)
2126 goto done;
2127
2128 p = w + l;
9177fa9f 2129 } else
b734a4ff 2130 p += strcspn(p, NEWLINE);
b734a4ff
LP
2131
2132 p += strspn(p, NEWLINE);
66ebf6c0
TH
2133 }
2134
b734a4ff
LP
2135 r = -ENXIO;
2136
2137fail:
2138 for (i = 0; i < n; i++)
2139 free(v[i]);
2140
2141 return r;
2142
2143done:
2144 memcpy(ret_values, v, sizeof(char*) * n);
66ebf6c0 2145 return 0;
b734a4ff 2146
66ebf6c0
TH
2147}
2148
efdb0237
LP
2149int cg_create_everywhere(CGroupMask supported, CGroupMask mask, const char *path) {
2150 CGroupController c;
e353faa0 2151 CGroupMask done;
65be7e06 2152 bool created;
415fc41c 2153 int r;
4ad49000
LP
2154
2155 /* This one will create a cgroup in our private tree, but also
2156 * duplicate it in the trees specified in mask, and remove it
65be7e06
ZJS
2157 * in all others.
2158 *
2159 * Returns 0 if the group already existed in the systemd hierarchy,
2160 * 1 on success, negative otherwise.
2161 */
4ad49000
LP
2162
2163 /* First create the cgroup in our own hierarchy. */
2164 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, path);
2165 if (r < 0)
2166 return r;
490c5a37 2167 created = r;
4ad49000 2168
efdb0237 2169 /* If we are in the unified hierarchy, we are done now */
b4cccbc1
LP
2170 r = cg_all_unified();
2171 if (r < 0)
2172 return r;
2173 if (r > 0)
65be7e06 2174 return created;
efdb0237 2175
e353faa0
LP
2176 supported &= CGROUP_MASK_V1;
2177 mask = CGROUP_MASK_EXTEND_JOINED(mask);
2178 done = 0;
2179
efdb0237
LP
2180 /* Otherwise, do the same in the other hierarchies */
2181 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2182 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2183 const char *n;
2184
e353faa0 2185 if (!FLAGS_SET(supported, bit))
ab275f23
LP
2186 continue;
2187
e353faa0
LP
2188 if (FLAGS_SET(done, bit))
2189 continue;
efdb0237 2190
e353faa0 2191 n = cgroup_controller_to_string(c);
f99850a0 2192 if (FLAGS_SET(mask, bit))
efdb0237 2193 (void) cg_create(n, path);
e353faa0 2194 else
efdb0237 2195 (void) cg_trim(n, path, true);
e353faa0
LP
2196
2197 done |= CGROUP_MASK_EXTEND_JOINED(bit);
4ad49000
LP
2198 }
2199
65be7e06 2200 return created;
4ad49000
LP
2201}
2202
efdb0237
LP
2203int cg_attach_everywhere(CGroupMask supported, const char *path, pid_t pid, cg_migrate_callback_t path_callback, void *userdata) {
2204 CGroupController c;
e353faa0 2205 CGroupMask done;
415fc41c 2206 int r;
4ad49000
LP
2207
2208 r = cg_attach(SYSTEMD_CGROUP_CONTROLLER, path, pid);
13b84ec7
LP
2209 if (r < 0)
2210 return r;
4ad49000 2211
b4cccbc1
LP
2212 r = cg_all_unified();
2213 if (r < 0)
2214 return r;
2215 if (r > 0)
efdb0237 2216 return 0;
7b3fd631 2217
e353faa0
LP
2218 supported &= CGROUP_MASK_V1;
2219 done = 0;
2220
efdb0237
LP
2221 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2222 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2223 const char *p = NULL;
7b3fd631 2224
e353faa0 2225 if (!FLAGS_SET(supported, bit))
ab275f23
LP
2226 continue;
2227
e353faa0 2228 if (FLAGS_SET(done, bit))
efdb0237 2229 continue;
7b3fd631 2230
efdb0237
LP
2231 if (path_callback)
2232 p = path_callback(bit, userdata);
efdb0237
LP
2233 if (!p)
2234 p = path;
4ad49000 2235
efdb0237 2236 (void) cg_attach_fallback(cgroup_controller_to_string(c), p, pid);
e353faa0 2237 done |= CGROUP_MASK_EXTEND_JOINED(bit);
4ad49000
LP
2238 }
2239
13b84ec7 2240 return 0;
4ad49000
LP
2241}
2242
efdb0237 2243int cg_attach_many_everywhere(CGroupMask supported, const char *path, Set* pids, cg_migrate_callback_t path_callback, void *userdata) {
6c12b52e
LP
2244 Iterator i;
2245 void *pidp;
2246 int r = 0;
2247
2248 SET_FOREACH(pidp, pids, i) {
fea72cc0 2249 pid_t pid = PTR_TO_PID(pidp);
13b84ec7 2250 int q;
6c12b52e 2251
7b3fd631 2252 q = cg_attach_everywhere(supported, path, pid, path_callback, userdata);
efdb0237 2253 if (q < 0 && r >= 0)
13b84ec7 2254 r = q;
6c12b52e
LP
2255 }
2256
2257 return r;
2258}
2259
efdb0237 2260int cg_migrate_everywhere(CGroupMask supported, const char *from, const char *to, cg_migrate_callback_t to_callback, void *userdata) {
b3c5bad3 2261 CGroupController c;
e353faa0 2262 CGroupMask done;
b4cccbc1 2263 int r = 0, q;
4ad49000 2264
13b84ec7 2265 if (!path_equal(from, to)) {
1d98fef1 2266 r = cg_migrate_recursive(SYSTEMD_CGROUP_CONTROLLER, from, SYSTEMD_CGROUP_CONTROLLER, to, CGROUP_REMOVE);
13b84ec7
LP
2267 if (r < 0)
2268 return r;
2269 }
4ad49000 2270
b4cccbc1
LP
2271 q = cg_all_unified();
2272 if (q < 0)
2273 return q;
2274 if (q > 0)
efdb0237 2275 return r;
03b90d4b 2276
e353faa0
LP
2277 supported &= CGROUP_MASK_V1;
2278 done = 0;
2279
efdb0237
LP
2280 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2281 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2282 const char *p = NULL;
03b90d4b 2283
e353faa0 2284 if (!FLAGS_SET(supported, bit))
ab275f23
LP
2285 continue;
2286
e353faa0 2287 if (FLAGS_SET(done, bit))
efdb0237 2288 continue;
03b90d4b 2289
efdb0237
LP
2290 if (to_callback)
2291 p = to_callback(bit, userdata);
efdb0237
LP
2292 if (!p)
2293 p = to;
2294
1d98fef1 2295 (void) cg_migrate_recursive_fallback(SYSTEMD_CGROUP_CONTROLLER, to, cgroup_controller_to_string(c), p, 0);
e353faa0 2296 done |= CGROUP_MASK_EXTEND_JOINED(bit);
4ad49000
LP
2297 }
2298
e353faa0 2299 return r;
4ad49000
LP
2300}
2301
efdb0237
LP
2302int cg_trim_everywhere(CGroupMask supported, const char *path, bool delete_root) {
2303 CGroupController c;
e353faa0 2304 CGroupMask done;
b4cccbc1 2305 int r, q;
4ad49000
LP
2306
2307 r = cg_trim(SYSTEMD_CGROUP_CONTROLLER, path, delete_root);
2308 if (r < 0)
2309 return r;
2310
b4cccbc1
LP
2311 q = cg_all_unified();
2312 if (q < 0)
2313 return q;
2314 if (q > 0)
efdb0237
LP
2315 return r;
2316
e353faa0
LP
2317 supported &= CGROUP_MASK_V1;
2318 done = 0;
2319
efdb0237
LP
2320 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2321 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2322
e353faa0 2323 if (!FLAGS_SET(supported, bit))
ab275f23
LP
2324 continue;
2325
e353faa0 2326 if (FLAGS_SET(done, bit))
efdb0237 2327 continue;
4ad49000 2328
efdb0237 2329 (void) cg_trim(cgroup_controller_to_string(c), path, delete_root);
e353faa0 2330 done |= CGROUP_MASK_EXTEND_JOINED(bit);
4ad49000
LP
2331 }
2332
e353faa0 2333 return r;
4ad49000
LP
2334}
2335
aae7e17f 2336int cg_mask_to_string(CGroupMask mask, char **ret) {
ec635a2d
LP
2337 _cleanup_free_ char *s = NULL;
2338 size_t n = 0, allocated = 0;
2339 bool space = false;
aae7e17f 2340 CGroupController c;
aae7e17f
FB
2341
2342 assert(ret);
2343
2344 if (mask == 0) {
2345 *ret = NULL;
2346 return 0;
2347 }
2348
2349 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
ec635a2d
LP
2350 const char *k;
2351 size_t l;
aae7e17f 2352
f99850a0 2353 if (!FLAGS_SET(mask, CGROUP_CONTROLLER_TO_MASK(c)))
aae7e17f
FB
2354 continue;
2355
ec635a2d
LP
2356 k = cgroup_controller_to_string(c);
2357 l = strlen(k);
2358
2359 if (!GREEDY_REALLOC(s, allocated, n + space + l + 1))
2360 return -ENOMEM;
2361
2362 if (space)
2363 s[n] = ' ';
2364 memcpy(s + n + space, k, l);
2365 n += space + l;
2366
2367 space = true;
aae7e17f
FB
2368 }
2369
ec635a2d 2370 assert(s);
aae7e17f 2371
ec635a2d 2372 s[n] = 0;
ae2a15bc 2373 *ret = TAKE_PTR(s);
ec635a2d 2374
aae7e17f
FB
2375 return 0;
2376}
2377
38a90d45
LP
2378int cg_mask_from_string(const char *value, CGroupMask *ret) {
2379 CGroupMask m = 0;
2380
2381 assert(ret);
aae7e17f
FB
2382 assert(value);
2383
2384 for (;;) {
2385 _cleanup_free_ char *n = NULL;
2386 CGroupController v;
2387 int r;
2388
2389 r = extract_first_word(&value, &n, NULL, 0);
2390 if (r < 0)
2391 return r;
2392 if (r == 0)
2393 break;
2394
2395 v = cgroup_controller_from_string(n);
2396 if (v < 0)
2397 continue;
2398
38a90d45 2399 m |= CGROUP_CONTROLLER_TO_MASK(v);
aae7e17f 2400 }
38a90d45
LP
2401
2402 *ret = m;
aae7e17f
FB
2403 return 0;
2404}
2405
efdb0237 2406int cg_mask_supported(CGroupMask *ret) {
38a90d45 2407 CGroupMask mask;
415fc41c 2408 int r;
efdb0237 2409
67558d15
LP
2410 /* Determines the mask of supported cgroup controllers. Only includes controllers we can make sense of and that
2411 * are actually accessible. Only covers real controllers, i.e. not the CGROUP_CONTROLLER_BPF_xyz
2412 * pseudo-controllers. */
4ad49000 2413
b4cccbc1
LP
2414 r = cg_all_unified();
2415 if (r < 0)
2416 return r;
2417 if (r > 0) {
5f4c5fef 2418 _cleanup_free_ char *root = NULL, *controllers = NULL, *path = NULL;
efdb0237
LP
2419
2420 /* In the unified hierarchy we can read the supported
2421 * and accessible controllers from a the top-level
2422 * cgroup attribute */
2423
5f4c5fef
LP
2424 r = cg_get_root_path(&root);
2425 if (r < 0)
2426 return r;
2427
2428 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, root, "cgroup.controllers", &path);
2429 if (r < 0)
2430 return r;
2431
2432 r = read_one_line_file(path, &controllers);
efdb0237
LP
2433 if (r < 0)
2434 return r;
4ad49000 2435
aae7e17f
FB
2436 r = cg_mask_from_string(controllers, &mask);
2437 if (r < 0)
2438 return r;
efdb0237 2439
03afd780 2440 /* Currently, we support the cpu, memory, io and pids controller in the unified hierarchy, mask
03a7b521 2441 * everything else off. */
03afd780 2442 mask &= CGROUP_MASK_V2;
efdb0237
LP
2443
2444 } else {
2445 CGroupController c;
2446
03afd780 2447 /* In the legacy hierarchy, we check which hierarchies are mounted. */
efdb0237 2448
38a90d45 2449 mask = 0;
efdb0237 2450 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
03afd780 2451 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
efdb0237
LP
2452 const char *n;
2453
03afd780
LP
2454 if (!FLAGS_SET(CGROUP_MASK_V1, bit))
2455 continue;
2456
efdb0237
LP
2457 n = cgroup_controller_to_string(c);
2458 if (controller_is_accessible(n) >= 0)
03afd780 2459 mask |= bit;
efdb0237 2460 }
4ad49000
LP
2461 }
2462
efdb0237
LP
2463 *ret = mask;
2464 return 0;
4ad49000 2465}
b12afc8c 2466
6925a0de
LP
2467int cg_kernel_controllers(Set **ret) {
2468 _cleanup_set_free_free_ Set *controllers = NULL;
b12afc8c 2469 _cleanup_fclose_ FILE *f = NULL;
b12afc8c
LP
2470 int r;
2471
6925a0de 2472 assert(ret);
b12afc8c 2473
f09e86bc
LS
2474 /* Determines the full list of kernel-known controllers. Might include controllers we don't actually support
2475 * and controllers that aren't currently accessible (because not mounted). This does not include "name="
2476 * pseudo-controllers. */
e155a0aa 2477
6925a0de
LP
2478 controllers = set_new(&string_hash_ops);
2479 if (!controllers)
2480 return -ENOMEM;
2481
fdeea3f4
ZJS
2482 r = fopen_unlocked("/proc/cgroups", "re", &f);
2483 if (r == -ENOENT) {
2484 *ret = NULL;
2485 return 0;
b12afc8c 2486 }
fdeea3f4
ZJS
2487 if (r < 0)
2488 return r;
35bbbf85 2489
b12afc8c 2490 /* Ignore the header line */
2351e44d 2491 (void) read_line(f, (size_t) -1, NULL);
b12afc8c
LP
2492
2493 for (;;) {
2494 char *controller;
2495 int enabled = 0;
2496
2497 errno = 0;
2498 if (fscanf(f, "%ms %*i %*i %i", &controller, &enabled) != 2) {
2499
2500 if (feof(f))
2501 break;
2502
66855de7
LP
2503 if (ferror(f))
2504 return errno_or_else(EIO);
b12afc8c
LP
2505
2506 return -EBADMSG;
2507 }
2508
2509 if (!enabled) {
2510 free(controller);
2511 continue;
2512 }
2513
efdb0237 2514 if (!cg_controller_is_valid(controller)) {
b12afc8c
LP
2515 free(controller);
2516 return -EBADMSG;
2517 }
2518
2519 r = set_consume(controllers, controller);
2520 if (r < 0)
2521 return r;
2522 }
2523
1cc6c93a 2524 *ret = TAKE_PTR(controllers);
6925a0de 2525
b12afc8c
LP
2526 return 0;
2527}
efdb0237 2528
5da38d07
TH
2529static thread_local CGroupUnified unified_cache = CGROUP_UNIFIED_UNKNOWN;
2530
4e1dfa45 2531/* The hybrid mode was initially implemented in v232 and simply mounted cgroup2 on /sys/fs/cgroup/systemd. This
c22800e4
LP
2532 * unfortunately broke other tools (such as docker) which expected the v1 "name=systemd" hierarchy on
2533 * /sys/fs/cgroup/systemd. From v233 and on, the hybrid mode mountnbs v2 on /sys/fs/cgroup/unified and maintains
2534 * "name=systemd" hierarchy on /sys/fs/cgroup/systemd for compatibility with other tools.
f08e9287 2535 *
c22800e4
LP
2536 * To keep live upgrade working, we detect and support v232 layout. When v232 layout is detected, to keep cgroup v2
2537 * process management but disable the compat dual layout, we return %true on
2538 * cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) and %false on cg_hybrid_unified().
f08e9287
TH
2539 */
2540static thread_local bool unified_systemd_v232;
2541
1fcca10e 2542static int cg_unified_update(void) {
efdb0237 2543
efdb0237
LP
2544 struct statfs fs;
2545
2546 /* Checks if we support the unified hierarchy. Returns an
2547 * error when the cgroup hierarchies aren't mounted yet or we
2548 * have any other trouble determining if the unified hierarchy
2549 * is supported. */
2550
5da38d07
TH
2551 if (unified_cache >= CGROUP_UNIFIED_NONE)
2552 return 0;
efdb0237
LP
2553
2554 if (statfs("/sys/fs/cgroup/", &fs) < 0)
c028bed1 2555 return log_debug_errno(errno, "statfs(\"/sys/fs/cgroup/\") failed: %m");
efdb0237 2556
9aa21133
ZJS
2557 if (F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2558 log_debug("Found cgroup2 on /sys/fs/cgroup/, full unified hierarchy");
5da38d07 2559 unified_cache = CGROUP_UNIFIED_ALL;
9aa21133 2560 } else if (F_TYPE_EQUAL(fs.f_type, TMPFS_MAGIC)) {
2977724b 2561 if (statfs("/sys/fs/cgroup/unified/", &fs) == 0 &&
f08e9287 2562 F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
9aa21133 2563 log_debug("Found cgroup2 on /sys/fs/cgroup/unified, unified hierarchy for systemd controller");
2977724b 2564 unified_cache = CGROUP_UNIFIED_SYSTEMD;
f08e9287 2565 unified_systemd_v232 = false;
f08e9287 2566 } else {
2977724b 2567 if (statfs("/sys/fs/cgroup/systemd/", &fs) < 0)
9aa21133 2568 return log_debug_errno(errno, "statfs(\"/sys/fs/cgroup/systemd\" failed: %m");
5535d8f7
EV
2569
2570 if (F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2571 log_debug("Found cgroup2 on /sys/fs/cgroup/systemd, unified hierarchy for systemd controller (v232 variant)");
2572 unified_cache = CGROUP_UNIFIED_SYSTEMD;
2573 unified_systemd_v232 = true;
2574 } else if (F_TYPE_EQUAL(fs.f_type, CGROUP_SUPER_MAGIC)) {
2575 log_debug("Found cgroup on /sys/fs/cgroup/systemd, legacy hierarchy");
2576 unified_cache = CGROUP_UNIFIED_NONE;
2577 } else {
2578 log_debug("Unexpected filesystem type %llx mounted on /sys/fs/cgroup/systemd, assuming legacy hierarchy",
9aa21133 2579 (unsigned long long) fs.f_type);
5535d8f7 2580 unified_cache = CGROUP_UNIFIED_NONE;
9aa21133 2581 }
2977724b 2582 }
baaa35ad
ZJS
2583 } else
2584 return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM),
2585 "Unknown filesystem type %llx mounted on /sys/fs/cgroup.",
2586 (unsigned long long)fs.f_type);
efdb0237 2587
5da38d07
TH
2588 return 0;
2589}
2590
c22800e4 2591int cg_unified_controller(const char *controller) {
b4cccbc1 2592 int r;
5da38d07 2593
1fcca10e 2594 r = cg_unified_update();
b4cccbc1
LP
2595 if (r < 0)
2596 return r;
5da38d07 2597
fc9ae717
LP
2598 if (unified_cache == CGROUP_UNIFIED_NONE)
2599 return false;
2600
2601 if (unified_cache >= CGROUP_UNIFIED_ALL)
2602 return true;
2603
2604 return streq_ptr(controller, SYSTEMD_CGROUP_CONTROLLER);
5da38d07
TH
2605}
2606
b4cccbc1 2607int cg_all_unified(void) {
4bb652ac
LP
2608 int r;
2609
2610 r = cg_unified_update();
2611 if (r < 0)
2612 return r;
2613
2614 return unified_cache >= CGROUP_UNIFIED_ALL;
efdb0237
LP
2615}
2616
b4cccbc1
LP
2617int cg_hybrid_unified(void) {
2618 int r;
2977724b 2619
1fcca10e 2620 r = cg_unified_update();
b4cccbc1
LP
2621 if (r < 0)
2622 return r;
2977724b 2623
f08e9287 2624 return unified_cache == CGROUP_UNIFIED_SYSTEMD && !unified_systemd_v232;
2977724b
TH
2625}
2626
415fc41c 2627int cg_unified_flush(void) {
5da38d07 2628 unified_cache = CGROUP_UNIFIED_UNKNOWN;
415fc41c 2629
1fcca10e 2630 return cg_unified_update();
efdb0237
LP
2631}
2632
27adcc97
LP
2633int cg_enable_everywhere(
2634 CGroupMask supported,
2635 CGroupMask mask,
2636 const char *p,
2637 CGroupMask *ret_result_mask) {
2638
77fa610b 2639 _cleanup_fclose_ FILE *f = NULL;
efdb0237
LP
2640 _cleanup_free_ char *fs = NULL;
2641 CGroupController c;
27adcc97 2642 CGroupMask ret = 0;
415fc41c 2643 int r;
efdb0237
LP
2644
2645 assert(p);
2646
27adcc97
LP
2647 if (supported == 0) {
2648 if (ret_result_mask)
2649 *ret_result_mask = 0;
efdb0237 2650 return 0;
27adcc97 2651 }
efdb0237 2652
b4cccbc1
LP
2653 r = cg_all_unified();
2654 if (r < 0)
2655 return r;
27adcc97 2656 if (r == 0) {
5238e957 2657 /* On the legacy hierarchy there's no concept of "enabling" controllers in cgroups defined. Let's claim
27adcc97
LP
2658 * complete success right away. (If you wonder why we return the full mask here, rather than zero: the
2659 * caller tends to use the returned mask later on to compare if all controllers where properly joined,
2660 * and if not requeues realization. This use is the primary purpose of the return value, hence let's
2661 * minimize surprises here and reduce triggers for re-realization by always saying we fully
2662 * succeeded.) */
2663 if (ret_result_mask)
2664 *ret_result_mask = mask & supported & CGROUP_MASK_V2; /* If you wonder why we mask this with
2665 * CGROUP_MASK_V2: The 'supported' mask
2666 * might contain pure-V1 or BPF
2667 * controllers, and we never want to
2668 * claim that we could enable those with
2669 * cgroup.subtree_control */
efdb0237 2670 return 0;
27adcc97 2671 }
efdb0237
LP
2672
2673 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, p, "cgroup.subtree_control", &fs);
2674 if (r < 0)
2675 return r;
2676
2677 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2678 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2679 const char *n;
2680
ab275f23
LP
2681 if (!FLAGS_SET(CGROUP_MASK_V2, bit))
2682 continue;
2683
f99850a0 2684 if (!FLAGS_SET(supported, bit))
efdb0237
LP
2685 continue;
2686
2687 n = cgroup_controller_to_string(c);
2688 {
2689 char s[1 + strlen(n) + 1];
2690
f99850a0 2691 s[0] = FLAGS_SET(mask, bit) ? '+' : '-';
efdb0237
LP
2692 strcpy(s + 1, n);
2693
77fa610b
LP
2694 if (!f) {
2695 f = fopen(fs, "we");
54b5ba1d
LP
2696 if (!f)
2697 return log_debug_errno(errno, "Failed to open cgroup.subtree_control file of %s: %m", p);
77fa610b
LP
2698 }
2699
604028de 2700 r = write_string_stream(f, s, WRITE_STRING_FILE_DISABLE_BUFFER);
96aa6591 2701 if (r < 0) {
94f344fb
LP
2702 log_debug_errno(r, "Failed to %s controller %s for %s (%s): %m",
2703 FLAGS_SET(mask, bit) ? "enable" : "disable", n, p, fs);
96aa6591 2704 clearerr(f);
27adcc97
LP
2705
2706 /* If we can't turn off a controller, leave it on in the reported resulting mask. This
2707 * happens for example when we attempt to turn off a controller up in the tree that is
2708 * used down in the tree. */
2709 if (!FLAGS_SET(mask, bit) && r == -EBUSY) /* You might wonder why we check for EBUSY
2710 * only here, and not follow the same logic
2711 * for other errors such as EINVAL or
2712 * EOPNOTSUPP or anything else. That's
2713 * because EBUSY indicates that the
2714 * controllers is currently enabled and
2715 * cannot be disabled because something down
2716 * the hierarchy is still using it. Any other
2717 * error most likely means something like "I
2718 * never heard of this controller" or
2719 * similar. In the former case it's hence
2720 * safe to assume the controller is still on
2721 * after the failed operation, while in the
2722 * latter case it's safer to assume the
2723 * controller is unknown and hence certainly
2724 * not enabled. */
2725 ret |= bit;
2726 } else {
2727 /* Otherwise, if we managed to turn on a controller, set the bit reflecting that. */
2728 if (FLAGS_SET(mask, bit))
2729 ret |= bit;
96aa6591 2730 }
efdb0237
LP
2731 }
2732 }
2733
27adcc97
LP
2734 /* Let's return the precise set of controllers now enabled for the cgroup. */
2735 if (ret_result_mask)
2736 *ret_result_mask = ret;
2737
efdb0237
LP
2738 return 0;
2739}
2740
2741bool cg_is_unified_wanted(void) {
2742 static thread_local int wanted = -1;
415fc41c 2743 int r;
1d84ad94 2744 bool b;
77fab2a9 2745 const bool is_default = DEFAULT_HIERARCHY == CGROUP_UNIFIED_ALL;
5f086dc7 2746 _cleanup_free_ char *c = NULL;
efdb0237 2747
77fab2a9 2748 /* If we have a cached value, return that. */
efdb0237
LP
2749 if (wanted >= 0)
2750 return wanted;
2751
239a3d09
ZJS
2752 /* If the hierarchy is already mounted, then follow whatever
2753 * was chosen for it. */
2754 if (cg_unified_flush() >= 0)
b4cccbc1 2755 return (wanted = unified_cache >= CGROUP_UNIFIED_ALL);
239a3d09 2756
5f086dc7
CD
2757 /* If we were explicitly passed systemd.unified_cgroup_hierarchy,
2758 * respect that. */
1d84ad94 2759 r = proc_cmdline_get_bool("systemd.unified_cgroup_hierarchy", &b);
5f086dc7
CD
2760 if (r > 0)
2761 return (wanted = b);
2762
2763 /* If we passed cgroup_no_v1=all with no other instructions, it seems
2764 * highly unlikely that we want to use hybrid or legacy hierarchy. */
2765 r = proc_cmdline_get_key("cgroup_no_v1", 0, &c);
2766 if (r > 0 && streq_ptr(c, "all"))
2767 return (wanted = true);
efdb0237 2768
5f086dc7 2769 return (wanted = is_default);
efdb0237
LP
2770}
2771
2772bool cg_is_legacy_wanted(void) {
239a3d09
ZJS
2773 static thread_local int wanted = -1;
2774
2775 /* If we have a cached value, return that. */
2776 if (wanted >= 0)
2777 return wanted;
2778
4e1dfa45 2779 /* Check if we have cgroup v2 already mounted. */
1b59cf04
ZJS
2780 if (cg_unified_flush() >= 0 &&
2781 unified_cache == CGROUP_UNIFIED_ALL)
239a3d09 2782 return (wanted = false);
1b59cf04
ZJS
2783
2784 /* Otherwise, assume that at least partial legacy is wanted,
4e1dfa45 2785 * since cgroup v2 should already be mounted at this point. */
239a3d09 2786 return (wanted = true);
efdb0237
LP
2787}
2788
a4464b95 2789bool cg_is_hybrid_wanted(void) {
5da38d07 2790 static thread_local int wanted = -1;
415fc41c 2791 int r;
1d84ad94 2792 bool b;
c19739db
ZJS
2793 const bool is_default = DEFAULT_HIERARCHY >= CGROUP_UNIFIED_SYSTEMD;
2794 /* We default to true if the default is "hybrid", obviously,
2795 * but also when the default is "unified", because if we get
2796 * called, it means that unified hierarchy was not mounted. */
5da38d07 2797
77fab2a9 2798 /* If we have a cached value, return that. */
5da38d07
TH
2799 if (wanted >= 0)
2800 return wanted;
2801
239a3d09
ZJS
2802 /* If the hierarchy is already mounted, then follow whatever
2803 * was chosen for it. */
2804 if (cg_unified_flush() >= 0 &&
2805 unified_cache == CGROUP_UNIFIED_ALL)
2806 return (wanted = false);
2807
77fab2a9
ZJS
2808 /* Otherwise, let's see what the kernel command line has to say.
2809 * Since checking is expensive, cache a non-error result. */
1d84ad94 2810 r = proc_cmdline_get_bool("systemd.legacy_systemd_cgroup_controller", &b);
5da38d07 2811
2dcb526d
ZJS
2812 /* The meaning of the kernel option is reversed wrt. to the return value
2813 * of this function, hence the negation. */
77fab2a9 2814 return (wanted = r > 0 ? !b : is_default);
5da38d07
TH
2815}
2816
13c31542
TH
2817int cg_weight_parse(const char *s, uint64_t *ret) {
2818 uint64_t u;
2819 int r;
2820
2821 if (isempty(s)) {
2822 *ret = CGROUP_WEIGHT_INVALID;
2823 return 0;
2824 }
2825
2826 r = safe_atou64(s, &u);
2827 if (r < 0)
2828 return r;
2829
2830 if (u < CGROUP_WEIGHT_MIN || u > CGROUP_WEIGHT_MAX)
2831 return -ERANGE;
2832
2833 *ret = u;
2834 return 0;
2835}
2836
9be57249
TH
2837const uint64_t cgroup_io_limit_defaults[_CGROUP_IO_LIMIT_TYPE_MAX] = {
2838 [CGROUP_IO_RBPS_MAX] = CGROUP_LIMIT_MAX,
2839 [CGROUP_IO_WBPS_MAX] = CGROUP_LIMIT_MAX,
ac06a0cf
TH
2840 [CGROUP_IO_RIOPS_MAX] = CGROUP_LIMIT_MAX,
2841 [CGROUP_IO_WIOPS_MAX] = CGROUP_LIMIT_MAX,
9be57249
TH
2842};
2843
2844static const char* const cgroup_io_limit_type_table[_CGROUP_IO_LIMIT_TYPE_MAX] = {
2845 [CGROUP_IO_RBPS_MAX] = "IOReadBandwidthMax",
2846 [CGROUP_IO_WBPS_MAX] = "IOWriteBandwidthMax",
ac06a0cf
TH
2847 [CGROUP_IO_RIOPS_MAX] = "IOReadIOPSMax",
2848 [CGROUP_IO_WIOPS_MAX] = "IOWriteIOPSMax",
9be57249
TH
2849};
2850
2851DEFINE_STRING_TABLE_LOOKUP(cgroup_io_limit_type, CGroupIOLimitType);
2852
d53d9474
LP
2853int cg_cpu_shares_parse(const char *s, uint64_t *ret) {
2854 uint64_t u;
2855 int r;
2856
2857 if (isempty(s)) {
2858 *ret = CGROUP_CPU_SHARES_INVALID;
2859 return 0;
2860 }
2861
2862 r = safe_atou64(s, &u);
2863 if (r < 0)
2864 return r;
2865
2866 if (u < CGROUP_CPU_SHARES_MIN || u > CGROUP_CPU_SHARES_MAX)
2867 return -ERANGE;
2868
2869 *ret = u;
2870 return 0;
2871}
2872
2873int cg_blkio_weight_parse(const char *s, uint64_t *ret) {
2874 uint64_t u;
2875 int r;
2876
2877 if (isempty(s)) {
2878 *ret = CGROUP_BLKIO_WEIGHT_INVALID;
2879 return 0;
2880 }
2881
2882 r = safe_atou64(s, &u);
2883 if (r < 0)
2884 return r;
2885
2886 if (u < CGROUP_BLKIO_WEIGHT_MIN || u > CGROUP_BLKIO_WEIGHT_MAX)
2887 return -ERANGE;
2888
2889 *ret = u;
2890 return 0;
2891}
2892
f0bef277
EV
2893bool is_cgroup_fs(const struct statfs *s) {
2894 return is_fs_type(s, CGROUP_SUPER_MAGIC) ||
2895 is_fs_type(s, CGROUP2_SUPER_MAGIC);
2896}
2897
2898bool fd_is_cgroup_fs(int fd) {
2899 struct statfs s;
2900
2901 if (fstatfs(fd, &s) < 0)
2902 return -errno;
2903
2904 return is_cgroup_fs(&s);
2905}
2906
b82f71c7 2907static const char *const cgroup_controller_table[_CGROUP_CONTROLLER_MAX] = {
efdb0237
LP
2908 [CGROUP_CONTROLLER_CPU] = "cpu",
2909 [CGROUP_CONTROLLER_CPUACCT] = "cpuacct",
13c31542 2910 [CGROUP_CONTROLLER_IO] = "io",
efdb0237
LP
2911 [CGROUP_CONTROLLER_BLKIO] = "blkio",
2912 [CGROUP_CONTROLLER_MEMORY] = "memory",
3905f127 2913 [CGROUP_CONTROLLER_DEVICES] = "devices",
03a7b521 2914 [CGROUP_CONTROLLER_PIDS] = "pids",
17f14955 2915 [CGROUP_CONTROLLER_BPF_FIREWALL] = "bpf-firewall",
084c7007 2916 [CGROUP_CONTROLLER_BPF_DEVICES] = "bpf-devices",
efdb0237
LP
2917};
2918
2919DEFINE_STRING_TABLE_LOOKUP(cgroup_controller, CGroupController);
f98c2585
CD
2920
2921CGroupMask get_cpu_accounting_mask(void) {
2922 static CGroupMask needed_mask = (CGroupMask) -1;
2923
2924 /* On kernel ≥4.15 with unified hierarchy, cpu.stat's usage_usec is
2925 * provided externally from the CPU controller, which means we don't
2926 * need to enable the CPU controller just to get metrics. This is good,
2927 * because enabling the CPU controller comes at a minor performance
2928 * hit, especially when it's propagated deep into large hierarchies.
2929 * There's also no separate CPU accounting controller available within
2930 * a unified hierarchy.
2931 *
2932 * This combination of factors results in the desired cgroup mask to
2933 * enable for CPU accounting varying as follows:
2934 *
2935 * ╔═════════════════════╤═════════════════════╗
2936 * ║ Linux ≥4.15 │ Linux <4.15 ║
2937 * ╔═══════════════╬═════════════════════╪═════════════════════╣
2938 * ║ Unified ║ nothing │ CGROUP_MASK_CPU ║
2939 * ╟───────────────╫─────────────────────┼─────────────────────╢
2940 * ║ Hybrid/Legacy ║ CGROUP_MASK_CPUACCT │ CGROUP_MASK_CPUACCT ║
2941 * ╚═══════════════╩═════════════════════╧═════════════════════╝
2942 *
2943 * We check kernel version here instead of manually checking whether
2944 * cpu.stat is present for every cgroup, as that check in itself would
2945 * already be fairly expensive.
2946 *
2947 * Kernels where this patch has been backported will therefore have the
2948 * CPU controller enabled unnecessarily. This is more expensive than
2949 * necessary, but harmless. ☺️
2950 */
2951
2952 if (needed_mask == (CGroupMask) -1) {
2953 if (cg_all_unified()) {
2954 struct utsname u;
2955 assert_se(uname(&u) >= 0);
2956
2957 if (str_verscmp(u.release, "4.15") < 0)
2958 needed_mask = CGROUP_MASK_CPU;
2959 else
2960 needed_mask = 0;
2961 } else
2962 needed_mask = CGROUP_MASK_CPUACCT;
2963 }
2964
2965 return needed_mask;
2966}
2967
2968bool cpu_accounting_is_cheap(void) {
2969 return get_cpu_accounting_mask() == 0;
2970}