src/basic/cgroup-util.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <errno.h>
   4 #include <limits.h>
   5 #include <signal.h>
   6 #include <stddef.h>
   7 #include <stdlib.h>
   8 #include <sys/types.h>
   9 #include <sys/utsname.h>
  10 #include <sys/xattr.h>
  11 #include <unistd.h>
  12
  13 #include "alloc-util.h"
  14 #include "cgroup-util.h"
  15 #include "constants.h"
  16 #include "dirent-util.h"
  17 #include "extract-word.h"
  18 #include "fd-util.h"
  19 #include "fileio.h"
  20 #include "format-util.h"
  21 #include "fs-util.h"
  22 #include "log.h"
  23 #include "login-util.h"
  24 #include "macro.h"
  25 #include "missing_fs.h"
  26 #include "missing_magic.h"
  27 #include "missing_threads.h"
  28 #include "mkdir.h"
  29 #include "parse-util.h"
  30 #include "path-util.h"
  31 #include "process-util.h"
  32 #include "set.h"
  33 #include "special.h"
  34 #include "stat-util.h"
  35 #include "stdio-util.h"
  36 #include "string-table.h"
  37 #include "string-util.h"
  38 #include "strv.h"
  39 #include "unit-name.h"
  40 #include "user-util.h"
  41 #include "xattr-util.h"
  42
  43 int cg_path_open(const char *controller, const char *path) {
  44         _cleanup_free_ char *fs = NULL;
  45         int r;
  46
  47         r = cg_get_path(controller, path, /* item=*/ NULL, &fs);
  48         if (r < 0)
  49                 return r;
  50
  51         return RET_NERRNO(open(fs, O_DIRECTORY|O_CLOEXEC));
  52 }
  53
  54 int cg_cgroupid_open(int cgroupfs_fd, uint64_t id) {
  55         _cleanup_close_ int fsfd = -EBADF;
  56
  57         if (cgroupfs_fd < 0) {
  58                 fsfd = open("/sys/fs/cgroup", O_CLOEXEC|O_DIRECTORY);
  59                 if (fsfd < 0)
  60                         return -errno;
  61
  62                 cgroupfs_fd = fsfd;
  63         }
  64
  65         cg_file_handle fh = CG_FILE_HANDLE_INIT;
  66         CG_FILE_HANDLE_CGROUPID(fh) = id;
  67
  68         int fd = open_by_handle_at(cgroupfs_fd, &fh.file_handle, O_DIRECTORY|O_CLOEXEC);
  69         if (fd < 0)
  70                 return -errno;
  71
  72         return fd;
  73 }
  74
  75 static int cg_enumerate_items(const char *controller, const char *path, FILE **ret, const char *item) {
  76         _cleanup_free_ char *fs = NULL;
  77         FILE *f;
  78         int r;
  79
  80         assert(ret);
  81
  82         r = cg_get_path(controller, path, item, &fs);
  83         if (r < 0)
  84                 return r;
  85
  86         f = fopen(fs, "re");
  87         if (!f)
  88                 return -errno;
  89
  90         *ret = f;
  91         return 0;
  92 }
  93
  94 int cg_enumerate_processes(const char *controller, const char *path, FILE **ret) {
  95         return cg_enumerate_items(controller, path, ret, "cgroup.procs");
  96 }
  97
  98 int cg_read_pid(FILE *f, pid_t *ret, CGroupFlags flags) {
  99         unsigned long ul;
 100
 101         /* Note that the cgroup.procs might contain duplicates! See cgroups.txt for details. */
 102
 103         assert(f);
 104         assert(ret);
 105
 106         for (;;) {
 107                 errno = 0;
 108                 if (fscanf(f, "%lu", &ul) != 1) {
 109
 110                         if (feof(f)) {
 111                                 *ret = 0;
 112                                 return 0;
 113                         }
 114
 115                         return errno_or_else(EIO);
 116                 }
 117
 118                 if (ul > PID_T_MAX)
 119                         return -EIO;
 120
 121                 /* In some circumstances (e.g. WSL), cgroups might contain unmappable PIDs from other
 122                  * contexts. These show up as zeros, and depending on the caller, can either be plain
 123                  * skipped over, or returned as-is. */
 124                 if (ul == 0 && !FLAGS_SET(flags, CGROUP_DONT_SKIP_UNMAPPED))
 125                         continue;
 126
 127                 *ret = (pid_t) ul;
 128                 return 1;
 129         }
 130 }
 131
 132 int cg_read_pidref(FILE *f, PidRef *ret, CGroupFlags flags) {
 133         int r;
 134
 135         assert(f);
 136         assert(ret);
 137
 138         for (;;) {
 139                 pid_t pid;
 140
 141                 r = cg_read_pid(f, &pid, flags);
 142                 if (r < 0)
 143                         return r;
 144                 if (r == 0) {
 145                         *ret = PIDREF_NULL;
 146                         return 0;
 147                 }
 148
 149                 if (pid == 0)
 150                         return -EREMOTE;
 151
 152                 r = pidref_set_pid(ret, pid);
 153                 if (r >= 0)
 154                         return 1;
 155                 if (r != -ESRCH)
 156                         return r;
 157
 158                 /* ESRCH → gone by now? just skip over it, read the next */
 159         }
 160 }
 161
 162 int cg_read_event(
 163                 const char *controller,
 164                 const char *path,
 165                 const char *event,
 166                 char **ret) {
 167
 168         _cleanup_free_ char *events = NULL, *content = NULL;
 169         int r;
 170
 171         r = cg_get_path(controller, path, "cgroup.events", &events);
 172         if (r < 0)
 173                 return r;
 174
 175         r = read_full_virtual_file(events, &content, NULL);
 176         if (r < 0)
 177                 return r;
 178
 179         for (const char *p = content;;) {
 180                 _cleanup_free_ char *line = NULL, *key = NULL;
 181                 const char *q;
 182
 183                 r = extract_first_word(&p, &line, "\n", 0);
 184                 if (r < 0)
 185                         return r;
 186                 if (r == 0)
 187                         return -ENOENT;
 188
 189                 q = line;
 190                 r = extract_first_word(&q, &key, " ", 0);
 191                 if (r < 0)
 192                         return r;
 193                 if (r == 0)
 194                         return -EINVAL;
 195
 196                 if (!streq(key, event))
 197                         continue;
 198
 199                 return strdup_to(ret, q);
 200         }
 201 }
 202
 203 bool cg_ns_supported(void) {
 204         static thread_local int enabled = -1;
 205
 206         if (enabled >= 0)
 207                 return enabled;
 208
 209         if (access("/proc/self/ns/cgroup", F_OK) < 0) {
 210                 if (errno != ENOENT)
 211                         log_debug_errno(errno, "Failed to check whether /proc/self/ns/cgroup is available, assuming not: %m");
 212                 enabled = false;
 213         } else
 214                 enabled = true;
 215
 216         return enabled;
 217 }
 218
 219 bool cg_freezer_supported(void) {
 220         static thread_local int supported = -1;
 221
 222         if (supported >= 0)
 223                 return supported;
 224
 225         supported = cg_all_unified() > 0 && access("/sys/fs/cgroup/init.scope/cgroup.freeze", F_OK) == 0;
 226
 227         return supported;
 228 }
 229
 230 bool cg_kill_supported(void) {
 231         static thread_local int supported = -1;
 232
 233         if (supported >= 0)
 234                 return supported;
 235
 236         if (cg_all_unified() <= 0)
 237                 supported = false;
 238         else if (access("/sys/fs/cgroup/init.scope/cgroup.kill", F_OK) < 0) {
 239                 if (errno != ENOENT)
 240                         log_debug_errno(errno, "Failed to check if cgroup.kill is available, assuming not: %m");
 241                 supported = false;
 242         } else
 243                 supported = true;
 244
 245         return supported;
 246 }
 247
 248 int cg_enumerate_subgroups(const char *controller, const char *path, DIR **ret) {
 249         _cleanup_free_ char *fs = NULL;
 250         DIR *d;
 251         int r;
 252
 253         assert(ret);
 254
 255         /* This is not recursive! */
 256
 257         r = cg_get_path(controller, path, NULL, &fs);
 258         if (r < 0)
 259                 return r;
 260
 261         d = opendir(fs);
 262         if (!d)
 263                 return -errno;
 264
 265         *ret = d;
 266         return 0;
 267 }
 268
 269 int cg_read_subgroup(DIR *d, char **ret) {
 270         assert(d);
 271         assert(ret);
 272
 273         FOREACH_DIRENT_ALL(de, d, return -errno) {
 274                 if (de->d_type != DT_DIR)
 275                         continue;
 276
 277                 if (dot_or_dot_dot(de->d_name))
 278                         continue;
 279
 280                 return strdup_to_full(ret, de->d_name);
 281         }
 282
 283         *ret = NULL;
 284         return 0;
 285 }
 286
 287 int cg_rmdir(const char *controller, const char *path) {
 288         _cleanup_free_ char *p = NULL;
 289         int r;
 290
 291         r = cg_get_path(controller, path, NULL, &p);
 292         if (r < 0)
 293                 return r;
 294
 295         r = rmdir(p);
 296         if (r < 0 && errno != ENOENT)
 297                 return -errno;
 298
 299         r = cg_hybrid_unified();
 300         if (r <= 0)
 301                 return r;
 302
 303         if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 304                 r = cg_rmdir(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path);
 305                 if (r < 0)
 306                         log_warning_errno(r, "Failed to remove compat systemd cgroup %s: %m", path);
 307         }
 308
 309         return 0;
 310 }
 311
 312 static int cg_kill_items(
 313                 const char *path,
 314                 int sig,
 315                 CGroupFlags flags,
 316                 Set *s,
 317                 cg_kill_log_func_t log_kill,
 318                 void *userdata,
 319                 const char *item) {
 320
 321         _cleanup_set_free_ Set *allocated_set = NULL;
 322         bool done = false;
 323         int r, ret = 0, ret_log_kill = 0;
 324
 325         assert(sig >= 0);
 326
 327          /* Don't send SIGCONT twice. Also, SIGKILL always works even when process is suspended, hence don't send
 328           * SIGCONT on SIGKILL. */
 329         if (IN_SET(sig, SIGCONT, SIGKILL))
 330                 flags &= ~CGROUP_SIGCONT;
 331
 332         /* This goes through the tasks list and kills them all. This
 333          * is repeated until no further processes are added to the
 334          * tasks list, to properly handle forking processes */
 335
 336         if (!s) {
 337                 s = allocated_set = set_new(NULL);
 338                 if (!s)
 339                         return -ENOMEM;
 340         }
 341
 342         do {
 343                 _cleanup_fclose_ FILE *f = NULL;
 344                 done = true;
 345
 346                 r = cg_enumerate_items(SYSTEMD_CGROUP_CONTROLLER, path, &f, item);
 347                 if (r == -ENOENT)
 348                         break;
 349                 if (r < 0)
 350                         return RET_GATHER(ret, r);
 351
 352                 for (;;) {
 353                         _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
 354
 355                         r = cg_read_pidref(f, &pidref, /* flags = */ 0);
 356                         if (r < 0)
 357                                 return RET_GATHER(ret, r);
 358                         if (r == 0)
 359                                 break;
 360
 361                         if ((flags & CGROUP_IGNORE_SELF) && pidref_is_self(&pidref))
 362                                 continue;
 363
 364                         if (set_get(s, PID_TO_PTR(pidref.pid)) == PID_TO_PTR(pidref.pid))
 365                                 continue;
 366
 367                         if (log_kill)
 368                                 ret_log_kill = log_kill(&pidref, sig, userdata);
 369
 370                         /* If we haven't killed this process yet, kill it */
 371                         r = pidref_kill(&pidref, sig);
 372                         if (r < 0 && r != -ESRCH)
 373                                 RET_GATHER(ret, r);
 374                         if (r >= 0) {
 375                                 if (flags & CGROUP_SIGCONT)
 376                                         (void) pidref_kill(&pidref, SIGCONT);
 377
 378                                 if (ret == 0) {
 379                                         if (log_kill)
 380                                                 ret = ret_log_kill;
 381                                         else
 382                                                 ret = 1;
 383                                 }
 384                         }
 385
 386                         done = false;
 387
 388                         r = set_put(s, PID_TO_PTR(pidref.pid));
 389                         if (r < 0)
 390                                 return RET_GATHER(ret, r);
 391                 }
 392
 393                 /* To avoid racing against processes which fork quicker than we can kill them, we repeat this
 394                  * until no new pids need to be killed. */
 395
 396         } while (!done);
 397
 398         return ret;
 399 }
 400
 401 int cg_kill(
 402                 const char *path,
 403                 int sig,
 404                 CGroupFlags flags,
 405                 Set *s,
 406                 cg_kill_log_func_t log_kill,
 407                 void *userdata) {
 408
 409         int r, ret;
 410
 411         r = cg_kill_items(path, sig, flags, s, log_kill, userdata, "cgroup.procs");
 412         if (r < 0 || sig != SIGKILL)
 413                 return r;
 414
 415         ret = r;
 416
 417         /* Only in case of killing with SIGKILL and when using cgroupsv2, kill remaining threads manually as
 418            a workaround for kernel bug. It was fixed in 5.2-rc5 (c03cd7738a83), backported to 4.19.66
 419            (4340d175b898) and 4.14.138 (feb6b123b7dd). */
 420         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
 421         if (r < 0)
 422                 return r;
 423         if (r == 0)
 424                 return ret;
 425
 426         r = cg_kill_items(path, sig, flags, s, log_kill, userdata, "cgroup.threads");
 427         if (r < 0)
 428                 return r;
 429
 430         return r > 0 || ret > 0;
 431 }
 432
 433 int cg_kill_kernel_sigkill(const char *path) {
 434         /* Kills the cgroup at `path` directly by writing to its cgroup.kill file.  This sends SIGKILL to all
 435          * processes in the cgroup and has the advantage of being completely atomic, unlike cg_kill_items(). */
 436
 437         _cleanup_free_ char *killfile = NULL;
 438         int r;
 439
 440         assert(path);
 441
 442         if (!cg_kill_supported())
 443                 return -EOPNOTSUPP;
 444
 445         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, "cgroup.kill", &killfile);
 446         if (r < 0)
 447                 return r;
 448
 449         r = write_string_file(killfile, "1", WRITE_STRING_FILE_DISABLE_BUFFER);
 450         if (r < 0)
 451                 return r;
 452
 453         return 0;
 454 }
 455
 456 int cg_kill_recursive(
 457                 const char *path,
 458                 int sig,
 459                 CGroupFlags flags,
 460                 Set *s,
 461                 cg_kill_log_func_t log_kill,
 462                 void *userdata) {
 463
 464         int r, ret;
 465
 466         assert(path);
 467         assert(sig >= 0);
 468
 469         if (sig == SIGKILL && cg_kill_supported() &&
 470             !FLAGS_SET(flags, CGROUP_IGNORE_SELF) && !s && !log_kill)
 471                 /* ignore CGROUP_SIGCONT, since this is a no-op alongside SIGKILL */
 472                 ret = cg_kill_kernel_sigkill(path);
 473         else {
 474                 _cleanup_set_free_ Set *allocated_set = NULL;
 475                 _cleanup_closedir_ DIR *d = NULL;
 476
 477                 if (!s) {
 478                         s = allocated_set = set_new(NULL);
 479                         if (!s)
 480                                 return -ENOMEM;
 481                 }
 482
 483                 ret = cg_kill(path, sig, flags, s, log_kill, userdata);
 484
 485                 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
 486                 if (r < 0) {
 487                         if (r != -ENOENT)
 488                                 RET_GATHER(ret, r);
 489
 490                         return ret;
 491                 }
 492
 493                 for (;;) {
 494                         _cleanup_free_ char *fn = NULL, *p = NULL;
 495
 496                         r = cg_read_subgroup(d, &fn);
 497                         if (r < 0) {
 498                                 RET_GATHER(ret, r);
 499                                 break;
 500                         }
 501                         if (r == 0)
 502                                 break;
 503
 504                         p = path_join(empty_to_root(path), fn);
 505                         if (!p)
 506                                 return -ENOMEM;
 507
 508                         r = cg_kill_recursive(p, sig, flags, s, log_kill, userdata);
 509                         if (r != 0 && ret >= 0)
 510                                 ret = r;
 511                 }
 512         }
 513
 514         if (FLAGS_SET(flags, CGROUP_REMOVE)) {
 515                 r = cg_rmdir(SYSTEMD_CGROUP_CONTROLLER, path);
 516                 if (!IN_SET(r, -ENOENT, -EBUSY))
 517                         RET_GATHER(ret, r);
 518         }
 519
 520         return ret;
 521 }
 522
 523 static const char *controller_to_dirname(const char *controller) {
 524         assert(controller);
 525
 526         /* Converts a controller name to the directory name below /sys/fs/cgroup/ we want to mount it
 527          * to. Effectively, this just cuts off the name= prefixed used for named hierarchies, if it is
 528          * specified. */
 529
 530         if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 531                 if (cg_hybrid_unified() > 0)
 532                         controller = SYSTEMD_CGROUP_CONTROLLER_HYBRID;
 533                 else
 534                         controller = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
 535         }
 536
 537         return startswith(controller, "name=") ?: controller;
 538 }
 539
 540 static int join_path_legacy(const char *controller, const char *path, const char *suffix, char **ret) {
 541         const char *dn;
 542         char *t = NULL;
 543
 544         assert(ret);
 545         assert(controller);
 546
 547         dn = controller_to_dirname(controller);
 548
 549         if (isempty(path) && isempty(suffix))
 550                 t = path_join("/sys/fs/cgroup", dn);
 551         else if (isempty(path))
 552                 t = path_join("/sys/fs/cgroup", dn, suffix);
 553         else if (isempty(suffix))
 554                 t = path_join("/sys/fs/cgroup", dn, path);
 555         else
 556                 t = path_join("/sys/fs/cgroup", dn, path, suffix);
 557         if (!t)
 558                 return -ENOMEM;
 559
 560         *ret = t;
 561         return 0;
 562 }
 563
 564 static int join_path_unified(const char *path, const char *suffix, char **ret) {
 565         char *t;
 566
 567         assert(ret);
 568
 569         if (isempty(path) && isempty(suffix))
 570                 t = strdup("/sys/fs/cgroup");
 571         else if (isempty(path))
 572                 t = path_join("/sys/fs/cgroup", suffix);
 573         else if (isempty(suffix))
 574                 t = path_join("/sys/fs/cgroup", path);
 575         else
 576                 t = path_join("/sys/fs/cgroup", path, suffix);
 577         if (!t)
 578                 return -ENOMEM;
 579
 580         *ret = t;
 581         return 0;
 582 }
 583
 584 int cg_get_path(const char *controller, const char *path, const char *suffix, char **ret) {
 585         int r;
 586
 587         assert(ret);
 588
 589         if (!controller) {
 590                 char *t;
 591
 592                 /* If no controller is specified, we return the path *below* the controllers, without any
 593                  * prefix. */
 594
 595                 if (isempty(path) && isempty(suffix))
 596                         return -EINVAL;
 597
 598                 if (isempty(suffix))
 599                         t = strdup(path);
 600                 else if (isempty(path))
 601                         t = strdup(suffix);
 602                 else
 603                         t = path_join(path, suffix);
 604                 if (!t)
 605                         return -ENOMEM;
 606
 607                 *ret = path_simplify(t);
 608                 return 0;
 609         }
 610
 611         if (!cg_controller_is_valid(controller))
 612                 return -EINVAL;
 613
 614         r = cg_all_unified();
 615         if (r < 0)
 616                 return r;
 617         if (r > 0)
 618                 r = join_path_unified(path, suffix, ret);
 619         else
 620                 r = join_path_legacy(controller, path, suffix, ret);
 621         if (r < 0)
 622                 return r;
 623
 624         path_simplify(*ret);
 625         return 0;
 626 }
 627
 628 static int controller_is_v1_accessible(const char *root, const char *controller) {
 629         const char *cpath, *dn;
 630
 631         assert(controller);
 632
 633         dn = controller_to_dirname(controller);
 634
 635         /* If root if specified, we check that:
 636          * - possible subcgroup is created at root,
 637          * - we can modify the hierarchy. */
 638
 639         cpath = strjoina("/sys/fs/cgroup/", dn, root, root ? "/cgroup.procs" : NULL);
 640         return laccess(cpath, root ? W_OK : F_OK);
 641 }
 642
 643 int cg_get_path_and_check(const char *controller, const char *path, const char *suffix, char **ret) {
 644         int r;
 645
 646         assert(controller);
 647         assert(ret);
 648
 649         if (!cg_controller_is_valid(controller))
 650                 return -EINVAL;
 651
 652         r = cg_all_unified();
 653         if (r < 0)
 654                 return r;
 655         if (r > 0) {
 656                 /* In the unified hierarchy all controllers are considered accessible,
 657                  * except for the named hierarchies */
 658                 if (startswith(controller, "name="))
 659                         return -EOPNOTSUPP;
 660         } else {
 661                 /* Check if the specified controller is actually accessible */
 662                 r = controller_is_v1_accessible(NULL, controller);
 663                 if (r < 0)
 664                         return r;
 665         }
 666
 667         return cg_get_path(controller, path, suffix, ret);
 668 }
 669
 670 int cg_set_xattr(const char *path, const char *name, const void *value, size_t size, int flags) {
 671         _cleanup_free_ char *fs = NULL;
 672         int r;
 673
 674         assert(path);
 675         assert(name);
 676         assert(value || size <= 0);
 677
 678         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
 679         if (r < 0)
 680                 return r;
 681
 682         return RET_NERRNO(setxattr(fs, name, value, size, flags));
 683 }
 684
 685 int cg_get_xattr(const char *path, const char *name, void *value, size_t size) {
 686         _cleanup_free_ char *fs = NULL;
 687         ssize_t n;
 688         int r;
 689
 690         assert(path);
 691         assert(name);
 692
 693         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
 694         if (r < 0)
 695                 return r;
 696
 697         n = getxattr(fs, name, value, size);
 698         if (n < 0)
 699                 return -errno;
 700
 701         return (int) n;
 702 }
 703
 704 int cg_get_xattr_malloc(const char *path, const char *name, char **ret) {
 705         _cleanup_free_ char *fs = NULL;
 706         int r;
 707
 708         assert(path);
 709         assert(name);
 710
 711         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
 712         if (r < 0)
 713                 return r;
 714
 715         return lgetxattr_malloc(fs, name, ret);
 716 }
 717
 718 int cg_get_xattr_bool(const char *path, const char *name) {
 719         _cleanup_free_ char *fs = NULL;
 720         int r;
 721
 722         assert(path);
 723         assert(name);
 724
 725         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
 726         if (r < 0)
 727                 return r;
 728
 729         return getxattr_at_bool(AT_FDCWD, fs, name, /* flags= */ 0);
 730 }
 731
 732 int cg_remove_xattr(const char *path, const char *name) {
 733         _cleanup_free_ char *fs = NULL;
 734         int r;
 735
 736         assert(path);
 737         assert(name);
 738
 739         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
 740         if (r < 0)
 741                 return r;
 742
 743         return RET_NERRNO(removexattr(fs, name));
 744 }
 745
 746 int cg_pid_get_path(const char *controller, pid_t pid, char **ret_path) {
 747         _cleanup_fclose_ FILE *f = NULL;
 748         const char *fs, *controller_str = NULL;  /* avoid false maybe-uninitialized warning */
 749         int unified, r;
 750
 751         assert(pid >= 0);
 752         assert(ret_path);
 753
 754         if (controller) {
 755                 if (!cg_controller_is_valid(controller))
 756                         return -EINVAL;
 757         } else
 758                 controller = SYSTEMD_CGROUP_CONTROLLER;
 759
 760         unified = cg_unified_controller(controller);
 761         if (unified < 0)
 762                 return unified;
 763         if (unified == 0) {
 764                 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER))
 765                         controller_str = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
 766                 else
 767                         controller_str = controller;
 768         }
 769
 770         fs = procfs_file_alloca(pid, "cgroup");
 771         r = fopen_unlocked(fs, "re", &f);
 772         if (r == -ENOENT)
 773                 return -ESRCH;
 774         if (r < 0)
 775                 return r;
 776
 777         for (;;) {
 778                 _cleanup_free_ char *line = NULL;
 779                 char *e;
 780
 781                 r = read_line(f, LONG_LINE_MAX, &line);
 782                 if (r < 0)
 783                         return r;
 784                 if (r == 0)
 785                         return -ENODATA;
 786
 787                 if (unified) {
 788                         e = startswith(line, "0:");
 789                         if (!e)
 790                                 continue;
 791
 792                         e = strchr(e, ':');
 793                         if (!e)
 794                                 continue;
 795                 } else {
 796                         char *l;
 797
 798                         l = strchr(line, ':');
 799                         if (!l)
 800                                 continue;
 801
 802                         l++;
 803                         e = strchr(l, ':');
 804                         if (!e)
 805                                 continue;
 806                         *e = 0;
 807
 808                         assert(controller_str);
 809                         r = string_contains_word(l, ",", controller_str);
 810                         if (r < 0)
 811                                 return r;
 812                         if (r == 0)
 813                                 continue;
 814                 }
 815
 816                 char *path = strdup(e + 1);
 817                 if (!path)
 818                         return -ENOMEM;
 819
 820                 /* Truncate suffix indicating the process is a zombie */
 821                 e = endswith(path, " (deleted)");
 822                 if (e)
 823                         *e = 0;
 824
 825                 *ret_path = path;
 826                 return 0;
 827         }
 828 }
 829
 830 int cg_pidref_get_path(const char *controller, const PidRef *pidref, char **ret_path) {
 831         _cleanup_free_ char *path = NULL;
 832         int r;
 833
 834         assert(ret_path);
 835
 836         if (!pidref_is_set(pidref))
 837                 return -ESRCH;
 838
 839         r = cg_pid_get_path(controller, pidref->pid, &path);
 840         if (r < 0)
 841                 return r;
 842
 843         /* Before we return the path, make sure the procfs entry for this pid still matches the pidref */
 844         r = pidref_verify(pidref);
 845         if (r < 0)
 846                 return r;
 847
 848         *ret_path = TAKE_PTR(path);
 849         return 0;
 850 }
 851
 852 int cg_install_release_agent(const char *controller, const char *agent) {
 853         _cleanup_free_ char *fs = NULL, *contents = NULL;
 854         const char *sc;
 855         int r;
 856
 857         assert(agent);
 858
 859         r = cg_unified_controller(controller);
 860         if (r < 0)
 861                 return r;
 862         if (r > 0) /* doesn't apply to unified hierarchy */
 863                 return -EOPNOTSUPP;
 864
 865         r = cg_get_path(controller, NULL, "release_agent", &fs);
 866         if (r < 0)
 867                 return r;
 868
 869         r = read_one_line_file(fs, &contents);
 870         if (r < 0)
 871                 return r;
 872
 873         sc = strstrip(contents);
 874         if (isempty(sc)) {
 875                 r = write_string_file(fs, agent, WRITE_STRING_FILE_DISABLE_BUFFER);
 876                 if (r < 0)
 877                         return r;
 878         } else if (!path_equal(sc, agent))
 879                 return -EEXIST;
 880
 881         fs = mfree(fs);
 882         r = cg_get_path(controller, NULL, "notify_on_release", &fs);
 883         if (r < 0)
 884                 return r;
 885
 886         contents = mfree(contents);
 887         r = read_one_line_file(fs, &contents);
 888         if (r < 0)
 889                 return r;
 890
 891         sc = strstrip(contents);
 892         if (streq(sc, "0")) {
 893                 r = write_string_file(fs, "1", WRITE_STRING_FILE_DISABLE_BUFFER);
 894                 if (r < 0)
 895                         return r;
 896
 897                 return 1;
 898         }
 899
 900         if (!streq(sc, "1"))
 901                 return -EIO;
 902
 903         return 0;
 904 }
 905
 906 int cg_uninstall_release_agent(const char *controller) {
 907         _cleanup_free_ char *fs = NULL;
 908         int r;
 909
 910         r = cg_unified_controller(controller);
 911         if (r < 0)
 912                 return r;
 913         if (r > 0) /* Doesn't apply to unified hierarchy */
 914                 return -EOPNOTSUPP;
 915
 916         r = cg_get_path(controller, NULL, "notify_on_release", &fs);
 917         if (r < 0)
 918                 return r;
 919
 920         r = write_string_file(fs, "0", WRITE_STRING_FILE_DISABLE_BUFFER);
 921         if (r < 0)
 922                 return r;
 923
 924         fs = mfree(fs);
 925
 926         r = cg_get_path(controller, NULL, "release_agent", &fs);
 927         if (r < 0)
 928                 return r;
 929
 930         r = write_string_file(fs, "", WRITE_STRING_FILE_DISABLE_BUFFER);
 931         if (r < 0)
 932                 return r;
 933
 934         return 0;
 935 }
 936
 937 int cg_is_empty(const char *controller, const char *path) {
 938         _cleanup_fclose_ FILE *f = NULL;
 939         pid_t pid;
 940         int r;
 941
 942         assert(path);
 943
 944         r = cg_enumerate_processes(controller, path, &f);
 945         if (r == -ENOENT)
 946                 return true;
 947         if (r < 0)
 948                 return r;
 949
 950         r = cg_read_pid(f, &pid, CGROUP_DONT_SKIP_UNMAPPED);
 951         if (r < 0)
 952                 return r;
 953
 954         return r == 0;
 955 }
 956
 957 int cg_is_empty_recursive(const char *controller, const char *path) {
 958         int r;
 959
 960         assert(path);
 961
 962         /* The root cgroup is always populated */
 963         if (controller && empty_or_root(path))
 964                 return false;
 965
 966         r = cg_unified_controller(controller);
 967         if (r < 0)
 968                 return r;
 969         if (r > 0) {
 970                 _cleanup_free_ char *t = NULL;
 971
 972                 /* On the unified hierarchy we can check empty state
 973                  * via the "populated" attribute of "cgroup.events". */
 974
 975                 r = cg_read_event(controller, path, "populated", &t);
 976                 if (r == -ENOENT)
 977                         return true;
 978                 if (r < 0)
 979                         return r;
 980
 981                 return streq(t, "0");
 982         } else {
 983                 _cleanup_closedir_ DIR *d = NULL;
 984                 char *fn;
 985
 986                 r = cg_is_empty(controller, path);
 987                 if (r <= 0)
 988                         return r;
 989
 990                 r = cg_enumerate_subgroups(controller, path, &d);
 991                 if (r == -ENOENT)
 992                         return true;
 993                 if (r < 0)
 994                         return r;
 995
 996                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
 997                         _cleanup_free_ char *p = NULL;
 998
 999                         p = path_join(path, fn);
1000                         free(fn);
1001                         if (!p)
1002                                 return -ENOMEM;
1003
1004                         r = cg_is_empty_recursive(controller, p);
1005                         if (r <= 0)
1006                                 return r;
1007                 }
1008                 if (r < 0)
1009                         return r;
1010
1011                 return true;
1012         }
1013 }
1014
1015 int cg_split_spec(const char *spec, char **ret_controller, char **ret_path) {
1016         _cleanup_free_ char *controller = NULL, *path = NULL;
1017         int r;
1018
1019         assert(spec);
1020
1021         if (*spec == '/') {
1022                 if (!path_is_normalized(spec))
1023                         return -EINVAL;
1024
1025                 if (ret_path) {
1026                         r = path_simplify_alloc(spec, &path);
1027                         if (r < 0)
1028                                 return r;
1029                 }
1030
1031         } else {
1032                 const char *e;
1033
1034                 e = strchr(spec, ':');
1035                 if (e) {
1036                         controller = strndup(spec, e-spec);
1037                         if (!controller)
1038                                 return -ENOMEM;
1039                         if (!cg_controller_is_valid(controller))
1040                                 return -EINVAL;
1041
1042                         if (!isempty(e + 1)) {
1043                                 path = strdup(e+1);
1044                                 if (!path)
1045                                         return -ENOMEM;
1046
1047                                 if (!path_is_normalized(path) ||
1048                                     !path_is_absolute(path))
1049                                         return -EINVAL;
1050
1051                                 path_simplify(path);
1052                         }
1053
1054                 } else {
1055                         if (!cg_controller_is_valid(spec))
1056                                 return -EINVAL;
1057
1058                         if (ret_controller) {
1059                                 controller = strdup(spec);
1060                                 if (!controller)
1061                                         return -ENOMEM;
1062                         }
1063                 }
1064         }
1065
1066         if (ret_controller)
1067                 *ret_controller = TAKE_PTR(controller);
1068         if (ret_path)
1069                 *ret_path = TAKE_PTR(path);
1070         return 0;
1071 }
1072
1073 int cg_mangle_path(const char *path, char **ret) {
1074         _cleanup_free_ char *c = NULL, *p = NULL;
1075         int r;
1076
1077         assert(path);
1078         assert(ret);
1079
1080         /* First, check if it already is a filesystem path */
1081         if (path_startswith(path, "/sys/fs/cgroup"))
1082                 return path_simplify_alloc(path, ret);
1083
1084         /* Otherwise, treat it as cg spec */
1085         r = cg_split_spec(path, &c, &p);
1086         if (r < 0)
1087                 return r;
1088
1089         return cg_get_path(c ?: SYSTEMD_CGROUP_CONTROLLER, p ?: "/", NULL, ret);
1090 }
1091
1092 int cg_get_root_path(char **ret_path) {
1093         char *p, *e;
1094         int r;
1095
1096         assert(ret_path);
1097
1098         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 1, &p);
1099         if (r < 0)
1100                 return r;
1101
1102         e = endswith(p, "/" SPECIAL_INIT_SCOPE);
1103         if (!e)
1104                 e = endswith(p, "/" SPECIAL_SYSTEM_SLICE); /* legacy */
1105         if (!e)
1106                 e = endswith(p, "/system"); /* even more legacy */
1107         if (e)
1108                 *e = 0;
1109
1110         *ret_path = p;
1111         return 0;
1112 }
1113
1114 int cg_shift_path(const char *cgroup, const char *root, const char **ret_shifted) {
1115         _cleanup_free_ char *rt = NULL;
1116         char *p;
1117         int r;
1118
1119         assert(cgroup);
1120         assert(ret_shifted);
1121
1122         if (!root) {
1123                 /* If the root was specified let's use that, otherwise
1124                  * let's determine it from PID 1 */
1125
1126                 r = cg_get_root_path(&rt);
1127                 if (r < 0)
1128                         return r;
1129
1130                 root = rt;
1131         }
1132
1133         p = path_startswith(cgroup, root);
1134         if (p && p > cgroup)
1135                 *ret_shifted = p - 1;
1136         else
1137                 *ret_shifted = cgroup;
1138
1139         return 0;
1140 }
1141
1142 int cg_pid_get_path_shifted(pid_t pid, const char *root, char **ret_cgroup) {
1143         _cleanup_free_ char *raw = NULL;
1144         const char *c;
1145         int r;
1146
1147         assert(pid >= 0);
1148         assert(ret_cgroup);
1149
1150         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &raw);
1151         if (r < 0)
1152                 return r;
1153
1154         r = cg_shift_path(raw, root, &c);
1155         if (r < 0)
1156                 return r;
1157
1158         if (c == raw) {
1159                 *ret_cgroup = TAKE_PTR(raw);
1160                 return 0;
1161         }
1162
1163         return strdup_to(ret_cgroup, c);
1164 }
1165
1166 int cg_path_decode_unit(const char *cgroup, char **ret_unit) {
1167         assert(cgroup);
1168         assert(ret_unit);
1169
1170         size_t n = strcspn(cgroup, "/");
1171         if (n < 3)
1172                 return -ENXIO;
1173
1174         char *c = strndupa_safe(cgroup, n);
1175         c = cg_unescape(c);
1176
1177         if (!unit_name_is_valid(c, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
1178                 return -ENXIO;
1179
1180         return strdup_to(ret_unit, c);
1181 }
1182
1183 static bool valid_slice_name(const char *p, size_t n) {
1184
1185         if (!p)
1186                 return false;
1187
1188         if (n < STRLEN("x.slice"))
1189                 return false;
1190
1191         if (memcmp(p + n - 6, ".slice", 6) == 0) {
1192                 char buf[n+1], *c;
1193
1194                 memcpy(buf, p, n);
1195                 buf[n] = 0;
1196
1197                 c = cg_unescape(buf);
1198
1199                 return unit_name_is_valid(c, UNIT_NAME_PLAIN);
1200         }
1201
1202         return false;
1203 }
1204
1205 static const char *skip_slices(const char *p) {
1206         assert(p);
1207
1208         /* Skips over all slice assignments */
1209
1210         for (;;) {
1211                 size_t n;
1212
1213                 p += strspn(p, "/");
1214
1215                 n = strcspn(p, "/");
1216                 if (!valid_slice_name(p, n))
1217                         return p;
1218
1219                 p += n;
1220         }
1221 }
1222
1223 int cg_path_get_unit(const char *path, char **ret) {
1224         _cleanup_free_ char *unit = NULL;
1225         const char *e;
1226         int r;
1227
1228         assert(path);
1229         assert(ret);
1230
1231         e = skip_slices(path);
1232
1233         r = cg_path_decode_unit(e, &unit);
1234         if (r < 0)
1235                 return r;
1236
1237         /* We skipped over the slices, don't accept any now */
1238         if (endswith(unit, ".slice"))
1239                 return -ENXIO;
1240
1241         *ret = TAKE_PTR(unit);
1242         return 0;
1243 }
1244
1245 int cg_path_get_unit_path(const char *path, char **ret) {
1246         _cleanup_free_ char *path_copy = NULL;
1247         char *unit_name;
1248
1249         assert(path);
1250         assert(ret);
1251
1252         path_copy = strdup(path);
1253         if (!path_copy)
1254                 return -ENOMEM;
1255
1256         unit_name = (char *)skip_slices(path_copy);
1257         unit_name[strcspn(unit_name, "/")] = 0;
1258
1259         if (!unit_name_is_valid(cg_unescape(unit_name), UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
1260                 return -ENXIO;
1261
1262         *ret = TAKE_PTR(path_copy);
1263
1264         return 0;
1265 }
1266
1267 int cg_pid_get_unit(pid_t pid, char **ret_unit) {
1268         _cleanup_free_ char *cgroup = NULL;
1269         int r;
1270
1271         assert(ret_unit);
1272
1273         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1274         if (r < 0)
1275                 return r;
1276
1277         return cg_path_get_unit(cgroup, ret_unit);
1278 }
1279
1280 int cg_pidref_get_unit(const PidRef *pidref, char **ret) {
1281         _cleanup_free_ char *unit = NULL;
1282         int r;
1283
1284         assert(ret);
1285
1286         if (!pidref_is_set(pidref))
1287                 return -ESRCH;
1288
1289         r = cg_pid_get_unit(pidref->pid, &unit);
1290         if (r < 0)
1291                 return r;
1292
1293         r = pidref_verify(pidref);
1294         if (r < 0)
1295                 return r;
1296
1297         *ret = TAKE_PTR(unit);
1298         return 0;
1299 }
1300
1301 /**
1302  * Skip session-*.scope, but require it to be there.
1303  */
1304 static const char *skip_session(const char *p) {
1305         size_t n;
1306
1307         if (isempty(p))
1308                 return NULL;
1309
1310         p += strspn(p, "/");
1311
1312         n = strcspn(p, "/");
1313         if (n < STRLEN("session-x.scope"))
1314                 return NULL;
1315
1316         if (memcmp(p, "session-", 8) == 0 && memcmp(p + n - 6, ".scope", 6) == 0) {
1317                 char buf[n - 8 - 6 + 1];
1318
1319                 memcpy(buf, p + 8, n - 8 - 6);
1320                 buf[n - 8 - 6] = 0;
1321
1322                 /* Note that session scopes never need unescaping,
1323                  * since they cannot conflict with the kernel's own
1324                  * names, hence we don't need to call cg_unescape()
1325                  * here. */
1326
1327                 if (!session_id_valid(buf))
1328                         return NULL;
1329
1330                 p += n;
1331                 p += strspn(p, "/");
1332                 return p;
1333         }
1334
1335         return NULL;
1336 }
1337
1338 /**
1339  * Skip user@*.service, but require it to be there.
1340  */
1341 static const char *skip_user_manager(const char *p) {
1342         size_t n;
1343
1344         if (isempty(p))
1345                 return NULL;
1346
1347         p += strspn(p, "/");
1348
1349         n = strcspn(p, "/");
1350         if (n < STRLEN("user@x.service"))
1351                 return NULL;
1352
1353         if (memcmp(p, "user@", 5) == 0 && memcmp(p + n - 8, ".service", 8) == 0) {
1354                 char buf[n - 5 - 8 + 1];
1355
1356                 memcpy(buf, p + 5, n - 5 - 8);
1357                 buf[n - 5 - 8] = 0;
1358
1359                 /* Note that user manager services never need unescaping,
1360                  * since they cannot conflict with the kernel's own
1361                  * names, hence we don't need to call cg_unescape()
1362                  * here. */
1363
1364                 if (parse_uid(buf, NULL) < 0)
1365                         return NULL;
1366
1367                 p += n;
1368                 p += strspn(p, "/");
1369
1370                 return p;
1371         }
1372
1373         return NULL;
1374 }
1375
1376 static const char *skip_user_prefix(const char *path) {
1377         const char *e, *t;
1378
1379         assert(path);
1380
1381         /* Skip slices, if there are any */
1382         e = skip_slices(path);
1383
1384         /* Skip the user manager, if it's in the path now... */
1385         t = skip_user_manager(e);
1386         if (t)
1387                 return t;
1388
1389         /* Alternatively skip the user session if it is in the path... */
1390         return skip_session(e);
1391 }
1392
1393 int cg_path_get_user_unit(const char *path, char **ret) {
1394         const char *t;
1395
1396         assert(path);
1397         assert(ret);
1398
1399         t = skip_user_prefix(path);
1400         if (!t)
1401                 return -ENXIO;
1402
1403         /* And from here on it looks pretty much the same as for a system unit, hence let's use the same
1404          * parser. */
1405         return cg_path_get_unit(t, ret);
1406 }
1407
1408 int cg_pid_get_user_unit(pid_t pid, char **ret_unit) {
1409         _cleanup_free_ char *cgroup = NULL;
1410         int r;
1411
1412         assert(ret_unit);
1413
1414         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1415         if (r < 0)
1416                 return r;
1417
1418         return cg_path_get_user_unit(cgroup, ret_unit);
1419 }
1420
1421 int cg_path_get_machine_name(const char *path, char **ret_machine) {
1422         _cleanup_free_ char *u = NULL;
1423         const char *sl;
1424         int r;
1425
1426         r = cg_path_get_unit(path, &u);
1427         if (r < 0)
1428                 return r;
1429
1430         sl = strjoina("/run/systemd/machines/unit:", u);
1431         return readlink_malloc(sl, ret_machine);
1432 }
1433
1434 int cg_pid_get_machine_name(pid_t pid, char **ret_machine) {
1435         _cleanup_free_ char *cgroup = NULL;
1436         int r;
1437
1438         assert(ret_machine);
1439
1440         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1441         if (r < 0)
1442                 return r;
1443
1444         return cg_path_get_machine_name(cgroup, ret_machine);
1445 }
1446
1447 int cg_path_get_cgroupid(const char *path, uint64_t *ret) {
1448         cg_file_handle fh = CG_FILE_HANDLE_INIT;
1449         int mnt_id;
1450
1451         assert(path);
1452         assert(ret);
1453
1454         /* This is cgroupfs so we know the size of the handle, thus no need to loop around like
1455          * name_to_handle_at_loop() does in mountpoint-util.c */
1456         if (name_to_handle_at(AT_FDCWD, path, &fh.file_handle, &mnt_id, 0) < 0)
1457                 return -errno;
1458
1459         *ret = CG_FILE_HANDLE_CGROUPID(fh);
1460         return 0;
1461 }
1462
1463 int cg_fd_get_cgroupid(int fd, uint64_t *ret) {
1464         cg_file_handle fh = CG_FILE_HANDLE_INIT;
1465         int mnt_id = -1;
1466
1467         assert(fd >= 0);
1468         assert(ret);
1469
1470         if (name_to_handle_at(fd, "", &fh.file_handle, &mnt_id, AT_EMPTY_PATH) < 0)
1471                 return -errno;
1472
1473         *ret = CG_FILE_HANDLE_CGROUPID(fh);
1474         return 0;
1475 }
1476
1477 int cg_path_get_session(const char *path, char **ret_session) {
1478         _cleanup_free_ char *unit = NULL;
1479         char *start, *end;
1480         int r;
1481
1482         assert(path);
1483
1484         r = cg_path_get_unit(path, &unit);
1485         if (r < 0)
1486                 return r;
1487
1488         start = startswith(unit, "session-");
1489         if (!start)
1490                 return -ENXIO;
1491         end = endswith(start, ".scope");
1492         if (!end)
1493                 return -ENXIO;
1494
1495         *end = 0;
1496         if (!session_id_valid(start))
1497                 return -ENXIO;
1498
1499         if (!ret_session)
1500                 return 0;
1501
1502         return strdup_to(ret_session, start);
1503 }
1504
1505 int cg_pid_get_session(pid_t pid, char **ret_session) {
1506         _cleanup_free_ char *cgroup = NULL;
1507         int r;
1508
1509         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1510         if (r < 0)
1511                 return r;
1512
1513         return cg_path_get_session(cgroup, ret_session);
1514 }
1515
1516 int cg_path_get_owner_uid(const char *path, uid_t *ret_uid) {
1517         _cleanup_free_ char *slice = NULL;
1518         char *start, *end;
1519         int r;
1520
1521         assert(path);
1522
1523         r = cg_path_get_slice(path, &slice);
1524         if (r < 0)
1525                 return r;
1526
1527         start = startswith(slice, "user-");
1528         if (!start)
1529                 return -ENXIO;
1530
1531         end = endswith(start, ".slice");
1532         if (!end)
1533                 return -ENXIO;
1534
1535         *end = 0;
1536         if (parse_uid(start, ret_uid) < 0)
1537                 return -ENXIO;
1538
1539         return 0;
1540 }
1541
1542 int cg_pid_get_owner_uid(pid_t pid, uid_t *ret_uid) {
1543         _cleanup_free_ char *cgroup = NULL;
1544         int r;
1545
1546         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1547         if (r < 0)
1548                 return r;
1549
1550         return cg_path_get_owner_uid(cgroup, ret_uid);
1551 }
1552
1553 int cg_path_get_slice(const char *p, char **ret_slice) {
1554         const char *e = NULL;
1555
1556         assert(p);
1557         assert(ret_slice);
1558
1559         /* Finds the right-most slice unit from the beginning, but stops before we come to
1560          * the first non-slice unit. */
1561
1562         for (;;) {
1563                 const char *s;
1564                 int n;
1565
1566                 n = path_find_first_component(&p, /* accept_dot_dot = */ false, &s);
1567                 if (n < 0)
1568                         return n;
1569                 if (!valid_slice_name(s, n))
1570                         break;
1571
1572                 e = s;
1573         }
1574
1575         if (e)
1576                 return cg_path_decode_unit(e, ret_slice);
1577
1578         return strdup_to(ret_slice, SPECIAL_ROOT_SLICE);
1579 }
1580
1581 int cg_pid_get_slice(pid_t pid, char **ret_slice) {
1582         _cleanup_free_ char *cgroup = NULL;
1583         int r;
1584
1585         assert(ret_slice);
1586
1587         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1588         if (r < 0)
1589                 return r;
1590
1591         return cg_path_get_slice(cgroup, ret_slice);
1592 }
1593
1594 int cg_path_get_user_slice(const char *p, char **ret_slice) {
1595         const char *t;
1596         assert(p);
1597         assert(ret_slice);
1598
1599         t = skip_user_prefix(p);
1600         if (!t)
1601                 return -ENXIO;
1602
1603         /* And now it looks pretty much the same as for a system slice, so let's just use the same parser
1604          * from here on. */
1605         return cg_path_get_slice(t, ret_slice);
1606 }
1607
1608 int cg_pid_get_user_slice(pid_t pid, char **ret_slice) {
1609         _cleanup_free_ char *cgroup = NULL;
1610         int r;
1611
1612         assert(ret_slice);
1613
1614         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1615         if (r < 0)
1616                 return r;
1617
1618         return cg_path_get_user_slice(cgroup, ret_slice);
1619 }
1620
1621 bool cg_needs_escape(const char *p) {
1622
1623         /* Checks if the specified path is a valid cgroup name by our rules, or if it must be escaped. Note
1624          * that we consider escaped cgroup names invalid here, as they need to be escaped a second time if
1625          * they shall be used. Also note that various names cannot be made valid by escaping even if we
1626          * return true here (because too long, or contain the forbidden character "/"). */
1627
1628         if (!filename_is_valid(p))
1629                 return true;
1630
1631         if (IN_SET(p[0], '_', '.'))
1632                 return true;
1633
1634         if (STR_IN_SET(p, "notify_on_release", "release_agent", "tasks"))
1635                 return true;
1636
1637         if (startswith(p, "cgroup."))
1638                 return true;
1639
1640         for (CGroupController c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
1641                 const char *q;
1642
1643                 q = startswith(p, cgroup_controller_to_string(c));
1644                 if (!q)
1645                         continue;
1646
1647                 if (q[0] == '.')
1648                         return true;
1649         }
1650
1651         return false;
1652 }
1653
1654 int cg_escape(const char *p, char **ret) {
1655         _cleanup_free_ char *n = NULL;
1656
1657         /* This implements very minimal escaping for names to be used as file names in the cgroup tree: any
1658          * name which might conflict with a kernel name or is prefixed with '_' is prefixed with a '_'. That
1659          * way, when reading cgroup names it is sufficient to remove a single prefixing underscore if there
1660          * is one. */
1661
1662         /* The return value of this function (unlike cg_unescape()) needs free()! */
1663
1664         if (cg_needs_escape(p)) {
1665                 n = strjoin("_", p);
1666                 if (!n)
1667                         return -ENOMEM;
1668
1669                 if (!filename_is_valid(n)) /* became invalid due to the prefixing? Or contained things like a slash that cannot be fixed by prefixing? */
1670                         return -EINVAL;
1671         } else {
1672                 n = strdup(p);
1673                 if (!n)
1674                         return -ENOMEM;
1675         }
1676
1677         *ret = TAKE_PTR(n);
1678         return 0;
1679 }
1680
1681 char *cg_unescape(const char *p) {
1682         assert(p);
1683
1684         /* The return value of this function (unlike cg_escape())
1685          * doesn't need free()! */
1686
1687         if (p[0] == '_')
1688                 return (char*) p+1;
1689
1690         return (char*) p;
1691 }
1692
1693 #define CONTROLLER_VALID                        \
1694         DIGITS LETTERS                          \
1695         "_"
1696
1697 bool cg_controller_is_valid(const char *p) {
1698         const char *t, *s;
1699
1700         if (!p)
1701                 return false;
1702
1703         if (streq(p, SYSTEMD_CGROUP_CONTROLLER))
1704                 return true;
1705
1706         s = startswith(p, "name=");
1707         if (s)
1708                 p = s;
1709
1710         if (IN_SET(*p, 0, '_'))
1711                 return false;
1712
1713         for (t = p; *t; t++)
1714                 if (!strchr(CONTROLLER_VALID, *t))
1715                         return false;
1716
1717         if (t - p > NAME_MAX)
1718                 return false;
1719
1720         return true;
1721 }
1722
1723 int cg_slice_to_path(const char *unit, char **ret) {
1724         _cleanup_free_ char *p = NULL, *s = NULL, *e = NULL;
1725         const char *dash;
1726         int r;
1727
1728         assert(unit);
1729         assert(ret);
1730
1731         if (streq(unit, SPECIAL_ROOT_SLICE))
1732                 return strdup_to(ret, "");
1733
1734         if (!unit_name_is_valid(unit, UNIT_NAME_PLAIN))
1735                 return -EINVAL;
1736
1737         if (!endswith(unit, ".slice"))
1738                 return -EINVAL;
1739
1740         r = unit_name_to_prefix(unit, &p);
1741         if (r < 0)
1742                 return r;
1743
1744         dash = strchr(p, '-');
1745
1746         /* Don't allow initial dashes */
1747         if (dash == p)
1748                 return -EINVAL;
1749
1750         while (dash) {
1751                 _cleanup_free_ char *escaped = NULL;
1752                 char n[dash - p + sizeof(".slice")];
1753
1754 #if HAS_FEATURE_MEMORY_SANITIZER
1755                 /* msan doesn't instrument stpncpy, so it thinks
1756                  * n is later used uninitialized:
1757                  * https://github.com/google/sanitizers/issues/926
1758                  */
1759                 zero(n);
1760 #endif
1761
1762                 /* Don't allow trailing or double dashes */
1763                 if (IN_SET(dash[1], 0, '-'))
1764                         return -EINVAL;
1765
1766                 strcpy(stpncpy(n, p, dash - p), ".slice");
1767                 if (!unit_name_is_valid(n, UNIT_NAME_PLAIN))
1768                         return -EINVAL;
1769
1770                 r = cg_escape(n, &escaped);
1771                 if (r < 0)
1772                         return r;
1773
1774                 if (!strextend(&s, escaped, "/"))
1775                         return -ENOMEM;
1776
1777                 dash = strchr(dash+1, '-');
1778         }
1779
1780         r = cg_escape(unit, &e);
1781         if (r < 0)
1782                 return r;
1783
1784         if (!strextend(&s, e))
1785                 return -ENOMEM;
1786
1787         *ret = TAKE_PTR(s);
1788         return 0;
1789 }
1790
1791 int cg_is_threaded(const char *path) {
1792         _cleanup_free_ char *fs = NULL, *contents = NULL;
1793         _cleanup_strv_free_ char **v = NULL;
1794         int r;
1795
1796         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, "cgroup.type", &fs);
1797         if (r < 0)
1798                 return r;
1799
1800         r = read_full_virtual_file(fs, &contents, NULL);
1801         if (r == -ENOENT)
1802                 return false; /* Assume no. */
1803         if (r < 0)
1804                 return r;
1805
1806         v = strv_split(contents, NULL);
1807         if (!v)
1808                 return -ENOMEM;
1809
1810         /* If the cgroup is in the threaded mode, it contains "threaded".
1811          * If one of the parents or siblings is in the threaded mode, it may contain "invalid". */
1812         return strv_contains(v, "threaded") || strv_contains(v, "invalid");
1813 }
1814
1815 int cg_set_attribute(const char *controller, const char *path, const char *attribute, const char *value) {
1816         _cleanup_free_ char *p = NULL;
1817         int r;
1818
1819         r = cg_get_path(controller, path, attribute, &p);
1820         if (r < 0)
1821                 return r;
1822
1823         return write_string_file(p, value, WRITE_STRING_FILE_DISABLE_BUFFER);
1824 }
1825
1826 int cg_get_attribute(const char *controller, const char *path, const char *attribute, char **ret) {
1827         _cleanup_free_ char *p = NULL;
1828         int r;
1829
1830         r = cg_get_path(controller, path, attribute, &p);
1831         if (r < 0)
1832                 return r;
1833
1834         return read_one_line_file(p, ret);
1835 }
1836
1837 int cg_get_attribute_as_uint64(const char *controller, const char *path, const char *attribute, uint64_t *ret) {
1838         _cleanup_free_ char *value = NULL;
1839         uint64_t v;
1840         int r;
1841
1842         assert(ret);
1843
1844         r = cg_get_attribute(controller, path, attribute, &value);
1845         if (r == -ENOENT)
1846                 return -ENODATA;
1847         if (r < 0)
1848                 return r;
1849
1850         if (streq(value, "max")) {
1851                 *ret = CGROUP_LIMIT_MAX;
1852                 return 0;
1853         }
1854
1855         r = safe_atou64(value, &v);
1856         if (r < 0)
1857                 return r;
1858
1859         *ret = v;
1860         return 0;
1861 }
1862
1863 int cg_get_attribute_as_bool(const char *controller, const char *path, const char *attribute, bool *ret) {
1864         _cleanup_free_ char *value = NULL;
1865         int r;
1866
1867         assert(ret);
1868
1869         r = cg_get_attribute(controller, path, attribute, &value);
1870         if (r == -ENOENT)
1871                 return -ENODATA;
1872         if (r < 0)
1873                 return r;
1874
1875         r = parse_boolean(value);
1876         if (r < 0)
1877                 return r;
1878
1879         *ret = r;
1880         return 0;
1881 }
1882
1883 int cg_get_owner(const char *path, uid_t *ret_uid) {
1884         _cleanup_free_ char *f = NULL;
1885         struct stat stats;
1886         int r;
1887
1888         assert(ret_uid);
1889
1890         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &f);
1891         if (r < 0)
1892                 return r;
1893
1894         if (stat(f, &stats) < 0)
1895                 return -errno;
1896
1897         r = stat_verify_directory(&stats);
1898         if (r < 0)
1899                 return r;
1900
1901         *ret_uid = stats.st_uid;
1902         return 0;
1903 }
1904
1905 int cg_get_keyed_attribute_full(
1906                 const char *controller,
1907                 const char *path,
1908                 const char *attribute,
1909                 char **keys,
1910                 char **ret_values,
1911                 CGroupKeyMode mode) {
1912
1913         _cleanup_free_ char *filename = NULL, *contents = NULL;
1914         const char *p;
1915         size_t n, i, n_done = 0;
1916         char **v;
1917         int r;
1918
1919         /* Reads one or more fields of a cgroup v2 keyed attribute file. The 'keys' parameter should be an strv with
1920          * all keys to retrieve. The 'ret_values' parameter should be passed as string size with the same number of
1921          * entries as 'keys'. On success each entry will be set to the value of the matching key.
1922          *
1923          * If the attribute file doesn't exist at all returns ENOENT, if any key is not found returns ENXIO. If mode
1924          * is set to GG_KEY_MODE_GRACEFUL we ignore missing keys and return those that were parsed successfully. */
1925
1926         r = cg_get_path(controller, path, attribute, &filename);
1927         if (r < 0)
1928                 return r;
1929
1930         r = read_full_file(filename, &contents, NULL);
1931         if (r < 0)
1932                 return r;
1933
1934         n = strv_length(keys);
1935         if (n == 0) /* No keys to retrieve? That's easy, we are done then */
1936                 return 0;
1937
1938         /* Let's build this up in a temporary array for now in order not to clobber the return parameter on failure */
1939         v = newa0(char*, n);
1940
1941         for (p = contents; *p;) {
1942                 const char *w = NULL;
1943
1944                 for (i = 0; i < n; i++)
1945                         if (!v[i]) {
1946                                 w = first_word(p, keys[i]);
1947                                 if (w)
1948                                         break;
1949                         }
1950
1951                 if (w) {
1952                         size_t l;
1953
1954                         l = strcspn(w, NEWLINE);
1955                         v[i] = strndup(w, l);
1956                         if (!v[i]) {
1957                                 r = -ENOMEM;
1958                                 goto fail;
1959                         }
1960
1961                         n_done++;
1962                         if (n_done >= n)
1963                                 goto done;
1964
1965                         p = w + l;
1966                 } else
1967                         p += strcspn(p, NEWLINE);
1968
1969                 p += strspn(p, NEWLINE);
1970         }
1971
1972         if (mode & CG_KEY_MODE_GRACEFUL)
1973                 goto done;
1974
1975         r = -ENXIO;
1976
1977 fail:
1978         free_many_charp(v, n);
1979         return r;
1980
1981 done:
1982         memcpy(ret_values, v, sizeof(char*) * n);
1983         if (mode & CG_KEY_MODE_GRACEFUL)
1984                 return n_done;
1985
1986         return 0;
1987 }
1988
1989 int cg_mask_to_string(CGroupMask mask, char **ret) {
1990         _cleanup_free_ char *s = NULL;
1991         bool space = false;
1992         CGroupController c;
1993         size_t n = 0;
1994
1995         assert(ret);
1996
1997         if (mask == 0) {
1998                 *ret = NULL;
1999                 return 0;
2000         }
2001
2002         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2003                 const char *k;
2004                 size_t l;
2005
2006                 if (!FLAGS_SET(mask, CGROUP_CONTROLLER_TO_MASK(c)))
2007                         continue;
2008
2009                 k = cgroup_controller_to_string(c);
2010                 l = strlen(k);
2011
2012                 if (!GREEDY_REALLOC(s, n + space + l + 1))
2013                         return -ENOMEM;
2014
2015                 if (space)
2016                         s[n] = ' ';
2017                 memcpy(s + n + space, k, l);
2018                 n += space + l;
2019
2020                 space = true;
2021         }
2022
2023         assert(s);
2024
2025         s[n] = 0;
2026         *ret = TAKE_PTR(s);
2027
2028         return 0;
2029 }
2030
2031 int cg_mask_from_string(const char *value, CGroupMask *ret) {
2032         CGroupMask m = 0;
2033
2034         assert(ret);
2035         assert(value);
2036
2037         for (;;) {
2038                 _cleanup_free_ char *n = NULL;
2039                 CGroupController v;
2040                 int r;
2041
2042                 r = extract_first_word(&value, &n, NULL, 0);
2043                 if (r < 0)
2044                         return r;
2045                 if (r == 0)
2046                         break;
2047
2048                 v = cgroup_controller_from_string(n);
2049                 if (v < 0)
2050                         continue;
2051
2052                 m |= CGROUP_CONTROLLER_TO_MASK(v);
2053         }
2054
2055         *ret = m;
2056         return 0;
2057 }
2058
2059 int cg_mask_supported_subtree(const char *root, CGroupMask *ret) {
2060         CGroupMask mask;
2061         int r;
2062
2063         /* Determines the mask of supported cgroup controllers. Only includes controllers we can make sense of and that
2064          * are actually accessible. Only covers real controllers, i.e. not the CGROUP_CONTROLLER_BPF_xyz
2065          * pseudo-controllers. */
2066
2067         r = cg_all_unified();
2068         if (r < 0)
2069                 return r;
2070         if (r > 0) {
2071                 _cleanup_free_ char *controllers = NULL, *path = NULL;
2072
2073                 /* In the unified hierarchy we can read the supported and accessible controllers from
2074                  * the top-level cgroup attribute */
2075
2076                 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, root, "cgroup.controllers", &path);
2077                 if (r < 0)
2078                         return r;
2079
2080                 r = read_one_line_file(path, &controllers);
2081                 if (r < 0)
2082                         return r;
2083
2084                 r = cg_mask_from_string(controllers, &mask);
2085                 if (r < 0)
2086                         return r;
2087
2088                 /* Mask controllers that are not supported in unified hierarchy. */
2089                 mask &= CGROUP_MASK_V2;
2090
2091         } else {
2092                 CGroupController c;
2093
2094                 /* In the legacy hierarchy, we check which hierarchies are accessible. */
2095
2096                 mask = 0;
2097                 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2098                         CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2099                         const char *n;
2100
2101                         if (!FLAGS_SET(CGROUP_MASK_V1, bit))
2102                                 continue;
2103
2104                         n = cgroup_controller_to_string(c);
2105                         if (controller_is_v1_accessible(root, n) >= 0)
2106                                 mask |= bit;
2107                 }
2108         }
2109
2110         *ret = mask;
2111         return 0;
2112 }
2113
2114 int cg_mask_supported(CGroupMask *ret) {
2115         _cleanup_free_ char *root = NULL;
2116         int r;
2117
2118         r = cg_get_root_path(&root);
2119         if (r < 0)
2120                 return r;
2121
2122         return cg_mask_supported_subtree(root, ret);
2123 }
2124
2125 int cg_kernel_controllers(Set **ret) {
2126         _cleanup_set_free_ Set *controllers = NULL;
2127         _cleanup_fclose_ FILE *f = NULL;
2128         int r;
2129
2130         assert(ret);
2131
2132         /* Determines the full list of kernel-known controllers. Might include controllers we don't actually support
2133          * and controllers that aren't currently accessible (because not mounted). This does not include "name="
2134          * pseudo-controllers. */
2135
2136         r = fopen_unlocked("/proc/cgroups", "re", &f);
2137         if (r == -ENOENT) {
2138                 *ret = NULL;
2139                 return 0;
2140         }
2141         if (r < 0)
2142                 return r;
2143
2144         /* Ignore the header line */
2145         (void) read_line(f, SIZE_MAX, NULL);
2146
2147         for (;;) {
2148                 _cleanup_free_ char *controller = NULL;
2149                 int enabled = 0;
2150
2151                 if (fscanf(f, "%ms %*i %*i %i", &controller, &enabled) != 2) {
2152
2153                         if (ferror(f))
2154                                 return -errno;
2155
2156                         if (feof(f))
2157                                 break;
2158
2159                         return -EBADMSG;
2160                 }
2161
2162                 if (!enabled)
2163                         continue;
2164
2165                 if (!cg_controller_is_valid(controller))
2166                         return -EBADMSG;
2167
2168                 r = set_ensure_consume(&controllers, &string_hash_ops_free, TAKE_PTR(controller));
2169                 if (r < 0)
2170                         return r;
2171         }
2172
2173         *ret = TAKE_PTR(controllers);
2174
2175         return 0;
2176 }
2177
2178 /* The hybrid mode was initially implemented in v232 and simply mounted cgroup2 on
2179  * /sys/fs/cgroup/systemd. This unfortunately broke other tools (such as docker) which expected the v1
2180  * "name=systemd" hierarchy on /sys/fs/cgroup/systemd. From v233 and on, the hybrid mode mounts v2 on
2181  * /sys/fs/cgroup/unified and maintains "name=systemd" hierarchy on /sys/fs/cgroup/systemd for compatibility
2182  * with other tools.
2183  *
2184  * To keep live upgrade working, we detect and support v232 layout. When v232 layout is detected, to keep
2185  * cgroup v2 process management but disable the compat dual layout, we return true on
2186  * cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) and false on cg_hybrid_unified().
2187  */
2188 static thread_local bool unified_systemd_v232;
2189
2190 int cg_unified_cached(bool flush) {
2191         static thread_local CGroupUnified unified_cache = CGROUP_UNIFIED_UNKNOWN;
2192
2193         struct statfs fs;
2194
2195         /* Checks if we support the unified hierarchy. Returns an
2196          * error when the cgroup hierarchies aren't mounted yet or we
2197          * have any other trouble determining if the unified hierarchy
2198          * is supported. */
2199
2200         if (flush)
2201                 unified_cache = CGROUP_UNIFIED_UNKNOWN;
2202         else if (unified_cache >= CGROUP_UNIFIED_NONE)
2203                 return unified_cache;
2204
2205         if (statfs("/sys/fs/cgroup/", &fs) < 0)
2206                 return log_debug_errno(errno, "statfs(\"/sys/fs/cgroup/\") failed: %m");
2207
2208         if (F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2209                 log_debug("Found cgroup2 on /sys/fs/cgroup/, full unified hierarchy");
2210                 unified_cache = CGROUP_UNIFIED_ALL;
2211         } else if (F_TYPE_EQUAL(fs.f_type, TMPFS_MAGIC)) {
2212                 if (statfs("/sys/fs/cgroup/unified/", &fs) == 0 &&
2213                     F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2214                         log_debug("Found cgroup2 on /sys/fs/cgroup/unified, unified hierarchy for systemd controller");
2215                         unified_cache = CGROUP_UNIFIED_SYSTEMD;
2216                         unified_systemd_v232 = false;
2217                 } else {
2218                         if (statfs("/sys/fs/cgroup/systemd/", &fs) < 0) {
2219                                 if (errno == ENOENT) {
2220                                         /* Some other software may have set up /sys/fs/cgroup in a configuration we do not recognize. */
2221                                         log_debug_errno(errno, "Unsupported cgroupsv1 setup detected: name=systemd hierarchy not found.");
2222                                         return -ENOMEDIUM;
2223                                 }
2224                                 return log_debug_errno(errno, "statfs(\"/sys/fs/cgroup/systemd\" failed: %m");
2225                         }
2226
2227                         if (F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2228                                 log_debug("Found cgroup2 on /sys/fs/cgroup/systemd, unified hierarchy for systemd controller (v232 variant)");
2229                                 unified_cache = CGROUP_UNIFIED_SYSTEMD;
2230                                 unified_systemd_v232 = true;
2231                         } else if (F_TYPE_EQUAL(fs.f_type, CGROUP_SUPER_MAGIC)) {
2232                                 log_debug("Found cgroup on /sys/fs/cgroup/systemd, legacy hierarchy");
2233                                 unified_cache = CGROUP_UNIFIED_NONE;
2234                         } else {
2235                                 log_debug("Unexpected filesystem type %llx mounted on /sys/fs/cgroup/systemd, assuming legacy hierarchy",
2236                                           (unsigned long long) fs.f_type);
2237                                 unified_cache = CGROUP_UNIFIED_NONE;
2238                         }
2239                 }
2240         } else if (F_TYPE_EQUAL(fs.f_type, SYSFS_MAGIC)) {
2241                 return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM),
2242                                        "No filesystem is currently mounted on /sys/fs/cgroup.");
2243         } else
2244                 return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM),
2245                                        "Unknown filesystem type %llx mounted on /sys/fs/cgroup.",
2246                                        (unsigned long long)fs.f_type);
2247
2248         return unified_cache;
2249 }
2250
2251 int cg_unified_controller(const char *controller) {
2252         int r;
2253
2254         r = cg_unified_cached(false);
2255         if (r < 0)
2256                 return r;
2257
2258         if (r == CGROUP_UNIFIED_NONE)
2259                 return false;
2260
2261         if (r >= CGROUP_UNIFIED_ALL)
2262                 return true;
2263
2264         return streq_ptr(controller, SYSTEMD_CGROUP_CONTROLLER);
2265 }
2266
2267 int cg_all_unified(void) {
2268         int r;
2269
2270         r = cg_unified_cached(false);
2271         if (r < 0)
2272                 return r;
2273
2274         return r >= CGROUP_UNIFIED_ALL;
2275 }
2276
2277 int cg_hybrid_unified(void) {
2278         int r;
2279
2280         r = cg_unified_cached(false);
2281         if (r < 0)
2282                 return r;
2283
2284         return r == CGROUP_UNIFIED_SYSTEMD && !unified_systemd_v232;
2285 }
2286
2287 int cg_is_delegated(const char *path) {
2288         int r;
2289
2290         assert(path);
2291
2292         r = cg_get_xattr_bool(path, "trusted.delegate");
2293         if (!ERRNO_IS_NEG_XATTR_ABSENT(r))
2294                 return r;
2295
2296         /* If the trusted xattr isn't set (preferred), then check the untrusted one. Under the assumption
2297          * that whoever is trusted enough to own the cgroup, is also trusted enough to decide if it is
2298          * delegated or not this should be safe. */
2299         r = cg_get_xattr_bool(path, "user.delegate");
2300         return ERRNO_IS_NEG_XATTR_ABSENT(r) ? false : r;
2301 }
2302
2303 int cg_is_delegated_fd(int fd) {
2304         int r;
2305
2306         assert(fd >= 0);
2307
2308         r = getxattr_at_bool(fd, /* path= */ NULL, "trusted.delegate", /* flags= */ 0);
2309         if (!ERRNO_IS_NEG_XATTR_ABSENT(r))
2310                 return r;
2311
2312         r = getxattr_at_bool(fd, /* path= */ NULL, "user.delegate", /* flags= */ 0);
2313         return ERRNO_IS_NEG_XATTR_ABSENT(r) ? false : r;
2314 }
2315
2316 int cg_has_coredump_receive(const char *path) {
2317         int r;
2318
2319         assert(path);
2320
2321         r = cg_get_xattr_bool(path, "user.coredump_receive");
2322         if (ERRNO_IS_NEG_XATTR_ABSENT(r))
2323                 return false;
2324
2325         return r;
2326 }
2327
2328 const uint64_t cgroup_io_limit_defaults[_CGROUP_IO_LIMIT_TYPE_MAX] = {
2329         [CGROUP_IO_RBPS_MAX]    = CGROUP_LIMIT_MAX,
2330         [CGROUP_IO_WBPS_MAX]    = CGROUP_LIMIT_MAX,
2331         [CGROUP_IO_RIOPS_MAX]   = CGROUP_LIMIT_MAX,
2332         [CGROUP_IO_WIOPS_MAX]   = CGROUP_LIMIT_MAX,
2333 };
2334
2335 static const char* const cgroup_io_limit_type_table[_CGROUP_IO_LIMIT_TYPE_MAX] = {
2336         [CGROUP_IO_RBPS_MAX]    = "IOReadBandwidthMax",
2337         [CGROUP_IO_WBPS_MAX]    = "IOWriteBandwidthMax",
2338         [CGROUP_IO_RIOPS_MAX]   = "IOReadIOPSMax",
2339         [CGROUP_IO_WIOPS_MAX]   = "IOWriteIOPSMax",
2340 };
2341
2342 DEFINE_STRING_TABLE_LOOKUP(cgroup_io_limit_type, CGroupIOLimitType);
2343
2344 bool is_cgroup_fs(const struct statfs *s) {
2345         return is_fs_type(s, CGROUP_SUPER_MAGIC) ||
2346                is_fs_type(s, CGROUP2_SUPER_MAGIC);
2347 }
2348
2349 bool fd_is_cgroup_fs(int fd) {
2350         struct statfs s;
2351
2352         if (fstatfs(fd, &s) < 0)
2353                 return -errno;
2354
2355         return is_cgroup_fs(&s);
2356 }
2357
2358 static const char *const cgroup_controller_table[_CGROUP_CONTROLLER_MAX] = {
2359         [CGROUP_CONTROLLER_CPU] = "cpu",
2360         [CGROUP_CONTROLLER_CPUACCT] = "cpuacct",
2361         [CGROUP_CONTROLLER_CPUSET] = "cpuset",
2362         [CGROUP_CONTROLLER_IO] = "io",
2363         [CGROUP_CONTROLLER_BLKIO] = "blkio",
2364         [CGROUP_CONTROLLER_MEMORY] = "memory",
2365         [CGROUP_CONTROLLER_DEVICES] = "devices",
2366         [CGROUP_CONTROLLER_PIDS] = "pids",
2367         [CGROUP_CONTROLLER_BPF_FIREWALL] = "bpf-firewall",
2368         [CGROUP_CONTROLLER_BPF_DEVICES] = "bpf-devices",
2369         [CGROUP_CONTROLLER_BPF_FOREIGN] = "bpf-foreign",
2370         [CGROUP_CONTROLLER_BPF_SOCKET_BIND] = "bpf-socket-bind",
2371         [CGROUP_CONTROLLER_BPF_RESTRICT_NETWORK_INTERFACES] = "bpf-restrict-network-interfaces",
2372 };
2373
2374 DEFINE_STRING_TABLE_LOOKUP(cgroup_controller, CGroupController);
2375
2376 CGroupMask get_cpu_accounting_mask(void) {
2377         static CGroupMask needed_mask = (CGroupMask) -1;
2378
2379         /* On kernel ≥4.15 with unified hierarchy, cpu.stat's usage_usec is
2380          * provided externally from the CPU controller, which means we don't
2381          * need to enable the CPU controller just to get metrics. This is good,
2382          * because enabling the CPU controller comes at a minor performance
2383          * hit, especially when it's propagated deep into large hierarchies.
2384          * There's also no separate CPU accounting controller available within
2385          * a unified hierarchy.
2386          *
2387          * This combination of factors results in the desired cgroup mask to
2388          * enable for CPU accounting varying as follows:
2389          *
2390          *                   ╔═════════════════════╤═════════════════════╗
2391          *                   ║     Linux ≥4.15     │     Linux <4.15     ║
2392          *   ╔═══════════════╬═════════════════════╪═════════════════════╣
2393          *   ║ Unified       ║ nothing             │ CGROUP_MASK_CPU     ║
2394          *   ╟───────────────╫─────────────────────┼─────────────────────╢
2395          *   ║ Hybrid/Legacy ║ CGROUP_MASK_CPUACCT │ CGROUP_MASK_CPUACCT ║
2396          *   ╚═══════════════╩═════════════════════╧═════════════════════╝
2397          *
2398          * We check kernel version here instead of manually checking whether
2399          * cpu.stat is present for every cgroup, as that check in itself would
2400          * already be fairly expensive.
2401          *
2402          * Kernels where this patch has been backported will therefore have the
2403          * CPU controller enabled unnecessarily. This is more expensive than
2404          * necessary, but harmless. ☺️
2405          */
2406
2407         if (needed_mask == (CGroupMask) -1) {
2408                 if (cg_all_unified()) {
2409                         struct utsname u;
2410                         assert_se(uname(&u) >= 0);
2411
2412                         if (strverscmp_improved(u.release, "4.15") < 0)
2413                                 needed_mask = CGROUP_MASK_CPU;
2414                         else
2415                                 needed_mask = 0;
2416                 } else
2417                         needed_mask = CGROUP_MASK_CPUACCT;
2418         }
2419
2420         return needed_mask;
2421 }
2422
2423 bool cpu_accounting_is_cheap(void) {
2424         return get_cpu_accounting_mask() == 0;
2425 }
2426
2427 static const char* const managed_oom_mode_table[_MANAGED_OOM_MODE_MAX] = {
2428         [MANAGED_OOM_AUTO] = "auto",
2429         [MANAGED_OOM_KILL] = "kill",
2430 };
2431
2432 DEFINE_STRING_TABLE_LOOKUP(managed_oom_mode, ManagedOOMMode);
2433
2434 static const char* const managed_oom_preference_table[_MANAGED_OOM_PREFERENCE_MAX] = {
2435         [MANAGED_OOM_PREFERENCE_NONE] = "none",
2436         [MANAGED_OOM_PREFERENCE_AVOID] = "avoid",
2437         [MANAGED_OOM_PREFERENCE_OMIT] = "omit",
2438 };
2439
2440 DEFINE_STRING_TABLE_LOOKUP(managed_oom_preference, ManagedOOMPreference);