src/basic/cgroup-util.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <errno.h>
   4 #include <limits.h>
   5 #include <signal.h>
   6 #include <stddef.h>
   7 #include <stdlib.h>
   8 #include <sys/types.h>
   9 #include <sys/utsname.h>
  10 #include <sys/xattr.h>
  11 #include <unistd.h>
  12
  13 #include "alloc-util.h"
  14 #include "cgroup-util.h"
  15 #include "constants.h"
  16 #include "dirent-util.h"
  17 #include "extract-word.h"
  18 #include "fd-util.h"
  19 #include "fileio.h"
  20 #include "format-util.h"
  21 #include "fs-util.h"
  22 #include "log.h"
  23 #include "login-util.h"
  24 #include "macro.h"
  25 #include "missing_magic.h"
  26 #include "missing_threads.h"
  27 #include "mkdir.h"
  28 #include "parse-util.h"
  29 #include "path-util.h"
  30 #include "process-util.h"
  31 #include "set.h"
  32 #include "special.h"
  33 #include "stat-util.h"
  34 #include "stdio-util.h"
  35 #include "string-table.h"
  36 #include "string-util.h"
  37 #include "strv.h"
  38 #include "unit-name.h"
  39 #include "user-util.h"
  40 #include "xattr-util.h"
  41
  42 static int cg_enumerate_items(const char *controller, const char *path, FILE **ret, const char *item) {
  43         _cleanup_free_ char *fs = NULL;
  44         FILE *f;
  45         int r;
  46
  47         assert(ret);
  48
  49         r = cg_get_path(controller, path, item, &fs);
  50         if (r < 0)
  51                 return r;
  52
  53         f = fopen(fs, "re");
  54         if (!f)
  55                 return -errno;
  56
  57         *ret = f;
  58         return 0;
  59 }
  60
  61 int cg_enumerate_processes(const char *controller, const char *path, FILE **ret) {
  62         return cg_enumerate_items(controller, path, ret, "cgroup.procs");
  63 }
  64
  65 int cg_read_pid(FILE *f, pid_t *ret) {
  66         unsigned long ul;
  67
  68         /* Note that the cgroup.procs might contain duplicates! See cgroups.txt for details. */
  69
  70         assert(f);
  71         assert(ret);
  72
  73         errno = 0;
  74         if (fscanf(f, "%lu", &ul) != 1) {
  75
  76                 if (feof(f)) {
  77                         *ret = 0;
  78                         return 0;
  79                 }
  80
  81                 return errno_or_else(EIO);
  82         }
  83
  84         if (ul <= 0)
  85                 return -EIO;
  86         if (ul > PID_T_MAX)
  87                 return -EIO;
  88
  89         *ret = (pid_t) ul;
  90         return 1;
  91 }
  92
  93 int cg_read_pidref(FILE *f, PidRef *ret) {
  94         int r;
  95
  96         assert(f);
  97         assert(ret);
  98
  99         for (;;) {
 100                 pid_t pid;
 101
 102                 r = cg_read_pid(f, &pid);
 103                 if (r < 0)
 104                         return r;
 105                 if (r == 0) {
 106                         *ret = PIDREF_NULL;
 107                         return 0;
 108                 }
 109
 110                 r = pidref_set_pid(ret, pid);
 111                 if (r >= 0)
 112                         return 1;
 113                 if (r != -ESRCH)
 114                         return r;
 115
 116                 /* ESRCH → gone by now? just skip over it, read the next */
 117         }
 118 }
 119
 120 int cg_read_event(
 121                 const char *controller,
 122                 const char *path,
 123                 const char *event,
 124                 char **ret) {
 125
 126         _cleanup_free_ char *events = NULL, *content = NULL;
 127         int r;
 128
 129         r = cg_get_path(controller, path, "cgroup.events", &events);
 130         if (r < 0)
 131                 return r;
 132
 133         r = read_full_virtual_file(events, &content, NULL);
 134         if (r < 0)
 135                 return r;
 136
 137         for (const char *p = content;;) {
 138                 _cleanup_free_ char *line = NULL, *key = NULL, *val = NULL;
 139                 const char *q;
 140
 141                 r = extract_first_word(&p, &line, "\n", 0);
 142                 if (r < 0)
 143                         return r;
 144                 if (r == 0)
 145                         return -ENOENT;
 146
 147                 q = line;
 148                 r = extract_first_word(&q, &key, " ", 0);
 149                 if (r < 0)
 150                         return r;
 151                 if (r == 0)
 152                         return -EINVAL;
 153
 154                 if (!streq(key, event))
 155                         continue;
 156
 157                 val = strdup(q);
 158                 if (!val)
 159                         return -ENOMEM;
 160
 161                 *ret = TAKE_PTR(val);
 162                 return 0;
 163         }
 164 }
 165
 166 bool cg_ns_supported(void) {
 167         static thread_local int enabled = -1;
 168
 169         if (enabled >= 0)
 170                 return enabled;
 171
 172         if (access("/proc/self/ns/cgroup", F_OK) < 0) {
 173                 if (errno != ENOENT)
 174                         log_debug_errno(errno, "Failed to check whether /proc/self/ns/cgroup is available, assuming not: %m");
 175                 enabled = false;
 176         } else
 177                 enabled = true;
 178
 179         return enabled;
 180 }
 181
 182 bool cg_freezer_supported(void) {
 183         static thread_local int supported = -1;
 184
 185         if (supported >= 0)
 186                 return supported;
 187
 188         supported = cg_all_unified() > 0 && access("/sys/fs/cgroup/init.scope/cgroup.freeze", F_OK) == 0;
 189
 190         return supported;
 191 }
 192
 193 bool cg_kill_supported(void) {
 194         static thread_local int supported = -1;
 195
 196         if (supported >= 0)
 197                 return supported;
 198
 199         if (cg_all_unified() <= 0)
 200                 supported = false;
 201         else if (access("/sys/fs/cgroup/init.scope/cgroup.kill", F_OK) < 0) {
 202                 if (errno != ENOENT)
 203                         log_debug_errno(errno, "Failed to check if cgroup.kill is available, assuming not: %m");
 204                 supported = false;
 205         } else
 206                 supported = true;
 207
 208         return supported;
 209 }
 210
 211 int cg_enumerate_subgroups(const char *controller, const char *path, DIR **ret) {
 212         _cleanup_free_ char *fs = NULL;
 213         DIR *d;
 214         int r;
 215
 216         assert(ret);
 217
 218         /* This is not recursive! */
 219
 220         r = cg_get_path(controller, path, NULL, &fs);
 221         if (r < 0)
 222                 return r;
 223
 224         d = opendir(fs);
 225         if (!d)
 226                 return -errno;
 227
 228         *ret = d;
 229         return 0;
 230 }
 231
 232 int cg_read_subgroup(DIR *d, char **ret) {
 233         assert(d);
 234         assert(ret);
 235
 236         FOREACH_DIRENT_ALL(de, d, return -errno) {
 237                 char *b;
 238
 239                 if (de->d_type != DT_DIR)
 240                         continue;
 241
 242                 if (dot_or_dot_dot(de->d_name))
 243                         continue;
 244
 245                 b = strdup(de->d_name);
 246                 if (!b)
 247                         return -ENOMEM;
 248
 249                 *ret = b;
 250                 return 1;
 251         }
 252
 253         *ret = NULL;
 254         return 0;
 255 }
 256
 257 int cg_rmdir(const char *controller, const char *path) {
 258         _cleanup_free_ char *p = NULL;
 259         int r;
 260
 261         r = cg_get_path(controller, path, NULL, &p);
 262         if (r < 0)
 263                 return r;
 264
 265         r = rmdir(p);
 266         if (r < 0 && errno != ENOENT)
 267                 return -errno;
 268
 269         r = cg_hybrid_unified();
 270         if (r <= 0)
 271                 return r;
 272
 273         if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 274                 r = cg_rmdir(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path);
 275                 if (r < 0)
 276                         log_warning_errno(r, "Failed to remove compat systemd cgroup %s: %m", path);
 277         }
 278
 279         return 0;
 280 }
 281
 282 static int cg_kill_items(
 283                 const char *path,
 284                 int sig,
 285                 CGroupFlags flags,
 286                 Set *s,
 287                 cg_kill_log_func_t log_kill,
 288                 void *userdata,
 289                 const char *item) {
 290
 291         _cleanup_set_free_ Set *allocated_set = NULL;
 292         bool done = false;
 293         int r, ret = 0, ret_log_kill = 0;
 294
 295         assert(sig >= 0);
 296
 297          /* Don't send SIGCONT twice. Also, SIGKILL always works even when process is suspended, hence don't send
 298           * SIGCONT on SIGKILL. */
 299         if (IN_SET(sig, SIGCONT, SIGKILL))
 300                 flags &= ~CGROUP_SIGCONT;
 301
 302         /* This goes through the tasks list and kills them all. This
 303          * is repeated until no further processes are added to the
 304          * tasks list, to properly handle forking processes */
 305
 306         if (!s) {
 307                 s = allocated_set = set_new(NULL);
 308                 if (!s)
 309                         return -ENOMEM;
 310         }
 311
 312         do {
 313                 _cleanup_fclose_ FILE *f = NULL;
 314                 done = true;
 315
 316                 r = cg_enumerate_items(SYSTEMD_CGROUP_CONTROLLER, path, &f, item);
 317                 if (r == -ENOENT)
 318                         break;
 319                 if (r < 0)
 320                         return RET_GATHER(ret, r);
 321
 322                 for (;;) {
 323                         _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
 324
 325                         r = cg_read_pidref(f, &pidref);
 326                         if (r < 0)
 327                                 return RET_GATHER(ret, r);
 328                         if (r == 0)
 329                                 break;
 330
 331                         if ((flags & CGROUP_IGNORE_SELF) && pidref_is_self(&pidref))
 332                                 continue;
 333
 334                         if (set_get(s, PID_TO_PTR(pidref.pid)) == PID_TO_PTR(pidref.pid))
 335                                 continue;
 336
 337                         if (log_kill)
 338                                 ret_log_kill = log_kill(&pidref, sig, userdata);
 339
 340                         /* If we haven't killed this process yet, kill it */
 341                         r = pidref_kill(&pidref, sig);
 342                         if (r < 0 && r != -ESRCH)
 343                                 RET_GATHER(ret, r);
 344                         if (r >= 0) {
 345                                 if (flags & CGROUP_SIGCONT)
 346                                         (void) pidref_kill(&pidref, SIGCONT);
 347
 348                                 if (ret == 0) {
 349                                         if (log_kill)
 350                                                 ret = ret_log_kill;
 351                                         else
 352                                                 ret = 1;
 353                                 }
 354                         }
 355
 356                         done = false;
 357
 358                         r = set_put(s, PID_TO_PTR(pidref.pid));
 359                         if (r < 0)
 360                                 return RET_GATHER(ret, r);
 361                 }
 362
 363                 /* To avoid racing against processes which fork quicker than we can kill them, we repeat this
 364                  * until no new pids need to be killed. */
 365
 366         } while (!done);
 367
 368         return ret;
 369 }
 370
 371 int cg_kill(
 372                 const char *path,
 373                 int sig,
 374                 CGroupFlags flags,
 375                 Set *s,
 376                 cg_kill_log_func_t log_kill,
 377                 void *userdata) {
 378
 379         int r, ret;
 380
 381         r = cg_kill_items(path, sig, flags, s, log_kill, userdata, "cgroup.procs");
 382         if (r < 0 || sig != SIGKILL)
 383                 return r;
 384
 385         ret = r;
 386
 387         /* Only in case of killing with SIGKILL and when using cgroupsv2, kill remaining threads manually as
 388            a workaround for kernel bug. It was fixed in 5.2-rc5 (c03cd7738a83), backported to 4.19.66
 389            (4340d175b898) and 4.14.138 (feb6b123b7dd). */
 390         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
 391         if (r < 0)
 392                 return r;
 393         if (r == 0)
 394                 return ret;
 395
 396         r = cg_kill_items(path, sig, flags, s, log_kill, userdata, "cgroup.threads");
 397         if (r < 0)
 398                 return r;
 399
 400         return r > 0 || ret > 0;
 401 }
 402
 403 int cg_kill_kernel_sigkill(const char *path) {
 404         /* Kills the cgroup at `path` directly by writing to its cgroup.kill file.  This sends SIGKILL to all
 405          * processes in the cgroup and has the advantage of being completely atomic, unlike cg_kill_items(). */
 406
 407         _cleanup_free_ char *killfile = NULL;
 408         int r;
 409
 410         assert(path);
 411
 412         if (!cg_kill_supported())
 413                 return -EOPNOTSUPP;
 414
 415         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, "cgroup.kill", &killfile);
 416         if (r < 0)
 417                 return r;
 418
 419         r = write_string_file(killfile, "1", WRITE_STRING_FILE_DISABLE_BUFFER);
 420         if (r < 0)
 421                 return r;
 422
 423         return 0;
 424 }
 425
 426 int cg_kill_recursive(
 427                 const char *path,
 428                 int sig,
 429                 CGroupFlags flags,
 430                 Set *s,
 431                 cg_kill_log_func_t log_kill,
 432                 void *userdata) {
 433
 434         int r, ret;
 435
 436         assert(path);
 437         assert(sig >= 0);
 438
 439         if (sig == SIGKILL && cg_kill_supported() &&
 440             !FLAGS_SET(flags, CGROUP_IGNORE_SELF) && !s && !log_kill)
 441                 /* ignore CGROUP_SIGCONT, since this is a no-op alongside SIGKILL */
 442                 ret = cg_kill_kernel_sigkill(path);
 443         else {
 444                 _cleanup_set_free_ Set *allocated_set = NULL;
 445                 _cleanup_closedir_ DIR *d = NULL;
 446
 447                 if (!s) {
 448                         s = allocated_set = set_new(NULL);
 449                         if (!s)
 450                                 return -ENOMEM;
 451                 }
 452
 453                 ret = cg_kill(path, sig, flags, s, log_kill, userdata);
 454
 455                 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
 456                 if (r < 0) {
 457                         if (r != -ENOENT)
 458                                 RET_GATHER(ret, r);
 459
 460                         return ret;
 461                 }
 462
 463                 for (;;) {
 464                         _cleanup_free_ char *fn = NULL, *p = NULL;
 465
 466                         r = cg_read_subgroup(d, &fn);
 467                         if (r < 0) {
 468                                 RET_GATHER(ret, r);
 469                                 break;
 470                         }
 471                         if (r == 0)
 472                                 break;
 473
 474                         p = path_join(empty_to_root(path), fn);
 475                         if (!p)
 476                                 return -ENOMEM;
 477
 478                         r = cg_kill_recursive(p, sig, flags, s, log_kill, userdata);
 479                         if (r != 0 && ret >= 0)
 480                                 ret = r;
 481                 }
 482         }
 483
 484         if (FLAGS_SET(flags, CGROUP_REMOVE)) {
 485                 r = cg_rmdir(SYSTEMD_CGROUP_CONTROLLER, path);
 486                 if (!IN_SET(r, -ENOENT, -EBUSY))
 487                         RET_GATHER(ret, r);
 488         }
 489
 490         return ret;
 491 }
 492
 493 static const char *controller_to_dirname(const char *controller) {
 494         assert(controller);
 495
 496         /* Converts a controller name to the directory name below /sys/fs/cgroup/ we want to mount it
 497          * to. Effectively, this just cuts off the name= prefixed used for named hierarchies, if it is
 498          * specified. */
 499
 500         if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 501                 if (cg_hybrid_unified() > 0)
 502                         controller = SYSTEMD_CGROUP_CONTROLLER_HYBRID;
 503                 else
 504                         controller = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
 505         }
 506
 507         return startswith(controller, "name=") ?: controller;
 508 }
 509
 510 static int join_path_legacy(const char *controller, const char *path, const char *suffix, char **ret) {
 511         const char *dn;
 512         char *t = NULL;
 513
 514         assert(ret);
 515         assert(controller);
 516
 517         dn = controller_to_dirname(controller);
 518
 519         if (isempty(path) && isempty(suffix))
 520                 t = path_join("/sys/fs/cgroup", dn);
 521         else if (isempty(path))
 522                 t = path_join("/sys/fs/cgroup", dn, suffix);
 523         else if (isempty(suffix))
 524                 t = path_join("/sys/fs/cgroup", dn, path);
 525         else
 526                 t = path_join("/sys/fs/cgroup", dn, path, suffix);
 527         if (!t)
 528                 return -ENOMEM;
 529
 530         *ret = t;
 531         return 0;
 532 }
 533
 534 static int join_path_unified(const char *path, const char *suffix, char **ret) {
 535         char *t;
 536
 537         assert(ret);
 538
 539         if (isempty(path) && isempty(suffix))
 540                 t = strdup("/sys/fs/cgroup");
 541         else if (isempty(path))
 542                 t = path_join("/sys/fs/cgroup", suffix);
 543         else if (isempty(suffix))
 544                 t = path_join("/sys/fs/cgroup", path);
 545         else
 546                 t = path_join("/sys/fs/cgroup", path, suffix);
 547         if (!t)
 548                 return -ENOMEM;
 549
 550         *ret = t;
 551         return 0;
 552 }
 553
 554 int cg_get_path(const char *controller, const char *path, const char *suffix, char **ret) {
 555         int r;
 556
 557         assert(ret);
 558
 559         if (!controller) {
 560                 char *t;
 561
 562                 /* If no controller is specified, we return the path *below* the controllers, without any
 563                  * prefix. */
 564
 565                 if (isempty(path) && isempty(suffix))
 566                         return -EINVAL;
 567
 568                 if (isempty(suffix))
 569                         t = strdup(path);
 570                 else if (isempty(path))
 571                         t = strdup(suffix);
 572                 else
 573                         t = path_join(path, suffix);
 574                 if (!t)
 575                         return -ENOMEM;
 576
 577                 *ret = path_simplify(t);
 578                 return 0;
 579         }
 580
 581         if (!cg_controller_is_valid(controller))
 582                 return -EINVAL;
 583
 584         r = cg_all_unified();
 585         if (r < 0)
 586                 return r;
 587         if (r > 0)
 588                 r = join_path_unified(path, suffix, ret);
 589         else
 590                 r = join_path_legacy(controller, path, suffix, ret);
 591         if (r < 0)
 592                 return r;
 593
 594         path_simplify(*ret);
 595         return 0;
 596 }
 597
 598 static int controller_is_v1_accessible(const char *root, const char *controller) {
 599         const char *cpath, *dn;
 600
 601         assert(controller);
 602
 603         dn = controller_to_dirname(controller);
 604
 605         /* If root if specified, we check that:
 606          * - possible subcgroup is created at root,
 607          * - we can modify the hierarchy. */
 608
 609         cpath = strjoina("/sys/fs/cgroup/", dn, root, root ? "/cgroup.procs" : NULL);
 610         return laccess(cpath, root ? W_OK : F_OK);
 611 }
 612
 613 int cg_get_path_and_check(const char *controller, const char *path, const char *suffix, char **ret) {
 614         int r;
 615
 616         assert(controller);
 617         assert(ret);
 618
 619         if (!cg_controller_is_valid(controller))
 620                 return -EINVAL;
 621
 622         r = cg_all_unified();
 623         if (r < 0)
 624                 return r;
 625         if (r > 0) {
 626                 /* In the unified hierarchy all controllers are considered accessible,
 627                  * except for the named hierarchies */
 628                 if (startswith(controller, "name="))
 629                         return -EOPNOTSUPP;
 630         } else {
 631                 /* Check if the specified controller is actually accessible */
 632                 r = controller_is_v1_accessible(NULL, controller);
 633                 if (r < 0)
 634                         return r;
 635         }
 636
 637         return cg_get_path(controller, path, suffix, ret);
 638 }
 639
 640 int cg_set_xattr(const char *path, const char *name, const void *value, size_t size, int flags) {
 641         _cleanup_free_ char *fs = NULL;
 642         int r;
 643
 644         assert(path);
 645         assert(name);
 646         assert(value || size <= 0);
 647
 648         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
 649         if (r < 0)
 650                 return r;
 651
 652         return RET_NERRNO(setxattr(fs, name, value, size, flags));
 653 }
 654
 655 int cg_get_xattr(const char *path, const char *name, void *value, size_t size) {
 656         _cleanup_free_ char *fs = NULL;
 657         ssize_t n;
 658         int r;
 659
 660         assert(path);
 661         assert(name);
 662
 663         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
 664         if (r < 0)
 665                 return r;
 666
 667         n = getxattr(fs, name, value, size);
 668         if (n < 0)
 669                 return -errno;
 670
 671         return (int) n;
 672 }
 673
 674 int cg_get_xattr_malloc(const char *path, const char *name, char **ret) {
 675         _cleanup_free_ char *fs = NULL;
 676         int r;
 677
 678         assert(path);
 679         assert(name);
 680
 681         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
 682         if (r < 0)
 683                 return r;
 684
 685         return lgetxattr_malloc(fs, name, ret);
 686 }
 687
 688 int cg_get_xattr_bool(const char *path, const char *name) {
 689         _cleanup_free_ char *fs = NULL;
 690         int r;
 691
 692         assert(path);
 693         assert(name);
 694
 695         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
 696         if (r < 0)
 697                 return r;
 698
 699         return getxattr_at_bool(AT_FDCWD, fs, name, /* flags= */ 0);
 700 }
 701
 702 int cg_remove_xattr(const char *path, const char *name) {
 703         _cleanup_free_ char *fs = NULL;
 704         int r;
 705
 706         assert(path);
 707         assert(name);
 708
 709         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
 710         if (r < 0)
 711                 return r;
 712
 713         return RET_NERRNO(removexattr(fs, name));
 714 }
 715
 716 int cg_pid_get_path(const char *controller, pid_t pid, char **ret_path) {
 717         _cleanup_fclose_ FILE *f = NULL;
 718         const char *fs, *controller_str = NULL;  /* avoid false maybe-uninitialized warning */
 719         int unified, r;
 720
 721         assert(pid >= 0);
 722         assert(ret_path);
 723
 724         if (controller) {
 725                 if (!cg_controller_is_valid(controller))
 726                         return -EINVAL;
 727         } else
 728                 controller = SYSTEMD_CGROUP_CONTROLLER;
 729
 730         unified = cg_unified_controller(controller);
 731         if (unified < 0)
 732                 return unified;
 733         if (unified == 0) {
 734                 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER))
 735                         controller_str = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
 736                 else
 737                         controller_str = controller;
 738         }
 739
 740         fs = procfs_file_alloca(pid, "cgroup");
 741         r = fopen_unlocked(fs, "re", &f);
 742         if (r == -ENOENT)
 743                 return -ESRCH;
 744         if (r < 0)
 745                 return r;
 746
 747         for (;;) {
 748                 _cleanup_free_ char *line = NULL;
 749                 char *e;
 750
 751                 r = read_line(f, LONG_LINE_MAX, &line);
 752                 if (r < 0)
 753                         return r;
 754                 if (r == 0)
 755                         return -ENODATA;
 756
 757                 if (unified) {
 758                         e = startswith(line, "0:");
 759                         if (!e)
 760                                 continue;
 761
 762                         e = strchr(e, ':');
 763                         if (!e)
 764                                 continue;
 765                 } else {
 766                         char *l;
 767
 768                         l = strchr(line, ':');
 769                         if (!l)
 770                                 continue;
 771
 772                         l++;
 773                         e = strchr(l, ':');
 774                         if (!e)
 775                                 continue;
 776                         *e = 0;
 777
 778                         assert(controller_str);
 779                         r = string_contains_word(l, ",", controller_str);
 780                         if (r < 0)
 781                                 return r;
 782                         if (r == 0)
 783                                 continue;
 784                 }
 785
 786                 char *path = strdup(e + 1);
 787                 if (!path)
 788                         return -ENOMEM;
 789
 790                 /* Truncate suffix indicating the process is a zombie */
 791                 e = endswith(path, " (deleted)");
 792                 if (e)
 793                         *e = 0;
 794
 795                 *ret_path = path;
 796                 return 0;
 797         }
 798 }
 799
 800 int cg_pidref_get_path(const char *controller, const PidRef *pidref, char **ret_path) {
 801         _cleanup_free_ char *path = NULL;
 802         int r;
 803
 804         assert(ret_path);
 805
 806         if (!pidref_is_set(pidref))
 807                 return -ESRCH;
 808
 809         r = cg_pid_get_path(controller, pidref->pid, &path);
 810         if (r < 0)
 811                 return r;
 812
 813         /* Before we return the path, make sure the procfs entry for this pid still matches the pidref */
 814         r = pidref_verify(pidref);
 815         if (r < 0)
 816                 return r;
 817
 818         *ret_path = TAKE_PTR(path);
 819         return 0;
 820 }
 821
 822 int cg_install_release_agent(const char *controller, const char *agent) {
 823         _cleanup_free_ char *fs = NULL, *contents = NULL;
 824         const char *sc;
 825         int r;
 826
 827         assert(agent);
 828
 829         r = cg_unified_controller(controller);
 830         if (r < 0)
 831                 return r;
 832         if (r > 0) /* doesn't apply to unified hierarchy */
 833                 return -EOPNOTSUPP;
 834
 835         r = cg_get_path(controller, NULL, "release_agent", &fs);
 836         if (r < 0)
 837                 return r;
 838
 839         r = read_one_line_file(fs, &contents);
 840         if (r < 0)
 841                 return r;
 842
 843         sc = strstrip(contents);
 844         if (isempty(sc)) {
 845                 r = write_string_file(fs, agent, WRITE_STRING_FILE_DISABLE_BUFFER);
 846                 if (r < 0)
 847                         return r;
 848         } else if (!path_equal(sc, agent))
 849                 return -EEXIST;
 850
 851         fs = mfree(fs);
 852         r = cg_get_path(controller, NULL, "notify_on_release", &fs);
 853         if (r < 0)
 854                 return r;
 855
 856         contents = mfree(contents);
 857         r = read_one_line_file(fs, &contents);
 858         if (r < 0)
 859                 return r;
 860
 861         sc = strstrip(contents);
 862         if (streq(sc, "0")) {
 863                 r = write_string_file(fs, "1", WRITE_STRING_FILE_DISABLE_BUFFER);
 864                 if (r < 0)
 865                         return r;
 866
 867                 return 1;
 868         }
 869
 870         if (!streq(sc, "1"))
 871                 return -EIO;
 872
 873         return 0;
 874 }
 875
 876 int cg_uninstall_release_agent(const char *controller) {
 877         _cleanup_free_ char *fs = NULL;
 878         int r;
 879
 880         r = cg_unified_controller(controller);
 881         if (r < 0)
 882                 return r;
 883         if (r > 0) /* Doesn't apply to unified hierarchy */
 884                 return -EOPNOTSUPP;
 885
 886         r = cg_get_path(controller, NULL, "notify_on_release", &fs);
 887         if (r < 0)
 888                 return r;
 889
 890         r = write_string_file(fs, "0", WRITE_STRING_FILE_DISABLE_BUFFER);
 891         if (r < 0)
 892                 return r;
 893
 894         fs = mfree(fs);
 895
 896         r = cg_get_path(controller, NULL, "release_agent", &fs);
 897         if (r < 0)
 898                 return r;
 899
 900         r = write_string_file(fs, "", WRITE_STRING_FILE_DISABLE_BUFFER);
 901         if (r < 0)
 902                 return r;
 903
 904         return 0;
 905 }
 906
 907 int cg_is_empty(const char *controller, const char *path) {
 908         _cleanup_fclose_ FILE *f = NULL;
 909         pid_t pid;
 910         int r;
 911
 912         assert(path);
 913
 914         r = cg_enumerate_processes(controller, path, &f);
 915         if (r == -ENOENT)
 916                 return true;
 917         if (r < 0)
 918                 return r;
 919
 920         r = cg_read_pid(f, &pid);
 921         if (r < 0)
 922                 return r;
 923
 924         return r == 0;
 925 }
 926
 927 int cg_is_empty_recursive(const char *controller, const char *path) {
 928         int r;
 929
 930         assert(path);
 931
 932         /* The root cgroup is always populated */
 933         if (controller && empty_or_root(path))
 934                 return false;
 935
 936         r = cg_unified_controller(controller);
 937         if (r < 0)
 938                 return r;
 939         if (r > 0) {
 940                 _cleanup_free_ char *t = NULL;
 941
 942                 /* On the unified hierarchy we can check empty state
 943                  * via the "populated" attribute of "cgroup.events". */
 944
 945                 r = cg_read_event(controller, path, "populated", &t);
 946                 if (r == -ENOENT)
 947                         return true;
 948                 if (r < 0)
 949                         return r;
 950
 951                 return streq(t, "0");
 952         } else {
 953                 _cleanup_closedir_ DIR *d = NULL;
 954                 char *fn;
 955
 956                 r = cg_is_empty(controller, path);
 957                 if (r <= 0)
 958                         return r;
 959
 960                 r = cg_enumerate_subgroups(controller, path, &d);
 961                 if (r == -ENOENT)
 962                         return true;
 963                 if (r < 0)
 964                         return r;
 965
 966                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
 967                         _cleanup_free_ char *p = NULL;
 968
 969                         p = path_join(path, fn);
 970                         free(fn);
 971                         if (!p)
 972                                 return -ENOMEM;
 973
 974                         r = cg_is_empty_recursive(controller, p);
 975                         if (r <= 0)
 976                                 return r;
 977                 }
 978                 if (r < 0)
 979                         return r;
 980
 981                 return true;
 982         }
 983 }
 984
 985 int cg_split_spec(const char *spec, char **ret_controller, char **ret_path) {
 986         _cleanup_free_ char *controller = NULL, *path = NULL;
 987         int r;
 988
 989         assert(spec);
 990
 991         if (*spec == '/') {
 992                 if (!path_is_normalized(spec))
 993                         return -EINVAL;
 994
 995                 if (ret_path) {
 996                         r = path_simplify_alloc(spec, &path);
 997                         if (r < 0)
 998                                 return r;
 999                 }
1000
1001         } else {
1002                 const char *e;
1003
1004                 e = strchr(spec, ':');
1005                 if (e) {
1006                         controller = strndup(spec, e-spec);
1007                         if (!controller)
1008                                 return -ENOMEM;
1009                         if (!cg_controller_is_valid(controller))
1010                                 return -EINVAL;
1011
1012                         if (!isempty(e + 1)) {
1013                                 path = strdup(e+1);
1014                                 if (!path)
1015                                         return -ENOMEM;
1016
1017                                 if (!path_is_normalized(path) ||
1018                                     !path_is_absolute(path))
1019                                         return -EINVAL;
1020
1021                                 path_simplify(path);
1022                         }
1023
1024                 } else {
1025                         if (!cg_controller_is_valid(spec))
1026                                 return -EINVAL;
1027
1028                         if (ret_controller) {
1029                                 controller = strdup(spec);
1030                                 if (!controller)
1031                                         return -ENOMEM;
1032                         }
1033                 }
1034         }
1035
1036         if (ret_controller)
1037                 *ret_controller = TAKE_PTR(controller);
1038         if (ret_path)
1039                 *ret_path = TAKE_PTR(path);
1040         return 0;
1041 }
1042
1043 int cg_mangle_path(const char *path, char **ret) {
1044         _cleanup_free_ char *c = NULL, *p = NULL;
1045         int r;
1046
1047         assert(path);
1048         assert(ret);
1049
1050         /* First, check if it already is a filesystem path */
1051         if (path_startswith(path, "/sys/fs/cgroup"))
1052                 return path_simplify_alloc(path, ret);
1053
1054         /* Otherwise, treat it as cg spec */
1055         r = cg_split_spec(path, &c, &p);
1056         if (r < 0)
1057                 return r;
1058
1059         return cg_get_path(c ?: SYSTEMD_CGROUP_CONTROLLER, p ?: "/", NULL, ret);
1060 }
1061
1062 int cg_get_root_path(char **ret_path) {
1063         char *p, *e;
1064         int r;
1065
1066         assert(ret_path);
1067
1068         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 1, &p);
1069         if (r < 0)
1070                 return r;
1071
1072         e = endswith(p, "/" SPECIAL_INIT_SCOPE);
1073         if (!e)
1074                 e = endswith(p, "/" SPECIAL_SYSTEM_SLICE); /* legacy */
1075         if (!e)
1076                 e = endswith(p, "/system"); /* even more legacy */
1077         if (e)
1078                 *e = 0;
1079
1080         *ret_path = p;
1081         return 0;
1082 }
1083
1084 int cg_shift_path(const char *cgroup, const char *root, const char **ret_shifted) {
1085         _cleanup_free_ char *rt = NULL;
1086         char *p;
1087         int r;
1088
1089         assert(cgroup);
1090         assert(ret_shifted);
1091
1092         if (!root) {
1093                 /* If the root was specified let's use that, otherwise
1094                  * let's determine it from PID 1 */
1095
1096                 r = cg_get_root_path(&rt);
1097                 if (r < 0)
1098                         return r;
1099
1100                 root = rt;
1101         }
1102
1103         p = path_startswith(cgroup, root);
1104         if (p && p > cgroup)
1105                 *ret_shifted = p - 1;
1106         else
1107                 *ret_shifted = cgroup;
1108
1109         return 0;
1110 }
1111
1112 int cg_pid_get_path_shifted(pid_t pid, const char *root, char **ret_cgroup) {
1113         _cleanup_free_ char *raw = NULL;
1114         const char *c;
1115         int r;
1116
1117         assert(pid >= 0);
1118         assert(ret_cgroup);
1119
1120         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &raw);
1121         if (r < 0)
1122                 return r;
1123
1124         r = cg_shift_path(raw, root, &c);
1125         if (r < 0)
1126                 return r;
1127
1128         if (c == raw)
1129                 *ret_cgroup = TAKE_PTR(raw);
1130         else {
1131                 char *n;
1132
1133                 n = strdup(c);
1134                 if (!n)
1135                         return -ENOMEM;
1136
1137                 *ret_cgroup = n;
1138         }
1139
1140         return 0;
1141 }
1142
1143 int cg_path_decode_unit(const char *cgroup, char **ret_unit) {
1144         char *c, *s;
1145         size_t n;
1146
1147         assert(cgroup);
1148         assert(ret_unit);
1149
1150         n = strcspn(cgroup, "/");
1151         if (n < 3)
1152                 return -ENXIO;
1153
1154         c = strndupa_safe(cgroup, n);
1155         c = cg_unescape(c);
1156
1157         if (!unit_name_is_valid(c, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
1158                 return -ENXIO;
1159
1160         s = strdup(c);
1161         if (!s)
1162                 return -ENOMEM;
1163
1164         *ret_unit = s;
1165         return 0;
1166 }
1167
1168 static bool valid_slice_name(const char *p, size_t n) {
1169
1170         if (!p)
1171                 return false;
1172
1173         if (n < STRLEN("x.slice"))
1174                 return false;
1175
1176         if (memcmp(p + n - 6, ".slice", 6) == 0) {
1177                 char buf[n+1], *c;
1178
1179                 memcpy(buf, p, n);
1180                 buf[n] = 0;
1181
1182                 c = cg_unescape(buf);
1183
1184                 return unit_name_is_valid(c, UNIT_NAME_PLAIN);
1185         }
1186
1187         return false;
1188 }
1189
1190 static const char *skip_slices(const char *p) {
1191         assert(p);
1192
1193         /* Skips over all slice assignments */
1194
1195         for (;;) {
1196                 size_t n;
1197
1198                 p += strspn(p, "/");
1199
1200                 n = strcspn(p, "/");
1201                 if (!valid_slice_name(p, n))
1202                         return p;
1203
1204                 p += n;
1205         }
1206 }
1207
1208 int cg_path_get_unit(const char *path, char **ret) {
1209         _cleanup_free_ char *unit = NULL;
1210         const char *e;
1211         int r;
1212
1213         assert(path);
1214         assert(ret);
1215
1216         e = skip_slices(path);
1217
1218         r = cg_path_decode_unit(e, &unit);
1219         if (r < 0)
1220                 return r;
1221
1222         /* We skipped over the slices, don't accept any now */
1223         if (endswith(unit, ".slice"))
1224                 return -ENXIO;
1225
1226         *ret = TAKE_PTR(unit);
1227         return 0;
1228 }
1229
1230 int cg_path_get_unit_path(const char *path, char **ret) {
1231         _cleanup_free_ char *path_copy = NULL;
1232         char *unit_name;
1233
1234         assert(path);
1235         assert(ret);
1236
1237         path_copy = strdup(path);
1238         if (!path_copy)
1239                 return -ENOMEM;
1240
1241         unit_name = (char *)skip_slices(path_copy);
1242         unit_name[strcspn(unit_name, "/")] = 0;
1243
1244         if (!unit_name_is_valid(cg_unescape(unit_name), UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
1245                 return -ENXIO;
1246
1247         *ret = TAKE_PTR(path_copy);
1248
1249         return 0;
1250 }
1251
1252 int cg_pid_get_unit(pid_t pid, char **ret_unit) {
1253         _cleanup_free_ char *cgroup = NULL;
1254         int r;
1255
1256         assert(ret_unit);
1257
1258         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1259         if (r < 0)
1260                 return r;
1261
1262         return cg_path_get_unit(cgroup, ret_unit);
1263 }
1264
1265 int cg_pidref_get_unit(const PidRef *pidref, char **ret) {
1266         _cleanup_free_ char *unit = NULL;
1267         int r;
1268
1269         assert(ret);
1270
1271         if (!pidref_is_set(pidref))
1272                 return -ESRCH;
1273
1274         r = cg_pid_get_unit(pidref->pid, &unit);
1275         if (r < 0)
1276                 return r;
1277
1278         r = pidref_verify(pidref);
1279         if (r < 0)
1280                 return r;
1281
1282         *ret = TAKE_PTR(unit);
1283         return 0;
1284 }
1285
1286 /**
1287  * Skip session-*.scope, but require it to be there.
1288  */
1289 static const char *skip_session(const char *p) {
1290         size_t n;
1291
1292         if (isempty(p))
1293                 return NULL;
1294
1295         p += strspn(p, "/");
1296
1297         n = strcspn(p, "/");
1298         if (n < STRLEN("session-x.scope"))
1299                 return NULL;
1300
1301         if (memcmp(p, "session-", 8) == 0 && memcmp(p + n - 6, ".scope", 6) == 0) {
1302                 char buf[n - 8 - 6 + 1];
1303
1304                 memcpy(buf, p + 8, n - 8 - 6);
1305                 buf[n - 8 - 6] = 0;
1306
1307                 /* Note that session scopes never need unescaping,
1308                  * since they cannot conflict with the kernel's own
1309                  * names, hence we don't need to call cg_unescape()
1310                  * here. */
1311
1312                 if (!session_id_valid(buf))
1313                         return NULL;
1314
1315                 p += n;
1316                 p += strspn(p, "/");
1317                 return p;
1318         }
1319
1320         return NULL;
1321 }
1322
1323 /**
1324  * Skip user@*.service, but require it to be there.
1325  */
1326 static const char *skip_user_manager(const char *p) {
1327         size_t n;
1328
1329         if (isempty(p))
1330                 return NULL;
1331
1332         p += strspn(p, "/");
1333
1334         n = strcspn(p, "/");
1335         if (n < STRLEN("user@x.service"))
1336                 return NULL;
1337
1338         if (memcmp(p, "user@", 5) == 0 && memcmp(p + n - 8, ".service", 8) == 0) {
1339                 char buf[n - 5 - 8 + 1];
1340
1341                 memcpy(buf, p + 5, n - 5 - 8);
1342                 buf[n - 5 - 8] = 0;
1343
1344                 /* Note that user manager services never need unescaping,
1345                  * since they cannot conflict with the kernel's own
1346                  * names, hence we don't need to call cg_unescape()
1347                  * here. */
1348
1349                 if (parse_uid(buf, NULL) < 0)
1350                         return NULL;
1351
1352                 p += n;
1353                 p += strspn(p, "/");
1354
1355                 return p;
1356         }
1357
1358         return NULL;
1359 }
1360
1361 static const char *skip_user_prefix(const char *path) {
1362         const char *e, *t;
1363
1364         assert(path);
1365
1366         /* Skip slices, if there are any */
1367         e = skip_slices(path);
1368
1369         /* Skip the user manager, if it's in the path now... */
1370         t = skip_user_manager(e);
1371         if (t)
1372                 return t;
1373
1374         /* Alternatively skip the user session if it is in the path... */
1375         return skip_session(e);
1376 }
1377
1378 int cg_path_get_user_unit(const char *path, char **ret) {
1379         const char *t;
1380
1381         assert(path);
1382         assert(ret);
1383
1384         t = skip_user_prefix(path);
1385         if (!t)
1386                 return -ENXIO;
1387
1388         /* And from here on it looks pretty much the same as for a system unit, hence let's use the same
1389          * parser. */
1390         return cg_path_get_unit(t, ret);
1391 }
1392
1393 int cg_pid_get_user_unit(pid_t pid, char **ret_unit) {
1394         _cleanup_free_ char *cgroup = NULL;
1395         int r;
1396
1397         assert(ret_unit);
1398
1399         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1400         if (r < 0)
1401                 return r;
1402
1403         return cg_path_get_user_unit(cgroup, ret_unit);
1404 }
1405
1406 int cg_path_get_machine_name(const char *path, char **ret_machine) {
1407         _cleanup_free_ char *u = NULL;
1408         const char *sl;
1409         int r;
1410
1411         r = cg_path_get_unit(path, &u);
1412         if (r < 0)
1413                 return r;
1414
1415         sl = strjoina("/run/systemd/machines/unit:", u);
1416         return readlink_malloc(sl, ret_machine);
1417 }
1418
1419 int cg_pid_get_machine_name(pid_t pid, char **ret_machine) {
1420         _cleanup_free_ char *cgroup = NULL;
1421         int r;
1422
1423         assert(ret_machine);
1424
1425         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1426         if (r < 0)
1427                 return r;
1428
1429         return cg_path_get_machine_name(cgroup, ret_machine);
1430 }
1431
1432 int cg_path_get_cgroupid(const char *path, uint64_t *ret) {
1433         cg_file_handle fh = CG_FILE_HANDLE_INIT;
1434         int mnt_id = -1;
1435
1436         assert(path);
1437         assert(ret);
1438
1439         /* This is cgroupfs so we know the size of the handle, thus no need to loop around like
1440          * name_to_handle_at_loop() does in mountpoint-util.c */
1441         if (name_to_handle_at(AT_FDCWD, path, &fh.file_handle, &mnt_id, 0) < 0)
1442                 return -errno;
1443
1444         *ret = CG_FILE_HANDLE_CGROUPID(fh);
1445         return 0;
1446 }
1447
1448 int cg_path_get_session(const char *path, char **ret_session) {
1449         _cleanup_free_ char *unit = NULL;
1450         char *start, *end;
1451         int r;
1452
1453         assert(path);
1454
1455         r = cg_path_get_unit(path, &unit);
1456         if (r < 0)
1457                 return r;
1458
1459         start = startswith(unit, "session-");
1460         if (!start)
1461                 return -ENXIO;
1462         end = endswith(start, ".scope");
1463         if (!end)
1464                 return -ENXIO;
1465
1466         *end = 0;
1467         if (!session_id_valid(start))
1468                 return -ENXIO;
1469
1470         if (ret_session) {
1471                 char *rr;
1472
1473                 rr = strdup(start);
1474                 if (!rr)
1475                         return -ENOMEM;
1476
1477                 *ret_session = rr;
1478         }
1479
1480         return 0;
1481 }
1482
1483 int cg_pid_get_session(pid_t pid, char **ret_session) {
1484         _cleanup_free_ char *cgroup = NULL;
1485         int r;
1486
1487         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1488         if (r < 0)
1489                 return r;
1490
1491         return cg_path_get_session(cgroup, ret_session);
1492 }
1493
1494 int cg_path_get_owner_uid(const char *path, uid_t *ret_uid) {
1495         _cleanup_free_ char *slice = NULL;
1496         char *start, *end;
1497         int r;
1498
1499         assert(path);
1500
1501         r = cg_path_get_slice(path, &slice);
1502         if (r < 0)
1503                 return r;
1504
1505         start = startswith(slice, "user-");
1506         if (!start)
1507                 return -ENXIO;
1508
1509         end = endswith(start, ".slice");
1510         if (!end)
1511                 return -ENXIO;
1512
1513         *end = 0;
1514         if (parse_uid(start, ret_uid) < 0)
1515                 return -ENXIO;
1516
1517         return 0;
1518 }
1519
1520 int cg_pid_get_owner_uid(pid_t pid, uid_t *ret_uid) {
1521         _cleanup_free_ char *cgroup = NULL;
1522         int r;
1523
1524         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1525         if (r < 0)
1526                 return r;
1527
1528         return cg_path_get_owner_uid(cgroup, ret_uid);
1529 }
1530
1531 int cg_path_get_slice(const char *p, char **ret_slice) {
1532         const char *e = NULL;
1533
1534         assert(p);
1535         assert(ret_slice);
1536
1537         /* Finds the right-most slice unit from the beginning, but
1538          * stops before we come to the first non-slice unit. */
1539
1540         for (;;) {
1541                 size_t n;
1542
1543                 p += strspn(p, "/");
1544
1545                 n = strcspn(p, "/");
1546                 if (!valid_slice_name(p, n)) {
1547
1548                         if (!e) {
1549                                 char *s;
1550
1551                                 s = strdup(SPECIAL_ROOT_SLICE);
1552                                 if (!s)
1553                                         return -ENOMEM;
1554
1555                                 *ret_slice = s;
1556                                 return 0;
1557                         }
1558
1559                         return cg_path_decode_unit(e, ret_slice);
1560                 }
1561
1562                 e = p;
1563                 p += n;
1564         }
1565 }
1566
1567 int cg_pid_get_slice(pid_t pid, char **ret_slice) {
1568         _cleanup_free_ char *cgroup = NULL;
1569         int r;
1570
1571         assert(ret_slice);
1572
1573         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1574         if (r < 0)
1575                 return r;
1576
1577         return cg_path_get_slice(cgroup, ret_slice);
1578 }
1579
1580 int cg_path_get_user_slice(const char *p, char **ret_slice) {
1581         const char *t;
1582         assert(p);
1583         assert(ret_slice);
1584
1585         t = skip_user_prefix(p);
1586         if (!t)
1587                 return -ENXIO;
1588
1589         /* And now it looks pretty much the same as for a system slice, so let's just use the same parser
1590          * from here on. */
1591         return cg_path_get_slice(t, ret_slice);
1592 }
1593
1594 int cg_pid_get_user_slice(pid_t pid, char **ret_slice) {
1595         _cleanup_free_ char *cgroup = NULL;
1596         int r;
1597
1598         assert(ret_slice);
1599
1600         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1601         if (r < 0)
1602                 return r;
1603
1604         return cg_path_get_user_slice(cgroup, ret_slice);
1605 }
1606
1607 bool cg_needs_escape(const char *p) {
1608
1609         /* Checks if the specified path is a valid cgroup name by our rules, or if it must be escaped. Note
1610          * that we consider escaped cgroup names invalid here, as they need to be escaped a second time if
1611          * they shall be used. Also note that various names cannot be made valid by escaping even if we
1612          * return true here (because too long, or contain the forbidden character "/"). */
1613
1614         if (!filename_is_valid(p))
1615                 return true;
1616
1617         if (IN_SET(p[0], '_', '.'))
1618                 return true;
1619
1620         if (STR_IN_SET(p, "notify_on_release", "release_agent", "tasks"))
1621                 return true;
1622
1623         if (startswith(p, "cgroup."))
1624                 return true;
1625
1626         for (CGroupController c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
1627                 const char *q;
1628
1629                 q = startswith(p, cgroup_controller_to_string(c));
1630                 if (!q)
1631                         continue;
1632
1633                 if (q[0] == '.')
1634                         return true;
1635         }
1636
1637         return false;
1638 }
1639
1640 int cg_escape(const char *p, char **ret) {
1641         _cleanup_free_ char *n = NULL;
1642
1643         /* This implements very minimal escaping for names to be used as file names in the cgroup tree: any
1644          * name which might conflict with a kernel name or is prefixed with '_' is prefixed with a '_'. That
1645          * way, when reading cgroup names it is sufficient to remove a single prefixing underscore if there
1646          * is one. */
1647
1648         /* The return value of this function (unlike cg_unescape()) needs free()! */
1649
1650         if (cg_needs_escape(p)) {
1651                 n = strjoin("_", p);
1652                 if (!n)
1653                         return -ENOMEM;
1654
1655                 if (!filename_is_valid(n)) /* became invalid due to the prefixing? Or contained things like a slash that cannot be fixed by prefixing? */
1656                         return -EINVAL;
1657         } else {
1658                 n = strdup(p);
1659                 if (!n)
1660                         return -ENOMEM;
1661         }
1662
1663         *ret = TAKE_PTR(n);
1664         return 0;
1665 }
1666
1667 char *cg_unescape(const char *p) {
1668         assert(p);
1669
1670         /* The return value of this function (unlike cg_escape())
1671          * doesn't need free()! */
1672
1673         if (p[0] == '_')
1674                 return (char*) p+1;
1675
1676         return (char*) p;
1677 }
1678
1679 #define CONTROLLER_VALID                        \
1680         DIGITS LETTERS                          \
1681         "_"
1682
1683 bool cg_controller_is_valid(const char *p) {
1684         const char *t, *s;
1685
1686         if (!p)
1687                 return false;
1688
1689         if (streq(p, SYSTEMD_CGROUP_CONTROLLER))
1690                 return true;
1691
1692         s = startswith(p, "name=");
1693         if (s)
1694                 p = s;
1695
1696         if (IN_SET(*p, 0, '_'))
1697                 return false;
1698
1699         for (t = p; *t; t++)
1700                 if (!strchr(CONTROLLER_VALID, *t))
1701                         return false;
1702
1703         if (t - p > NAME_MAX)
1704                 return false;
1705
1706         return true;
1707 }
1708
1709 int cg_slice_to_path(const char *unit, char **ret) {
1710         _cleanup_free_ char *p = NULL, *s = NULL, *e = NULL;
1711         const char *dash;
1712         int r;
1713
1714         assert(unit);
1715         assert(ret);
1716
1717         if (streq(unit, SPECIAL_ROOT_SLICE)) {
1718                 char *x;
1719
1720                 x = strdup("");
1721                 if (!x)
1722                         return -ENOMEM;
1723                 *ret = x;
1724                 return 0;
1725         }
1726
1727         if (!unit_name_is_valid(unit, UNIT_NAME_PLAIN))
1728                 return -EINVAL;
1729
1730         if (!endswith(unit, ".slice"))
1731                 return -EINVAL;
1732
1733         r = unit_name_to_prefix(unit, &p);
1734         if (r < 0)
1735                 return r;
1736
1737         dash = strchr(p, '-');
1738
1739         /* Don't allow initial dashes */
1740         if (dash == p)
1741                 return -EINVAL;
1742
1743         while (dash) {
1744                 _cleanup_free_ char *escaped = NULL;
1745                 char n[dash - p + sizeof(".slice")];
1746
1747 #if HAS_FEATURE_MEMORY_SANITIZER
1748                 /* msan doesn't instrument stpncpy, so it thinks
1749                  * n is later used uninitialized:
1750                  * https://github.com/google/sanitizers/issues/926
1751                  */
1752                 zero(n);
1753 #endif
1754
1755                 /* Don't allow trailing or double dashes */
1756                 if (IN_SET(dash[1], 0, '-'))
1757                         return -EINVAL;
1758
1759                 strcpy(stpncpy(n, p, dash - p), ".slice");
1760                 if (!unit_name_is_valid(n, UNIT_NAME_PLAIN))
1761                         return -EINVAL;
1762
1763                 r = cg_escape(n, &escaped);
1764                 if (r < 0)
1765                         return r;
1766
1767                 if (!strextend(&s, escaped, "/"))
1768                         return -ENOMEM;
1769
1770                 dash = strchr(dash+1, '-');
1771         }
1772
1773         r = cg_escape(unit, &e);
1774         if (r < 0)
1775                 return r;
1776
1777         if (!strextend(&s, e))
1778                 return -ENOMEM;
1779
1780         *ret = TAKE_PTR(s);
1781         return 0;
1782 }
1783
1784 int cg_is_threaded(const char *path) {
1785         _cleanup_free_ char *fs = NULL, *contents = NULL;
1786         _cleanup_strv_free_ char **v = NULL;
1787         int r;
1788
1789         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, "cgroup.type", &fs);
1790         if (r < 0)
1791                 return r;
1792
1793         r = read_full_virtual_file(fs, &contents, NULL);
1794         if (r == -ENOENT)
1795                 return false; /* Assume no. */
1796         if (r < 0)
1797                 return r;
1798
1799         v = strv_split(contents, NULL);
1800         if (!v)
1801                 return -ENOMEM;
1802
1803         /* If the cgroup is in the threaded mode, it contains "threaded".
1804          * If one of the parents or siblings is in the threaded mode, it may contain "invalid". */
1805         return strv_contains(v, "threaded") || strv_contains(v, "invalid");
1806 }
1807
1808 int cg_set_attribute(const char *controller, const char *path, const char *attribute, const char *value) {
1809         _cleanup_free_ char *p = NULL;
1810         int r;
1811
1812         r = cg_get_path(controller, path, attribute, &p);
1813         if (r < 0)
1814                 return r;
1815
1816         return write_string_file(p, value, WRITE_STRING_FILE_DISABLE_BUFFER);
1817 }
1818
1819 int cg_get_attribute(const char *controller, const char *path, const char *attribute, char **ret) {
1820         _cleanup_free_ char *p = NULL;
1821         int r;
1822
1823         r = cg_get_path(controller, path, attribute, &p);
1824         if (r < 0)
1825                 return r;
1826
1827         return read_one_line_file(p, ret);
1828 }
1829
1830 int cg_get_attribute_as_uint64(const char *controller, const char *path, const char *attribute, uint64_t *ret) {
1831         _cleanup_free_ char *value = NULL;
1832         uint64_t v;
1833         int r;
1834
1835         assert(ret);
1836
1837         r = cg_get_attribute(controller, path, attribute, &value);
1838         if (r == -ENOENT)
1839                 return -ENODATA;
1840         if (r < 0)
1841                 return r;
1842
1843         if (streq(value, "max")) {
1844                 *ret = CGROUP_LIMIT_MAX;
1845                 return 0;
1846         }
1847
1848         r = safe_atou64(value, &v);
1849         if (r < 0)
1850                 return r;
1851
1852         *ret = v;
1853         return 0;
1854 }
1855
1856 int cg_get_attribute_as_bool(const char *controller, const char *path, const char *attribute, bool *ret) {
1857         _cleanup_free_ char *value = NULL;
1858         int r;
1859
1860         assert(ret);
1861
1862         r = cg_get_attribute(controller, path, attribute, &value);
1863         if (r == -ENOENT)
1864                 return -ENODATA;
1865         if (r < 0)
1866                 return r;
1867
1868         r = parse_boolean(value);
1869         if (r < 0)
1870                 return r;
1871
1872         *ret = r;
1873         return 0;
1874 }
1875
1876 int cg_get_owner(const char *path, uid_t *ret_uid) {
1877         _cleanup_free_ char *f = NULL;
1878         struct stat stats;
1879         int r;
1880
1881         assert(ret_uid);
1882
1883         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &f);
1884         if (r < 0)
1885                 return r;
1886
1887         if (stat(f, &stats) < 0)
1888                 return -errno;
1889
1890         r = stat_verify_directory(&stats);
1891         if (r < 0)
1892                 return r;
1893
1894         *ret_uid = stats.st_uid;
1895         return 0;
1896 }
1897
1898 int cg_get_keyed_attribute_full(
1899                 const char *controller,
1900                 const char *path,
1901                 const char *attribute,
1902                 char **keys,
1903                 char **ret_values,
1904                 CGroupKeyMode mode) {
1905
1906         _cleanup_free_ char *filename = NULL, *contents = NULL;
1907         const char *p;
1908         size_t n, i, n_done = 0;
1909         char **v;
1910         int r;
1911
1912         /* Reads one or more fields of a cgroup v2 keyed attribute file. The 'keys' parameter should be an strv with
1913          * all keys to retrieve. The 'ret_values' parameter should be passed as string size with the same number of
1914          * entries as 'keys'. On success each entry will be set to the value of the matching key.
1915          *
1916          * If the attribute file doesn't exist at all returns ENOENT, if any key is not found returns ENXIO. If mode
1917          * is set to GG_KEY_MODE_GRACEFUL we ignore missing keys and return those that were parsed successfully. */
1918
1919         r = cg_get_path(controller, path, attribute, &filename);
1920         if (r < 0)
1921                 return r;
1922
1923         r = read_full_file(filename, &contents, NULL);
1924         if (r < 0)
1925                 return r;
1926
1927         n = strv_length(keys);
1928         if (n == 0) /* No keys to retrieve? That's easy, we are done then */
1929                 return 0;
1930
1931         /* Let's build this up in a temporary array for now in order not to clobber the return parameter on failure */
1932         v = newa0(char*, n);
1933
1934         for (p = contents; *p;) {
1935                 const char *w = NULL;
1936
1937                 for (i = 0; i < n; i++)
1938                         if (!v[i]) {
1939                                 w = first_word(p, keys[i]);
1940                                 if (w)
1941                                         break;
1942                         }
1943
1944                 if (w) {
1945                         size_t l;
1946
1947                         l = strcspn(w, NEWLINE);
1948                         v[i] = strndup(w, l);
1949                         if (!v[i]) {
1950                                 r = -ENOMEM;
1951                                 goto fail;
1952                         }
1953
1954                         n_done++;
1955                         if (n_done >= n)
1956                                 goto done;
1957
1958                         p = w + l;
1959                 } else
1960                         p += strcspn(p, NEWLINE);
1961
1962                 p += strspn(p, NEWLINE);
1963         }
1964
1965         if (mode & CG_KEY_MODE_GRACEFUL)
1966                 goto done;
1967
1968         r = -ENXIO;
1969
1970 fail:
1971         free_many_charp(v, n);
1972         return r;
1973
1974 done:
1975         memcpy(ret_values, v, sizeof(char*) * n);
1976         if (mode & CG_KEY_MODE_GRACEFUL)
1977                 return n_done;
1978
1979         return 0;
1980 }
1981
1982 int cg_mask_to_string(CGroupMask mask, char **ret) {
1983         _cleanup_free_ char *s = NULL;
1984         bool space = false;
1985         CGroupController c;
1986         size_t n = 0;
1987
1988         assert(ret);
1989
1990         if (mask == 0) {
1991                 *ret = NULL;
1992                 return 0;
1993         }
1994
1995         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
1996                 const char *k;
1997                 size_t l;
1998
1999                 if (!FLAGS_SET(mask, CGROUP_CONTROLLER_TO_MASK(c)))
2000                         continue;
2001
2002                 k = cgroup_controller_to_string(c);
2003                 l = strlen(k);
2004
2005                 if (!GREEDY_REALLOC(s, n + space + l + 1))
2006                         return -ENOMEM;
2007
2008                 if (space)
2009                         s[n] = ' ';
2010                 memcpy(s + n + space, k, l);
2011                 n += space + l;
2012
2013                 space = true;
2014         }
2015
2016         assert(s);
2017
2018         s[n] = 0;
2019         *ret = TAKE_PTR(s);
2020
2021         return 0;
2022 }
2023
2024 int cg_mask_from_string(const char *value, CGroupMask *ret) {
2025         CGroupMask m = 0;
2026
2027         assert(ret);
2028         assert(value);
2029
2030         for (;;) {
2031                 _cleanup_free_ char *n = NULL;
2032                 CGroupController v;
2033                 int r;
2034
2035                 r = extract_first_word(&value, &n, NULL, 0);
2036                 if (r < 0)
2037                         return r;
2038                 if (r == 0)
2039                         break;
2040
2041                 v = cgroup_controller_from_string(n);
2042                 if (v < 0)
2043                         continue;
2044
2045                 m |= CGROUP_CONTROLLER_TO_MASK(v);
2046         }
2047
2048         *ret = m;
2049         return 0;
2050 }
2051
2052 int cg_mask_supported_subtree(const char *root, CGroupMask *ret) {
2053         CGroupMask mask;
2054         int r;
2055
2056         /* Determines the mask of supported cgroup controllers. Only includes controllers we can make sense of and that
2057          * are actually accessible. Only covers real controllers, i.e. not the CGROUP_CONTROLLER_BPF_xyz
2058          * pseudo-controllers. */
2059
2060         r = cg_all_unified();
2061         if (r < 0)
2062                 return r;
2063         if (r > 0) {
2064                 _cleanup_free_ char *controllers = NULL, *path = NULL;
2065
2066                 /* In the unified hierarchy we can read the supported and accessible controllers from
2067                  * the top-level cgroup attribute */
2068
2069                 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, root, "cgroup.controllers", &path);
2070                 if (r < 0)
2071                         return r;
2072
2073                 r = read_one_line_file(path, &controllers);
2074                 if (r < 0)
2075                         return r;
2076
2077                 r = cg_mask_from_string(controllers, &mask);
2078                 if (r < 0)
2079                         return r;
2080
2081                 /* Mask controllers that are not supported in unified hierarchy. */
2082                 mask &= CGROUP_MASK_V2;
2083
2084         } else {
2085                 CGroupController c;
2086
2087                 /* In the legacy hierarchy, we check which hierarchies are accessible. */
2088
2089                 mask = 0;
2090                 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2091                         CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2092                         const char *n;
2093
2094                         if (!FLAGS_SET(CGROUP_MASK_V1, bit))
2095                                 continue;
2096
2097                         n = cgroup_controller_to_string(c);
2098                         if (controller_is_v1_accessible(root, n) >= 0)
2099                                 mask |= bit;
2100                 }
2101         }
2102
2103         *ret = mask;
2104         return 0;
2105 }
2106
2107 int cg_mask_supported(CGroupMask *ret) {
2108         _cleanup_free_ char *root = NULL;
2109         int r;
2110
2111         r = cg_get_root_path(&root);
2112         if (r < 0)
2113                 return r;
2114
2115         return cg_mask_supported_subtree(root, ret);
2116 }
2117
2118 int cg_kernel_controllers(Set **ret) {
2119         _cleanup_set_free_ Set *controllers = NULL;
2120         _cleanup_fclose_ FILE *f = NULL;
2121         int r;
2122
2123         assert(ret);
2124
2125         /* Determines the full list of kernel-known controllers. Might include controllers we don't actually support
2126          * and controllers that aren't currently accessible (because not mounted). This does not include "name="
2127          * pseudo-controllers. */
2128
2129         r = fopen_unlocked("/proc/cgroups", "re", &f);
2130         if (r == -ENOENT) {
2131                 *ret = NULL;
2132                 return 0;
2133         }
2134         if (r < 0)
2135                 return r;
2136
2137         /* Ignore the header line */
2138         (void) read_line(f, SIZE_MAX, NULL);
2139
2140         for (;;) {
2141                 _cleanup_free_ char *controller = NULL;
2142                 int enabled = 0;
2143
2144                 errno = 0;
2145                 if (fscanf(f, "%ms %*i %*i %i", &controller, &enabled) != 2) {
2146
2147                         if (feof(f))
2148                                 break;
2149
2150                         if (ferror(f))
2151                                 return errno_or_else(EIO);
2152
2153                         return -EBADMSG;
2154                 }
2155
2156                 if (!enabled)
2157                         continue;
2158
2159                 if (!cg_controller_is_valid(controller))
2160                         return -EBADMSG;
2161
2162                 r = set_ensure_consume(&controllers, &string_hash_ops_free, TAKE_PTR(controller));
2163                 if (r < 0)
2164                         return r;
2165         }
2166
2167         *ret = TAKE_PTR(controllers);
2168
2169         return 0;
2170 }
2171
2172 /* The hybrid mode was initially implemented in v232 and simply mounted cgroup2 on
2173  * /sys/fs/cgroup/systemd. This unfortunately broke other tools (such as docker) which expected the v1
2174  * "name=systemd" hierarchy on /sys/fs/cgroup/systemd. From v233 and on, the hybrid mode mounts v2 on
2175  * /sys/fs/cgroup/unified and maintains "name=systemd" hierarchy on /sys/fs/cgroup/systemd for compatibility
2176  * with other tools.
2177  *
2178  * To keep live upgrade working, we detect and support v232 layout. When v232 layout is detected, to keep
2179  * cgroup v2 process management but disable the compat dual layout, we return true on
2180  * cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) and false on cg_hybrid_unified().
2181  */
2182 static thread_local bool unified_systemd_v232;
2183
2184 int cg_unified_cached(bool flush) {
2185         static thread_local CGroupUnified unified_cache = CGROUP_UNIFIED_UNKNOWN;
2186
2187         struct statfs fs;
2188
2189         /* Checks if we support the unified hierarchy. Returns an
2190          * error when the cgroup hierarchies aren't mounted yet or we
2191          * have any other trouble determining if the unified hierarchy
2192          * is supported. */
2193
2194         if (flush)
2195                 unified_cache = CGROUP_UNIFIED_UNKNOWN;
2196         else if (unified_cache >= CGROUP_UNIFIED_NONE)
2197                 return unified_cache;
2198
2199         if (statfs("/sys/fs/cgroup/", &fs) < 0)
2200                 return log_debug_errno(errno, "statfs(\"/sys/fs/cgroup/\") failed: %m");
2201
2202         if (F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2203                 log_debug("Found cgroup2 on /sys/fs/cgroup/, full unified hierarchy");
2204                 unified_cache = CGROUP_UNIFIED_ALL;
2205         } else if (F_TYPE_EQUAL(fs.f_type, TMPFS_MAGIC)) {
2206                 if (statfs("/sys/fs/cgroup/unified/", &fs) == 0 &&
2207                     F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2208                         log_debug("Found cgroup2 on /sys/fs/cgroup/unified, unified hierarchy for systemd controller");
2209                         unified_cache = CGROUP_UNIFIED_SYSTEMD;
2210                         unified_systemd_v232 = false;
2211                 } else {
2212                         if (statfs("/sys/fs/cgroup/systemd/", &fs) < 0) {
2213                                 if (errno == ENOENT) {
2214                                         /* Some other software may have set up /sys/fs/cgroup in a configuration we do not recognize. */
2215                                         log_debug_errno(errno, "Unsupported cgroupsv1 setup detected: name=systemd hierarchy not found.");
2216                                         return -ENOMEDIUM;
2217                                 }
2218                                 return log_debug_errno(errno, "statfs(\"/sys/fs/cgroup/systemd\" failed: %m");
2219                         }
2220
2221                         if (F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2222                                 log_debug("Found cgroup2 on /sys/fs/cgroup/systemd, unified hierarchy for systemd controller (v232 variant)");
2223                                 unified_cache = CGROUP_UNIFIED_SYSTEMD;
2224                                 unified_systemd_v232 = true;
2225                         } else if (F_TYPE_EQUAL(fs.f_type, CGROUP_SUPER_MAGIC)) {
2226                                 log_debug("Found cgroup on /sys/fs/cgroup/systemd, legacy hierarchy");
2227                                 unified_cache = CGROUP_UNIFIED_NONE;
2228                         } else {
2229                                 log_debug("Unexpected filesystem type %llx mounted on /sys/fs/cgroup/systemd, assuming legacy hierarchy",
2230                                           (unsigned long long) fs.f_type);
2231                                 unified_cache = CGROUP_UNIFIED_NONE;
2232                         }
2233                 }
2234         } else if (F_TYPE_EQUAL(fs.f_type, SYSFS_MAGIC)) {
2235                 return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM),
2236                                        "No filesystem is currently mounted on /sys/fs/cgroup.");
2237         } else
2238                 return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM),
2239                                        "Unknown filesystem type %llx mounted on /sys/fs/cgroup.",
2240                                        (unsigned long long)fs.f_type);
2241
2242         return unified_cache;
2243 }
2244
2245 int cg_unified_controller(const char *controller) {
2246         int r;
2247
2248         r = cg_unified_cached(false);
2249         if (r < 0)
2250                 return r;
2251
2252         if (r == CGROUP_UNIFIED_NONE)
2253                 return false;
2254
2255         if (r >= CGROUP_UNIFIED_ALL)
2256                 return true;
2257
2258         return streq_ptr(controller, SYSTEMD_CGROUP_CONTROLLER);
2259 }
2260
2261 int cg_all_unified(void) {
2262         int r;
2263
2264         r = cg_unified_cached(false);
2265         if (r < 0)
2266                 return r;
2267
2268         return r >= CGROUP_UNIFIED_ALL;
2269 }
2270
2271 int cg_hybrid_unified(void) {
2272         int r;
2273
2274         r = cg_unified_cached(false);
2275         if (r < 0)
2276                 return r;
2277
2278         return r == CGROUP_UNIFIED_SYSTEMD && !unified_systemd_v232;
2279 }
2280
2281 int cg_is_delegated(const char *path) {
2282         int r;
2283
2284         assert(path);
2285
2286         r = cg_get_xattr_bool(path, "trusted.delegate");
2287         if (!ERRNO_IS_NEG_XATTR_ABSENT(r))
2288                 return r;
2289
2290         /* If the trusted xattr isn't set (preferred), then check the untrusted one. Under the assumption
2291          * that whoever is trusted enough to own the cgroup, is also trusted enough to decide if it is
2292          * delegated or not this should be safe. */
2293         r = cg_get_xattr_bool(path, "user.delegate");
2294         return ERRNO_IS_NEG_XATTR_ABSENT(r) ? false : r;
2295 }
2296
2297 int cg_is_delegated_fd(int fd) {
2298         int r;
2299
2300         assert(fd >= 0);
2301
2302         r = getxattr_at_bool(fd, /* path= */ NULL, "trusted.delegate", /* flags= */ 0);
2303         if (!ERRNO_IS_NEG_XATTR_ABSENT(r))
2304                 return r;
2305
2306         r = getxattr_at_bool(fd, /* path= */ NULL, "user.delegate", /* flags= */ 0);
2307         return ERRNO_IS_NEG_XATTR_ABSENT(r) ? false : r;
2308 }
2309
2310 int cg_has_coredump_receive(const char *path) {
2311         int r;
2312
2313         assert(path);
2314
2315         r = cg_get_xattr_bool(path, "user.coredump_receive");
2316         if (ERRNO_IS_NEG_XATTR_ABSENT(r))
2317                 return false;
2318
2319         return r;
2320 }
2321
2322 const uint64_t cgroup_io_limit_defaults[_CGROUP_IO_LIMIT_TYPE_MAX] = {
2323         [CGROUP_IO_RBPS_MAX]    = CGROUP_LIMIT_MAX,
2324         [CGROUP_IO_WBPS_MAX]    = CGROUP_LIMIT_MAX,
2325         [CGROUP_IO_RIOPS_MAX]   = CGROUP_LIMIT_MAX,
2326         [CGROUP_IO_WIOPS_MAX]   = CGROUP_LIMIT_MAX,
2327 };
2328
2329 static const char* const cgroup_io_limit_type_table[_CGROUP_IO_LIMIT_TYPE_MAX] = {
2330         [CGROUP_IO_RBPS_MAX]    = "IOReadBandwidthMax",
2331         [CGROUP_IO_WBPS_MAX]    = "IOWriteBandwidthMax",
2332         [CGROUP_IO_RIOPS_MAX]   = "IOReadIOPSMax",
2333         [CGROUP_IO_WIOPS_MAX]   = "IOWriteIOPSMax",
2334 };
2335
2336 DEFINE_STRING_TABLE_LOOKUP(cgroup_io_limit_type, CGroupIOLimitType);
2337
2338 bool is_cgroup_fs(const struct statfs *s) {
2339         return is_fs_type(s, CGROUP_SUPER_MAGIC) ||
2340                is_fs_type(s, CGROUP2_SUPER_MAGIC);
2341 }
2342
2343 bool fd_is_cgroup_fs(int fd) {
2344         struct statfs s;
2345
2346         if (fstatfs(fd, &s) < 0)
2347                 return -errno;
2348
2349         return is_cgroup_fs(&s);
2350 }
2351
2352 static const char *const cgroup_controller_table[_CGROUP_CONTROLLER_MAX] = {
2353         [CGROUP_CONTROLLER_CPU] = "cpu",
2354         [CGROUP_CONTROLLER_CPUACCT] = "cpuacct",
2355         [CGROUP_CONTROLLER_CPUSET] = "cpuset",
2356         [CGROUP_CONTROLLER_IO] = "io",
2357         [CGROUP_CONTROLLER_BLKIO] = "blkio",
2358         [CGROUP_CONTROLLER_MEMORY] = "memory",
2359         [CGROUP_CONTROLLER_DEVICES] = "devices",
2360         [CGROUP_CONTROLLER_PIDS] = "pids",
2361         [CGROUP_CONTROLLER_BPF_FIREWALL] = "bpf-firewall",
2362         [CGROUP_CONTROLLER_BPF_DEVICES] = "bpf-devices",
2363         [CGROUP_CONTROLLER_BPF_FOREIGN] = "bpf-foreign",
2364         [CGROUP_CONTROLLER_BPF_SOCKET_BIND] = "bpf-socket-bind",
2365         [CGROUP_CONTROLLER_BPF_RESTRICT_NETWORK_INTERFACES] = "bpf-restrict-network-interfaces",
2366 };
2367
2368 DEFINE_STRING_TABLE_LOOKUP(cgroup_controller, CGroupController);
2369
2370 CGroupMask get_cpu_accounting_mask(void) {
2371         static CGroupMask needed_mask = (CGroupMask) -1;
2372
2373         /* On kernel ≥4.15 with unified hierarchy, cpu.stat's usage_usec is
2374          * provided externally from the CPU controller, which means we don't
2375          * need to enable the CPU controller just to get metrics. This is good,
2376          * because enabling the CPU controller comes at a minor performance
2377          * hit, especially when it's propagated deep into large hierarchies.
2378          * There's also no separate CPU accounting controller available within
2379          * a unified hierarchy.
2380          *
2381          * This combination of factors results in the desired cgroup mask to
2382          * enable for CPU accounting varying as follows:
2383          *
2384          *                   ╔═════════════════════╤═════════════════════╗
2385          *                   ║     Linux ≥4.15     │     Linux <4.15     ║
2386          *   ╔═══════════════╬═════════════════════╪═════════════════════╣
2387          *   ║ Unified       ║ nothing             │ CGROUP_MASK_CPU     ║
2388          *   ╟───────────────╫─────────────────────┼─────────────────────╢
2389          *   ║ Hybrid/Legacy ║ CGROUP_MASK_CPUACCT │ CGROUP_MASK_CPUACCT ║
2390          *   ╚═══════════════╩═════════════════════╧═════════════════════╝
2391          *
2392          * We check kernel version here instead of manually checking whether
2393          * cpu.stat is present for every cgroup, as that check in itself would
2394          * already be fairly expensive.
2395          *
2396          * Kernels where this patch has been backported will therefore have the
2397          * CPU controller enabled unnecessarily. This is more expensive than
2398          * necessary, but harmless. ☺️
2399          */
2400
2401         if (needed_mask == (CGroupMask) -1) {
2402                 if (cg_all_unified()) {
2403                         struct utsname u;
2404                         assert_se(uname(&u) >= 0);
2405
2406                         if (strverscmp_improved(u.release, "4.15") < 0)
2407                                 needed_mask = CGROUP_MASK_CPU;
2408                         else
2409                                 needed_mask = 0;
2410                 } else
2411                         needed_mask = CGROUP_MASK_CPUACCT;
2412         }
2413
2414         return needed_mask;
2415 }
2416
2417 bool cpu_accounting_is_cheap(void) {
2418         return get_cpu_accounting_mask() == 0;
2419 }
2420
2421 static const char* const managed_oom_mode_table[_MANAGED_OOM_MODE_MAX] = {
2422         [MANAGED_OOM_AUTO] = "auto",
2423         [MANAGED_OOM_KILL] = "kill",
2424 };
2425
2426 DEFINE_STRING_TABLE_LOOKUP(managed_oom_mode, ManagedOOMMode);
2427
2428 static const char* const managed_oom_preference_table[_MANAGED_OOM_PREFERENCE_MAX] = {
2429         [MANAGED_OOM_PREFERENCE_NONE] = "none",
2430         [MANAGED_OOM_PREFERENCE_AVOID] = "avoid",
2431         [MANAGED_OOM_PREFERENCE_OMIT] = "omit",
2432 };
2433
2434 DEFINE_STRING_TABLE_LOOKUP(managed_oom_preference, ManagedOOMPreference);