src/basic/cgroup-util.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #include <dirent.h>
   4 #include <errno.h>
   5 #include <ftw.h>
   6 #include <limits.h>
   7 #include <signal.h>
   8 #include <stddef.h>
   9 #include <stdio_ext.h>
  10 #include <stdlib.h>
  11 #include <string.h>
  12 #include <sys/stat.h>
  13 #include <sys/statfs.h>
  14 #include <sys/types.h>
  15 #include <sys/xattr.h>
  16 #include <unistd.h>
  17
  18 #include "alloc-util.h"
  19 #include "cgroup-util.h"
  20 #include "def.h"
  21 #include "dirent-util.h"
  22 #include "extract-word.h"
  23 #include "fd-util.h"
  24 #include "fileio.h"
  25 #include "format-util.h"
  26 #include "fs-util.h"
  27 #include "log.h"
  28 #include "login-util.h"
  29 #include "macro.h"
  30 #include "missing.h"
  31 #include "mkdir.h"
  32 #include "parse-util.h"
  33 #include "path-util.h"
  34 #include "proc-cmdline.h"
  35 #include "process-util.h"
  36 #include "set.h"
  37 #include "special.h"
  38 #include "stat-util.h"
  39 #include "stdio-util.h"
  40 #include "string-table.h"
  41 #include "string-util.h"
  42 #include "strv.h"
  43 #include "unit-name.h"
  44 #include "user-util.h"
  45
  46 int cg_enumerate_processes(const char *controller, const char *path, FILE **_f) {
  47         _cleanup_free_ char *fs = NULL;
  48         FILE *f;
  49         int r;
  50
  51         assert(_f);
  52
  53         r = cg_get_path(controller, path, "cgroup.procs", &fs);
  54         if (r < 0)
  55                 return r;
  56
  57         f = fopen(fs, "re");
  58         if (!f)
  59                 return -errno;
  60
  61         *_f = f;
  62         return 0;
  63 }
  64
  65 int cg_read_pid(FILE *f, pid_t *_pid) {
  66         unsigned long ul;
  67
  68         /* Note that the cgroup.procs might contain duplicates! See
  69          * cgroups.txt for details. */
  70
  71         assert(f);
  72         assert(_pid);
  73
  74         errno = 0;
  75         if (fscanf(f, "%lu", &ul) != 1) {
  76
  77                 if (feof(f))
  78                         return 0;
  79
  80                 return errno > 0 ? -errno : -EIO;
  81         }
  82
  83         if (ul <= 0)
  84                 return -EIO;
  85
  86         *_pid = (pid_t) ul;
  87         return 1;
  88 }
  89
  90 int cg_read_event(
  91                 const char *controller,
  92                 const char *path,
  93                 const char *event,
  94                 char **val) {
  95
  96         _cleanup_free_ char *events = NULL, *content = NULL;
  97         char *p, *line;
  98         int r;
  99
 100         r = cg_get_path(controller, path, "cgroup.events", &events);
 101         if (r < 0)
 102                 return r;
 103
 104         r = read_full_file(events, &content, NULL);
 105         if (r < 0)
 106                 return r;
 107
 108         p = content;
 109         while ((line = strsep(&p, "\n"))) {
 110                 char *key;
 111
 112                 key = strsep(&line, " ");
 113                 if (!key || !line)
 114                         return -EINVAL;
 115
 116                 if (strcmp(key, event))
 117                         continue;
 118
 119                 *val = strdup(line);
 120                 return 0;
 121         }
 122
 123         return -ENOENT;
 124 }
 125
 126 bool cg_ns_supported(void) {
 127         static thread_local int enabled = -1;
 128
 129         if (enabled >= 0)
 130                 return enabled;
 131
 132         if (access("/proc/self/ns/cgroup", F_OK) == 0)
 133                 enabled = 1;
 134         else
 135                 enabled = 0;
 136
 137         return enabled;
 138 }
 139
 140 int cg_enumerate_subgroups(const char *controller, const char *path, DIR **_d) {
 141         _cleanup_free_ char *fs = NULL;
 142         int r;
 143         DIR *d;
 144
 145         assert(_d);
 146
 147         /* This is not recursive! */
 148
 149         r = cg_get_path(controller, path, NULL, &fs);
 150         if (r < 0)
 151                 return r;
 152
 153         d = opendir(fs);
 154         if (!d)
 155                 return -errno;
 156
 157         *_d = d;
 158         return 0;
 159 }
 160
 161 int cg_read_subgroup(DIR *d, char **fn) {
 162         struct dirent *de;
 163
 164         assert(d);
 165         assert(fn);
 166
 167         FOREACH_DIRENT_ALL(de, d, return -errno) {
 168                 char *b;
 169
 170                 if (de->d_type != DT_DIR)
 171                         continue;
 172
 173                 if (dot_or_dot_dot(de->d_name))
 174                         continue;
 175
 176                 b = strdup(de->d_name);
 177                 if (!b)
 178                         return -ENOMEM;
 179
 180                 *fn = b;
 181                 return 1;
 182         }
 183
 184         return 0;
 185 }
 186
 187 int cg_rmdir(const char *controller, const char *path) {
 188         _cleanup_free_ char *p = NULL;
 189         int r;
 190
 191         r = cg_get_path(controller, path, NULL, &p);
 192         if (r < 0)
 193                 return r;
 194
 195         r = rmdir(p);
 196         if (r < 0 && errno != ENOENT)
 197                 return -errno;
 198
 199         r = cg_hybrid_unified();
 200         if (r < 0)
 201                 return r;
 202         if (r == 0)
 203                 return 0;
 204
 205         if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 206                 r = cg_rmdir(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path);
 207                 if (r < 0)
 208                         log_warning_errno(r, "Failed to remove compat systemd cgroup %s: %m", path);
 209         }
 210
 211         return 0;
 212 }
 213
 214 int cg_kill(
 215                 const char *controller,
 216                 const char *path,
 217                 int sig,
 218                 CGroupFlags flags,
 219                 Set *s,
 220                 cg_kill_log_func_t log_kill,
 221                 void *userdata) {
 222
 223         _cleanup_set_free_ Set *allocated_set = NULL;
 224         bool done = false;
 225         int r, ret = 0;
 226         pid_t my_pid;
 227
 228         assert(sig >= 0);
 229
 230          /* Don't send SIGCONT twice. Also, SIGKILL always works even when process is suspended, hence don't send
 231           * SIGCONT on SIGKILL. */
 232         if (IN_SET(sig, SIGCONT, SIGKILL))
 233                 flags &= ~CGROUP_SIGCONT;
 234
 235         /* This goes through the tasks list and kills them all. This
 236          * is repeated until no further processes are added to the
 237          * tasks list, to properly handle forking processes */
 238
 239         if (!s) {
 240                 s = allocated_set = set_new(NULL);
 241                 if (!s)
 242                         return -ENOMEM;
 243         }
 244
 245         my_pid = getpid_cached();
 246
 247         do {
 248                 _cleanup_fclose_ FILE *f = NULL;
 249                 pid_t pid = 0;
 250                 done = true;
 251
 252                 r = cg_enumerate_processes(controller, path, &f);
 253                 if (r < 0) {
 254                         if (ret >= 0 && r != -ENOENT)
 255                                 return r;
 256
 257                         return ret;
 258                 }
 259
 260                 while ((r = cg_read_pid(f, &pid)) > 0) {
 261
 262                         if ((flags & CGROUP_IGNORE_SELF) && pid == my_pid)
 263                                 continue;
 264
 265                         if (set_get(s, PID_TO_PTR(pid)) == PID_TO_PTR(pid))
 266                                 continue;
 267
 268                         if (log_kill)
 269                                 log_kill(pid, sig, userdata);
 270
 271                         /* If we haven't killed this process yet, kill
 272                          * it */
 273                         if (kill(pid, sig) < 0) {
 274                                 if (ret >= 0 && errno != ESRCH)
 275                                         ret = -errno;
 276                         } else {
 277                                 if (flags & CGROUP_SIGCONT)
 278                                         (void) kill(pid, SIGCONT);
 279
 280                                 if (ret == 0)
 281                                         ret = 1;
 282                         }
 283
 284                         done = false;
 285
 286                         r = set_put(s, PID_TO_PTR(pid));
 287                         if (r < 0) {
 288                                 if (ret >= 0)
 289                                         return r;
 290
 291                                 return ret;
 292                         }
 293                 }
 294
 295                 if (r < 0) {
 296                         if (ret >= 0)
 297                                 return r;
 298
 299                         return ret;
 300                 }
 301
 302                 /* To avoid racing against processes which fork
 303                  * quicker than we can kill them we repeat this until
 304                  * no new pids need to be killed. */
 305
 306         } while (!done);
 307
 308         return ret;
 309 }
 310
 311 int cg_kill_recursive(
 312                 const char *controller,
 313                 const char *path,
 314                 int sig,
 315                 CGroupFlags flags,
 316                 Set *s,
 317                 cg_kill_log_func_t log_kill,
 318                 void *userdata) {
 319
 320         _cleanup_set_free_ Set *allocated_set = NULL;
 321         _cleanup_closedir_ DIR *d = NULL;
 322         int r, ret;
 323         char *fn;
 324
 325         assert(path);
 326         assert(sig >= 0);
 327
 328         if (!s) {
 329                 s = allocated_set = set_new(NULL);
 330                 if (!s)
 331                         return -ENOMEM;
 332         }
 333
 334         ret = cg_kill(controller, path, sig, flags, s, log_kill, userdata);
 335
 336         r = cg_enumerate_subgroups(controller, path, &d);
 337         if (r < 0) {
 338                 if (ret >= 0 && r != -ENOENT)
 339                         return r;
 340
 341                 return ret;
 342         }
 343
 344         while ((r = cg_read_subgroup(d, &fn)) > 0) {
 345                 _cleanup_free_ char *p = NULL;
 346
 347                 p = strjoin(path, "/", fn);
 348                 free(fn);
 349                 if (!p)
 350                         return -ENOMEM;
 351
 352                 r = cg_kill_recursive(controller, p, sig, flags, s, log_kill, userdata);
 353                 if (r != 0 && ret >= 0)
 354                         ret = r;
 355         }
 356         if (ret >= 0 && r < 0)
 357                 ret = r;
 358
 359         if (flags & CGROUP_REMOVE) {
 360                 r = cg_rmdir(controller, path);
 361                 if (r < 0 && ret >= 0 && !IN_SET(r, -ENOENT, -EBUSY))
 362                         return r;
 363         }
 364
 365         return ret;
 366 }
 367
 368 int cg_migrate(
 369                 const char *cfrom,
 370                 const char *pfrom,
 371                 const char *cto,
 372                 const char *pto,
 373                 CGroupFlags flags) {
 374
 375         bool done = false;
 376         _cleanup_set_free_ Set *s = NULL;
 377         int r, ret = 0;
 378         pid_t my_pid;
 379
 380         assert(cfrom);
 381         assert(pfrom);
 382         assert(cto);
 383         assert(pto);
 384
 385         s = set_new(NULL);
 386         if (!s)
 387                 return -ENOMEM;
 388
 389         my_pid = getpid_cached();
 390
 391         do {
 392                 _cleanup_fclose_ FILE *f = NULL;
 393                 pid_t pid = 0;
 394                 done = true;
 395
 396                 r = cg_enumerate_processes(cfrom, pfrom, &f);
 397                 if (r < 0) {
 398                         if (ret >= 0 && r != -ENOENT)
 399                                 return r;
 400
 401                         return ret;
 402                 }
 403
 404                 while ((r = cg_read_pid(f, &pid)) > 0) {
 405
 406                         /* This might do weird stuff if we aren't a
 407                          * single-threaded program. However, we
 408                          * luckily know we are not */
 409                         if ((flags & CGROUP_IGNORE_SELF) && pid == my_pid)
 410                                 continue;
 411
 412                         if (set_get(s, PID_TO_PTR(pid)) == PID_TO_PTR(pid))
 413                                 continue;
 414
 415                         /* Ignore kernel threads. Since they can only
 416                          * exist in the root cgroup, we only check for
 417                          * them there. */
 418                         if (cfrom &&
 419                             empty_or_root(pfrom) &&
 420                             is_kernel_thread(pid) > 0)
 421                                 continue;
 422
 423                         r = cg_attach(cto, pto, pid);
 424                         if (r < 0) {
 425                                 if (ret >= 0 && r != -ESRCH)
 426                                         ret = r;
 427                         } else if (ret == 0)
 428                                 ret = 1;
 429
 430                         done = false;
 431
 432                         r = set_put(s, PID_TO_PTR(pid));
 433                         if (r < 0) {
 434                                 if (ret >= 0)
 435                                         return r;
 436
 437                                 return ret;
 438                         }
 439                 }
 440
 441                 if (r < 0) {
 442                         if (ret >= 0)
 443                                 return r;
 444
 445                         return ret;
 446                 }
 447         } while (!done);
 448
 449         return ret;
 450 }
 451
 452 int cg_migrate_recursive(
 453                 const char *cfrom,
 454                 const char *pfrom,
 455                 const char *cto,
 456                 const char *pto,
 457                 CGroupFlags flags) {
 458
 459         _cleanup_closedir_ DIR *d = NULL;
 460         int r, ret = 0;
 461         char *fn;
 462
 463         assert(cfrom);
 464         assert(pfrom);
 465         assert(cto);
 466         assert(pto);
 467
 468         ret = cg_migrate(cfrom, pfrom, cto, pto, flags);
 469
 470         r = cg_enumerate_subgroups(cfrom, pfrom, &d);
 471         if (r < 0) {
 472                 if (ret >= 0 && r != -ENOENT)
 473                         return r;
 474
 475                 return ret;
 476         }
 477
 478         while ((r = cg_read_subgroup(d, &fn)) > 0) {
 479                 _cleanup_free_ char *p = NULL;
 480
 481                 p = strjoin(pfrom, "/", fn);
 482                 free(fn);
 483                 if (!p)
 484                         return -ENOMEM;
 485
 486                 r = cg_migrate_recursive(cfrom, p, cto, pto, flags);
 487                 if (r != 0 && ret >= 0)
 488                         ret = r;
 489         }
 490
 491         if (r < 0 && ret >= 0)
 492                 ret = r;
 493
 494         if (flags & CGROUP_REMOVE) {
 495                 r = cg_rmdir(cfrom, pfrom);
 496                 if (r < 0 && ret >= 0 && !IN_SET(r, -ENOENT, -EBUSY))
 497                         return r;
 498         }
 499
 500         return ret;
 501 }
 502
 503 int cg_migrate_recursive_fallback(
 504                 const char *cfrom,
 505                 const char *pfrom,
 506                 const char *cto,
 507                 const char *pto,
 508                 CGroupFlags flags) {
 509
 510         int r;
 511
 512         assert(cfrom);
 513         assert(pfrom);
 514         assert(cto);
 515         assert(pto);
 516
 517         r = cg_migrate_recursive(cfrom, pfrom, cto, pto, flags);
 518         if (r < 0) {
 519                 char prefix[strlen(pto) + 1];
 520
 521                 /* This didn't work? Then let's try all prefixes of the destination */
 522
 523                 PATH_FOREACH_PREFIX(prefix, pto) {
 524                         int q;
 525
 526                         q = cg_migrate_recursive(cfrom, pfrom, cto, prefix, flags);
 527                         if (q >= 0)
 528                                 return q;
 529                 }
 530         }
 531
 532         return r;
 533 }
 534
 535 static const char *controller_to_dirname(const char *controller) {
 536         const char *e;
 537
 538         assert(controller);
 539
 540         /* Converts a controller name to the directory name below
 541          * /sys/fs/cgroup/ we want to mount it to. Effectively, this
 542          * just cuts off the name= prefixed used for named
 543          * hierarchies, if it is specified. */
 544
 545         if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 546                 if (cg_hybrid_unified() > 0)
 547                         controller = SYSTEMD_CGROUP_CONTROLLER_HYBRID;
 548                 else
 549                         controller = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
 550         }
 551
 552         e = startswith(controller, "name=");
 553         if (e)
 554                 return e;
 555
 556         return controller;
 557 }
 558
 559 static int join_path_legacy(const char *controller, const char *path, const char *suffix, char **fs) {
 560         const char *dn;
 561         char *t = NULL;
 562
 563         assert(fs);
 564         assert(controller);
 565
 566         dn = controller_to_dirname(controller);
 567
 568         if (isempty(path) && isempty(suffix))
 569                 t = strappend("/sys/fs/cgroup/", dn);
 570         else if (isempty(path))
 571                 t = strjoin("/sys/fs/cgroup/", dn, "/", suffix);
 572         else if (isempty(suffix))
 573                 t = strjoin("/sys/fs/cgroup/", dn, "/", path);
 574         else
 575                 t = strjoin("/sys/fs/cgroup/", dn, "/", path, "/", suffix);
 576         if (!t)
 577                 return -ENOMEM;
 578
 579         *fs = t;
 580         return 0;
 581 }
 582
 583 static int join_path_unified(const char *path, const char *suffix, char **fs) {
 584         char *t;
 585
 586         assert(fs);
 587
 588         if (isempty(path) && isempty(suffix))
 589                 t = strdup("/sys/fs/cgroup");
 590         else if (isempty(path))
 591                 t = strappend("/sys/fs/cgroup/", suffix);
 592         else if (isempty(suffix))
 593                 t = strappend("/sys/fs/cgroup/", path);
 594         else
 595                 t = strjoin("/sys/fs/cgroup/", path, "/", suffix);
 596         if (!t)
 597                 return -ENOMEM;
 598
 599         *fs = t;
 600         return 0;
 601 }
 602
 603 int cg_get_path(const char *controller, const char *path, const char *suffix, char **fs) {
 604         int r;
 605
 606         assert(fs);
 607
 608         if (!controller) {
 609                 char *t;
 610
 611                 /* If no controller is specified, we return the path
 612                  * *below* the controllers, without any prefix. */
 613
 614                 if (!path && !suffix)
 615                         return -EINVAL;
 616
 617                 if (!suffix)
 618                         t = strdup(path);
 619                 else if (!path)
 620                         t = strdup(suffix);
 621                 else
 622                         t = strjoin(path, "/", suffix);
 623                 if (!t)
 624                         return -ENOMEM;
 625
 626                 *fs = path_simplify(t, false);
 627                 return 0;
 628         }
 629
 630         if (!cg_controller_is_valid(controller))
 631                 return -EINVAL;
 632
 633         r = cg_all_unified();
 634         if (r < 0)
 635                 return r;
 636         if (r > 0)
 637                 r = join_path_unified(path, suffix, fs);
 638         else
 639                 r = join_path_legacy(controller, path, suffix, fs);
 640         if (r < 0)
 641                 return r;
 642
 643         path_simplify(*fs, false);
 644         return 0;
 645 }
 646
 647 static int controller_is_accessible(const char *controller) {
 648         int r;
 649
 650         assert(controller);
 651
 652         /* Checks whether a specific controller is accessible,
 653          * i.e. its hierarchy mounted. In the unified hierarchy all
 654          * controllers are considered accessible, except for the named
 655          * hierarchies */
 656
 657         if (!cg_controller_is_valid(controller))
 658                 return -EINVAL;
 659
 660         r = cg_all_unified();
 661         if (r < 0)
 662                 return r;
 663         if (r > 0) {
 664                 /* We don't support named hierarchies if we are using
 665                  * the unified hierarchy. */
 666
 667                 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER))
 668                         return 0;
 669
 670                 if (startswith(controller, "name="))
 671                         return -EOPNOTSUPP;
 672
 673         } else {
 674                 const char *cc, *dn;
 675
 676                 dn = controller_to_dirname(controller);
 677                 cc = strjoina("/sys/fs/cgroup/", dn);
 678
 679                 if (laccess(cc, F_OK) < 0)
 680                         return -errno;
 681         }
 682
 683         return 0;
 684 }
 685
 686 int cg_get_path_and_check(const char *controller, const char *path, const char *suffix, char **fs) {
 687         int r;
 688
 689         assert(controller);
 690         assert(fs);
 691
 692         /* Check if the specified controller is actually accessible */
 693         r = controller_is_accessible(controller);
 694         if (r < 0)
 695                 return r;
 696
 697         return cg_get_path(controller, path, suffix, fs);
 698 }
 699
 700 static int trim_cb(const char *path, const struct stat *sb, int typeflag, struct FTW *ftwbuf) {
 701         assert(path);
 702         assert(sb);
 703         assert(ftwbuf);
 704
 705         if (typeflag != FTW_DP)
 706                 return 0;
 707
 708         if (ftwbuf->level < 1)
 709                 return 0;
 710
 711         (void) rmdir(path);
 712         return 0;
 713 }
 714
 715 int cg_trim(const char *controller, const char *path, bool delete_root) {
 716         _cleanup_free_ char *fs = NULL;
 717         int r = 0, q;
 718
 719         assert(path);
 720
 721         r = cg_get_path(controller, path, NULL, &fs);
 722         if (r < 0)
 723                 return r;
 724
 725         errno = 0;
 726         if (nftw(fs, trim_cb, 64, FTW_DEPTH|FTW_MOUNT|FTW_PHYS) != 0) {
 727                 if (errno == ENOENT)
 728                         r = 0;
 729                 else if (errno > 0)
 730                         r = -errno;
 731                 else
 732                         r = -EIO;
 733         }
 734
 735         if (delete_root) {
 736                 if (rmdir(fs) < 0 && errno != ENOENT)
 737                         return -errno;
 738         }
 739
 740         q = cg_hybrid_unified();
 741         if (q < 0)
 742                 return q;
 743         if (q > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 744                 q = cg_trim(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, delete_root);
 745                 if (q < 0)
 746                         log_warning_errno(q, "Failed to trim compat systemd cgroup %s: %m", path);
 747         }
 748
 749         return r;
 750 }
 751
 752 /* Create a cgroup in the hierarchy of controller.
 753  * Returns 0 if the group already existed, 1 on success, negative otherwise.
 754  */
 755 int cg_create(const char *controller, const char *path) {
 756         _cleanup_free_ char *fs = NULL;
 757         int r;
 758
 759         r = cg_get_path_and_check(controller, path, NULL, &fs);
 760         if (r < 0)
 761                 return r;
 762
 763         r = mkdir_parents(fs, 0755);
 764         if (r < 0)
 765                 return r;
 766
 767         r = mkdir_errno_wrapper(fs, 0755);
 768         if (r == -EEXIST)
 769                 return 0;
 770         if (r < 0)
 771                 return r;
 772
 773         r = cg_hybrid_unified();
 774         if (r < 0)
 775                 return r;
 776
 777         if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 778                 r = cg_create(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path);
 779                 if (r < 0)
 780                         log_warning_errno(r, "Failed to create compat systemd cgroup %s: %m", path);
 781         }
 782
 783         return 1;
 784 }
 785
 786 int cg_create_and_attach(const char *controller, const char *path, pid_t pid) {
 787         int r, q;
 788
 789         assert(pid >= 0);
 790
 791         r = cg_create(controller, path);
 792         if (r < 0)
 793                 return r;
 794
 795         q = cg_attach(controller, path, pid);
 796         if (q < 0)
 797                 return q;
 798
 799         /* This does not remove the cgroup on failure */
 800         return r;
 801 }
 802
 803 int cg_attach(const char *controller, const char *path, pid_t pid) {
 804         _cleanup_free_ char *fs = NULL;
 805         char c[DECIMAL_STR_MAX(pid_t) + 2];
 806         int r;
 807
 808         assert(path);
 809         assert(pid >= 0);
 810
 811         r = cg_get_path_and_check(controller, path, "cgroup.procs", &fs);
 812         if (r < 0)
 813                 return r;
 814
 815         if (pid == 0)
 816                 pid = getpid_cached();
 817
 818         xsprintf(c, PID_FMT "\n", pid);
 819
 820         r = write_string_file(fs, c, 0);
 821         if (r < 0)
 822                 return r;
 823
 824         r = cg_hybrid_unified();
 825         if (r < 0)
 826                 return r;
 827
 828         if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 829                 r = cg_attach(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, pid);
 830                 if (r < 0)
 831                         log_warning_errno(r, "Failed to attach "PID_FMT" to compat systemd cgroup %s: %m", pid, path);
 832         }
 833
 834         return 0;
 835 }
 836
 837 int cg_attach_fallback(const char *controller, const char *path, pid_t pid) {
 838         int r;
 839
 840         assert(controller);
 841         assert(path);
 842         assert(pid >= 0);
 843
 844         r = cg_attach(controller, path, pid);
 845         if (r < 0) {
 846                 char prefix[strlen(path) + 1];
 847
 848                 /* This didn't work? Then let's try all prefixes of
 849                  * the destination */
 850
 851                 PATH_FOREACH_PREFIX(prefix, path) {
 852                         int q;
 853
 854                         q = cg_attach(controller, prefix, pid);
 855                         if (q >= 0)
 856                                 return q;
 857                 }
 858         }
 859
 860         return r;
 861 }
 862
 863 int cg_set_access(
 864                 const char *controller,
 865                 const char *path,
 866                 uid_t uid,
 867                 gid_t gid) {
 868
 869         struct Attribute {
 870                 const char *name;
 871                 bool fatal;
 872         };
 873
 874         /* cgroupsv1, aka legacy/non-unified */
 875         static const struct Attribute legacy_attributes[] = {
 876                 { "cgroup.procs",           true  },
 877                 { "tasks",                  false },
 878                 { "cgroup.clone_children",  false },
 879                 {},
 880         };
 881
 882         /* cgroupsv2, aka unified */
 883         static const struct Attribute unified_attributes[] = {
 884                 { "cgroup.procs",           true  },
 885                 { "cgroup.subtree_control", true  },
 886                 { "cgroup.threads",         false },
 887                 {},
 888         };
 889
 890         static const struct Attribute* const attributes[] = {
 891                 [false] = legacy_attributes,
 892                 [true]  = unified_attributes,
 893         };
 894
 895         _cleanup_free_ char *fs = NULL;
 896         const struct Attribute *i;
 897         int r, unified;
 898
 899         assert(path);
 900
 901         if (uid == UID_INVALID && gid == GID_INVALID)
 902                 return 0;
 903
 904         unified = cg_unified_controller(controller);
 905         if (unified < 0)
 906                 return unified;
 907
 908         /* Configure access to the cgroup itself */
 909         r = cg_get_path(controller, path, NULL, &fs);
 910         if (r < 0)
 911                 return r;
 912
 913         r = chmod_and_chown(fs, 0755, uid, gid);
 914         if (r < 0)
 915                 return r;
 916
 917         /* Configure access to the cgroup's attributes */
 918         for (i = attributes[unified]; i->name; i++) {
 919                 fs = mfree(fs);
 920
 921                 r = cg_get_path(controller, path, i->name, &fs);
 922                 if (r < 0)
 923                         return r;
 924
 925                 r = chmod_and_chown(fs, 0644, uid, gid);
 926                 if (r < 0) {
 927                         if (i->fatal)
 928                                 return r;
 929
 930                         log_debug_errno(r, "Failed to set access on cgroup %s, ignoring: %m", fs);
 931                 }
 932         }
 933
 934         if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 935                 r = cg_hybrid_unified();
 936                 if (r < 0)
 937                         return r;
 938                 if (r > 0) {
 939                         /* Always propagate access mode from unified to legacy controller */
 940                         r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, uid, gid);
 941                         if (r < 0)
 942                                 log_debug_errno(r, "Failed to set access on compatibility systemd cgroup %s, ignoring: %m", path);
 943                 }
 944         }
 945
 946         return 0;
 947 }
 948
 949 int cg_set_xattr(const char *controller, const char *path, const char *name, const void *value, size_t size, int flags) {
 950         _cleanup_free_ char *fs = NULL;
 951         int r;
 952
 953         assert(path);
 954         assert(name);
 955         assert(value || size <= 0);
 956
 957         r = cg_get_path(controller, path, NULL, &fs);
 958         if (r < 0)
 959                 return r;
 960
 961         if (setxattr(fs, name, value, size, flags) < 0)
 962                 return -errno;
 963
 964         return 0;
 965 }
 966
 967 int cg_get_xattr(const char *controller, const char *path, const char *name, void *value, size_t size) {
 968         _cleanup_free_ char *fs = NULL;
 969         ssize_t n;
 970         int r;
 971
 972         assert(path);
 973         assert(name);
 974
 975         r = cg_get_path(controller, path, NULL, &fs);
 976         if (r < 0)
 977                 return r;
 978
 979         n = getxattr(fs, name, value, size);
 980         if (n < 0)
 981                 return -errno;
 982
 983         return (int) n;
 984 }
 985
 986 int cg_pid_get_path(const char *controller, pid_t pid, char **path) {
 987         _cleanup_fclose_ FILE *f = NULL;
 988         char line[LINE_MAX];
 989         const char *fs, *controller_str;
 990         size_t cs = 0;
 991         int unified;
 992
 993         assert(path);
 994         assert(pid >= 0);
 995
 996         if (controller) {
 997                 if (!cg_controller_is_valid(controller))
 998                         return -EINVAL;
 999         } else
1000                 controller = SYSTEMD_CGROUP_CONTROLLER;
1001
1002         unified = cg_unified_controller(controller);
1003         if (unified < 0)
1004                 return unified;
1005         if (unified == 0) {
1006                 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER))
1007                         controller_str = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
1008                 else
1009                         controller_str = controller;
1010
1011                 cs = strlen(controller_str);
1012         }
1013
1014         fs = procfs_file_alloca(pid, "cgroup");
1015         f = fopen(fs, "re");
1016         if (!f)
1017                 return errno == ENOENT ? -ESRCH : -errno;
1018
1019         (void) __fsetlocking(f, FSETLOCKING_BYCALLER);
1020
1021         FOREACH_LINE(line, f, return -errno) {
1022                 char *e, *p;
1023
1024                 truncate_nl(line);
1025
1026                 if (unified) {
1027                         e = startswith(line, "0:");
1028                         if (!e)
1029                                 continue;
1030
1031                         e = strchr(e, ':');
1032                         if (!e)
1033                                 continue;
1034                 } else {
1035                         char *l;
1036                         size_t k;
1037                         const char *word, *state;
1038                         bool found = false;
1039
1040                         l = strchr(line, ':');
1041                         if (!l)
1042                                 continue;
1043
1044                         l++;
1045                         e = strchr(l, ':');
1046                         if (!e)
1047                                 continue;
1048
1049                         *e = 0;
1050                         FOREACH_WORD_SEPARATOR(word, k, l, ",", state)
1051                                 if (k == cs && memcmp(word, controller_str, cs) == 0) {
1052                                         found = true;
1053                                         break;
1054                                 }
1055                         if (!found)
1056                                 continue;
1057                 }
1058
1059                 p = strdup(e + 1);
1060                 if (!p)
1061                         return -ENOMEM;
1062
1063                 /* Truncate suffix indicating the process is a zombie */
1064                 e = endswith(p, " (deleted)");
1065                 if (e)
1066                         *e = 0;
1067
1068                 *path = p;
1069                 return 0;
1070         }
1071
1072         return -ENODATA;
1073 }
1074
1075 int cg_install_release_agent(const char *controller, const char *agent) {
1076         _cleanup_free_ char *fs = NULL, *contents = NULL;
1077         const char *sc;
1078         int r;
1079
1080         assert(agent);
1081
1082         r = cg_unified_controller(controller);
1083         if (r < 0)
1084                 return r;
1085         if (r > 0) /* doesn't apply to unified hierarchy */
1086                 return -EOPNOTSUPP;
1087
1088         r = cg_get_path(controller, NULL, "release_agent", &fs);
1089         if (r < 0)
1090                 return r;
1091
1092         r = read_one_line_file(fs, &contents);
1093         if (r < 0)
1094                 return r;
1095
1096         sc = strstrip(contents);
1097         if (isempty(sc)) {
1098                 r = write_string_file(fs, agent, 0);
1099                 if (r < 0)
1100                         return r;
1101         } else if (!path_equal(sc, agent))
1102                 return -EEXIST;
1103
1104         fs = mfree(fs);
1105         r = cg_get_path(controller, NULL, "notify_on_release", &fs);
1106         if (r < 0)
1107                 return r;
1108
1109         contents = mfree(contents);
1110         r = read_one_line_file(fs, &contents);
1111         if (r < 0)
1112                 return r;
1113
1114         sc = strstrip(contents);
1115         if (streq(sc, "0")) {
1116                 r = write_string_file(fs, "1", 0);
1117                 if (r < 0)
1118                         return r;
1119
1120                 return 1;
1121         }
1122
1123         if (!streq(sc, "1"))
1124                 return -EIO;
1125
1126         return 0;
1127 }
1128
1129 int cg_uninstall_release_agent(const char *controller) {
1130         _cleanup_free_ char *fs = NULL;
1131         int r;
1132
1133         r = cg_unified_controller(controller);
1134         if (r < 0)
1135                 return r;
1136         if (r > 0) /* Doesn't apply to unified hierarchy */
1137                 return -EOPNOTSUPP;
1138
1139         r = cg_get_path(controller, NULL, "notify_on_release", &fs);
1140         if (r < 0)
1141                 return r;
1142
1143         r = write_string_file(fs, "0", 0);
1144         if (r < 0)
1145                 return r;
1146
1147         fs = mfree(fs);
1148
1149         r = cg_get_path(controller, NULL, "release_agent", &fs);
1150         if (r < 0)
1151                 return r;
1152
1153         r = write_string_file(fs, "", 0);
1154         if (r < 0)
1155                 return r;
1156
1157         return 0;
1158 }
1159
1160 int cg_is_empty(const char *controller, const char *path) {
1161         _cleanup_fclose_ FILE *f = NULL;
1162         pid_t pid;
1163         int r;
1164
1165         assert(path);
1166
1167         r = cg_enumerate_processes(controller, path, &f);
1168         if (r == -ENOENT)
1169                 return 1;
1170         if (r < 0)
1171                 return r;
1172
1173         r = cg_read_pid(f, &pid);
1174         if (r < 0)
1175                 return r;
1176
1177         return r == 0;
1178 }
1179
1180 int cg_is_empty_recursive(const char *controller, const char *path) {
1181         int r;
1182
1183         assert(path);
1184
1185         /* The root cgroup is always populated */
1186         if (controller && empty_or_root(path))
1187                 return false;
1188
1189         r = cg_unified_controller(controller);
1190         if (r < 0)
1191                 return r;
1192         if (r > 0) {
1193                 _cleanup_free_ char *t = NULL;
1194
1195                 /* On the unified hierarchy we can check empty state
1196                  * via the "populated" attribute of "cgroup.events". */
1197
1198                 r = cg_read_event(controller, path, "populated", &t);
1199                 if (r < 0)
1200                         return r;
1201
1202                 return streq(t, "0");
1203         } else {
1204                 _cleanup_closedir_ DIR *d = NULL;
1205                 char *fn;
1206
1207                 r = cg_is_empty(controller, path);
1208                 if (r <= 0)
1209                         return r;
1210
1211                 r = cg_enumerate_subgroups(controller, path, &d);
1212                 if (r == -ENOENT)
1213                         return 1;
1214                 if (r < 0)
1215                         return r;
1216
1217                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1218                         _cleanup_free_ char *p = NULL;
1219
1220                         p = strjoin(path, "/", fn);
1221                         free(fn);
1222                         if (!p)
1223                                 return -ENOMEM;
1224
1225                         r = cg_is_empty_recursive(controller, p);
1226                         if (r <= 0)
1227                                 return r;
1228                 }
1229                 if (r < 0)
1230                         return r;
1231
1232                 return true;
1233         }
1234 }
1235
1236 int cg_split_spec(const char *spec, char **controller, char **path) {
1237         char *t = NULL, *u = NULL;
1238         const char *e;
1239
1240         assert(spec);
1241
1242         if (*spec == '/') {
1243                 if (!path_is_normalized(spec))
1244                         return -EINVAL;
1245
1246                 if (path) {
1247                         t = strdup(spec);
1248                         if (!t)
1249                                 return -ENOMEM;
1250
1251                         *path = path_simplify(t, false);
1252                 }
1253
1254                 if (controller)
1255                         *controller = NULL;
1256
1257                 return 0;
1258         }
1259
1260         e = strchr(spec, ':');
1261         if (!e) {
1262                 if (!cg_controller_is_valid(spec))
1263                         return -EINVAL;
1264
1265                 if (controller) {
1266                         t = strdup(spec);
1267                         if (!t)
1268                                 return -ENOMEM;
1269
1270                         *controller = t;
1271                 }
1272
1273                 if (path)
1274                         *path = NULL;
1275
1276                 return 0;
1277         }
1278
1279         t = strndup(spec, e-spec);
1280         if (!t)
1281                 return -ENOMEM;
1282         if (!cg_controller_is_valid(t)) {
1283                 free(t);
1284                 return -EINVAL;
1285         }
1286
1287         if (isempty(e+1))
1288                 u = NULL;
1289         else {
1290                 u = strdup(e+1);
1291                 if (!u) {
1292                         free(t);
1293                         return -ENOMEM;
1294                 }
1295
1296                 if (!path_is_normalized(u) ||
1297                     !path_is_absolute(u)) {
1298                         free(t);
1299                         free(u);
1300                         return -EINVAL;
1301                 }
1302
1303                 path_simplify(u, false);
1304         }
1305
1306         if (controller)
1307                 *controller = t;
1308         else
1309                 free(t);
1310
1311         if (path)
1312                 *path = u;
1313         else
1314                 free(u);
1315
1316         return 0;
1317 }
1318
1319 int cg_mangle_path(const char *path, char **result) {
1320         _cleanup_free_ char *c = NULL, *p = NULL;
1321         char *t;
1322         int r;
1323
1324         assert(path);
1325         assert(result);
1326
1327         /* First, check if it already is a filesystem path */
1328         if (path_startswith(path, "/sys/fs/cgroup")) {
1329
1330                 t = strdup(path);
1331                 if (!t)
1332                         return -ENOMEM;
1333
1334                 *result = path_simplify(t, false);
1335                 return 0;
1336         }
1337
1338         /* Otherwise, treat it as cg spec */
1339         r = cg_split_spec(path, &c, &p);
1340         if (r < 0)
1341                 return r;
1342
1343         return cg_get_path(c ?: SYSTEMD_CGROUP_CONTROLLER, p ?: "/", NULL, result);
1344 }
1345
1346 int cg_get_root_path(char **path) {
1347         char *p, *e;
1348         int r;
1349
1350         assert(path);
1351
1352         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 1, &p);
1353         if (r < 0)
1354                 return r;
1355
1356         e = endswith(p, "/" SPECIAL_INIT_SCOPE);
1357         if (!e)
1358                 e = endswith(p, "/" SPECIAL_SYSTEM_SLICE); /* legacy */
1359         if (!e)
1360                 e = endswith(p, "/system"); /* even more legacy */
1361         if (e)
1362                 *e = 0;
1363
1364         *path = p;
1365         return 0;
1366 }
1367
1368 int cg_shift_path(const char *cgroup, const char *root, const char **shifted) {
1369         _cleanup_free_ char *rt = NULL;
1370         char *p;
1371         int r;
1372
1373         assert(cgroup);
1374         assert(shifted);
1375
1376         if (!root) {
1377                 /* If the root was specified let's use that, otherwise
1378                  * let's determine it from PID 1 */
1379
1380                 r = cg_get_root_path(&rt);
1381                 if (r < 0)
1382                         return r;
1383
1384                 root = rt;
1385         }
1386
1387         p = path_startswith(cgroup, root);
1388         if (p && p > cgroup)
1389                 *shifted = p - 1;
1390         else
1391                 *shifted = cgroup;
1392
1393         return 0;
1394 }
1395
1396 int cg_pid_get_path_shifted(pid_t pid, const char *root, char **cgroup) {
1397         _cleanup_free_ char *raw = NULL;
1398         const char *c;
1399         int r;
1400
1401         assert(pid >= 0);
1402         assert(cgroup);
1403
1404         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &raw);
1405         if (r < 0)
1406                 return r;
1407
1408         r = cg_shift_path(raw, root, &c);
1409         if (r < 0)
1410                 return r;
1411
1412         if (c == raw)
1413                 *cgroup = TAKE_PTR(raw);
1414         else {
1415                 char *n;
1416
1417                 n = strdup(c);
1418                 if (!n)
1419                         return -ENOMEM;
1420
1421                 *cgroup = n;
1422         }
1423
1424         return 0;
1425 }
1426
1427 int cg_path_decode_unit(const char *cgroup, char **unit) {
1428         char *c, *s;
1429         size_t n;
1430
1431         assert(cgroup);
1432         assert(unit);
1433
1434         n = strcspn(cgroup, "/");
1435         if (n < 3)
1436                 return -ENXIO;
1437
1438         c = strndupa(cgroup, n);
1439         c = cg_unescape(c);
1440
1441         if (!unit_name_is_valid(c, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
1442                 return -ENXIO;
1443
1444         s = strdup(c);
1445         if (!s)
1446                 return -ENOMEM;
1447
1448         *unit = s;
1449         return 0;
1450 }
1451
1452 static bool valid_slice_name(const char *p, size_t n) {
1453
1454         if (!p)
1455                 return false;
1456
1457         if (n < STRLEN("x.slice"))
1458                 return false;
1459
1460         if (memcmp(p + n - 6, ".slice", 6) == 0) {
1461                 char buf[n+1], *c;
1462
1463                 memcpy(buf, p, n);
1464                 buf[n] = 0;
1465
1466                 c = cg_unescape(buf);
1467
1468                 return unit_name_is_valid(c, UNIT_NAME_PLAIN);
1469         }
1470
1471         return false;
1472 }
1473
1474 static const char *skip_slices(const char *p) {
1475         assert(p);
1476
1477         /* Skips over all slice assignments */
1478
1479         for (;;) {
1480                 size_t n;
1481
1482                 p += strspn(p, "/");
1483
1484                 n = strcspn(p, "/");
1485                 if (!valid_slice_name(p, n))
1486                         return p;
1487
1488                 p += n;
1489         }
1490 }
1491
1492 int cg_path_get_unit(const char *path, char **ret) {
1493         const char *e;
1494         char *unit;
1495         int r;
1496
1497         assert(path);
1498         assert(ret);
1499
1500         e = skip_slices(path);
1501
1502         r = cg_path_decode_unit(e, &unit);
1503         if (r < 0)
1504                 return r;
1505
1506         /* We skipped over the slices, don't accept any now */
1507         if (endswith(unit, ".slice")) {
1508                 free(unit);
1509                 return -ENXIO;
1510         }
1511
1512         *ret = unit;
1513         return 0;
1514 }
1515
1516 int cg_pid_get_unit(pid_t pid, char **unit) {
1517         _cleanup_free_ char *cgroup = NULL;
1518         int r;
1519
1520         assert(unit);
1521
1522         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1523         if (r < 0)
1524                 return r;
1525
1526         return cg_path_get_unit(cgroup, unit);
1527 }
1528
1529 /**
1530  * Skip session-*.scope, but require it to be there.
1531  */
1532 static const char *skip_session(const char *p) {
1533         size_t n;
1534
1535         if (isempty(p))
1536                 return NULL;
1537
1538         p += strspn(p, "/");
1539
1540         n = strcspn(p, "/");
1541         if (n < STRLEN("session-x.scope"))
1542                 return NULL;
1543
1544         if (memcmp(p, "session-", 8) == 0 && memcmp(p + n - 6, ".scope", 6) == 0) {
1545                 char buf[n - 8 - 6 + 1];
1546
1547                 memcpy(buf, p + 8, n - 8 - 6);
1548                 buf[n - 8 - 6] = 0;
1549
1550                 /* Note that session scopes never need unescaping,
1551                  * since they cannot conflict with the kernel's own
1552                  * names, hence we don't need to call cg_unescape()
1553                  * here. */
1554
1555                 if (!session_id_valid(buf))
1556                         return false;
1557
1558                 p += n;
1559                 p += strspn(p, "/");
1560                 return p;
1561         }
1562
1563         return NULL;
1564 }
1565
1566 /**
1567  * Skip user@*.service, but require it to be there.
1568  */
1569 static const char *skip_user_manager(const char *p) {
1570         size_t n;
1571
1572         if (isempty(p))
1573                 return NULL;
1574
1575         p += strspn(p, "/");
1576
1577         n = strcspn(p, "/");
1578         if (n < STRLEN("user@x.service"))
1579                 return NULL;
1580
1581         if (memcmp(p, "user@", 5) == 0 && memcmp(p + n - 8, ".service", 8) == 0) {
1582                 char buf[n - 5 - 8 + 1];
1583
1584                 memcpy(buf, p + 5, n - 5 - 8);
1585                 buf[n - 5 - 8] = 0;
1586
1587                 /* Note that user manager services never need unescaping,
1588                  * since they cannot conflict with the kernel's own
1589                  * names, hence we don't need to call cg_unescape()
1590                  * here. */
1591
1592                 if (parse_uid(buf, NULL) < 0)
1593                         return NULL;
1594
1595                 p += n;
1596                 p += strspn(p, "/");
1597
1598                 return p;
1599         }
1600
1601         return NULL;
1602 }
1603
1604 static const char *skip_user_prefix(const char *path) {
1605         const char *e, *t;
1606
1607         assert(path);
1608
1609         /* Skip slices, if there are any */
1610         e = skip_slices(path);
1611
1612         /* Skip the user manager, if it's in the path now... */
1613         t = skip_user_manager(e);
1614         if (t)
1615                 return t;
1616
1617         /* Alternatively skip the user session if it is in the path... */
1618         return skip_session(e);
1619 }
1620
1621 int cg_path_get_user_unit(const char *path, char **ret) {
1622         const char *t;
1623
1624         assert(path);
1625         assert(ret);
1626
1627         t = skip_user_prefix(path);
1628         if (!t)
1629                 return -ENXIO;
1630
1631         /* And from here on it looks pretty much the same as for a
1632          * system unit, hence let's use the same parser from here
1633          * on. */
1634         return cg_path_get_unit(t, ret);
1635 }
1636
1637 int cg_pid_get_user_unit(pid_t pid, char **unit) {
1638         _cleanup_free_ char *cgroup = NULL;
1639         int r;
1640
1641         assert(unit);
1642
1643         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1644         if (r < 0)
1645                 return r;
1646
1647         return cg_path_get_user_unit(cgroup, unit);
1648 }
1649
1650 int cg_path_get_machine_name(const char *path, char **machine) {
1651         _cleanup_free_ char *u = NULL;
1652         const char *sl;
1653         int r;
1654
1655         r = cg_path_get_unit(path, &u);
1656         if (r < 0)
1657                 return r;
1658
1659         sl = strjoina("/run/systemd/machines/unit:", u);
1660         return readlink_malloc(sl, machine);
1661 }
1662
1663 int cg_pid_get_machine_name(pid_t pid, char **machine) {
1664         _cleanup_free_ char *cgroup = NULL;
1665         int r;
1666
1667         assert(machine);
1668
1669         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1670         if (r < 0)
1671                 return r;
1672
1673         return cg_path_get_machine_name(cgroup, machine);
1674 }
1675
1676 int cg_path_get_session(const char *path, char **session) {
1677         _cleanup_free_ char *unit = NULL;
1678         char *start, *end;
1679         int r;
1680
1681         assert(path);
1682
1683         r = cg_path_get_unit(path, &unit);
1684         if (r < 0)
1685                 return r;
1686
1687         start = startswith(unit, "session-");
1688         if (!start)
1689                 return -ENXIO;
1690         end = endswith(start, ".scope");
1691         if (!end)
1692                 return -ENXIO;
1693
1694         *end = 0;
1695         if (!session_id_valid(start))
1696                 return -ENXIO;
1697
1698         if (session) {
1699                 char *rr;
1700
1701                 rr = strdup(start);
1702                 if (!rr)
1703                         return -ENOMEM;
1704
1705                 *session = rr;
1706         }
1707
1708         return 0;
1709 }
1710
1711 int cg_pid_get_session(pid_t pid, char **session) {
1712         _cleanup_free_ char *cgroup = NULL;
1713         int r;
1714
1715         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1716         if (r < 0)
1717                 return r;
1718
1719         return cg_path_get_session(cgroup, session);
1720 }
1721
1722 int cg_path_get_owner_uid(const char *path, uid_t *uid) {
1723         _cleanup_free_ char *slice = NULL;
1724         char *start, *end;
1725         int r;
1726
1727         assert(path);
1728
1729         r = cg_path_get_slice(path, &slice);
1730         if (r < 0)
1731                 return r;
1732
1733         start = startswith(slice, "user-");
1734         if (!start)
1735                 return -ENXIO;
1736         end = endswith(start, ".slice");
1737         if (!end)
1738                 return -ENXIO;
1739
1740         *end = 0;
1741         if (parse_uid(start, uid) < 0)
1742                 return -ENXIO;
1743
1744         return 0;
1745 }
1746
1747 int cg_pid_get_owner_uid(pid_t pid, uid_t *uid) {
1748         _cleanup_free_ char *cgroup = NULL;
1749         int r;
1750
1751         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1752         if (r < 0)
1753                 return r;
1754
1755         return cg_path_get_owner_uid(cgroup, uid);
1756 }
1757
1758 int cg_path_get_slice(const char *p, char **slice) {
1759         const char *e = NULL;
1760
1761         assert(p);
1762         assert(slice);
1763
1764         /* Finds the right-most slice unit from the beginning, but
1765          * stops before we come to the first non-slice unit. */
1766
1767         for (;;) {
1768                 size_t n;
1769
1770                 p += strspn(p, "/");
1771
1772                 n = strcspn(p, "/");
1773                 if (!valid_slice_name(p, n)) {
1774
1775                         if (!e) {
1776                                 char *s;
1777
1778                                 s = strdup(SPECIAL_ROOT_SLICE);
1779                                 if (!s)
1780                                         return -ENOMEM;
1781
1782                                 *slice = s;
1783                                 return 0;
1784                         }
1785
1786                         return cg_path_decode_unit(e, slice);
1787                 }
1788
1789                 e = p;
1790                 p += n;
1791         }
1792 }
1793
1794 int cg_pid_get_slice(pid_t pid, char **slice) {
1795         _cleanup_free_ char *cgroup = NULL;
1796         int r;
1797
1798         assert(slice);
1799
1800         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1801         if (r < 0)
1802                 return r;
1803
1804         return cg_path_get_slice(cgroup, slice);
1805 }
1806
1807 int cg_path_get_user_slice(const char *p, char **slice) {
1808         const char *t;
1809         assert(p);
1810         assert(slice);
1811
1812         t = skip_user_prefix(p);
1813         if (!t)
1814                 return -ENXIO;
1815
1816         /* And now it looks pretty much the same as for a system
1817          * slice, so let's just use the same parser from here on. */
1818         return cg_path_get_slice(t, slice);
1819 }
1820
1821 int cg_pid_get_user_slice(pid_t pid, char **slice) {
1822         _cleanup_free_ char *cgroup = NULL;
1823         int r;
1824
1825         assert(slice);
1826
1827         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1828         if (r < 0)
1829                 return r;
1830
1831         return cg_path_get_user_slice(cgroup, slice);
1832 }
1833
1834 char *cg_escape(const char *p) {
1835         bool need_prefix = false;
1836
1837         /* This implements very minimal escaping for names to be used
1838          * as file names in the cgroup tree: any name which might
1839          * conflict with a kernel name or is prefixed with '_' is
1840          * prefixed with a '_'. That way, when reading cgroup names it
1841          * is sufficient to remove a single prefixing underscore if
1842          * there is one. */
1843
1844         /* The return value of this function (unlike cg_unescape())
1845          * needs free()! */
1846
1847         if (IN_SET(p[0], 0, '_', '.') ||
1848             streq(p, "notify_on_release") ||
1849             streq(p, "release_agent") ||
1850             streq(p, "tasks") ||
1851             startswith(p, "cgroup."))
1852                 need_prefix = true;
1853         else {
1854                 const char *dot;
1855
1856                 dot = strrchr(p, '.');
1857                 if (dot) {
1858                         CGroupController c;
1859                         size_t l = dot - p;
1860
1861                         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
1862                                 const char *n;
1863
1864                                 n = cgroup_controller_to_string(c);
1865
1866                                 if (l != strlen(n))
1867                                         continue;
1868
1869                                 if (memcmp(p, n, l) != 0)
1870                                         continue;
1871
1872                                 need_prefix = true;
1873                                 break;
1874                         }
1875                 }
1876         }
1877
1878         if (need_prefix)
1879                 return strappend("_", p);
1880
1881         return strdup(p);
1882 }
1883
1884 char *cg_unescape(const char *p) {
1885         assert(p);
1886
1887         /* The return value of this function (unlike cg_escape())
1888          * doesn't need free()! */
1889
1890         if (p[0] == '_')
1891                 return (char*) p+1;
1892
1893         return (char*) p;
1894 }
1895
1896 #define CONTROLLER_VALID                        \
1897         DIGITS LETTERS                          \
1898         "_"
1899
1900 bool cg_controller_is_valid(const char *p) {
1901         const char *t, *s;
1902
1903         if (!p)
1904                 return false;
1905
1906         if (streq(p, SYSTEMD_CGROUP_CONTROLLER))
1907                 return true;
1908
1909         s = startswith(p, "name=");
1910         if (s)
1911                 p = s;
1912
1913         if (IN_SET(*p, 0, '_'))
1914                 return false;
1915
1916         for (t = p; *t; t++)
1917                 if (!strchr(CONTROLLER_VALID, *t))
1918                         return false;
1919
1920         if (t - p > FILENAME_MAX)
1921                 return false;
1922
1923         return true;
1924 }
1925
1926 int cg_slice_to_path(const char *unit, char **ret) {
1927         _cleanup_free_ char *p = NULL, *s = NULL, *e = NULL;
1928         const char *dash;
1929         int r;
1930
1931         assert(unit);
1932         assert(ret);
1933
1934         if (streq(unit, SPECIAL_ROOT_SLICE)) {
1935                 char *x;
1936
1937                 x = strdup("");
1938                 if (!x)
1939                         return -ENOMEM;
1940                 *ret = x;
1941                 return 0;
1942         }
1943
1944         if (!unit_name_is_valid(unit, UNIT_NAME_PLAIN))
1945                 return -EINVAL;
1946
1947         if (!endswith(unit, ".slice"))
1948                 return -EINVAL;
1949
1950         r = unit_name_to_prefix(unit, &p);
1951         if (r < 0)
1952                 return r;
1953
1954         dash = strchr(p, '-');
1955
1956         /* Don't allow initial dashes */
1957         if (dash == p)
1958                 return -EINVAL;
1959
1960         while (dash) {
1961                 _cleanup_free_ char *escaped = NULL;
1962                 char n[dash - p + sizeof(".slice")];
1963
1964 #if HAS_FEATURE_MEMORY_SANITIZER
1965                 /* msan doesn't instrument stpncpy, so it thinks
1966                  * n is later used unitialized:
1967                  * https://github.com/google/sanitizers/issues/926
1968                  */
1969                 zero(n);
1970 #endif
1971
1972                 /* Don't allow trailing or double dashes */
1973                 if (IN_SET(dash[1], 0, '-'))
1974                         return -EINVAL;
1975
1976                 strcpy(stpncpy(n, p, dash - p), ".slice");
1977                 if (!unit_name_is_valid(n, UNIT_NAME_PLAIN))
1978                         return -EINVAL;
1979
1980                 escaped = cg_escape(n);
1981                 if (!escaped)
1982                         return -ENOMEM;
1983
1984                 if (!strextend(&s, escaped, "/", NULL))
1985                         return -ENOMEM;
1986
1987                 dash = strchr(dash+1, '-');
1988         }
1989
1990         e = cg_escape(unit);
1991         if (!e)
1992                 return -ENOMEM;
1993
1994         if (!strextend(&s, e, NULL))
1995                 return -ENOMEM;
1996
1997         *ret = TAKE_PTR(s);
1998
1999         return 0;
2000 }
2001
2002 int cg_set_attribute(const char *controller, const char *path, const char *attribute, const char *value) {
2003         _cleanup_free_ char *p = NULL;
2004         int r;
2005
2006         r = cg_get_path(controller, path, attribute, &p);
2007         if (r < 0)
2008                 return r;
2009
2010         return write_string_file(p, value, 0);
2011 }
2012
2013 int cg_get_attribute(const char *controller, const char *path, const char *attribute, char **ret) {
2014         _cleanup_free_ char *p = NULL;
2015         int r;
2016
2017         r = cg_get_path(controller, path, attribute, &p);
2018         if (r < 0)
2019                 return r;
2020
2021         return read_one_line_file(p, ret);
2022 }
2023
2024 int cg_get_keyed_attribute(
2025                 const char *controller,
2026                 const char *path,
2027                 const char *attribute,
2028                 char **keys,
2029                 char **ret_values) {
2030
2031         _cleanup_free_ char *filename = NULL, *contents = NULL;
2032         const char *p;
2033         size_t n, i, n_done = 0;
2034         char **v;
2035         int r;
2036
2037         /* Reads one or more fields of a cgroupsv2 keyed attribute file. The 'keys' parameter should be an strv with
2038          * all keys to retrieve. The 'ret_values' parameter should be passed as string size with the same number of
2039          * entries as 'keys'. On success each entry will be set to the value of the matching key.
2040          *
2041          * If the attribute file doesn't exist at all returns ENOENT, if any key is not found returns ENXIO. */
2042
2043         r = cg_get_path(controller, path, attribute, &filename);
2044         if (r < 0)
2045                 return r;
2046
2047         r = read_full_file(filename, &contents, NULL);
2048         if (r < 0)
2049                 return r;
2050
2051         n = strv_length(keys);
2052         if (n == 0) /* No keys to retrieve? That's easy, we are done then */
2053                 return 0;
2054
2055         /* Let's build this up in a temporary array for now in order not to clobber the return parameter on failure */
2056         v = newa0(char*, n);
2057
2058         for (p = contents; *p;) {
2059                 const char *w = NULL;
2060
2061                 for (i = 0; i < n; i++)
2062                         if (!v[i]) {
2063                                 w = first_word(p, keys[i]);
2064                                 if (w)
2065                                         break;
2066                         }
2067
2068                 if (w) {
2069                         size_t l;
2070
2071                         l = strcspn(w, NEWLINE);
2072                         v[i] = strndup(w, l);
2073                         if (!v[i]) {
2074                                 r = -ENOMEM;
2075                                 goto fail;
2076                         }
2077
2078                         n_done++;
2079                         if (n_done >= n)
2080                                 goto done;
2081
2082                         p = w + l;
2083                 } else
2084                         p += strcspn(p, NEWLINE);
2085
2086                 p += strspn(p, NEWLINE);
2087         }
2088
2089         r = -ENXIO;
2090
2091 fail:
2092         for (i = 0; i < n; i++)
2093                 free(v[i]);
2094
2095         return r;
2096
2097 done:
2098         memcpy(ret_values, v, sizeof(char*) * n);
2099         return 0;
2100
2101 }
2102
2103 int cg_create_everywhere(CGroupMask supported, CGroupMask mask, const char *path) {
2104         CGroupController c;
2105         bool created;
2106         int r;
2107
2108         /* This one will create a cgroup in our private tree, but also
2109          * duplicate it in the trees specified in mask, and remove it
2110          * in all others.
2111          *
2112          * Returns 0 if the group already existed in the systemd hierarchy,
2113          * 1 on success, negative otherwise.
2114          */
2115
2116         /* First create the cgroup in our own hierarchy. */
2117         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, path);
2118         if (r < 0)
2119                 return r;
2120         created = !!r;
2121
2122         /* If we are in the unified hierarchy, we are done now */
2123         r = cg_all_unified();
2124         if (r < 0)
2125                 return r;
2126         if (r > 0)
2127                 return created;
2128
2129         /* Otherwise, do the same in the other hierarchies */
2130         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2131                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2132                 const char *n;
2133
2134                 n = cgroup_controller_to_string(c);
2135
2136                 if (mask & bit)
2137                         (void) cg_create(n, path);
2138                 else if (supported & bit)
2139                         (void) cg_trim(n, path, true);
2140         }
2141
2142         return created;
2143 }
2144
2145 int cg_attach_everywhere(CGroupMask supported, const char *path, pid_t pid, cg_migrate_callback_t path_callback, void *userdata) {
2146         CGroupController c;
2147         int r;
2148
2149         r = cg_attach(SYSTEMD_CGROUP_CONTROLLER, path, pid);
2150         if (r < 0)
2151                 return r;
2152
2153         r = cg_all_unified();
2154         if (r < 0)
2155                 return r;
2156         if (r > 0)
2157                 return 0;
2158
2159         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2160                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2161                 const char *p = NULL;
2162
2163                 if (!(supported & bit))
2164                         continue;
2165
2166                 if (path_callback)
2167                         p = path_callback(bit, userdata);
2168
2169                 if (!p)
2170                         p = path;
2171
2172                 (void) cg_attach_fallback(cgroup_controller_to_string(c), p, pid);
2173         }
2174
2175         return 0;
2176 }
2177
2178 int cg_attach_many_everywhere(CGroupMask supported, const char *path, Set* pids, cg_migrate_callback_t path_callback, void *userdata) {
2179         Iterator i;
2180         void *pidp;
2181         int r = 0;
2182
2183         SET_FOREACH(pidp, pids, i) {
2184                 pid_t pid = PTR_TO_PID(pidp);
2185                 int q;
2186
2187                 q = cg_attach_everywhere(supported, path, pid, path_callback, userdata);
2188                 if (q < 0 && r >= 0)
2189                         r = q;
2190         }
2191
2192         return r;
2193 }
2194
2195 int cg_migrate_everywhere(CGroupMask supported, const char *from, const char *to, cg_migrate_callback_t to_callback, void *userdata) {
2196         CGroupController c;
2197         int r = 0, q;
2198
2199         if (!path_equal(from, to))  {
2200                 r = cg_migrate_recursive(SYSTEMD_CGROUP_CONTROLLER, from, SYSTEMD_CGROUP_CONTROLLER, to, CGROUP_REMOVE);
2201                 if (r < 0)
2202                         return r;
2203         }
2204
2205         q = cg_all_unified();
2206         if (q < 0)
2207                 return q;
2208         if (q > 0)
2209                 return r;
2210
2211         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2212                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2213                 const char *p = NULL;
2214
2215                 if (!(supported & bit))
2216                         continue;
2217
2218                 if (to_callback)
2219                         p = to_callback(bit, userdata);
2220
2221                 if (!p)
2222                         p = to;
2223
2224                 (void) cg_migrate_recursive_fallback(SYSTEMD_CGROUP_CONTROLLER, to, cgroup_controller_to_string(c), p, 0);
2225         }
2226
2227         return 0;
2228 }
2229
2230 int cg_trim_everywhere(CGroupMask supported, const char *path, bool delete_root) {
2231         CGroupController c;
2232         int r, q;
2233
2234         r = cg_trim(SYSTEMD_CGROUP_CONTROLLER, path, delete_root);
2235         if (r < 0)
2236                 return r;
2237
2238         q = cg_all_unified();
2239         if (q < 0)
2240                 return q;
2241         if (q > 0)
2242                 return r;
2243
2244         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2245                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2246
2247                 if (!(supported & bit))
2248                         continue;
2249
2250                 (void) cg_trim(cgroup_controller_to_string(c), path, delete_root);
2251         }
2252
2253         return 0;
2254 }
2255
2256 int cg_mask_to_string(CGroupMask mask, char **ret) {
2257         _cleanup_free_ char *s = NULL;
2258         size_t n = 0, allocated = 0;
2259         bool space = false;
2260         CGroupController c;
2261
2262         assert(ret);
2263
2264         if (mask == 0) {
2265                 *ret = NULL;
2266                 return 0;
2267         }
2268
2269         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2270                 const char *k;
2271                 size_t l;
2272
2273                 if (!(mask & CGROUP_CONTROLLER_TO_MASK(c)))
2274                         continue;
2275
2276                 k = cgroup_controller_to_string(c);
2277                 l = strlen(k);
2278
2279                 if (!GREEDY_REALLOC(s, allocated, n + space + l + 1))
2280                         return -ENOMEM;
2281
2282                 if (space)
2283                         s[n] = ' ';
2284                 memcpy(s + n + space, k, l);
2285                 n += space + l;
2286
2287                 space = true;
2288         }
2289
2290         assert(s);
2291
2292         s[n] = 0;
2293         *ret = TAKE_PTR(s);
2294
2295         return 0;
2296 }
2297
2298 int cg_mask_from_string(const char *value, CGroupMask *mask) {
2299         assert(mask);
2300         assert(value);
2301
2302         for (;;) {
2303                 _cleanup_free_ char *n = NULL;
2304                 CGroupController v;
2305                 int r;
2306
2307                 r = extract_first_word(&value, &n, NULL, 0);
2308                 if (r < 0)
2309                         return r;
2310                 if (r == 0)
2311                         break;
2312
2313                 v = cgroup_controller_from_string(n);
2314                 if (v < 0)
2315                         continue;
2316
2317                 *mask |= CGROUP_CONTROLLER_TO_MASK(v);
2318         }
2319         return 0;
2320 }
2321
2322 int cg_mask_supported(CGroupMask *ret) {
2323         CGroupMask mask = 0;
2324         int r;
2325
2326         /* Determines the mask of supported cgroup controllers. Only
2327          * includes controllers we can make sense of and that are
2328          * actually accessible. */
2329
2330         r = cg_all_unified();
2331         if (r < 0)
2332                 return r;
2333         if (r > 0) {
2334                 _cleanup_free_ char *root = NULL, *controllers = NULL, *path = NULL;
2335
2336                 /* In the unified hierarchy we can read the supported
2337                  * and accessible controllers from a the top-level
2338                  * cgroup attribute */
2339
2340                 r = cg_get_root_path(&root);
2341                 if (r < 0)
2342                         return r;
2343
2344                 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, root, "cgroup.controllers", &path);
2345                 if (r < 0)
2346                         return r;
2347
2348                 r = read_one_line_file(path, &controllers);
2349                 if (r < 0)
2350                         return r;
2351
2352                 r = cg_mask_from_string(controllers, &mask);
2353                 if (r < 0)
2354                         return r;
2355
2356                 /* Currently, we support the cpu, memory, io and pids
2357                  * controller in the unified hierarchy, mask
2358                  * everything else off. */
2359                 mask &= CGROUP_MASK_CPU | CGROUP_MASK_MEMORY | CGROUP_MASK_IO | CGROUP_MASK_PIDS;
2360
2361         } else {
2362                 CGroupController c;
2363
2364                 /* In the legacy hierarchy, we check whether which
2365                  * hierarchies are mounted. */
2366
2367                 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2368                         const char *n;
2369
2370                         n = cgroup_controller_to_string(c);
2371                         if (controller_is_accessible(n) >= 0)
2372                                 mask |= CGROUP_CONTROLLER_TO_MASK(c);
2373                 }
2374         }
2375
2376         *ret = mask;
2377         return 0;
2378 }
2379
2380 int cg_kernel_controllers(Set **ret) {
2381         _cleanup_set_free_free_ Set *controllers = NULL;
2382         _cleanup_fclose_ FILE *f = NULL;
2383         int r;
2384
2385         assert(ret);
2386
2387         /* Determines the full list of kernel-known controllers. Might include controllers we don't actually support
2388          * and controllers that aren't currently accessible (because not mounted). This does not include "name="
2389          * pseudo-controllers. */
2390
2391         controllers = set_new(&string_hash_ops);
2392         if (!controllers)
2393                 return -ENOMEM;
2394
2395         f = fopen("/proc/cgroups", "re");
2396         if (!f) {
2397                 if (errno == ENOENT) {
2398                         *ret = NULL;
2399                         return 0;
2400                 }
2401
2402                 return -errno;
2403         }
2404
2405         (void) __fsetlocking(f, FSETLOCKING_BYCALLER);
2406
2407         /* Ignore the header line */
2408         (void) read_line(f, (size_t) -1, NULL);
2409
2410         for (;;) {
2411                 char *controller;
2412                 int enabled = 0;
2413
2414                 errno = 0;
2415                 if (fscanf(f, "%ms %*i %*i %i", &controller, &enabled) != 2) {
2416
2417                         if (feof(f))
2418                                 break;
2419
2420                         if (ferror(f) && errno > 0)
2421                                 return -errno;
2422
2423                         return -EBADMSG;
2424                 }
2425
2426                 if (!enabled) {
2427                         free(controller);
2428                         continue;
2429                 }
2430
2431                 if (!cg_controller_is_valid(controller)) {
2432                         free(controller);
2433                         return -EBADMSG;
2434                 }
2435
2436                 r = set_consume(controllers, controller);
2437                 if (r < 0)
2438                         return r;
2439         }
2440
2441         *ret = TAKE_PTR(controllers);
2442
2443         return 0;
2444 }
2445
2446 static thread_local CGroupUnified unified_cache = CGROUP_UNIFIED_UNKNOWN;
2447
2448 /* The hybrid mode was initially implemented in v232 and simply mounted cgroup v2 on /sys/fs/cgroup/systemd.  This
2449  * unfortunately broke other tools (such as docker) which expected the v1 "name=systemd" hierarchy on
2450  * /sys/fs/cgroup/systemd.  From v233 and on, the hybrid mode mountnbs v2 on /sys/fs/cgroup/unified and maintains
2451  * "name=systemd" hierarchy on /sys/fs/cgroup/systemd for compatibility with other tools.
2452  *
2453  * To keep live upgrade working, we detect and support v232 layout.  When v232 layout is detected, to keep cgroup v2
2454  * process management but disable the compat dual layout, we return %true on
2455  * cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) and %false on cg_hybrid_unified().
2456  */
2457 static thread_local bool unified_systemd_v232;
2458
2459 static int cg_unified_update(void) {
2460
2461         struct statfs fs;
2462
2463         /* Checks if we support the unified hierarchy. Returns an
2464          * error when the cgroup hierarchies aren't mounted yet or we
2465          * have any other trouble determining if the unified hierarchy
2466          * is supported. */
2467
2468         if (unified_cache >= CGROUP_UNIFIED_NONE)
2469                 return 0;
2470
2471         if (statfs("/sys/fs/cgroup/", &fs) < 0)
2472                 return log_debug_errno(errno, "statfs(\"/sys/fs/cgroup/\") failed: %m");
2473
2474         if (F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2475                 log_debug("Found cgroup2 on /sys/fs/cgroup/, full unified hierarchy");
2476                 unified_cache = CGROUP_UNIFIED_ALL;
2477         } else if (F_TYPE_EQUAL(fs.f_type, TMPFS_MAGIC)) {
2478                 if (statfs("/sys/fs/cgroup/unified/", &fs) == 0 &&
2479                     F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2480                         log_debug("Found cgroup2 on /sys/fs/cgroup/unified, unified hierarchy for systemd controller");
2481                         unified_cache = CGROUP_UNIFIED_SYSTEMD;
2482                         unified_systemd_v232 = false;
2483                 } else {
2484                         if (statfs("/sys/fs/cgroup/systemd/", &fs) < 0)
2485                                 return log_debug_errno(errno, "statfs(\"/sys/fs/cgroup/systemd\" failed: %m");
2486
2487                         if (F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2488                                 log_debug("Found cgroup2 on /sys/fs/cgroup/systemd, unified hierarchy for systemd controller (v232 variant)");
2489                                 unified_cache = CGROUP_UNIFIED_SYSTEMD;
2490                                 unified_systemd_v232 = true;
2491                         } else if (F_TYPE_EQUAL(fs.f_type, CGROUP_SUPER_MAGIC)) {
2492                                 log_debug("Found cgroup on /sys/fs/cgroup/systemd, legacy hierarchy");
2493                                 unified_cache = CGROUP_UNIFIED_NONE;
2494                         } else {
2495                                 log_debug("Unexpected filesystem type %llx mounted on /sys/fs/cgroup/systemd, assuming legacy hierarchy",
2496                                           (unsigned long long) fs.f_type);
2497                                 unified_cache = CGROUP_UNIFIED_NONE;
2498                         }
2499                 }
2500         } else {
2501                 log_debug("Unknown filesystem type %llx mounted on /sys/fs/cgroup.",
2502                           (unsigned long long) fs.f_type);
2503                 return -ENOMEDIUM;
2504         }
2505
2506         return 0;
2507 }
2508
2509 int cg_unified_controller(const char *controller) {
2510         int r;
2511
2512         r = cg_unified_update();
2513         if (r < 0)
2514                 return r;
2515
2516         if (unified_cache == CGROUP_UNIFIED_NONE)
2517                 return false;
2518
2519         if (unified_cache >= CGROUP_UNIFIED_ALL)
2520                 return true;
2521
2522         return streq_ptr(controller, SYSTEMD_CGROUP_CONTROLLER);
2523 }
2524
2525 int cg_all_unified(void) {
2526         int r;
2527
2528         r = cg_unified_update();
2529         if (r < 0)
2530                 return r;
2531
2532         return unified_cache >= CGROUP_UNIFIED_ALL;
2533 }
2534
2535 int cg_hybrid_unified(void) {
2536         int r;
2537
2538         r = cg_unified_update();
2539         if (r < 0)
2540                 return r;
2541
2542         return unified_cache == CGROUP_UNIFIED_SYSTEMD && !unified_systemd_v232;
2543 }
2544
2545 int cg_unified_flush(void) {
2546         unified_cache = CGROUP_UNIFIED_UNKNOWN;
2547
2548         return cg_unified_update();
2549 }
2550
2551 int cg_enable_everywhere(CGroupMask supported, CGroupMask mask, const char *p) {
2552         _cleanup_fclose_ FILE *f = NULL;
2553         _cleanup_free_ char *fs = NULL;
2554         CGroupController c;
2555         int r;
2556
2557         assert(p);
2558
2559         if (supported == 0)
2560                 return 0;
2561
2562         r = cg_all_unified();
2563         if (r < 0)
2564                 return r;
2565         if (r == 0) /* on the legacy hiearchy there's no joining of controllers defined */
2566                 return 0;
2567
2568         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, p, "cgroup.subtree_control", &fs);
2569         if (r < 0)
2570                 return r;
2571
2572         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2573                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2574                 const char *n;
2575
2576                 if (!(supported & bit))
2577                         continue;
2578
2579                 n = cgroup_controller_to_string(c);
2580                 {
2581                         char s[1 + strlen(n) + 1];
2582
2583                         s[0] = mask & bit ? '+' : '-';
2584                         strcpy(s + 1, n);
2585
2586                         if (!f) {
2587                                 f = fopen(fs, "we");
2588                                 if (!f) {
2589                                         log_debug_errno(errno, "Failed to open cgroup.subtree_control file of %s: %m", p);
2590                                         break;
2591                                 }
2592                         }
2593
2594                         r = write_string_stream(f, s, 0);
2595                         if (r < 0) {
2596                                 log_debug_errno(r, "Failed to enable controller %s for %s (%s): %m", n, p, fs);
2597                                 clearerr(f);
2598                         }
2599                 }
2600         }
2601
2602         return 0;
2603 }
2604
2605 bool cg_is_unified_wanted(void) {
2606         static thread_local int wanted = -1;
2607         int r;
2608         bool b;
2609         const bool is_default = DEFAULT_HIERARCHY == CGROUP_UNIFIED_ALL;
2610
2611         /* If we have a cached value, return that. */
2612         if (wanted >= 0)
2613                 return wanted;
2614
2615         /* If the hierarchy is already mounted, then follow whatever
2616          * was chosen for it. */
2617         if (cg_unified_flush() >= 0)
2618                 return (wanted = unified_cache >= CGROUP_UNIFIED_ALL);
2619
2620         /* Otherwise, let's see what the kernel command line has to say.
2621          * Since checking is expensive, cache a non-error result. */
2622         r = proc_cmdline_get_bool("systemd.unified_cgroup_hierarchy", &b);
2623
2624         return (wanted = r > 0 ? b : is_default);
2625 }
2626
2627 bool cg_is_legacy_wanted(void) {
2628         static thread_local int wanted = -1;
2629
2630         /* If we have a cached value, return that. */
2631         if (wanted >= 0)
2632                 return wanted;
2633
2634         /* Check if we have cgroups2 already mounted. */
2635         if (cg_unified_flush() >= 0 &&
2636             unified_cache == CGROUP_UNIFIED_ALL)
2637                 return (wanted = false);
2638
2639         /* Otherwise, assume that at least partial legacy is wanted,
2640          * since cgroups2 should already be mounted at this point. */
2641         return (wanted = true);
2642 }
2643
2644 bool cg_is_hybrid_wanted(void) {
2645         static thread_local int wanted = -1;
2646         int r;
2647         bool b;
2648         const bool is_default = DEFAULT_HIERARCHY >= CGROUP_UNIFIED_SYSTEMD;
2649         /* We default to true if the default is "hybrid", obviously,
2650          * but also when the default is "unified", because if we get
2651          * called, it means that unified hierarchy was not mounted. */
2652
2653         /* If we have a cached value, return that. */
2654         if (wanted >= 0)
2655                 return wanted;
2656
2657         /* If the hierarchy is already mounted, then follow whatever
2658          * was chosen for it. */
2659         if (cg_unified_flush() >= 0 &&
2660             unified_cache == CGROUP_UNIFIED_ALL)
2661                 return (wanted = false);
2662
2663         /* Otherwise, let's see what the kernel command line has to say.
2664          * Since checking is expensive, cache a non-error result. */
2665         r = proc_cmdline_get_bool("systemd.legacy_systemd_cgroup_controller", &b);
2666
2667         /* The meaning of the kernel option is reversed wrt. to the return value
2668          * of this function, hence the negation. */
2669         return (wanted = r > 0 ? !b : is_default);
2670 }
2671
2672 int cg_weight_parse(const char *s, uint64_t *ret) {
2673         uint64_t u;
2674         int r;
2675
2676         if (isempty(s)) {
2677                 *ret = CGROUP_WEIGHT_INVALID;
2678                 return 0;
2679         }
2680
2681         r = safe_atou64(s, &u);
2682         if (r < 0)
2683                 return r;
2684
2685         if (u < CGROUP_WEIGHT_MIN || u > CGROUP_WEIGHT_MAX)
2686                 return -ERANGE;
2687
2688         *ret = u;
2689         return 0;
2690 }
2691
2692 const uint64_t cgroup_io_limit_defaults[_CGROUP_IO_LIMIT_TYPE_MAX] = {
2693         [CGROUP_IO_RBPS_MAX]    = CGROUP_LIMIT_MAX,
2694         [CGROUP_IO_WBPS_MAX]    = CGROUP_LIMIT_MAX,
2695         [CGROUP_IO_RIOPS_MAX]   = CGROUP_LIMIT_MAX,
2696         [CGROUP_IO_WIOPS_MAX]   = CGROUP_LIMIT_MAX,
2697 };
2698
2699 static const char* const cgroup_io_limit_type_table[_CGROUP_IO_LIMIT_TYPE_MAX] = {
2700         [CGROUP_IO_RBPS_MAX]    = "IOReadBandwidthMax",
2701         [CGROUP_IO_WBPS_MAX]    = "IOWriteBandwidthMax",
2702         [CGROUP_IO_RIOPS_MAX]   = "IOReadIOPSMax",
2703         [CGROUP_IO_WIOPS_MAX]   = "IOWriteIOPSMax",
2704 };
2705
2706 DEFINE_STRING_TABLE_LOOKUP(cgroup_io_limit_type, CGroupIOLimitType);
2707
2708 int cg_cpu_shares_parse(const char *s, uint64_t *ret) {
2709         uint64_t u;
2710         int r;
2711
2712         if (isempty(s)) {
2713                 *ret = CGROUP_CPU_SHARES_INVALID;
2714                 return 0;
2715         }
2716
2717         r = safe_atou64(s, &u);
2718         if (r < 0)
2719                 return r;
2720
2721         if (u < CGROUP_CPU_SHARES_MIN || u > CGROUP_CPU_SHARES_MAX)
2722                 return -ERANGE;
2723
2724         *ret = u;
2725         return 0;
2726 }
2727
2728 int cg_blkio_weight_parse(const char *s, uint64_t *ret) {
2729         uint64_t u;
2730         int r;
2731
2732         if (isempty(s)) {
2733                 *ret = CGROUP_BLKIO_WEIGHT_INVALID;
2734                 return 0;
2735         }
2736
2737         r = safe_atou64(s, &u);
2738         if (r < 0)
2739                 return r;
2740
2741         if (u < CGROUP_BLKIO_WEIGHT_MIN || u > CGROUP_BLKIO_WEIGHT_MAX)
2742                 return -ERANGE;
2743
2744         *ret = u;
2745         return 0;
2746 }
2747
2748 bool is_cgroup_fs(const struct statfs *s) {
2749         return is_fs_type(s, CGROUP_SUPER_MAGIC) ||
2750                is_fs_type(s, CGROUP2_SUPER_MAGIC);
2751 }
2752
2753 bool fd_is_cgroup_fs(int fd) {
2754         struct statfs s;
2755
2756         if (fstatfs(fd, &s) < 0)
2757                 return -errno;
2758
2759         return is_cgroup_fs(&s);
2760 }
2761
2762 static const char *cgroup_controller_table[_CGROUP_CONTROLLER_MAX] = {
2763         [CGROUP_CONTROLLER_CPU] = "cpu",
2764         [CGROUP_CONTROLLER_CPUACCT] = "cpuacct",
2765         [CGROUP_CONTROLLER_IO] = "io",
2766         [CGROUP_CONTROLLER_BLKIO] = "blkio",
2767         [CGROUP_CONTROLLER_MEMORY] = "memory",
2768         [CGROUP_CONTROLLER_DEVICES] = "devices",
2769         [CGROUP_CONTROLLER_PIDS] = "pids",
2770 };
2771
2772 DEFINE_STRING_TABLE_LOOKUP(cgroup_controller, CGroupController);