src/basic/cgroup-util.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2010 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <dirent.h>
  22 #include <errno.h>
  23 #include <ftw.h>
  24 #include <limits.h>
  25 #include <signal.h>
  26 #include <stddef.h>
  27 #include <stdio_ext.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <sys/stat.h>
  31 #include <sys/statfs.h>
  32 #include <sys/types.h>
  33 #include <sys/xattr.h>
  34 #include <unistd.h>
  35
  36 #include "alloc-util.h"
  37 #include "cgroup-util.h"
  38 #include "def.h"
  39 #include "dirent-util.h"
  40 #include "extract-word.h"
  41 #include "fd-util.h"
  42 #include "fileio.h"
  43 #include "format-util.h"
  44 #include "fs-util.h"
  45 #include "log.h"
  46 #include "login-util.h"
  47 #include "macro.h"
  48 #include "missing.h"
  49 #include "mkdir.h"
  50 #include "parse-util.h"
  51 #include "path-util.h"
  52 #include "proc-cmdline.h"
  53 #include "process-util.h"
  54 #include "set.h"
  55 #include "special.h"
  56 #include "stat-util.h"
  57 #include "stdio-util.h"
  58 #include "string-table.h"
  59 #include "string-util.h"
  60 #include "strv.h"
  61 #include "unit-name.h"
  62 #include "user-util.h"
  63
  64 int cg_enumerate_processes(const char *controller, const char *path, FILE **_f) {
  65         _cleanup_free_ char *fs = NULL;
  66         FILE *f;
  67         int r;
  68
  69         assert(_f);
  70
  71         r = cg_get_path(controller, path, "cgroup.procs", &fs);
  72         if (r < 0)
  73                 return r;
  74
  75         f = fopen(fs, "re");
  76         if (!f)
  77                 return -errno;
  78
  79         *_f = f;
  80         return 0;
  81 }
  82
  83 int cg_read_pid(FILE *f, pid_t *_pid) {
  84         unsigned long ul;
  85
  86         /* Note that the cgroup.procs might contain duplicates! See
  87          * cgroups.txt for details. */
  88
  89         assert(f);
  90         assert(_pid);
  91
  92         errno = 0;
  93         if (fscanf(f, "%lu", &ul) != 1) {
  94
  95                 if (feof(f))
  96                         return 0;
  97
  98                 return errno > 0 ? -errno : -EIO;
  99         }
 100
 101         if (ul <= 0)
 102                 return -EIO;
 103
 104         *_pid = (pid_t) ul;
 105         return 1;
 106 }
 107
 108 int cg_read_event(
 109                 const char *controller,
 110                 const char *path,
 111                 const char *event,
 112                 char **val) {
 113
 114         _cleanup_free_ char *events = NULL, *content = NULL;
 115         char *p, *line;
 116         int r;
 117
 118         r = cg_get_path(controller, path, "cgroup.events", &events);
 119         if (r < 0)
 120                 return r;
 121
 122         r = read_full_file(events, &content, NULL);
 123         if (r < 0)
 124                 return r;
 125
 126         p = content;
 127         while ((line = strsep(&p, "\n"))) {
 128                 char *key;
 129
 130                 key = strsep(&line, " ");
 131                 if (!key || !line)
 132                         return -EINVAL;
 133
 134                 if (strcmp(key, event))
 135                         continue;
 136
 137                 *val = strdup(line);
 138                 return 0;
 139         }
 140
 141         return -ENOENT;
 142 }
 143
 144 bool cg_ns_supported(void) {
 145         static thread_local int enabled = -1;
 146
 147         if (enabled >= 0)
 148                 return enabled;
 149
 150         if (access("/proc/self/ns/cgroup", F_OK) == 0)
 151                 enabled = 1;
 152         else
 153                 enabled = 0;
 154
 155         return enabled;
 156 }
 157
 158 int cg_enumerate_subgroups(const char *controller, const char *path, DIR **_d) {
 159         _cleanup_free_ char *fs = NULL;
 160         int r;
 161         DIR *d;
 162
 163         assert(_d);
 164
 165         /* This is not recursive! */
 166
 167         r = cg_get_path(controller, path, NULL, &fs);
 168         if (r < 0)
 169                 return r;
 170
 171         d = opendir(fs);
 172         if (!d)
 173                 return -errno;
 174
 175         *_d = d;
 176         return 0;
 177 }
 178
 179 int cg_read_subgroup(DIR *d, char **fn) {
 180         struct dirent *de;
 181
 182         assert(d);
 183         assert(fn);
 184
 185         FOREACH_DIRENT_ALL(de, d, return -errno) {
 186                 char *b;
 187
 188                 if (de->d_type != DT_DIR)
 189                         continue;
 190
 191                 if (dot_or_dot_dot(de->d_name))
 192                         continue;
 193
 194                 b = strdup(de->d_name);
 195                 if (!b)
 196                         return -ENOMEM;
 197
 198                 *fn = b;
 199                 return 1;
 200         }
 201
 202         return 0;
 203 }
 204
 205 int cg_rmdir(const char *controller, const char *path) {
 206         _cleanup_free_ char *p = NULL;
 207         int r;
 208
 209         r = cg_get_path(controller, path, NULL, &p);
 210         if (r < 0)
 211                 return r;
 212
 213         r = rmdir(p);
 214         if (r < 0 && errno != ENOENT)
 215                 return -errno;
 216
 217         r = cg_hybrid_unified();
 218         if (r < 0)
 219                 return r;
 220         if (r == 0)
 221                 return 0;
 222
 223         if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 224                 r = cg_rmdir(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path);
 225                 if (r < 0)
 226                         log_warning_errno(r, "Failed to remove compat systemd cgroup %s: %m", path);
 227         }
 228
 229         return 0;
 230 }
 231
 232 int cg_kill(
 233                 const char *controller,
 234                 const char *path,
 235                 int sig,
 236                 CGroupFlags flags,
 237                 Set *s,
 238                 cg_kill_log_func_t log_kill,
 239                 void *userdata) {
 240
 241         _cleanup_set_free_ Set *allocated_set = NULL;
 242         bool done = false;
 243         int r, ret = 0;
 244         pid_t my_pid;
 245
 246         assert(sig >= 0);
 247
 248          /* Don't send SIGCONT twice. Also, SIGKILL always works even when process is suspended, hence don't send
 249           * SIGCONT on SIGKILL. */
 250         if (IN_SET(sig, SIGCONT, SIGKILL))
 251                 flags &= ~CGROUP_SIGCONT;
 252
 253         /* This goes through the tasks list and kills them all. This
 254          * is repeated until no further processes are added to the
 255          * tasks list, to properly handle forking processes */
 256
 257         if (!s) {
 258                 s = allocated_set = set_new(NULL);
 259                 if (!s)
 260                         return -ENOMEM;
 261         }
 262
 263         my_pid = getpid_cached();
 264
 265         do {
 266                 _cleanup_fclose_ FILE *f = NULL;
 267                 pid_t pid = 0;
 268                 done = true;
 269
 270                 r = cg_enumerate_processes(controller, path, &f);
 271                 if (r < 0) {
 272                         if (ret >= 0 && r != -ENOENT)
 273                                 return r;
 274
 275                         return ret;
 276                 }
 277
 278                 while ((r = cg_read_pid(f, &pid)) > 0) {
 279
 280                         if ((flags & CGROUP_IGNORE_SELF) && pid == my_pid)
 281                                 continue;
 282
 283                         if (set_get(s, PID_TO_PTR(pid)) == PID_TO_PTR(pid))
 284                                 continue;
 285
 286                         if (log_kill)
 287                                 log_kill(pid, sig, userdata);
 288
 289                         /* If we haven't killed this process yet, kill
 290                          * it */
 291                         if (kill(pid, sig) < 0) {
 292                                 if (ret >= 0 && errno != ESRCH)
 293                                         ret = -errno;
 294                         } else {
 295                                 if (flags & CGROUP_SIGCONT)
 296                                         (void) kill(pid, SIGCONT);
 297
 298                                 if (ret == 0)
 299                                         ret = 1;
 300                         }
 301
 302                         done = false;
 303
 304                         r = set_put(s, PID_TO_PTR(pid));
 305                         if (r < 0) {
 306                                 if (ret >= 0)
 307                                         return r;
 308
 309                                 return ret;
 310                         }
 311                 }
 312
 313                 if (r < 0) {
 314                         if (ret >= 0)
 315                                 return r;
 316
 317                         return ret;
 318                 }
 319
 320                 /* To avoid racing against processes which fork
 321                  * quicker than we can kill them we repeat this until
 322                  * no new pids need to be killed. */
 323
 324         } while (!done);
 325
 326         return ret;
 327 }
 328
 329 int cg_kill_recursive(
 330                 const char *controller,
 331                 const char *path,
 332                 int sig,
 333                 CGroupFlags flags,
 334                 Set *s,
 335                 cg_kill_log_func_t log_kill,
 336                 void *userdata) {
 337
 338         _cleanup_set_free_ Set *allocated_set = NULL;
 339         _cleanup_closedir_ DIR *d = NULL;
 340         int r, ret;
 341         char *fn;
 342
 343         assert(path);
 344         assert(sig >= 0);
 345
 346         if (!s) {
 347                 s = allocated_set = set_new(NULL);
 348                 if (!s)
 349                         return -ENOMEM;
 350         }
 351
 352         ret = cg_kill(controller, path, sig, flags, s, log_kill, userdata);
 353
 354         r = cg_enumerate_subgroups(controller, path, &d);
 355         if (r < 0) {
 356                 if (ret >= 0 && r != -ENOENT)
 357                         return r;
 358
 359                 return ret;
 360         }
 361
 362         while ((r = cg_read_subgroup(d, &fn)) > 0) {
 363                 _cleanup_free_ char *p = NULL;
 364
 365                 p = strjoin(path, "/", fn);
 366                 free(fn);
 367                 if (!p)
 368                         return -ENOMEM;
 369
 370                 r = cg_kill_recursive(controller, p, sig, flags, s, log_kill, userdata);
 371                 if (r != 0 && ret >= 0)
 372                         ret = r;
 373         }
 374         if (ret >= 0 && r < 0)
 375                 ret = r;
 376
 377         if (flags & CGROUP_REMOVE) {
 378                 r = cg_rmdir(controller, path);
 379                 if (r < 0 && ret >= 0 && !IN_SET(r, -ENOENT, -EBUSY))
 380                         return r;
 381         }
 382
 383         return ret;
 384 }
 385
 386 int cg_migrate(
 387                 const char *cfrom,
 388                 const char *pfrom,
 389                 const char *cto,
 390                 const char *pto,
 391                 CGroupFlags flags) {
 392
 393         bool done = false;
 394         _cleanup_set_free_ Set *s = NULL;
 395         int r, ret = 0;
 396         pid_t my_pid;
 397
 398         assert(cfrom);
 399         assert(pfrom);
 400         assert(cto);
 401         assert(pto);
 402
 403         s = set_new(NULL);
 404         if (!s)
 405                 return -ENOMEM;
 406
 407         my_pid = getpid_cached();
 408
 409         do {
 410                 _cleanup_fclose_ FILE *f = NULL;
 411                 pid_t pid = 0;
 412                 done = true;
 413
 414                 r = cg_enumerate_processes(cfrom, pfrom, &f);
 415                 if (r < 0) {
 416                         if (ret >= 0 && r != -ENOENT)
 417                                 return r;
 418
 419                         return ret;
 420                 }
 421
 422                 while ((r = cg_read_pid(f, &pid)) > 0) {
 423
 424                         /* This might do weird stuff if we aren't a
 425                          * single-threaded program. However, we
 426                          * luckily know we are not */
 427                         if ((flags & CGROUP_IGNORE_SELF) && pid == my_pid)
 428                                 continue;
 429
 430                         if (set_get(s, PID_TO_PTR(pid)) == PID_TO_PTR(pid))
 431                                 continue;
 432
 433                         /* Ignore kernel threads. Since they can only
 434                          * exist in the root cgroup, we only check for
 435                          * them there. */
 436                         if (cfrom &&
 437                             (isempty(pfrom) || path_equal(pfrom, "/")) &&
 438                             is_kernel_thread(pid) > 0)
 439                                 continue;
 440
 441                         r = cg_attach(cto, pto, pid);
 442                         if (r < 0) {
 443                                 if (ret >= 0 && r != -ESRCH)
 444                                         ret = r;
 445                         } else if (ret == 0)
 446                                 ret = 1;
 447
 448                         done = false;
 449
 450                         r = set_put(s, PID_TO_PTR(pid));
 451                         if (r < 0) {
 452                                 if (ret >= 0)
 453                                         return r;
 454
 455                                 return ret;
 456                         }
 457                 }
 458
 459                 if (r < 0) {
 460                         if (ret >= 0)
 461                                 return r;
 462
 463                         return ret;
 464                 }
 465         } while (!done);
 466
 467         return ret;
 468 }
 469
 470 int cg_migrate_recursive(
 471                 const char *cfrom,
 472                 const char *pfrom,
 473                 const char *cto,
 474                 const char *pto,
 475                 CGroupFlags flags) {
 476
 477         _cleanup_closedir_ DIR *d = NULL;
 478         int r, ret = 0;
 479         char *fn;
 480
 481         assert(cfrom);
 482         assert(pfrom);
 483         assert(cto);
 484         assert(pto);
 485
 486         ret = cg_migrate(cfrom, pfrom, cto, pto, flags);
 487
 488         r = cg_enumerate_subgroups(cfrom, pfrom, &d);
 489         if (r < 0) {
 490                 if (ret >= 0 && r != -ENOENT)
 491                         return r;
 492
 493                 return ret;
 494         }
 495
 496         while ((r = cg_read_subgroup(d, &fn)) > 0) {
 497                 _cleanup_free_ char *p = NULL;
 498
 499                 p = strjoin(pfrom, "/", fn);
 500                 free(fn);
 501                 if (!p)
 502                         return -ENOMEM;
 503
 504                 r = cg_migrate_recursive(cfrom, p, cto, pto, flags);
 505                 if (r != 0 && ret >= 0)
 506                         ret = r;
 507         }
 508
 509         if (r < 0 && ret >= 0)
 510                 ret = r;
 511
 512         if (flags & CGROUP_REMOVE) {
 513                 r = cg_rmdir(cfrom, pfrom);
 514                 if (r < 0 && ret >= 0 && !IN_SET(r, -ENOENT, -EBUSY))
 515                         return r;
 516         }
 517
 518         return ret;
 519 }
 520
 521 int cg_migrate_recursive_fallback(
 522                 const char *cfrom,
 523                 const char *pfrom,
 524                 const char *cto,
 525                 const char *pto,
 526                 CGroupFlags flags) {
 527
 528         int r;
 529
 530         assert(cfrom);
 531         assert(pfrom);
 532         assert(cto);
 533         assert(pto);
 534
 535         r = cg_migrate_recursive(cfrom, pfrom, cto, pto, flags);
 536         if (r < 0) {
 537                 char prefix[strlen(pto) + 1];
 538
 539                 /* This didn't work? Then let's try all prefixes of the destination */
 540
 541                 PATH_FOREACH_PREFIX(prefix, pto) {
 542                         int q;
 543
 544                         q = cg_migrate_recursive(cfrom, pfrom, cto, prefix, flags);
 545                         if (q >= 0)
 546                                 return q;
 547                 }
 548         }
 549
 550         return r;
 551 }
 552
 553 static const char *controller_to_dirname(const char *controller) {
 554         const char *e;
 555
 556         assert(controller);
 557
 558         /* Converts a controller name to the directory name below
 559          * /sys/fs/cgroup/ we want to mount it to. Effectively, this
 560          * just cuts off the name= prefixed used for named
 561          * hierarchies, if it is specified. */
 562
 563         if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 564                 if (cg_hybrid_unified() > 0)
 565                         controller = SYSTEMD_CGROUP_CONTROLLER_HYBRID;
 566                 else
 567                         controller = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
 568         }
 569
 570         e = startswith(controller, "name=");
 571         if (e)
 572                 return e;
 573
 574         return controller;
 575 }
 576
 577 static int join_path_legacy(const char *controller, const char *path, const char *suffix, char **fs) {
 578         const char *dn;
 579         char *t = NULL;
 580
 581         assert(fs);
 582         assert(controller);
 583
 584         dn = controller_to_dirname(controller);
 585
 586         if (isempty(path) && isempty(suffix))
 587                 t = strappend("/sys/fs/cgroup/", dn);
 588         else if (isempty(path))
 589                 t = strjoin("/sys/fs/cgroup/", dn, "/", suffix);
 590         else if (isempty(suffix))
 591                 t = strjoin("/sys/fs/cgroup/", dn, "/", path);
 592         else
 593                 t = strjoin("/sys/fs/cgroup/", dn, "/", path, "/", suffix);
 594         if (!t)
 595                 return -ENOMEM;
 596
 597         *fs = t;
 598         return 0;
 599 }
 600
 601 static int join_path_unified(const char *path, const char *suffix, char **fs) {
 602         char *t;
 603
 604         assert(fs);
 605
 606         if (isempty(path) && isempty(suffix))
 607                 t = strdup("/sys/fs/cgroup");
 608         else if (isempty(path))
 609                 t = strappend("/sys/fs/cgroup/", suffix);
 610         else if (isempty(suffix))
 611                 t = strappend("/sys/fs/cgroup/", path);
 612         else
 613                 t = strjoin("/sys/fs/cgroup/", path, "/", suffix);
 614         if (!t)
 615                 return -ENOMEM;
 616
 617         *fs = t;
 618         return 0;
 619 }
 620
 621 int cg_get_path(const char *controller, const char *path, const char *suffix, char **fs) {
 622         int r;
 623
 624         assert(fs);
 625
 626         if (!controller) {
 627                 char *t;
 628
 629                 /* If no controller is specified, we return the path
 630                  * *below* the controllers, without any prefix. */
 631
 632                 if (!path && !suffix)
 633                         return -EINVAL;
 634
 635                 if (!suffix)
 636                         t = strdup(path);
 637                 else if (!path)
 638                         t = strdup(suffix);
 639                 else
 640                         t = strjoin(path, "/", suffix);
 641                 if (!t)
 642                         return -ENOMEM;
 643
 644                 *fs = path_kill_slashes(t);
 645                 return 0;
 646         }
 647
 648         if (!cg_controller_is_valid(controller))
 649                 return -EINVAL;
 650
 651         r = cg_all_unified();
 652         if (r < 0)
 653                 return r;
 654         if (r > 0)
 655                 r = join_path_unified(path, suffix, fs);
 656         else
 657                 r = join_path_legacy(controller, path, suffix, fs);
 658         if (r < 0)
 659                 return r;
 660
 661         path_kill_slashes(*fs);
 662         return 0;
 663 }
 664
 665 static int controller_is_accessible(const char *controller) {
 666         int r;
 667
 668         assert(controller);
 669
 670         /* Checks whether a specific controller is accessible,
 671          * i.e. its hierarchy mounted. In the unified hierarchy all
 672          * controllers are considered accessible, except for the named
 673          * hierarchies */
 674
 675         if (!cg_controller_is_valid(controller))
 676                 return -EINVAL;
 677
 678         r = cg_all_unified();
 679         if (r < 0)
 680                 return r;
 681         if (r > 0) {
 682                 /* We don't support named hierarchies if we are using
 683                  * the unified hierarchy. */
 684
 685                 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER))
 686                         return 0;
 687
 688                 if (startswith(controller, "name="))
 689                         return -EOPNOTSUPP;
 690
 691         } else {
 692                 const char *cc, *dn;
 693
 694                 dn = controller_to_dirname(controller);
 695                 cc = strjoina("/sys/fs/cgroup/", dn);
 696
 697                 if (laccess(cc, F_OK) < 0)
 698                         return -errno;
 699         }
 700
 701         return 0;
 702 }
 703
 704 int cg_get_path_and_check(const char *controller, const char *path, const char *suffix, char **fs) {
 705         int r;
 706
 707         assert(controller);
 708         assert(fs);
 709
 710         /* Check if the specified controller is actually accessible */
 711         r = controller_is_accessible(controller);
 712         if (r < 0)
 713                 return r;
 714
 715         return cg_get_path(controller, path, suffix, fs);
 716 }
 717
 718 static int trim_cb(const char *path, const struct stat *sb, int typeflag, struct FTW *ftwbuf) {
 719         assert(path);
 720         assert(sb);
 721         assert(ftwbuf);
 722
 723         if (typeflag != FTW_DP)
 724                 return 0;
 725
 726         if (ftwbuf->level < 1)
 727                 return 0;
 728
 729         (void) rmdir(path);
 730         return 0;
 731 }
 732
 733 int cg_trim(const char *controller, const char *path, bool delete_root) {
 734         _cleanup_free_ char *fs = NULL;
 735         int r = 0, q;
 736
 737         assert(path);
 738
 739         r = cg_get_path(controller, path, NULL, &fs);
 740         if (r < 0)
 741                 return r;
 742
 743         errno = 0;
 744         if (nftw(fs, trim_cb, 64, FTW_DEPTH|FTW_MOUNT|FTW_PHYS) != 0) {
 745                 if (errno == ENOENT)
 746                         r = 0;
 747                 else if (errno > 0)
 748                         r = -errno;
 749                 else
 750                         r = -EIO;
 751         }
 752
 753         if (delete_root) {
 754                 if (rmdir(fs) < 0 && errno != ENOENT)
 755                         return -errno;
 756         }
 757
 758         q = cg_hybrid_unified();
 759         if (q < 0)
 760                 return q;
 761         if (q > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 762                 q = cg_trim(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, delete_root);
 763                 if (q < 0)
 764                         log_warning_errno(q, "Failed to trim compat systemd cgroup %s: %m", path);
 765         }
 766
 767         return r;
 768 }
 769
 770 int cg_create(const char *controller, const char *path) {
 771         _cleanup_free_ char *fs = NULL;
 772         int r;
 773
 774         r = cg_get_path_and_check(controller, path, NULL, &fs);
 775         if (r < 0)
 776                 return r;
 777
 778         r = mkdir_parents(fs, 0755);
 779         if (r < 0)
 780                 return r;
 781
 782         r = mkdir_errno_wrapper(fs, 0755);
 783         if (r == -EEXIST)
 784                 return 0;
 785         if (r < 0)
 786                 return r;
 787
 788         r = cg_hybrid_unified();
 789         if (r < 0)
 790                 return r;
 791
 792         if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 793                 r = cg_create(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path);
 794                 if (r < 0)
 795                         log_warning_errno(r, "Failed to create compat systemd cgroup %s: %m", path);
 796         }
 797
 798         return 1;
 799 }
 800
 801 int cg_create_and_attach(const char *controller, const char *path, pid_t pid) {
 802         int r, q;
 803
 804         assert(pid >= 0);
 805
 806         r = cg_create(controller, path);
 807         if (r < 0)
 808                 return r;
 809
 810         q = cg_attach(controller, path, pid);
 811         if (q < 0)
 812                 return q;
 813
 814         /* This does not remove the cgroup on failure */
 815         return r;
 816 }
 817
 818 int cg_attach(const char *controller, const char *path, pid_t pid) {
 819         _cleanup_free_ char *fs = NULL;
 820         char c[DECIMAL_STR_MAX(pid_t) + 2];
 821         int r;
 822
 823         assert(path);
 824         assert(pid >= 0);
 825
 826         r = cg_get_path_and_check(controller, path, "cgroup.procs", &fs);
 827         if (r < 0)
 828                 return r;
 829
 830         if (pid == 0)
 831                 pid = getpid_cached();
 832
 833         xsprintf(c, PID_FMT "\n", pid);
 834
 835         r = write_string_file(fs, c, 0);
 836         if (r < 0)
 837                 return r;
 838
 839         r = cg_hybrid_unified();
 840         if (r < 0)
 841                 return r;
 842
 843         if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 844                 r = cg_attach(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, pid);
 845                 if (r < 0)
 846                         log_warning_errno(r, "Failed to attach "PID_FMT" to compat systemd cgroup %s: %m", pid, path);
 847         }
 848
 849         return 0;
 850 }
 851
 852 int cg_attach_fallback(const char *controller, const char *path, pid_t pid) {
 853         int r;
 854
 855         assert(controller);
 856         assert(path);
 857         assert(pid >= 0);
 858
 859         r = cg_attach(controller, path, pid);
 860         if (r < 0) {
 861                 char prefix[strlen(path) + 1];
 862
 863                 /* This didn't work? Then let's try all prefixes of
 864                  * the destination */
 865
 866                 PATH_FOREACH_PREFIX(prefix, path) {
 867                         int q;
 868
 869                         q = cg_attach(controller, prefix, pid);
 870                         if (q >= 0)
 871                                 return q;
 872                 }
 873         }
 874
 875         return r;
 876 }
 877
 878 int cg_set_access(
 879                 const char *controller,
 880                 const char *path,
 881                 uid_t uid,
 882                 gid_t gid) {
 883
 884         struct Attribute {
 885                 const char *name;
 886                 bool fatal;
 887         };
 888
 889         /* cgroupsv1, aka legacy/non-unified */
 890         static const struct Attribute legacy_attributes[] = {
 891                 { "cgroup.procs",           true  },
 892                 { "tasks",                  false },
 893                 { "cgroup.clone_children",  false },
 894                 {},
 895         };
 896
 897         /* cgroupsv2, aka unified */
 898         static const struct Attribute unified_attributes[] = {
 899                 { "cgroup.procs",           true  },
 900                 { "cgroup.subtree_control", true  },
 901                 { "cgroup.threads",         false },
 902                 {},
 903         };
 904
 905         static const struct Attribute* const attributes[] = {
 906                 [false] = legacy_attributes,
 907                 [true]  = unified_attributes,
 908         };
 909
 910         _cleanup_free_ char *fs = NULL;
 911         const struct Attribute *i;
 912         int r, unified;
 913
 914         assert(path);
 915
 916         if (uid == UID_INVALID && gid == GID_INVALID)
 917                 return 0;
 918
 919         unified = cg_unified_controller(controller);
 920         if (unified < 0)
 921                 return unified;
 922
 923         /* Configure access to the cgroup itself */
 924         r = cg_get_path(controller, path, NULL, &fs);
 925         if (r < 0)
 926                 return r;
 927
 928         r = chmod_and_chown(fs, 0755, uid, gid);
 929         if (r < 0)
 930                 return r;
 931
 932         /* Configure access to the cgroup's attributes */
 933         for (i = attributes[unified]; i->name; i++) {
 934                 fs = mfree(fs);
 935
 936                 r = cg_get_path(controller, path, i->name, &fs);
 937                 if (r < 0)
 938                         return r;
 939
 940                 r = chmod_and_chown(fs, 0644, uid, gid);
 941                 if (r < 0) {
 942                         if (i->fatal)
 943                                 return r;
 944
 945                         log_debug_errno(r, "Failed to set access on cgroup %s, ignoring: %m", fs);
 946                 }
 947         }
 948
 949         if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 950                 r = cg_hybrid_unified();
 951                 if (r < 0)
 952                         return r;
 953                 if (r > 0) {
 954                         /* Always propagate access mode from unified to legacy controller */
 955                         r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, uid, gid);
 956                         if (r < 0)
 957                                 log_debug_errno(r, "Failed to set access on compatibility systemd cgroup %s, ignoring: %m", path);
 958                 }
 959         }
 960
 961         return 0;
 962 }
 963
 964 int cg_set_xattr(const char *controller, const char *path, const char *name, const void *value, size_t size, int flags) {
 965         _cleanup_free_ char *fs = NULL;
 966         int r;
 967
 968         assert(path);
 969         assert(name);
 970         assert(value || size <= 0);
 971
 972         r = cg_get_path(controller, path, NULL, &fs);
 973         if (r < 0)
 974                 return r;
 975
 976         if (setxattr(fs, name, value, size, flags) < 0)
 977                 return -errno;
 978
 979         return 0;
 980 }
 981
 982 int cg_get_xattr(const char *controller, const char *path, const char *name, void *value, size_t size) {
 983         _cleanup_free_ char *fs = NULL;
 984         ssize_t n;
 985         int r;
 986
 987         assert(path);
 988         assert(name);
 989
 990         r = cg_get_path(controller, path, NULL, &fs);
 991         if (r < 0)
 992                 return r;
 993
 994         n = getxattr(fs, name, value, size);
 995         if (n < 0)
 996                 return -errno;
 997
 998         return (int) n;
 999 }
1000
1001 int cg_pid_get_path(const char *controller, pid_t pid, char **path) {
1002         _cleanup_fclose_ FILE *f = NULL;
1003         char line[LINE_MAX];
1004         const char *fs, *controller_str;
1005         size_t cs = 0;
1006         int unified;
1007
1008         assert(path);
1009         assert(pid >= 0);
1010
1011         if (controller) {
1012                 if (!cg_controller_is_valid(controller))
1013                         return -EINVAL;
1014         } else
1015                 controller = SYSTEMD_CGROUP_CONTROLLER;
1016
1017         unified = cg_unified_controller(controller);
1018         if (unified < 0)
1019                 return unified;
1020         if (unified == 0) {
1021                 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER))
1022                         controller_str = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
1023                 else
1024                         controller_str = controller;
1025
1026                 cs = strlen(controller_str);
1027         }
1028
1029         fs = procfs_file_alloca(pid, "cgroup");
1030         f = fopen(fs, "re");
1031         if (!f)
1032                 return errno == ENOENT ? -ESRCH : -errno;
1033
1034         (void) __fsetlocking(f, FSETLOCKING_BYCALLER);
1035
1036         FOREACH_LINE(line, f, return -errno) {
1037                 char *e, *p;
1038
1039                 truncate_nl(line);
1040
1041                 if (unified) {
1042                         e = startswith(line, "0:");
1043                         if (!e)
1044                                 continue;
1045
1046                         e = strchr(e, ':');
1047                         if (!e)
1048                                 continue;
1049                 } else {
1050                         char *l;
1051                         size_t k;
1052                         const char *word, *state;
1053                         bool found = false;
1054
1055                         l = strchr(line, ':');
1056                         if (!l)
1057                                 continue;
1058
1059                         l++;
1060                         e = strchr(l, ':');
1061                         if (!e)
1062                                 continue;
1063
1064                         *e = 0;
1065                         FOREACH_WORD_SEPARATOR(word, k, l, ",", state)
1066                                 if (k == cs && memcmp(word, controller_str, cs) == 0) {
1067                                         found = true;
1068                                         break;
1069                                 }
1070                         if (!found)
1071                                 continue;
1072                 }
1073
1074                 p = strdup(e + 1);
1075                 if (!p)
1076                         return -ENOMEM;
1077
1078                 /* Truncate suffix indicating the process is a zombie */
1079                 e = endswith(p, " (deleted)");
1080                 if (e)
1081                         *e = 0;
1082
1083                 *path = p;
1084                 return 0;
1085         }
1086
1087         return -ENODATA;
1088 }
1089
1090 int cg_install_release_agent(const char *controller, const char *agent) {
1091         _cleanup_free_ char *fs = NULL, *contents = NULL;
1092         const char *sc;
1093         int r;
1094
1095         assert(agent);
1096
1097         r = cg_unified_controller(controller);
1098         if (r < 0)
1099                 return r;
1100         if (r > 0) /* doesn't apply to unified hierarchy */
1101                 return -EOPNOTSUPP;
1102
1103         r = cg_get_path(controller, NULL, "release_agent", &fs);
1104         if (r < 0)
1105                 return r;
1106
1107         r = read_one_line_file(fs, &contents);
1108         if (r < 0)
1109                 return r;
1110
1111         sc = strstrip(contents);
1112         if (isempty(sc)) {
1113                 r = write_string_file(fs, agent, 0);
1114                 if (r < 0)
1115                         return r;
1116         } else if (!path_equal(sc, agent))
1117                 return -EEXIST;
1118
1119         fs = mfree(fs);
1120         r = cg_get_path(controller, NULL, "notify_on_release", &fs);
1121         if (r < 0)
1122                 return r;
1123
1124         contents = mfree(contents);
1125         r = read_one_line_file(fs, &contents);
1126         if (r < 0)
1127                 return r;
1128
1129         sc = strstrip(contents);
1130         if (streq(sc, "0")) {
1131                 r = write_string_file(fs, "1", 0);
1132                 if (r < 0)
1133                         return r;
1134
1135                 return 1;
1136         }
1137
1138         if (!streq(sc, "1"))
1139                 return -EIO;
1140
1141         return 0;
1142 }
1143
1144 int cg_uninstall_release_agent(const char *controller) {
1145         _cleanup_free_ char *fs = NULL;
1146         int r;
1147
1148         r = cg_unified_controller(controller);
1149         if (r < 0)
1150                 return r;
1151         if (r > 0) /* Doesn't apply to unified hierarchy */
1152                 return -EOPNOTSUPP;
1153
1154         r = cg_get_path(controller, NULL, "notify_on_release", &fs);
1155         if (r < 0)
1156                 return r;
1157
1158         r = write_string_file(fs, "0", 0);
1159         if (r < 0)
1160                 return r;
1161
1162         fs = mfree(fs);
1163
1164         r = cg_get_path(controller, NULL, "release_agent", &fs);
1165         if (r < 0)
1166                 return r;
1167
1168         r = write_string_file(fs, "", 0);
1169         if (r < 0)
1170                 return r;
1171
1172         return 0;
1173 }
1174
1175 int cg_is_empty(const char *controller, const char *path) {
1176         _cleanup_fclose_ FILE *f = NULL;
1177         pid_t pid;
1178         int r;
1179
1180         assert(path);
1181
1182         r = cg_enumerate_processes(controller, path, &f);
1183         if (r == -ENOENT)
1184                 return 1;
1185         if (r < 0)
1186                 return r;
1187
1188         r = cg_read_pid(f, &pid);
1189         if (r < 0)
1190                 return r;
1191
1192         return r == 0;
1193 }
1194
1195 int cg_is_empty_recursive(const char *controller, const char *path) {
1196         int r;
1197
1198         assert(path);
1199
1200         /* The root cgroup is always populated */
1201         if (controller && (isempty(path) || path_equal(path, "/")))
1202                 return false;
1203
1204         r = cg_unified_controller(controller);
1205         if (r < 0)
1206                 return r;
1207         if (r > 0) {
1208                 _cleanup_free_ char *t = NULL;
1209
1210                 /* On the unified hierarchy we can check empty state
1211                  * via the "populated" attribute of "cgroup.events". */
1212
1213                 r = cg_read_event(controller, path, "populated", &t);
1214                 if (r < 0)
1215                         return r;
1216
1217                 return streq(t, "0");
1218         } else {
1219                 _cleanup_closedir_ DIR *d = NULL;
1220                 char *fn;
1221
1222                 r = cg_is_empty(controller, path);
1223                 if (r <= 0)
1224                         return r;
1225
1226                 r = cg_enumerate_subgroups(controller, path, &d);
1227                 if (r == -ENOENT)
1228                         return 1;
1229                 if (r < 0)
1230                         return r;
1231
1232                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
1233                         _cleanup_free_ char *p = NULL;
1234
1235                         p = strjoin(path, "/", fn);
1236                         free(fn);
1237                         if (!p)
1238                                 return -ENOMEM;
1239
1240                         r = cg_is_empty_recursive(controller, p);
1241                         if (r <= 0)
1242                                 return r;
1243                 }
1244                 if (r < 0)
1245                         return r;
1246
1247                 return true;
1248         }
1249 }
1250
1251 int cg_split_spec(const char *spec, char **controller, char **path) {
1252         char *t = NULL, *u = NULL;
1253         const char *e;
1254
1255         assert(spec);
1256
1257         if (*spec == '/') {
1258                 if (!path_is_normalized(spec))
1259                         return -EINVAL;
1260
1261                 if (path) {
1262                         t = strdup(spec);
1263                         if (!t)
1264                                 return -ENOMEM;
1265
1266                         *path = path_kill_slashes(t);
1267                 }
1268
1269                 if (controller)
1270                         *controller = NULL;
1271
1272                 return 0;
1273         }
1274
1275         e = strchr(spec, ':');
1276         if (!e) {
1277                 if (!cg_controller_is_valid(spec))
1278                         return -EINVAL;
1279
1280                 if (controller) {
1281                         t = strdup(spec);
1282                         if (!t)
1283                                 return -ENOMEM;
1284
1285                         *controller = t;
1286                 }
1287
1288                 if (path)
1289                         *path = NULL;
1290
1291                 return 0;
1292         }
1293
1294         t = strndup(spec, e-spec);
1295         if (!t)
1296                 return -ENOMEM;
1297         if (!cg_controller_is_valid(t)) {
1298                 free(t);
1299                 return -EINVAL;
1300         }
1301
1302         if (isempty(e+1))
1303                 u = NULL;
1304         else {
1305                 u = strdup(e+1);
1306                 if (!u) {
1307                         free(t);
1308                         return -ENOMEM;
1309                 }
1310
1311                 if (!path_is_normalized(u) ||
1312                     !path_is_absolute(u)) {
1313                         free(t);
1314                         free(u);
1315                         return -EINVAL;
1316                 }
1317
1318                 path_kill_slashes(u);
1319         }
1320
1321         if (controller)
1322                 *controller = t;
1323         else
1324                 free(t);
1325
1326         if (path)
1327                 *path = u;
1328         else
1329                 free(u);
1330
1331         return 0;
1332 }
1333
1334 int cg_mangle_path(const char *path, char **result) {
1335         _cleanup_free_ char *c = NULL, *p = NULL;
1336         char *t;
1337         int r;
1338
1339         assert(path);
1340         assert(result);
1341
1342         /* First, check if it already is a filesystem path */
1343         if (path_startswith(path, "/sys/fs/cgroup")) {
1344
1345                 t = strdup(path);
1346                 if (!t)
1347                         return -ENOMEM;
1348
1349                 *result = path_kill_slashes(t);
1350                 return 0;
1351         }
1352
1353         /* Otherwise, treat it as cg spec */
1354         r = cg_split_spec(path, &c, &p);
1355         if (r < 0)
1356                 return r;
1357
1358         return cg_get_path(c ?: SYSTEMD_CGROUP_CONTROLLER, p ?: "/", NULL, result);
1359 }
1360
1361 int cg_get_root_path(char **path) {
1362         char *p, *e;
1363         int r;
1364
1365         assert(path);
1366
1367         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 1, &p);
1368         if (r < 0)
1369                 return r;
1370
1371         e = endswith(p, "/" SPECIAL_INIT_SCOPE);
1372         if (!e)
1373                 e = endswith(p, "/" SPECIAL_SYSTEM_SLICE); /* legacy */
1374         if (!e)
1375                 e = endswith(p, "/system"); /* even more legacy */
1376         if (e)
1377                 *e = 0;
1378
1379         *path = p;
1380         return 0;
1381 }
1382
1383 int cg_shift_path(const char *cgroup, const char *root, const char **shifted) {
1384         _cleanup_free_ char *rt = NULL;
1385         char *p;
1386         int r;
1387
1388         assert(cgroup);
1389         assert(shifted);
1390
1391         if (!root) {
1392                 /* If the root was specified let's use that, otherwise
1393                  * let's determine it from PID 1 */
1394
1395                 r = cg_get_root_path(&rt);
1396                 if (r < 0)
1397                         return r;
1398
1399                 root = rt;
1400         }
1401
1402         p = path_startswith(cgroup, root);
1403         if (p && p > cgroup)
1404                 *shifted = p - 1;
1405         else
1406                 *shifted = cgroup;
1407
1408         return 0;
1409 }
1410
1411 int cg_pid_get_path_shifted(pid_t pid, const char *root, char **cgroup) {
1412         _cleanup_free_ char *raw = NULL;
1413         const char *c;
1414         int r;
1415
1416         assert(pid >= 0);
1417         assert(cgroup);
1418
1419         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &raw);
1420         if (r < 0)
1421                 return r;
1422
1423         r = cg_shift_path(raw, root, &c);
1424         if (r < 0)
1425                 return r;
1426
1427         if (c == raw) {
1428                 *cgroup = raw;
1429                 raw = NULL;
1430         } else {
1431                 char *n;
1432
1433                 n = strdup(c);
1434                 if (!n)
1435                         return -ENOMEM;
1436
1437                 *cgroup = n;
1438         }
1439
1440         return 0;
1441 }
1442
1443 int cg_path_decode_unit(const char *cgroup, char **unit) {
1444         char *c, *s;
1445         size_t n;
1446
1447         assert(cgroup);
1448         assert(unit);
1449
1450         n = strcspn(cgroup, "/");
1451         if (n < 3)
1452                 return -ENXIO;
1453
1454         c = strndupa(cgroup, n);
1455         c = cg_unescape(c);
1456
1457         if (!unit_name_is_valid(c, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
1458                 return -ENXIO;
1459
1460         s = strdup(c);
1461         if (!s)
1462                 return -ENOMEM;
1463
1464         *unit = s;
1465         return 0;
1466 }
1467
1468 static bool valid_slice_name(const char *p, size_t n) {
1469
1470         if (!p)
1471                 return false;
1472
1473         if (n < STRLEN("x.slice"))
1474                 return false;
1475
1476         if (memcmp(p + n - 6, ".slice", 6) == 0) {
1477                 char buf[n+1], *c;
1478
1479                 memcpy(buf, p, n);
1480                 buf[n] = 0;
1481
1482                 c = cg_unescape(buf);
1483
1484                 return unit_name_is_valid(c, UNIT_NAME_PLAIN);
1485         }
1486
1487         return false;
1488 }
1489
1490 static const char *skip_slices(const char *p) {
1491         assert(p);
1492
1493         /* Skips over all slice assignments */
1494
1495         for (;;) {
1496                 size_t n;
1497
1498                 p += strspn(p, "/");
1499
1500                 n = strcspn(p, "/");
1501                 if (!valid_slice_name(p, n))
1502                         return p;
1503
1504                 p += n;
1505         }
1506 }
1507
1508 int cg_path_get_unit(const char *path, char **ret) {
1509         const char *e;
1510         char *unit;
1511         int r;
1512
1513         assert(path);
1514         assert(ret);
1515
1516         e = skip_slices(path);
1517
1518         r = cg_path_decode_unit(e, &unit);
1519         if (r < 0)
1520                 return r;
1521
1522         /* We skipped over the slices, don't accept any now */
1523         if (endswith(unit, ".slice")) {
1524                 free(unit);
1525                 return -ENXIO;
1526         }
1527
1528         *ret = unit;
1529         return 0;
1530 }
1531
1532 int cg_pid_get_unit(pid_t pid, char **unit) {
1533         _cleanup_free_ char *cgroup = NULL;
1534         int r;
1535
1536         assert(unit);
1537
1538         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1539         if (r < 0)
1540                 return r;
1541
1542         return cg_path_get_unit(cgroup, unit);
1543 }
1544
1545 /**
1546  * Skip session-*.scope, but require it to be there.
1547  */
1548 static const char *skip_session(const char *p) {
1549         size_t n;
1550
1551         if (isempty(p))
1552                 return NULL;
1553
1554         p += strspn(p, "/");
1555
1556         n = strcspn(p, "/");
1557         if (n < STRLEN("session-x.scope"))
1558                 return NULL;
1559
1560         if (memcmp(p, "session-", 8) == 0 && memcmp(p + n - 6, ".scope", 6) == 0) {
1561                 char buf[n - 8 - 6 + 1];
1562
1563                 memcpy(buf, p + 8, n - 8 - 6);
1564                 buf[n - 8 - 6] = 0;
1565
1566                 /* Note that session scopes never need unescaping,
1567                  * since they cannot conflict with the kernel's own
1568                  * names, hence we don't need to call cg_unescape()
1569                  * here. */
1570
1571                 if (!session_id_valid(buf))
1572                         return false;
1573
1574                 p += n;
1575                 p += strspn(p, "/");
1576                 return p;
1577         }
1578
1579         return NULL;
1580 }
1581
1582 /**
1583  * Skip user@*.service, but require it to be there.
1584  */
1585 static const char *skip_user_manager(const char *p) {
1586         size_t n;
1587
1588         if (isempty(p))
1589                 return NULL;
1590
1591         p += strspn(p, "/");
1592
1593         n = strcspn(p, "/");
1594         if (n < STRLEN("user@x.service"))
1595                 return NULL;
1596
1597         if (memcmp(p, "user@", 5) == 0 && memcmp(p + n - 8, ".service", 8) == 0) {
1598                 char buf[n - 5 - 8 + 1];
1599
1600                 memcpy(buf, p + 5, n - 5 - 8);
1601                 buf[n - 5 - 8] = 0;
1602
1603                 /* Note that user manager services never need unescaping,
1604                  * since they cannot conflict with the kernel's own
1605                  * names, hence we don't need to call cg_unescape()
1606                  * here. */
1607
1608                 if (parse_uid(buf, NULL) < 0)
1609                         return NULL;
1610
1611                 p += n;
1612                 p += strspn(p, "/");
1613
1614                 return p;
1615         }
1616
1617         return NULL;
1618 }
1619
1620 static const char *skip_user_prefix(const char *path) {
1621         const char *e, *t;
1622
1623         assert(path);
1624
1625         /* Skip slices, if there are any */
1626         e = skip_slices(path);
1627
1628         /* Skip the user manager, if it's in the path now... */
1629         t = skip_user_manager(e);
1630         if (t)
1631                 return t;
1632
1633         /* Alternatively skip the user session if it is in the path... */
1634         return skip_session(e);
1635 }
1636
1637 int cg_path_get_user_unit(const char *path, char **ret) {
1638         const char *t;
1639
1640         assert(path);
1641         assert(ret);
1642
1643         t = skip_user_prefix(path);
1644         if (!t)
1645                 return -ENXIO;
1646
1647         /* And from here on it looks pretty much the same as for a
1648          * system unit, hence let's use the same parser from here
1649          * on. */
1650         return cg_path_get_unit(t, ret);
1651 }
1652
1653 int cg_pid_get_user_unit(pid_t pid, char **unit) {
1654         _cleanup_free_ char *cgroup = NULL;
1655         int r;
1656
1657         assert(unit);
1658
1659         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1660         if (r < 0)
1661                 return r;
1662
1663         return cg_path_get_user_unit(cgroup, unit);
1664 }
1665
1666 int cg_path_get_machine_name(const char *path, char **machine) {
1667         _cleanup_free_ char *u = NULL;
1668         const char *sl;
1669         int r;
1670
1671         r = cg_path_get_unit(path, &u);
1672         if (r < 0)
1673                 return r;
1674
1675         sl = strjoina("/run/systemd/machines/unit:", u);
1676         return readlink_malloc(sl, machine);
1677 }
1678
1679 int cg_pid_get_machine_name(pid_t pid, char **machine) {
1680         _cleanup_free_ char *cgroup = NULL;
1681         int r;
1682
1683         assert(machine);
1684
1685         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1686         if (r < 0)
1687                 return r;
1688
1689         return cg_path_get_machine_name(cgroup, machine);
1690 }
1691
1692 int cg_path_get_session(const char *path, char **session) {
1693         _cleanup_free_ char *unit = NULL;
1694         char *start, *end;
1695         int r;
1696
1697         assert(path);
1698
1699         r = cg_path_get_unit(path, &unit);
1700         if (r < 0)
1701                 return r;
1702
1703         start = startswith(unit, "session-");
1704         if (!start)
1705                 return -ENXIO;
1706         end = endswith(start, ".scope");
1707         if (!end)
1708                 return -ENXIO;
1709
1710         *end = 0;
1711         if (!session_id_valid(start))
1712                 return -ENXIO;
1713
1714         if (session) {
1715                 char *rr;
1716
1717                 rr = strdup(start);
1718                 if (!rr)
1719                         return -ENOMEM;
1720
1721                 *session = rr;
1722         }
1723
1724         return 0;
1725 }
1726
1727 int cg_pid_get_session(pid_t pid, char **session) {
1728         _cleanup_free_ char *cgroup = NULL;
1729         int r;
1730
1731         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1732         if (r < 0)
1733                 return r;
1734
1735         return cg_path_get_session(cgroup, session);
1736 }
1737
1738 int cg_path_get_owner_uid(const char *path, uid_t *uid) {
1739         _cleanup_free_ char *slice = NULL;
1740         char *start, *end;
1741         int r;
1742
1743         assert(path);
1744
1745         r = cg_path_get_slice(path, &slice);
1746         if (r < 0)
1747                 return r;
1748
1749         start = startswith(slice, "user-");
1750         if (!start)
1751                 return -ENXIO;
1752         end = endswith(start, ".slice");
1753         if (!end)
1754                 return -ENXIO;
1755
1756         *end = 0;
1757         if (parse_uid(start, uid) < 0)
1758                 return -ENXIO;
1759
1760         return 0;
1761 }
1762
1763 int cg_pid_get_owner_uid(pid_t pid, uid_t *uid) {
1764         _cleanup_free_ char *cgroup = NULL;
1765         int r;
1766
1767         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1768         if (r < 0)
1769                 return r;
1770
1771         return cg_path_get_owner_uid(cgroup, uid);
1772 }
1773
1774 int cg_path_get_slice(const char *p, char **slice) {
1775         const char *e = NULL;
1776
1777         assert(p);
1778         assert(slice);
1779
1780         /* Finds the right-most slice unit from the beginning, but
1781          * stops before we come to the first non-slice unit. */
1782
1783         for (;;) {
1784                 size_t n;
1785
1786                 p += strspn(p, "/");
1787
1788                 n = strcspn(p, "/");
1789                 if (!valid_slice_name(p, n)) {
1790
1791                         if (!e) {
1792                                 char *s;
1793
1794                                 s = strdup(SPECIAL_ROOT_SLICE);
1795                                 if (!s)
1796                                         return -ENOMEM;
1797
1798                                 *slice = s;
1799                                 return 0;
1800                         }
1801
1802                         return cg_path_decode_unit(e, slice);
1803                 }
1804
1805                 e = p;
1806                 p += n;
1807         }
1808 }
1809
1810 int cg_pid_get_slice(pid_t pid, char **slice) {
1811         _cleanup_free_ char *cgroup = NULL;
1812         int r;
1813
1814         assert(slice);
1815
1816         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1817         if (r < 0)
1818                 return r;
1819
1820         return cg_path_get_slice(cgroup, slice);
1821 }
1822
1823 int cg_path_get_user_slice(const char *p, char **slice) {
1824         const char *t;
1825         assert(p);
1826         assert(slice);
1827
1828         t = skip_user_prefix(p);
1829         if (!t)
1830                 return -ENXIO;
1831
1832         /* And now it looks pretty much the same as for a system
1833          * slice, so let's just use the same parser from here on. */
1834         return cg_path_get_slice(t, slice);
1835 }
1836
1837 int cg_pid_get_user_slice(pid_t pid, char **slice) {
1838         _cleanup_free_ char *cgroup = NULL;
1839         int r;
1840
1841         assert(slice);
1842
1843         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1844         if (r < 0)
1845                 return r;
1846
1847         return cg_path_get_user_slice(cgroup, slice);
1848 }
1849
1850 char *cg_escape(const char *p) {
1851         bool need_prefix = false;
1852
1853         /* This implements very minimal escaping for names to be used
1854          * as file names in the cgroup tree: any name which might
1855          * conflict with a kernel name or is prefixed with '_' is
1856          * prefixed with a '_'. That way, when reading cgroup names it
1857          * is sufficient to remove a single prefixing underscore if
1858          * there is one. */
1859
1860         /* The return value of this function (unlike cg_unescape())
1861          * needs free()! */
1862
1863         if (IN_SET(p[0], 0, '_', '.') ||
1864             streq(p, "notify_on_release") ||
1865             streq(p, "release_agent") ||
1866             streq(p, "tasks") ||
1867             startswith(p, "cgroup."))
1868                 need_prefix = true;
1869         else {
1870                 const char *dot;
1871
1872                 dot = strrchr(p, '.');
1873                 if (dot) {
1874                         CGroupController c;
1875                         size_t l = dot - p;
1876
1877                         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
1878                                 const char *n;
1879
1880                                 n = cgroup_controller_to_string(c);
1881
1882                                 if (l != strlen(n))
1883                                         continue;
1884
1885                                 if (memcmp(p, n, l) != 0)
1886                                         continue;
1887
1888                                 need_prefix = true;
1889                                 break;
1890                         }
1891                 }
1892         }
1893
1894         if (need_prefix)
1895                 return strappend("_", p);
1896
1897         return strdup(p);
1898 }
1899
1900 char *cg_unescape(const char *p) {
1901         assert(p);
1902
1903         /* The return value of this function (unlike cg_escape())
1904          * doesn't need free()! */
1905
1906         if (p[0] == '_')
1907                 return (char*) p+1;
1908
1909         return (char*) p;
1910 }
1911
1912 #define CONTROLLER_VALID                        \
1913         DIGITS LETTERS                          \
1914         "_"
1915
1916 bool cg_controller_is_valid(const char *p) {
1917         const char *t, *s;
1918
1919         if (!p)
1920                 return false;
1921
1922         if (streq(p, SYSTEMD_CGROUP_CONTROLLER))
1923                 return true;
1924
1925         s = startswith(p, "name=");
1926         if (s)
1927                 p = s;
1928
1929         if (IN_SET(*p, 0, '_'))
1930                 return false;
1931
1932         for (t = p; *t; t++)
1933                 if (!strchr(CONTROLLER_VALID, *t))
1934                         return false;
1935
1936         if (t - p > FILENAME_MAX)
1937                 return false;
1938
1939         return true;
1940 }
1941
1942 int cg_slice_to_path(const char *unit, char **ret) {
1943         _cleanup_free_ char *p = NULL, *s = NULL, *e = NULL;
1944         const char *dash;
1945         int r;
1946
1947         assert(unit);
1948         assert(ret);
1949
1950         if (streq(unit, SPECIAL_ROOT_SLICE)) {
1951                 char *x;
1952
1953                 x = strdup("");
1954                 if (!x)
1955                         return -ENOMEM;
1956                 *ret = x;
1957                 return 0;
1958         }
1959
1960         if (!unit_name_is_valid(unit, UNIT_NAME_PLAIN))
1961                 return -EINVAL;
1962
1963         if (!endswith(unit, ".slice"))
1964                 return -EINVAL;
1965
1966         r = unit_name_to_prefix(unit, &p);
1967         if (r < 0)
1968                 return r;
1969
1970         dash = strchr(p, '-');
1971
1972         /* Don't allow initial dashes */
1973         if (dash == p)
1974                 return -EINVAL;
1975
1976         while (dash) {
1977                 _cleanup_free_ char *escaped = NULL;
1978                 char n[dash - p + sizeof(".slice")];
1979
1980 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1981                 /* msan doesn't instrument stpncpy, so it thinks
1982                  * n is later used unitialized:
1983                  * https://github.com/google/sanitizers/issues/926
1984                  */
1985                 zero(n);
1986 #endif
1987
1988                 /* Don't allow trailing or double dashes */
1989                 if (IN_SET(dash[1], 0, '-'))
1990                         return -EINVAL;
1991
1992                 strcpy(stpncpy(n, p, dash - p), ".slice");
1993                 if (!unit_name_is_valid(n, UNIT_NAME_PLAIN))
1994                         return -EINVAL;
1995
1996                 escaped = cg_escape(n);
1997                 if (!escaped)
1998                         return -ENOMEM;
1999
2000                 if (!strextend(&s, escaped, "/", NULL))
2001                         return -ENOMEM;
2002
2003                 dash = strchr(dash+1, '-');
2004         }
2005
2006         e = cg_escape(unit);
2007         if (!e)
2008                 return -ENOMEM;
2009
2010         if (!strextend(&s, e, NULL))
2011                 return -ENOMEM;
2012
2013         *ret = s;
2014         s = NULL;
2015
2016         return 0;
2017 }
2018
2019 int cg_set_attribute(const char *controller, const char *path, const char *attribute, const char *value) {
2020         _cleanup_free_ char *p = NULL;
2021         int r;
2022
2023         r = cg_get_path(controller, path, attribute, &p);
2024         if (r < 0)
2025                 return r;
2026
2027         return write_string_file(p, value, 0);
2028 }
2029
2030 int cg_get_attribute(const char *controller, const char *path, const char *attribute, char **ret) {
2031         _cleanup_free_ char *p = NULL;
2032         int r;
2033
2034         r = cg_get_path(controller, path, attribute, &p);
2035         if (r < 0)
2036                 return r;
2037
2038         return read_one_line_file(p, ret);
2039 }
2040
2041 int cg_get_keyed_attribute(
2042                 const char *controller,
2043                 const char *path,
2044                 const char *attribute,
2045                 char **keys,
2046                 char **ret_values) {
2047
2048         _cleanup_free_ char *filename = NULL, *contents = NULL;
2049         const char *p;
2050         size_t n, i, n_done = 0;
2051         char **v;
2052         int r;
2053
2054         /* Reads one or more fields of a cgroupsv2 keyed attribute file. The 'keys' parameter should be an strv with
2055          * all keys to retrieve. The 'ret_values' parameter should be passed as string size with the same number of
2056          * entries as 'keys'. On success each entry will be set to the value of the matching key.
2057          *
2058          * If the attribute file doesn't exist at all returns ENOENT, if any key is not found returns ENXIO. */
2059
2060         r = cg_get_path(controller, path, attribute, &filename);
2061         if (r < 0)
2062                 return r;
2063
2064         r = read_full_file(filename, &contents, NULL);
2065         if (r < 0)
2066                 return r;
2067
2068         n = strv_length(keys);
2069         if (n == 0) /* No keys to retrieve? That's easy, we are done then */
2070                 return 0;
2071
2072         /* Let's build this up in a temporary array for now in order not to clobber the return parameter on failure */
2073         v = newa0(char*, n);
2074
2075         for (p = contents; *p;) {
2076                 const char *w = NULL;
2077
2078                 for (i = 0; i < n; i++)
2079                         if (!v[i]) {
2080                                 w = first_word(p, keys[i]);
2081                                 if (w)
2082                                         break;
2083                         }
2084
2085                 if (w) {
2086                         size_t l;
2087
2088                         l = strcspn(w, NEWLINE);
2089                         v[i] = strndup(w, l);
2090                         if (!v[i]) {
2091                                 r = -ENOMEM;
2092                                 goto fail;
2093                         }
2094
2095                         n_done++;
2096                         if (n_done >= n)
2097                                 goto done;
2098
2099                         p = w + l;
2100                 } else
2101                         p += strcspn(p, NEWLINE);
2102
2103                 p += strspn(p, NEWLINE);
2104         }
2105
2106         r = -ENXIO;
2107
2108 fail:
2109         for (i = 0; i < n; i++)
2110                 free(v[i]);
2111
2112         return r;
2113
2114 done:
2115         memcpy(ret_values, v, sizeof(char*) * n);
2116         return 0;
2117
2118 }
2119
2120 int cg_create_everywhere(CGroupMask supported, CGroupMask mask, const char *path) {
2121         CGroupController c;
2122         int r;
2123
2124         /* This one will create a cgroup in our private tree, but also
2125          * duplicate it in the trees specified in mask, and remove it
2126          * in all others */
2127
2128         /* First create the cgroup in our own hierarchy. */
2129         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, path);
2130         if (r < 0)
2131                 return r;
2132
2133         /* If we are in the unified hierarchy, we are done now */
2134         r = cg_all_unified();
2135         if (r < 0)
2136                 return r;
2137         if (r > 0)
2138                 return 0;
2139
2140         /* Otherwise, do the same in the other hierarchies */
2141         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2142                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2143                 const char *n;
2144
2145                 n = cgroup_controller_to_string(c);
2146
2147                 if (mask & bit)
2148                         (void) cg_create(n, path);
2149                 else if (supported & bit)
2150                         (void) cg_trim(n, path, true);
2151         }
2152
2153         return 0;
2154 }
2155
2156 int cg_attach_everywhere(CGroupMask supported, const char *path, pid_t pid, cg_migrate_callback_t path_callback, void *userdata) {
2157         CGroupController c;
2158         int r;
2159
2160         r = cg_attach(SYSTEMD_CGROUP_CONTROLLER, path, pid);
2161         if (r < 0)
2162                 return r;
2163
2164         r = cg_all_unified();
2165         if (r < 0)
2166                 return r;
2167         if (r > 0)
2168                 return 0;
2169
2170         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2171                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2172                 const char *p = NULL;
2173
2174                 if (!(supported & bit))
2175                         continue;
2176
2177                 if (path_callback)
2178                         p = path_callback(bit, userdata);
2179
2180                 if (!p)
2181                         p = path;
2182
2183                 (void) cg_attach_fallback(cgroup_controller_to_string(c), p, pid);
2184         }
2185
2186         return 0;
2187 }
2188
2189 int cg_attach_many_everywhere(CGroupMask supported, const char *path, Set* pids, cg_migrate_callback_t path_callback, void *userdata) {
2190         Iterator i;
2191         void *pidp;
2192         int r = 0;
2193
2194         SET_FOREACH(pidp, pids, i) {
2195                 pid_t pid = PTR_TO_PID(pidp);
2196                 int q;
2197
2198                 q = cg_attach_everywhere(supported, path, pid, path_callback, userdata);
2199                 if (q < 0 && r >= 0)
2200                         r = q;
2201         }
2202
2203         return r;
2204 }
2205
2206 int cg_migrate_everywhere(CGroupMask supported, const char *from, const char *to, cg_migrate_callback_t to_callback, void *userdata) {
2207         CGroupController c;
2208         int r = 0, q;
2209
2210         if (!path_equal(from, to))  {
2211                 r = cg_migrate_recursive(SYSTEMD_CGROUP_CONTROLLER, from, SYSTEMD_CGROUP_CONTROLLER, to, CGROUP_REMOVE);
2212                 if (r < 0)
2213                         return r;
2214         }
2215
2216         q = cg_all_unified();
2217         if (q < 0)
2218                 return q;
2219         if (q > 0)
2220                 return r;
2221
2222         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2223                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2224                 const char *p = NULL;
2225
2226                 if (!(supported & bit))
2227                         continue;
2228
2229                 if (to_callback)
2230                         p = to_callback(bit, userdata);
2231
2232                 if (!p)
2233                         p = to;
2234
2235                 (void) cg_migrate_recursive_fallback(SYSTEMD_CGROUP_CONTROLLER, to, cgroup_controller_to_string(c), p, 0);
2236         }
2237
2238         return 0;
2239 }
2240
2241 int cg_trim_everywhere(CGroupMask supported, const char *path, bool delete_root) {
2242         CGroupController c;
2243         int r, q;
2244
2245         r = cg_trim(SYSTEMD_CGROUP_CONTROLLER, path, delete_root);
2246         if (r < 0)
2247                 return r;
2248
2249         q = cg_all_unified();
2250         if (q < 0)
2251                 return q;
2252         if (q > 0)
2253                 return r;
2254
2255         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2256                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2257
2258                 if (!(supported & bit))
2259                         continue;
2260
2261                 (void) cg_trim(cgroup_controller_to_string(c), path, delete_root);
2262         }
2263
2264         return 0;
2265 }
2266
2267 int cg_mask_to_string(CGroupMask mask, char **ret) {
2268         _cleanup_free_ char *s = NULL;
2269         size_t n = 0, allocated = 0;
2270         bool space = false;
2271         CGroupController c;
2272
2273         assert(ret);
2274
2275         if (mask == 0) {
2276                 *ret = NULL;
2277                 return 0;
2278         }
2279
2280         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2281                 const char *k;
2282                 size_t l;
2283
2284                 if (!(mask & CGROUP_CONTROLLER_TO_MASK(c)))
2285                         continue;
2286
2287                 k = cgroup_controller_to_string(c);
2288                 l = strlen(k);
2289
2290                 if (!GREEDY_REALLOC(s, allocated, n + space + l + 1))
2291                         return -ENOMEM;
2292
2293                 if (space)
2294                         s[n] = ' ';
2295                 memcpy(s + n + space, k, l);
2296                 n += space + l;
2297
2298                 space = true;
2299         }
2300
2301         assert(s);
2302
2303         s[n] = 0;
2304         *ret = s;
2305         s = NULL;
2306
2307         return 0;
2308 }
2309
2310 int cg_mask_from_string(const char *value, CGroupMask *mask) {
2311         assert(mask);
2312         assert(value);
2313
2314         for (;;) {
2315                 _cleanup_free_ char *n = NULL;
2316                 CGroupController v;
2317                 int r;
2318
2319                 r = extract_first_word(&value, &n, NULL, 0);
2320                 if (r < 0)
2321                         return r;
2322                 if (r == 0)
2323                         break;
2324
2325                 v = cgroup_controller_from_string(n);
2326                 if (v < 0)
2327                         continue;
2328
2329                 *mask |= CGROUP_CONTROLLER_TO_MASK(v);
2330         }
2331         return 0;
2332 }
2333
2334 int cg_mask_supported(CGroupMask *ret) {
2335         CGroupMask mask = 0;
2336         int r;
2337
2338         /* Determines the mask of supported cgroup controllers. Only
2339          * includes controllers we can make sense of and that are
2340          * actually accessible. */
2341
2342         r = cg_all_unified();
2343         if (r < 0)
2344                 return r;
2345         if (r > 0) {
2346                 _cleanup_free_ char *root = NULL, *controllers = NULL, *path = NULL;
2347
2348                 /* In the unified hierarchy we can read the supported
2349                  * and accessible controllers from a the top-level
2350                  * cgroup attribute */
2351
2352                 r = cg_get_root_path(&root);
2353                 if (r < 0)
2354                         return r;
2355
2356                 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, root, "cgroup.controllers", &path);
2357                 if (r < 0)
2358                         return r;
2359
2360                 r = read_one_line_file(path, &controllers);
2361                 if (r < 0)
2362                         return r;
2363
2364                 r = cg_mask_from_string(controllers, &mask);
2365                 if (r < 0)
2366                         return r;
2367
2368                 /* Currently, we support the cpu, memory, io and pids
2369                  * controller in the unified hierarchy, mask
2370                  * everything else off. */
2371                 mask &= CGROUP_MASK_CPU | CGROUP_MASK_MEMORY | CGROUP_MASK_IO | CGROUP_MASK_PIDS;
2372
2373         } else {
2374                 CGroupController c;
2375
2376                 /* In the legacy hierarchy, we check whether which
2377                  * hierarchies are mounted. */
2378
2379                 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2380                         const char *n;
2381
2382                         n = cgroup_controller_to_string(c);
2383                         if (controller_is_accessible(n) >= 0)
2384                                 mask |= CGROUP_CONTROLLER_TO_MASK(c);
2385                 }
2386         }
2387
2388         *ret = mask;
2389         return 0;
2390 }
2391
2392 int cg_kernel_controllers(Set **ret) {
2393         _cleanup_set_free_free_ Set *controllers = NULL;
2394         _cleanup_fclose_ FILE *f = NULL;
2395         int r;
2396
2397         assert(ret);
2398
2399         /* Determines the full list of kernel-known controllers. Might
2400          * include controllers we don't actually support, arbitrary
2401          * named hierarchies and controllers that aren't currently
2402          * accessible (because not mounted). */
2403
2404         controllers = set_new(&string_hash_ops);
2405         if (!controllers)
2406                 return -ENOMEM;
2407
2408         f = fopen("/proc/cgroups", "re");
2409         if (!f) {
2410                 if (errno == ENOENT) {
2411                         *ret = NULL;
2412                         return 0;
2413                 }
2414
2415                 return -errno;
2416         }
2417
2418         (void) __fsetlocking(f, FSETLOCKING_BYCALLER);
2419
2420         /* Ignore the header line */
2421         (void) read_line(f, (size_t) -1, NULL);
2422
2423         for (;;) {
2424                 char *controller;
2425                 int enabled = 0;
2426
2427                 errno = 0;
2428                 if (fscanf(f, "%ms %*i %*i %i", &controller, &enabled) != 2) {
2429
2430                         if (feof(f))
2431                                 break;
2432
2433                         if (ferror(f) && errno > 0)
2434                                 return -errno;
2435
2436                         return -EBADMSG;
2437                 }
2438
2439                 if (!enabled) {
2440                         free(controller);
2441                         continue;
2442                 }
2443
2444                 if (!cg_controller_is_valid(controller)) {
2445                         free(controller);
2446                         return -EBADMSG;
2447                 }
2448
2449                 r = set_consume(controllers, controller);
2450                 if (r < 0)
2451                         return r;
2452         }
2453
2454         *ret = controllers;
2455         controllers = NULL;
2456
2457         return 0;
2458 }
2459
2460 static thread_local CGroupUnified unified_cache = CGROUP_UNIFIED_UNKNOWN;
2461
2462 /* The hybrid mode was initially implemented in v232 and simply mounted cgroup v2 on /sys/fs/cgroup/systemd.  This
2463  * unfortunately broke other tools (such as docker) which expected the v1 "name=systemd" hierarchy on
2464  * /sys/fs/cgroup/systemd.  From v233 and on, the hybrid mode mountnbs v2 on /sys/fs/cgroup/unified and maintains
2465  * "name=systemd" hierarchy on /sys/fs/cgroup/systemd for compatibility with other tools.
2466  *
2467  * To keep live upgrade working, we detect and support v232 layout.  When v232 layout is detected, to keep cgroup v2
2468  * process management but disable the compat dual layout, we return %true on
2469  * cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) and %false on cg_hybrid_unified().
2470  */
2471 static thread_local bool unified_systemd_v232;
2472
2473 static int cg_unified_update(void) {
2474
2475         struct statfs fs;
2476
2477         /* Checks if we support the unified hierarchy. Returns an
2478          * error when the cgroup hierarchies aren't mounted yet or we
2479          * have any other trouble determining if the unified hierarchy
2480          * is supported. */
2481
2482         if (unified_cache >= CGROUP_UNIFIED_NONE)
2483                 return 0;
2484
2485         if (statfs("/sys/fs/cgroup/", &fs) < 0)
2486                 return log_debug_errno(errno, "statfs(\"/sys/fs/cgroup/\") failed: %m");
2487
2488         if (F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2489                 log_debug("Found cgroup2 on /sys/fs/cgroup/, full unified hierarchy");
2490                 unified_cache = CGROUP_UNIFIED_ALL;
2491         } else if (F_TYPE_EQUAL(fs.f_type, TMPFS_MAGIC)) {
2492                 if (statfs("/sys/fs/cgroup/unified/", &fs) == 0 &&
2493                     F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2494                         log_debug("Found cgroup2 on /sys/fs/cgroup/unified, unified hierarchy for systemd controller");
2495                         unified_cache = CGROUP_UNIFIED_SYSTEMD;
2496                         unified_systemd_v232 = false;
2497                 } else {
2498                         if (statfs("/sys/fs/cgroup/systemd/", &fs) < 0)
2499                                 return log_debug_errno(errno, "statfs(\"/sys/fs/cgroup/systemd\" failed: %m");
2500
2501                         if (F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2502                                 log_debug("Found cgroup2 on /sys/fs/cgroup/systemd, unified hierarchy for systemd controller (v232 variant)");
2503                                 unified_cache = CGROUP_UNIFIED_SYSTEMD;
2504                                 unified_systemd_v232 = true;
2505                         } else if (F_TYPE_EQUAL(fs.f_type, CGROUP_SUPER_MAGIC)) {
2506                                 log_debug("Found cgroup on /sys/fs/cgroup/systemd, legacy hierarchy");
2507                                 unified_cache = CGROUP_UNIFIED_NONE;
2508                         } else {
2509                                 log_debug("Unexpected filesystem type %llx mounted on /sys/fs/cgroup/systemd, assuming legacy hierarchy",
2510                                           (unsigned long long) fs.f_type);
2511                                 unified_cache = CGROUP_UNIFIED_NONE;
2512                         }
2513                 }
2514         } else {
2515                 log_debug("Unknown filesystem type %llx mounted on /sys/fs/cgroup.",
2516                           (unsigned long long) fs.f_type);
2517                 return -ENOMEDIUM;
2518         }
2519
2520         return 0;
2521 }
2522
2523 int cg_unified_controller(const char *controller) {
2524         int r;
2525
2526         r = cg_unified_update();
2527         if (r < 0)
2528                 return r;
2529
2530         if (unified_cache == CGROUP_UNIFIED_NONE)
2531                 return false;
2532
2533         if (unified_cache >= CGROUP_UNIFIED_ALL)
2534                 return true;
2535
2536         return streq_ptr(controller, SYSTEMD_CGROUP_CONTROLLER);
2537 }
2538
2539 int cg_all_unified(void) {
2540         int r;
2541
2542         r = cg_unified_update();
2543         if (r < 0)
2544                 return r;
2545
2546         return unified_cache >= CGROUP_UNIFIED_ALL;
2547 }
2548
2549 int cg_hybrid_unified(void) {
2550         int r;
2551
2552         r = cg_unified_update();
2553         if (r < 0)
2554                 return r;
2555
2556         return unified_cache == CGROUP_UNIFIED_SYSTEMD && !unified_systemd_v232;
2557 }
2558
2559 int cg_unified_flush(void) {
2560         unified_cache = CGROUP_UNIFIED_UNKNOWN;
2561
2562         return cg_unified_update();
2563 }
2564
2565 int cg_enable_everywhere(CGroupMask supported, CGroupMask mask, const char *p) {
2566         _cleanup_fclose_ FILE *f = NULL;
2567         _cleanup_free_ char *fs = NULL;
2568         CGroupController c;
2569         int r;
2570
2571         assert(p);
2572
2573         if (supported == 0)
2574                 return 0;
2575
2576         r = cg_all_unified();
2577         if (r < 0)
2578                 return r;
2579         if (r == 0) /* on the legacy hiearchy there's no joining of controllers defined */
2580                 return 0;
2581
2582         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, p, "cgroup.subtree_control", &fs);
2583         if (r < 0)
2584                 return r;
2585
2586         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2587                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2588                 const char *n;
2589
2590                 if (!(supported & bit))
2591                         continue;
2592
2593                 n = cgroup_controller_to_string(c);
2594                 {
2595                         char s[1 + strlen(n) + 1];
2596
2597                         s[0] = mask & bit ? '+' : '-';
2598                         strcpy(s + 1, n);
2599
2600                         if (!f) {
2601                                 f = fopen(fs, "we");
2602                                 if (!f) {
2603                                         log_debug_errno(errno, "Failed to open cgroup.subtree_control file of %s: %m", p);
2604                                         break;
2605                                 }
2606                         }
2607
2608                         r = write_string_stream(f, s, 0);
2609                         if (r < 0)
2610                                 log_debug_errno(r, "Failed to enable controller %s for %s (%s): %m", n, p, fs);
2611                 }
2612         }
2613
2614         return 0;
2615 }
2616
2617 bool cg_is_unified_wanted(void) {
2618         static thread_local int wanted = -1;
2619         int r;
2620         bool b;
2621         const bool is_default = DEFAULT_HIERARCHY == CGROUP_UNIFIED_ALL;
2622
2623         /* If we have a cached value, return that. */
2624         if (wanted >= 0)
2625                 return wanted;
2626
2627         /* If the hierarchy is already mounted, then follow whatever
2628          * was chosen for it. */
2629         if (cg_unified_flush() >= 0)
2630                 return (wanted = unified_cache >= CGROUP_UNIFIED_ALL);
2631
2632         /* Otherwise, let's see what the kernel command line has to say.
2633          * Since checking is expensive, cache a non-error result. */
2634         r = proc_cmdline_get_bool("systemd.unified_cgroup_hierarchy", &b);
2635
2636         return (wanted = r > 0 ? b : is_default);
2637 }
2638
2639 bool cg_is_legacy_wanted(void) {
2640         static thread_local int wanted = -1;
2641
2642         /* If we have a cached value, return that. */
2643         if (wanted >= 0)
2644                 return wanted;
2645
2646         /* Check if we have cgroups2 already mounted. */
2647         if (cg_unified_flush() >= 0 &&
2648             unified_cache == CGROUP_UNIFIED_ALL)
2649                 return (wanted = false);
2650
2651         /* Otherwise, assume that at least partial legacy is wanted,
2652          * since cgroups2 should already be mounted at this point. */
2653         return (wanted = true);
2654 }
2655
2656 bool cg_is_hybrid_wanted(void) {
2657         static thread_local int wanted = -1;
2658         int r;
2659         bool b;
2660         const bool is_default = DEFAULT_HIERARCHY >= CGROUP_UNIFIED_SYSTEMD;
2661         /* We default to true if the default is "hybrid", obviously,
2662          * but also when the default is "unified", because if we get
2663          * called, it means that unified hierarchy was not mounted. */
2664
2665         /* If we have a cached value, return that. */
2666         if (wanted >= 0)
2667                 return wanted;
2668
2669         /* If the hierarchy is already mounted, then follow whatever
2670          * was chosen for it. */
2671         if (cg_unified_flush() >= 0 &&
2672             unified_cache == CGROUP_UNIFIED_ALL)
2673                 return (wanted = false);
2674
2675         /* Otherwise, let's see what the kernel command line has to say.
2676          * Since checking is expensive, cache a non-error result. */
2677         r = proc_cmdline_get_bool("systemd.legacy_systemd_cgroup_controller", &b);
2678
2679         /* The meaning of the kernel option is reversed wrt. to the return value
2680          * of this function, hence the negation. */
2681         return (wanted = r > 0 ? !b : is_default);
2682 }
2683
2684 int cg_weight_parse(const char *s, uint64_t *ret) {
2685         uint64_t u;
2686         int r;
2687
2688         if (isempty(s)) {
2689                 *ret = CGROUP_WEIGHT_INVALID;
2690                 return 0;
2691         }
2692
2693         r = safe_atou64(s, &u);
2694         if (r < 0)
2695                 return r;
2696
2697         if (u < CGROUP_WEIGHT_MIN || u > CGROUP_WEIGHT_MAX)
2698                 return -ERANGE;
2699
2700         *ret = u;
2701         return 0;
2702 }
2703
2704 const uint64_t cgroup_io_limit_defaults[_CGROUP_IO_LIMIT_TYPE_MAX] = {
2705         [CGROUP_IO_RBPS_MAX]    = CGROUP_LIMIT_MAX,
2706         [CGROUP_IO_WBPS_MAX]    = CGROUP_LIMIT_MAX,
2707         [CGROUP_IO_RIOPS_MAX]   = CGROUP_LIMIT_MAX,
2708         [CGROUP_IO_WIOPS_MAX]   = CGROUP_LIMIT_MAX,
2709 };
2710
2711 static const char* const cgroup_io_limit_type_table[_CGROUP_IO_LIMIT_TYPE_MAX] = {
2712         [CGROUP_IO_RBPS_MAX]    = "IOReadBandwidthMax",
2713         [CGROUP_IO_WBPS_MAX]    = "IOWriteBandwidthMax",
2714         [CGROUP_IO_RIOPS_MAX]   = "IOReadIOPSMax",
2715         [CGROUP_IO_WIOPS_MAX]   = "IOWriteIOPSMax",
2716 };
2717
2718 DEFINE_STRING_TABLE_LOOKUP(cgroup_io_limit_type, CGroupIOLimitType);
2719
2720 int cg_cpu_shares_parse(const char *s, uint64_t *ret) {
2721         uint64_t u;
2722         int r;
2723
2724         if (isempty(s)) {
2725                 *ret = CGROUP_CPU_SHARES_INVALID;
2726                 return 0;
2727         }
2728
2729         r = safe_atou64(s, &u);
2730         if (r < 0)
2731                 return r;
2732
2733         if (u < CGROUP_CPU_SHARES_MIN || u > CGROUP_CPU_SHARES_MAX)
2734                 return -ERANGE;
2735
2736         *ret = u;
2737         return 0;
2738 }
2739
2740 int cg_blkio_weight_parse(const char *s, uint64_t *ret) {
2741         uint64_t u;
2742         int r;
2743
2744         if (isempty(s)) {
2745                 *ret = CGROUP_BLKIO_WEIGHT_INVALID;
2746                 return 0;
2747         }
2748
2749         r = safe_atou64(s, &u);
2750         if (r < 0)
2751                 return r;
2752
2753         if (u < CGROUP_BLKIO_WEIGHT_MIN || u > CGROUP_BLKIO_WEIGHT_MAX)
2754                 return -ERANGE;
2755
2756         *ret = u;
2757         return 0;
2758 }
2759
2760 bool is_cgroup_fs(const struct statfs *s) {
2761         return is_fs_type(s, CGROUP_SUPER_MAGIC) ||
2762                is_fs_type(s, CGROUP2_SUPER_MAGIC);
2763 }
2764
2765 bool fd_is_cgroup_fs(int fd) {
2766         struct statfs s;
2767
2768         if (fstatfs(fd, &s) < 0)
2769                 return -errno;
2770
2771         return is_cgroup_fs(&s);
2772 }
2773
2774 static const char *cgroup_controller_table[_CGROUP_CONTROLLER_MAX] = {
2775         [CGROUP_CONTROLLER_CPU] = "cpu",
2776         [CGROUP_CONTROLLER_CPUACCT] = "cpuacct",
2777         [CGROUP_CONTROLLER_IO] = "io",
2778         [CGROUP_CONTROLLER_BLKIO] = "blkio",
2779         [CGROUP_CONTROLLER_MEMORY] = "memory",
2780         [CGROUP_CONTROLLER_DEVICES] = "devices",
2781         [CGROUP_CONTROLLER_PIDS] = "pids",
2782 };
2783
2784 DEFINE_STRING_TABLE_LOOKUP(cgroup_controller, CGroupController);