src/shared/cgroup-setup.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <unistd.h>
   4
   5 #include "cgroup-setup.h"
   6 #include "cgroup-util.h"
   7 #include "errno-util.h"
   8 #include "fd-util.h"
   9 #include "fileio.h"
  10 #include "fs-util.h"
  11 #include "mkdir.h"
  12 #include "parse-util.h"
  13 #include "path-util.h"
  14 #include "proc-cmdline.h"
  15 #include "process-util.h"
  16 #include "recurse-dir.h"
  17 #include "stdio-util.h"
  18 #include "string-util.h"
  19 #include "user-util.h"
  20 #include "virt.h"
  21
  22 static int cg_any_controller_used_for_v1(void) {
  23         _cleanup_free_ char *buf = NULL;
  24         _cleanup_strv_free_ char **lines = NULL;
  25         char **line;
  26         int r;
  27
  28         r = read_full_virtual_file("/proc/cgroups", &buf, NULL);
  29         if (r < 0)
  30                 return log_debug_errno(r, "Could not read /proc/cgroups, ignoring: %m");
  31
  32         r = strv_split_newlines_full(&lines, buf, 0);
  33         if (r < 0)
  34                 return r;
  35
  36         /* The intention of this is to check if the fully unified cgroup tree setup is possible, meaning all
  37          * enabled kernel cgroup controllers are currently not in use by cgroup1.  For reference:
  38          * https://systemd.io/CGROUP_DELEGATION/#three-different-tree-setups-
  39          *
  40          * Note that this is typically only useful to check inside a container where we don't know what
  41          * cgroup tree setup is in use by the host; if the host is using legacy or hybrid, we can't use
  42          * unified since some or all controllers would be missing. This is not the best way to detect this,
  43          * as whatever container manager created our container should have mounted /sys/fs/cgroup
  44          * appropriately, but in case that wasn't done, we try to detect if it's possible for us to use
  45          * unified cgroups. */
  46         STRV_FOREACH(line, lines) {
  47                 _cleanup_free_ char *name = NULL, *hierarchy_id = NULL, *num = NULL, *enabled = NULL;
  48
  49                 /* Skip header line */
  50                 if (startswith(*line, "#"))
  51                         continue;
  52
  53                 const char *p = *line;
  54                 r = extract_many_words(&p, NULL, 0, &name, &hierarchy_id, &num, &enabled, NULL);
  55                 if (r < 0)
  56                         return log_debug_errno(r, "Error parsing /proc/cgroups line, ignoring: %m");
  57                 else if (r < 4) {
  58                         log_debug("Invalid /proc/cgroups line, ignoring.");
  59                         continue;
  60                 }
  61
  62                 /* Ignore disabled controllers. */
  63                 if (streq(enabled, "0"))
  64                         continue;
  65
  66                 /* Ignore controllers we don't care about. */
  67                 if (cgroup_controller_from_string(name) < 0)
  68                         continue;
  69
  70                 /* Since the unified cgroup doesn't use multiple hierarchies, if any controller has a
  71                  * non-zero hierarchy_id that means it's in use already in a legacy (or hybrid) cgroup v1
  72                  * hierarchy, and can't be used in a unified cgroup. */
  73                 if (!streq(hierarchy_id, "0")) {
  74                         log_debug("Cgroup controller %s in use by legacy v1 hierarchy.", name);
  75                         return 1;
  76                 }
  77         }
  78
  79         return 0;
  80 }
  81
  82 bool cg_is_unified_wanted(void) {
  83         static thread_local int wanted = -1;
  84         bool b;
  85         const bool is_default = DEFAULT_HIERARCHY == CGROUP_UNIFIED_ALL;
  86         _cleanup_free_ char *c = NULL;
  87         int r;
  88
  89         /* If we have a cached value, return that. */
  90         if (wanted >= 0)
  91                 return wanted;
  92
  93         /* If the hierarchy is already mounted, then follow whatever was chosen for it. */
  94         r = cg_unified_cached(true);
  95         if (r >= 0)
  96                 return (wanted = r >= CGROUP_UNIFIED_ALL);
  97
  98         /* If we were explicitly passed systemd.unified_cgroup_hierarchy, respect that. */
  99         r = proc_cmdline_get_bool("systemd.unified_cgroup_hierarchy", &b);
 100         if (r > 0)
 101                 return (wanted = b);
 102
 103         /* If we passed cgroup_no_v1=all with no other instructions, it seems highly unlikely that we want to
 104          * use hybrid or legacy hierarchy. */
 105         r = proc_cmdline_get_key("cgroup_no_v1", 0, &c);
 106         if (r > 0 && streq_ptr(c, "all"))
 107                 return (wanted = true);
 108
 109         /* If any controller is in use as v1, don't use unified. */
 110         if (cg_any_controller_used_for_v1() > 0)
 111                 return (wanted = false);
 112
 113         return (wanted = is_default);
 114 }
 115
 116 bool cg_is_legacy_wanted(void) {
 117         static thread_local int wanted = -1;
 118
 119         /* If we have a cached value, return that. */
 120         if (wanted >= 0)
 121                 return wanted;
 122
 123         /* Check if we have cgroup v2 already mounted. */
 124         if (cg_unified_cached(true) == CGROUP_UNIFIED_ALL)
 125                 return (wanted = false);
 126
 127         /* Otherwise, assume that at least partial legacy is wanted,
 128          * since cgroup v2 should already be mounted at this point. */
 129         return (wanted = true);
 130 }
 131
 132 bool cg_is_hybrid_wanted(void) {
 133         static thread_local int wanted = -1;
 134         int r;
 135         bool b;
 136         const bool is_default = DEFAULT_HIERARCHY >= CGROUP_UNIFIED_SYSTEMD;
 137         /* We default to true if the default is "hybrid", obviously, but also when the default is "unified",
 138          * because if we get called, it means that unified hierarchy was not mounted. */
 139
 140         /* If we have a cached value, return that. */
 141         if (wanted >= 0)
 142                 return wanted;
 143
 144         /* If the hierarchy is already mounted, then follow whatever was chosen for it. */
 145         if (cg_unified_cached(true) == CGROUP_UNIFIED_ALL)
 146                 return (wanted = false);
 147
 148         /* Otherwise, let's see what the kernel command line has to say.  Since checking is expensive, cache
 149          * a non-error result. */
 150         r = proc_cmdline_get_bool("systemd.legacy_systemd_cgroup_controller", &b);
 151
 152         /* The meaning of the kernel option is reversed wrt. to the return value of this function, hence the
 153          * negation. */
 154         return (wanted = r > 0 ? !b : is_default);
 155 }
 156
 157 int cg_weight_parse(const char *s, uint64_t *ret) {
 158         uint64_t u;
 159         int r;
 160
 161         if (isempty(s)) {
 162                 *ret = CGROUP_WEIGHT_INVALID;
 163                 return 0;
 164         }
 165
 166         r = safe_atou64(s, &u);
 167         if (r < 0)
 168                 return r;
 169
 170         if (u < CGROUP_WEIGHT_MIN || u > CGROUP_WEIGHT_MAX)
 171                 return -ERANGE;
 172
 173         *ret = u;
 174         return 0;
 175 }
 176
 177 int cg_cpu_shares_parse(const char *s, uint64_t *ret) {
 178         uint64_t u;
 179         int r;
 180
 181         if (isempty(s)) {
 182                 *ret = CGROUP_CPU_SHARES_INVALID;
 183                 return 0;
 184         }
 185
 186         r = safe_atou64(s, &u);
 187         if (r < 0)
 188                 return r;
 189
 190         if (u < CGROUP_CPU_SHARES_MIN || u > CGROUP_CPU_SHARES_MAX)
 191                 return -ERANGE;
 192
 193         *ret = u;
 194         return 0;
 195 }
 196
 197 int cg_blkio_weight_parse(const char *s, uint64_t *ret) {
 198         uint64_t u;
 199         int r;
 200
 201         if (isempty(s)) {
 202                 *ret = CGROUP_BLKIO_WEIGHT_INVALID;
 203                 return 0;
 204         }
 205
 206         r = safe_atou64(s, &u);
 207         if (r < 0)
 208                 return r;
 209
 210         if (u < CGROUP_BLKIO_WEIGHT_MIN || u > CGROUP_BLKIO_WEIGHT_MAX)
 211                 return -ERANGE;
 212
 213         *ret = u;
 214         return 0;
 215 }
 216
 217 static int trim_cb(
 218                 RecurseDirEvent event,
 219                 const char *path,
 220                 int dir_fd,
 221                 int inode_fd,
 222                 const struct dirent *de,
 223                 const struct statx *sx,
 224                 void *userdata) {
 225
 226         /* Failures to delete inner cgroup we ignore (but debug log in case error code is unexpected) */
 227         if (event == RECURSE_DIR_LEAVE &&
 228             de->d_type == DT_DIR &&
 229             unlinkat(dir_fd, de->d_name, AT_REMOVEDIR) < 0 &&
 230             !IN_SET(errno, ENOENT, ENOTEMPTY, EBUSY))
 231                 log_debug_errno(errno, "Failed to trim inner cgroup %s, ignoring: %m", path);
 232
 233         return RECURSE_DIR_CONTINUE;
 234 }
 235
 236 int cg_trim(const char *controller, const char *path, bool delete_root) {
 237         _cleanup_free_ char *fs = NULL;
 238         int r, q;
 239
 240         assert(path);
 241         assert(controller);
 242
 243         r = cg_get_path(controller, path, NULL, &fs);
 244         if (r < 0)
 245                 return r;
 246
 247         r = recurse_dir_at(
 248                         AT_FDCWD,
 249                         fs,
 250                         /* statx_mask= */ 0,
 251                         /* n_depth_max= */ UINT_MAX,
 252                         RECURSE_DIR_ENSURE_TYPE,
 253                         trim_cb,
 254                         NULL);
 255         if (r == -ENOENT) /* non-existing is the ultimate trimming, hence no error */
 256                 r = 0;
 257         else if (r < 0)
 258                 log_debug_errno(r, "Failed to iterate through cgroup %s: %m", path);
 259
 260         /* If we shall delete the top-level cgroup, then propagate the faiure to do so (except if it is
 261          * already gone anyway). Also, let's debug log about this failure, except if the error code is an
 262          * expected one. */
 263         if (delete_root && !empty_or_root(path) &&
 264             rmdir(fs) < 0 && errno != ENOENT) {
 265                 if (!IN_SET(errno, ENOTEMPTY, EBUSY))
 266                         log_debug_errno(errno, "Failed to trim cgroup %s: %m", path);
 267                 if (r >= 0)
 268                         r = -errno;
 269         }
 270
 271         q = cg_hybrid_unified();
 272         if (q < 0)
 273                 return q;
 274         if (q > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER))
 275                 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, delete_root);
 276
 277         return r;
 278 }
 279
 280 /* Create a cgroup in the hierarchy of controller.
 281  * Returns 0 if the group already existed, 1 on success, negative otherwise.
 282  */
 283 int cg_create(const char *controller, const char *path) {
 284         _cleanup_free_ char *fs = NULL;
 285         int r;
 286
 287         r = cg_get_path_and_check(controller, path, NULL, &fs);
 288         if (r < 0)
 289                 return r;
 290
 291         r = mkdir_parents(fs, 0755);
 292         if (r < 0)
 293                 return r;
 294
 295         r = RET_NERRNO(mkdir(fs, 0755));
 296         if (r == -EEXIST)
 297                 return 0;
 298         if (r < 0)
 299                 return r;
 300
 301         r = cg_hybrid_unified();
 302         if (r < 0)
 303                 return r;
 304
 305         if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 306                 r = cg_create(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path);
 307                 if (r < 0)
 308                         log_warning_errno(r, "Failed to create compat systemd cgroup %s: %m", path);
 309         }
 310
 311         return 1;
 312 }
 313
 314 int cg_create_and_attach(const char *controller, const char *path, pid_t pid) {
 315         int r, q;
 316
 317         assert(pid >= 0);
 318
 319         r = cg_create(controller, path);
 320         if (r < 0)
 321                 return r;
 322
 323         q = cg_attach(controller, path, pid);
 324         if (q < 0)
 325                 return q;
 326
 327         /* This does not remove the cgroup on failure */
 328         return r;
 329 }
 330
 331 int cg_attach(const char *controller, const char *path, pid_t pid) {
 332         _cleanup_free_ char *fs = NULL;
 333         char c[DECIMAL_STR_MAX(pid_t) + 2];
 334         int r;
 335
 336         assert(path);
 337         assert(pid >= 0);
 338
 339         r = cg_get_path_and_check(controller, path, "cgroup.procs", &fs);
 340         if (r < 0)
 341                 return r;
 342
 343         if (pid == 0)
 344                 pid = getpid_cached();
 345
 346         xsprintf(c, PID_FMT "\n", pid);
 347
 348         r = write_string_file(fs, c, WRITE_STRING_FILE_DISABLE_BUFFER);
 349         if (r == -EOPNOTSUPP && cg_is_threaded(controller, path) > 0)
 350                 /* When the threaded mode is used, we cannot read/write the file. Let's return recognizable error. */
 351                 return -EUCLEAN;
 352         if (r < 0)
 353                 return r;
 354
 355         r = cg_hybrid_unified();
 356         if (r < 0)
 357                 return r;
 358
 359         if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 360                 r = cg_attach(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, pid);
 361                 if (r < 0)
 362                         log_warning_errno(r, "Failed to attach "PID_FMT" to compat systemd cgroup %s: %m", pid, path);
 363         }
 364
 365         return 0;
 366 }
 367
 368 int cg_attach_fallback(const char *controller, const char *path, pid_t pid) {
 369         int r;
 370
 371         assert(controller);
 372         assert(path);
 373         assert(pid >= 0);
 374
 375         r = cg_attach(controller, path, pid);
 376         if (r < 0) {
 377                 char prefix[strlen(path) + 1];
 378
 379                 /* This didn't work? Then let's try all prefixes of
 380                  * the destination */
 381
 382                 PATH_FOREACH_PREFIX(prefix, path) {
 383                         int q;
 384
 385                         q = cg_attach(controller, prefix, pid);
 386                         if (q >= 0)
 387                                 return q;
 388                 }
 389         }
 390
 391         return r;
 392 }
 393
 394 int cg_set_access(
 395                 const char *controller,
 396                 const char *path,
 397                 uid_t uid,
 398                 gid_t gid) {
 399
 400         struct Attribute {
 401                 const char *name;
 402                 bool fatal;
 403         };
 404
 405         /* cgroup v1, aka legacy/non-unified */
 406         static const struct Attribute legacy_attributes[] = {
 407                 { "cgroup.procs",           true  },
 408                 { "tasks",                  false },
 409                 { "cgroup.clone_children",  false },
 410                 {},
 411         };
 412
 413         /* cgroup v2, aka unified */
 414         static const struct Attribute unified_attributes[] = {
 415                 { "cgroup.procs",           true  },
 416                 { "cgroup.subtree_control", true  },
 417                 { "cgroup.threads",         false },
 418                 {},
 419         };
 420
 421         static const struct Attribute* const attributes[] = {
 422                 [false] = legacy_attributes,
 423                 [true]  = unified_attributes,
 424         };
 425
 426         _cleanup_free_ char *fs = NULL;
 427         const struct Attribute *i;
 428         int r, unified;
 429
 430         assert(path);
 431
 432         if (uid == UID_INVALID && gid == GID_INVALID)
 433                 return 0;
 434
 435         unified = cg_unified_controller(controller);
 436         if (unified < 0)
 437                 return unified;
 438
 439         /* Configure access to the cgroup itself */
 440         r = cg_get_path(controller, path, NULL, &fs);
 441         if (r < 0)
 442                 return r;
 443
 444         r = chmod_and_chown(fs, 0755, uid, gid);
 445         if (r < 0)
 446                 return r;
 447
 448         /* Configure access to the cgroup's attributes */
 449         for (i = attributes[unified]; i->name; i++) {
 450                 fs = mfree(fs);
 451
 452                 r = cg_get_path(controller, path, i->name, &fs);
 453                 if (r < 0)
 454                         return r;
 455
 456                 r = chmod_and_chown(fs, 0644, uid, gid);
 457                 if (r < 0) {
 458                         if (i->fatal)
 459                                 return r;
 460
 461                         log_debug_errno(r, "Failed to set access on cgroup %s, ignoring: %m", fs);
 462                 }
 463         }
 464
 465         if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 466                 r = cg_hybrid_unified();
 467                 if (r < 0)
 468                         return r;
 469                 if (r > 0) {
 470                         /* Always propagate access mode from unified to legacy controller */
 471                         r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, uid, gid);
 472                         if (r < 0)
 473                                 log_debug_errno(r, "Failed to set access on compatibility systemd cgroup %s, ignoring: %m", path);
 474                 }
 475         }
 476
 477         return 0;
 478 }
 479
 480 int cg_migrate(
 481                 const char *cfrom,
 482                 const char *pfrom,
 483                 const char *cto,
 484                 const char *pto,
 485                 CGroupFlags flags) {
 486
 487         bool done = false;
 488         _cleanup_set_free_ Set *s = NULL;
 489         int r, ret = 0;
 490         pid_t my_pid;
 491
 492         assert(cfrom);
 493         assert(pfrom);
 494         assert(cto);
 495         assert(pto);
 496
 497         s = set_new(NULL);
 498         if (!s)
 499                 return -ENOMEM;
 500
 501         my_pid = getpid_cached();
 502
 503         do {
 504                 _cleanup_fclose_ FILE *f = NULL;
 505                 pid_t pid = 0;
 506                 done = true;
 507
 508                 r = cg_enumerate_processes(cfrom, pfrom, &f);
 509                 if (r < 0) {
 510                         if (ret >= 0 && r != -ENOENT)
 511                                 return r;
 512
 513                         return ret;
 514                 }
 515
 516                 while ((r = cg_read_pid(f, &pid)) > 0) {
 517
 518                         /* This might do weird stuff if we aren't a
 519                          * single-threaded program. However, we
 520                          * luckily know we are not */
 521                         if ((flags & CGROUP_IGNORE_SELF) && pid == my_pid)
 522                                 continue;
 523
 524                         if (set_get(s, PID_TO_PTR(pid)) == PID_TO_PTR(pid))
 525                                 continue;
 526
 527                         /* Ignore kernel threads. Since they can only
 528                          * exist in the root cgroup, we only check for
 529                          * them there. */
 530                         if (cfrom &&
 531                             empty_or_root(pfrom) &&
 532                             is_kernel_thread(pid) > 0)
 533                                 continue;
 534
 535                         r = cg_attach(cto, pto, pid);
 536                         if (r < 0) {
 537                                 if (ret >= 0 && r != -ESRCH)
 538                                         ret = r;
 539                         } else if (ret == 0)
 540                                 ret = 1;
 541
 542                         done = false;
 543
 544                         r = set_put(s, PID_TO_PTR(pid));
 545                         if (r < 0) {
 546                                 if (ret >= 0)
 547                                         return r;
 548
 549                                 return ret;
 550                         }
 551                 }
 552
 553                 if (r < 0) {
 554                         if (ret >= 0)
 555                                 return r;
 556
 557                         return ret;
 558                 }
 559         } while (!done);
 560
 561         return ret;
 562 }
 563
 564 int cg_migrate_recursive(
 565                 const char *cfrom,
 566                 const char *pfrom,
 567                 const char *cto,
 568                 const char *pto,
 569                 CGroupFlags flags) {
 570
 571         _cleanup_closedir_ DIR *d = NULL;
 572         int r, ret = 0;
 573         char *fn;
 574
 575         assert(cfrom);
 576         assert(pfrom);
 577         assert(cto);
 578         assert(pto);
 579
 580         ret = cg_migrate(cfrom, pfrom, cto, pto, flags);
 581
 582         r = cg_enumerate_subgroups(cfrom, pfrom, &d);
 583         if (r < 0) {
 584                 if (ret >= 0 && r != -ENOENT)
 585                         return r;
 586
 587                 return ret;
 588         }
 589
 590         while ((r = cg_read_subgroup(d, &fn)) > 0) {
 591                 _cleanup_free_ char *p = NULL;
 592
 593                 p = path_join(empty_to_root(pfrom), fn);
 594                 free(fn);
 595                 if (!p)
 596                         return -ENOMEM;
 597
 598                 r = cg_migrate_recursive(cfrom, p, cto, pto, flags);
 599                 if (r != 0 && ret >= 0)
 600                         ret = r;
 601         }
 602
 603         if (r < 0 && ret >= 0)
 604                 ret = r;
 605
 606         if (flags & CGROUP_REMOVE) {
 607                 r = cg_rmdir(cfrom, pfrom);
 608                 if (r < 0 && ret >= 0 && !IN_SET(r, -ENOENT, -EBUSY))
 609                         return r;
 610         }
 611
 612         return ret;
 613 }
 614
 615 int cg_migrate_recursive_fallback(
 616                 const char *cfrom,
 617                 const char *pfrom,
 618                 const char *cto,
 619                 const char *pto,
 620                 CGroupFlags flags) {
 621
 622         int r;
 623
 624         assert(cfrom);
 625         assert(pfrom);
 626         assert(cto);
 627         assert(pto);
 628
 629         r = cg_migrate_recursive(cfrom, pfrom, cto, pto, flags);
 630         if (r < 0) {
 631                 char prefix[strlen(pto) + 1];
 632
 633                 /* This didn't work? Then let's try all prefixes of the destination */
 634
 635                 PATH_FOREACH_PREFIX(prefix, pto) {
 636                         int q;
 637
 638                         q = cg_migrate_recursive(cfrom, pfrom, cto, prefix, flags);
 639                         if (q >= 0)
 640                                 return q;
 641                 }
 642         }
 643
 644         return r;
 645 }
 646
 647 int cg_create_everywhere(CGroupMask supported, CGroupMask mask, const char *path) {
 648         CGroupController c;
 649         CGroupMask done;
 650         bool created;
 651         int r;
 652
 653         /* This one will create a cgroup in our private tree, but also
 654          * duplicate it in the trees specified in mask, and remove it
 655          * in all others.
 656          *
 657          * Returns 0 if the group already existed in the systemd hierarchy,
 658          * 1 on success, negative otherwise.
 659          */
 660
 661         /* First create the cgroup in our own hierarchy. */
 662         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, path);
 663         if (r < 0)
 664                 return r;
 665         created = r;
 666
 667         /* If we are in the unified hierarchy, we are done now */
 668         r = cg_all_unified();
 669         if (r < 0)
 670                 return r;
 671         if (r > 0)
 672                 return created;
 673
 674         supported &= CGROUP_MASK_V1;
 675         mask = CGROUP_MASK_EXTEND_JOINED(mask);
 676         done = 0;
 677
 678         /* Otherwise, do the same in the other hierarchies */
 679         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
 680                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
 681                 const char *n;
 682
 683                 if (!FLAGS_SET(supported, bit))
 684                         continue;
 685
 686                 if (FLAGS_SET(done, bit))
 687                         continue;
 688
 689                 n = cgroup_controller_to_string(c);
 690                 if (FLAGS_SET(mask, bit))
 691                         (void) cg_create(n, path);
 692
 693                 done |= CGROUP_MASK_EXTEND_JOINED(bit);
 694         }
 695
 696         return created;
 697 }
 698
 699 int cg_attach_everywhere(CGroupMask supported, const char *path, pid_t pid, cg_migrate_callback_t path_callback, void *userdata) {
 700         int r;
 701
 702         r = cg_attach(SYSTEMD_CGROUP_CONTROLLER, path, pid);
 703         if (r < 0)
 704                 return r;
 705
 706         r = cg_all_unified();
 707         if (r < 0)
 708                 return r;
 709         if (r > 0)
 710                 return 0;
 711
 712         supported &= CGROUP_MASK_V1;
 713         CGroupMask done = 0;
 714
 715         for (CGroupController c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
 716                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
 717                 const char *p = NULL;
 718
 719                 if (!FLAGS_SET(supported, bit))
 720                         continue;
 721
 722                 if (FLAGS_SET(done, bit))
 723                         continue;
 724
 725                 if (path_callback)
 726                         p = path_callback(bit, userdata);
 727                 if (!p)
 728                         p = path;
 729
 730                 (void) cg_attach_fallback(cgroup_controller_to_string(c), p, pid);
 731                 done |= CGROUP_MASK_EXTEND_JOINED(bit);
 732         }
 733
 734         return 0;
 735 }
 736
 737 int cg_migrate_v1_controllers(CGroupMask supported, CGroupMask mask, const char *from, cg_migrate_callback_t to_callback, void *userdata) {
 738         CGroupController c;
 739         CGroupMask done;
 740         int r = 0, q;
 741
 742         assert(to_callback);
 743
 744         supported &= CGROUP_MASK_V1;
 745         mask = CGROUP_MASK_EXTEND_JOINED(mask);
 746         done = 0;
 747
 748         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
 749                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
 750                 const char *to = NULL;
 751
 752                 if (!FLAGS_SET(supported, bit))
 753                         continue;
 754
 755                 if (FLAGS_SET(done, bit))
 756                         continue;
 757
 758                 if (!FLAGS_SET(mask, bit))
 759                         continue;
 760
 761                 to = to_callback(bit, userdata);
 762
 763                 /* Remember first error and try continuing */
 764                 q = cg_migrate_recursive_fallback(SYSTEMD_CGROUP_CONTROLLER, from, cgroup_controller_to_string(c), to, 0);
 765                 r = (r < 0) ? r : q;
 766         }
 767
 768         return r;
 769 }
 770
 771 int cg_trim_everywhere(CGroupMask supported, const char *path, bool delete_root) {
 772         int r, q;
 773
 774         r = cg_trim(SYSTEMD_CGROUP_CONTROLLER, path, delete_root);
 775         if (r < 0)
 776                 return r;
 777
 778         q = cg_all_unified();
 779         if (q < 0)
 780                 return q;
 781         if (q > 0)
 782                 return r;
 783
 784         return cg_trim_v1_controllers(supported, _CGROUP_MASK_ALL, path, delete_root);
 785 }
 786
 787 int cg_trim_v1_controllers(CGroupMask supported, CGroupMask mask, const char *path, bool delete_root) {
 788         CGroupController c;
 789         CGroupMask done;
 790         int r = 0, q;
 791
 792         supported &= CGROUP_MASK_V1;
 793         mask = CGROUP_MASK_EXTEND_JOINED(mask);
 794         done = 0;
 795
 796         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
 797                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
 798
 799                 if (!FLAGS_SET(supported, bit))
 800                         continue;
 801
 802                 if (FLAGS_SET(done, bit))
 803                         continue;
 804
 805                 if (FLAGS_SET(mask, bit)) {
 806                         /* Remember first error and try continuing */
 807                         q = cg_trim(cgroup_controller_to_string(c), path, delete_root);
 808                         r = (r < 0) ? r : q;
 809                 }
 810                 done |= CGROUP_MASK_EXTEND_JOINED(bit);
 811         }
 812
 813         return r;
 814 }
 815
 816 int cg_enable_everywhere(
 817                 CGroupMask supported,
 818                 CGroupMask mask,
 819                 const char *p,
 820                 CGroupMask *ret_result_mask) {
 821
 822         _cleanup_fclose_ FILE *f = NULL;
 823         _cleanup_free_ char *fs = NULL;
 824         CGroupController c;
 825         CGroupMask ret = 0;
 826         int r;
 827
 828         assert(p);
 829
 830         if (supported == 0) {
 831                 if (ret_result_mask)
 832                         *ret_result_mask = 0;
 833                 return 0;
 834         }
 835
 836         r = cg_all_unified();
 837         if (r < 0)
 838                 return r;
 839         if (r == 0) {
 840                 /* On the legacy hierarchy there's no concept of "enabling" controllers in cgroups defined. Let's claim
 841                  * complete success right away. (If you wonder why we return the full mask here, rather than zero: the
 842                  * caller tends to use the returned mask later on to compare if all controllers where properly joined,
 843                  * and if not requeues realization. This use is the primary purpose of the return value, hence let's
 844                  * minimize surprises here and reduce triggers for re-realization by always saying we fully
 845                  * succeeded.) */
 846                 if (ret_result_mask)
 847                         *ret_result_mask = mask & supported & CGROUP_MASK_V2; /* If you wonder why we mask this with
 848                                                                                * CGROUP_MASK_V2: The 'supported' mask
 849                                                                                * might contain pure-V1 or BPF
 850                                                                                * controllers, and we never want to
 851                                                                                * claim that we could enable those with
 852                                                                                * cgroup.subtree_control */
 853                 return 0;
 854         }
 855
 856         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, p, "cgroup.subtree_control", &fs);
 857         if (r < 0)
 858                 return r;
 859
 860         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
 861                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
 862                 const char *n;
 863
 864                 if (!FLAGS_SET(CGROUP_MASK_V2, bit))
 865                         continue;
 866
 867                 if (!FLAGS_SET(supported, bit))
 868                         continue;
 869
 870                 n = cgroup_controller_to_string(c);
 871                 {
 872                         char s[1 + strlen(n) + 1];
 873
 874                         s[0] = FLAGS_SET(mask, bit) ? '+' : '-';
 875                         strcpy(s + 1, n);
 876
 877                         if (!f) {
 878                                 f = fopen(fs, "we");
 879                                 if (!f)
 880                                         return log_debug_errno(errno, "Failed to open cgroup.subtree_control file of %s: %m", p);
 881                         }
 882
 883                         r = write_string_stream(f, s, WRITE_STRING_FILE_DISABLE_BUFFER);
 884                         if (r < 0) {
 885                                 log_debug_errno(r, "Failed to %s controller %s for %s (%s): %m",
 886                                                 FLAGS_SET(mask, bit) ? "enable" : "disable", n, p, fs);
 887                                 clearerr(f);
 888
 889                                 /* If we can't turn off a controller, leave it on in the reported resulting mask. This
 890                                  * happens for example when we attempt to turn off a controller up in the tree that is
 891                                  * used down in the tree. */
 892                                 if (!FLAGS_SET(mask, bit) && r == -EBUSY) /* You might wonder why we check for EBUSY
 893                                                                            * only here, and not follow the same logic
 894                                                                            * for other errors such as EINVAL or
 895                                                                            * EOPNOTSUPP or anything else. That's
 896                                                                            * because EBUSY indicates that the
 897                                                                            * controllers is currently enabled and
 898                                                                            * cannot be disabled because something down
 899                                                                            * the hierarchy is still using it. Any other
 900                                                                            * error most likely means something like "I
 901                                                                            * never heard of this controller" or
 902                                                                            * similar. In the former case it's hence
 903                                                                            * safe to assume the controller is still on
 904                                                                            * after the failed operation, while in the
 905                                                                            * latter case it's safer to assume the
 906                                                                            * controller is unknown and hence certainly
 907                                                                            * not enabled. */
 908                                         ret |= bit;
 909                         } else {
 910                                 /* Otherwise, if we managed to turn on a controller, set the bit reflecting that. */
 911                                 if (FLAGS_SET(mask, bit))
 912                                         ret |= bit;
 913                         }
 914                 }
 915         }
 916
 917         /* Let's return the precise set of controllers now enabled for the cgroup. */
 918         if (ret_result_mask)
 919                 *ret_result_mask = ret;
 920
 921         return 0;
 922 }