src/shared/cgroup-setup.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <unistd.h>
   4
   5 #include "cgroup-setup.h"
   6 #include "cgroup-util.h"
   7 #include "errno-util.h"
   8 #include "fd-util.h"
   9 #include "fileio.h"
  10 #include "fs-util.h"
  11 #include "missing_threads.h"
  12 #include "mkdir.h"
  13 #include "parse-util.h"
  14 #include "path-util.h"
  15 #include "proc-cmdline.h"
  16 #include "process-util.h"
  17 #include "recurse-dir.h"
  18 #include "stdio-util.h"
  19 #include "string-util.h"
  20 #include "user-util.h"
  21 #include "virt.h"
  22
  23 static int cg_any_controller_used_for_v1(void) {
  24         _cleanup_free_ char *buf = NULL;
  25         _cleanup_strv_free_ char **lines = NULL;
  26         int r;
  27
  28         r = read_full_virtual_file("/proc/cgroups", &buf, NULL);
  29         if (r < 0)
  30                 return log_debug_errno(r, "Could not read /proc/cgroups, ignoring: %m");
  31
  32         r = strv_split_newlines_full(&lines, buf, 0);
  33         if (r < 0)
  34                 return r;
  35
  36         /* The intention of this is to check if the fully unified cgroup tree setup is possible, meaning all
  37          * enabled kernel cgroup controllers are currently not in use by cgroup1.  For reference:
  38          * https://systemd.io/CGROUP_DELEGATION/#three-different-tree-setups-
  39          *
  40          * Note that this is typically only useful to check inside a container where we don't know what
  41          * cgroup tree setup is in use by the host; if the host is using legacy or hybrid, we can't use
  42          * unified since some or all controllers would be missing. This is not the best way to detect this,
  43          * as whatever container manager created our container should have mounted /sys/fs/cgroup
  44          * appropriately, but in case that wasn't done, we try to detect if it's possible for us to use
  45          * unified cgroups. */
  46         STRV_FOREACH(line, lines) {
  47                 _cleanup_free_ char *name = NULL, *hierarchy_id = NULL, *num = NULL, *enabled = NULL;
  48
  49                 /* Skip header line */
  50                 if (startswith(*line, "#"))
  51                         continue;
  52
  53                 const char *p = *line;
  54                 r = extract_many_words(&p, NULL, 0, &name, &hierarchy_id, &num, &enabled, NULL);
  55                 if (r < 0)
  56                         return log_debug_errno(r, "Error parsing /proc/cgroups line, ignoring: %m");
  57                 else if (r < 4) {
  58                         log_debug("Invalid /proc/cgroups line, ignoring.");
  59                         continue;
  60                 }
  61
  62                 /* Ignore disabled controllers. */
  63                 if (streq(enabled, "0"))
  64                         continue;
  65
  66                 /* Ignore controllers we don't care about. */
  67                 if (cgroup_controller_from_string(name) < 0)
  68                         continue;
  69
  70                 /* Since the unified cgroup doesn't use multiple hierarchies, if any controller has a
  71                  * non-zero hierarchy_id that means it's in use already in a legacy (or hybrid) cgroup v1
  72                  * hierarchy, and can't be used in a unified cgroup. */
  73                 if (!streq(hierarchy_id, "0")) {
  74                         log_debug("Cgroup controller %s in use by legacy v1 hierarchy.", name);
  75                         return 1;
  76                 }
  77         }
  78
  79         return 0;
  80 }
  81
  82 bool cg_is_unified_wanted(void) {
  83         static thread_local int wanted = -1;
  84         bool b;
  85         const bool is_default = DEFAULT_HIERARCHY == CGROUP_UNIFIED_ALL;
  86         _cleanup_free_ char *c = NULL;
  87         int r;
  88
  89         /* If we have a cached value, return that. */
  90         if (wanted >= 0)
  91                 return wanted;
  92
  93         /* If the hierarchy is already mounted, then follow whatever was chosen for it. */
  94         r = cg_unified_cached(true);
  95         if (r >= 0)
  96                 return (wanted = r >= CGROUP_UNIFIED_ALL);
  97
  98         /* If we were explicitly passed systemd.unified_cgroup_hierarchy, respect that. */
  99         r = proc_cmdline_get_bool("systemd.unified_cgroup_hierarchy", &b);
 100         if (r > 0)
 101                 return (wanted = b);
 102
 103         /* If we passed cgroup_no_v1=all with no other instructions, it seems highly unlikely that we want to
 104          * use hybrid or legacy hierarchy. */
 105         r = proc_cmdline_get_key("cgroup_no_v1", 0, &c);
 106         if (r > 0 && streq_ptr(c, "all"))
 107                 return (wanted = true);
 108
 109         /* If any controller is in use as v1, don't use unified. */
 110         if (cg_any_controller_used_for_v1() > 0)
 111                 return (wanted = false);
 112
 113         return (wanted = is_default);
 114 }
 115
 116 bool cg_is_legacy_wanted(void) {
 117         static thread_local int wanted = -1;
 118
 119         /* If we have a cached value, return that. */
 120         if (wanted >= 0)
 121                 return wanted;
 122
 123         /* Check if we have cgroup v2 already mounted. */
 124         if (cg_unified_cached(true) == CGROUP_UNIFIED_ALL)
 125                 return (wanted = false);
 126
 127         /* Otherwise, assume that at least partial legacy is wanted,
 128          * since cgroup v2 should already be mounted at this point. */
 129         return (wanted = true);
 130 }
 131
 132 bool cg_is_hybrid_wanted(void) {
 133         static thread_local int wanted = -1;
 134         int r;
 135         bool b;
 136         const bool is_default = DEFAULT_HIERARCHY >= CGROUP_UNIFIED_SYSTEMD;
 137         /* We default to true if the default is "hybrid", obviously, but also when the default is "unified",
 138          * because if we get called, it means that unified hierarchy was not mounted. */
 139
 140         /* If we have a cached value, return that. */
 141         if (wanted >= 0)
 142                 return wanted;
 143
 144         /* If the hierarchy is already mounted, then follow whatever was chosen for it. */
 145         if (cg_unified_cached(true) == CGROUP_UNIFIED_ALL)
 146                 return (wanted = false);
 147
 148         /* Otherwise, let's see what the kernel command line has to say.  Since checking is expensive, cache
 149          * a non-error result. */
 150         r = proc_cmdline_get_bool("systemd.legacy_systemd_cgroup_controller", &b);
 151
 152         /* The meaning of the kernel option is reversed wrt. to the return value of this function, hence the
 153          * negation. */
 154         return (wanted = r > 0 ? !b : is_default);
 155 }
 156
 157 int cg_weight_parse(const char *s, uint64_t *ret) {
 158         uint64_t u;
 159         int r;
 160
 161         if (isempty(s)) {
 162                 *ret = CGROUP_WEIGHT_INVALID;
 163                 return 0;
 164         }
 165
 166         r = safe_atou64(s, &u);
 167         if (r < 0)
 168                 return r;
 169
 170         if (u < CGROUP_WEIGHT_MIN || u > CGROUP_WEIGHT_MAX)
 171                 return -ERANGE;
 172
 173         *ret = u;
 174         return 0;
 175 }
 176
 177 int cg_cpu_weight_parse(const char *s, uint64_t *ret) {
 178         if (streq_ptr(s, "idle"))
 179                 return *ret = CGROUP_WEIGHT_IDLE;
 180         return cg_weight_parse(s, ret);
 181 }
 182
 183 int cg_cpu_shares_parse(const char *s, uint64_t *ret) {
 184         uint64_t u;
 185         int r;
 186
 187         if (isempty(s)) {
 188                 *ret = CGROUP_CPU_SHARES_INVALID;
 189                 return 0;
 190         }
 191
 192         r = safe_atou64(s, &u);
 193         if (r < 0)
 194                 return r;
 195
 196         if (u < CGROUP_CPU_SHARES_MIN || u > CGROUP_CPU_SHARES_MAX)
 197                 return -ERANGE;
 198
 199         *ret = u;
 200         return 0;
 201 }
 202
 203 int cg_blkio_weight_parse(const char *s, uint64_t *ret) {
 204         uint64_t u;
 205         int r;
 206
 207         if (isempty(s)) {
 208                 *ret = CGROUP_BLKIO_WEIGHT_INVALID;
 209                 return 0;
 210         }
 211
 212         r = safe_atou64(s, &u);
 213         if (r < 0)
 214                 return r;
 215
 216         if (u < CGROUP_BLKIO_WEIGHT_MIN || u > CGROUP_BLKIO_WEIGHT_MAX)
 217                 return -ERANGE;
 218
 219         *ret = u;
 220         return 0;
 221 }
 222
 223 static int trim_cb(
 224                 RecurseDirEvent event,
 225                 const char *path,
 226                 int dir_fd,
 227                 int inode_fd,
 228                 const struct dirent *de,
 229                 const struct statx *sx,
 230                 void *userdata) {
 231
 232         /* Failures to delete inner cgroup we ignore (but debug log in case error code is unexpected) */
 233         if (event == RECURSE_DIR_LEAVE &&
 234             de->d_type == DT_DIR &&
 235             unlinkat(dir_fd, de->d_name, AT_REMOVEDIR) < 0 &&
 236             !IN_SET(errno, ENOENT, ENOTEMPTY, EBUSY))
 237                 log_debug_errno(errno, "Failed to trim inner cgroup %s, ignoring: %m", path);
 238
 239         return RECURSE_DIR_CONTINUE;
 240 }
 241
 242 int cg_trim(const char *controller, const char *path, bool delete_root) {
 243         _cleanup_free_ char *fs = NULL;
 244         int r, q;
 245
 246         assert(path);
 247         assert(controller);
 248
 249         r = cg_get_path(controller, path, NULL, &fs);
 250         if (r < 0)
 251                 return r;
 252
 253         r = recurse_dir_at(
 254                         AT_FDCWD,
 255                         fs,
 256                         /* statx_mask= */ 0,
 257                         /* n_depth_max= */ UINT_MAX,
 258                         RECURSE_DIR_ENSURE_TYPE,
 259                         trim_cb,
 260                         NULL);
 261         if (r == -ENOENT) /* non-existing is the ultimate trimming, hence no error */
 262                 r = 0;
 263         else if (r < 0)
 264                 log_debug_errno(r, "Failed to iterate through cgroup %s: %m", path);
 265
 266         /* If we shall delete the top-level cgroup, then propagate the failure to do so (except if it is
 267          * already gone anyway). Also, let's debug log about this failure, except if the error code is an
 268          * expected one. */
 269         if (delete_root && !empty_or_root(path) &&
 270             rmdir(fs) < 0 && errno != ENOENT) {
 271                 if (!IN_SET(errno, ENOTEMPTY, EBUSY))
 272                         log_debug_errno(errno, "Failed to trim cgroup %s: %m", path);
 273                 if (r >= 0)
 274                         r = -errno;
 275         }
 276
 277         q = cg_hybrid_unified();
 278         if (q < 0)
 279                 return q;
 280         if (q > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER))
 281                 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, delete_root);
 282
 283         return r;
 284 }
 285
 286 /* Create a cgroup in the hierarchy of controller.
 287  * Returns 0 if the group already existed, 1 on success, negative otherwise.
 288  */
 289 int cg_create(const char *controller, const char *path) {
 290         _cleanup_free_ char *fs = NULL;
 291         int r;
 292
 293         r = cg_get_path_and_check(controller, path, NULL, &fs);
 294         if (r < 0)
 295                 return r;
 296
 297         r = mkdir_parents(fs, 0755);
 298         if (r < 0)
 299                 return r;
 300
 301         r = RET_NERRNO(mkdir(fs, 0755));
 302         if (r == -EEXIST)
 303                 return 0;
 304         if (r < 0)
 305                 return r;
 306
 307         r = cg_hybrid_unified();
 308         if (r < 0)
 309                 return r;
 310
 311         if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 312                 r = cg_create(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path);
 313                 if (r < 0)
 314                         log_warning_errno(r, "Failed to create compat systemd cgroup %s: %m", path);
 315         }
 316
 317         return 1;
 318 }
 319
 320 int cg_create_and_attach(const char *controller, const char *path, pid_t pid) {
 321         int r, q;
 322
 323         assert(pid >= 0);
 324
 325         r = cg_create(controller, path);
 326         if (r < 0)
 327                 return r;
 328
 329         q = cg_attach(controller, path, pid);
 330         if (q < 0)
 331                 return q;
 332
 333         /* This does not remove the cgroup on failure */
 334         return r;
 335 }
 336
 337 int cg_attach(const char *controller, const char *path, pid_t pid) {
 338         _cleanup_free_ char *fs = NULL;
 339         char c[DECIMAL_STR_MAX(pid_t) + 2];
 340         int r;
 341
 342         assert(path);
 343         assert(pid >= 0);
 344
 345         r = cg_get_path_and_check(controller, path, "cgroup.procs", &fs);
 346         if (r < 0)
 347                 return r;
 348
 349         if (pid == 0)
 350                 pid = getpid_cached();
 351
 352         xsprintf(c, PID_FMT "\n", pid);
 353
 354         r = write_string_file(fs, c, WRITE_STRING_FILE_DISABLE_BUFFER);
 355         if (r == -EOPNOTSUPP && cg_is_threaded(controller, path) > 0)
 356                 /* When the threaded mode is used, we cannot read/write the file. Let's return recognizable error. */
 357                 return -EUCLEAN;
 358         if (r < 0)
 359                 return r;
 360
 361         r = cg_hybrid_unified();
 362         if (r < 0)
 363                 return r;
 364
 365         if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 366                 r = cg_attach(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, pid);
 367                 if (r < 0)
 368                         log_warning_errno(r, "Failed to attach "PID_FMT" to compat systemd cgroup %s: %m", pid, path);
 369         }
 370
 371         return 0;
 372 }
 373
 374 int cg_attach_fallback(const char *controller, const char *path, pid_t pid) {
 375         int r;
 376
 377         assert(controller);
 378         assert(path);
 379         assert(pid >= 0);
 380
 381         r = cg_attach(controller, path, pid);
 382         if (r < 0) {
 383                 char prefix[strlen(path) + 1];
 384
 385                 /* This didn't work? Then let's try all prefixes of
 386                  * the destination */
 387
 388                 PATH_FOREACH_PREFIX(prefix, path) {
 389                         int q;
 390
 391                         q = cg_attach(controller, prefix, pid);
 392                         if (q >= 0)
 393                                 return q;
 394                 }
 395         }
 396
 397         return r;
 398 }
 399
 400 int cg_set_access(
 401                 const char *controller,
 402                 const char *path,
 403                 uid_t uid,
 404                 gid_t gid) {
 405
 406         struct Attribute {
 407                 const char *name;
 408                 bool fatal;
 409         };
 410
 411         /* cgroup v1, aka legacy/non-unified */
 412         static const struct Attribute legacy_attributes[] = {
 413                 { "cgroup.procs",           true  },
 414                 { "tasks",                  false },
 415                 { "cgroup.clone_children",  false },
 416                 {},
 417         };
 418
 419         /* cgroup v2, aka unified */
 420         static const struct Attribute unified_attributes[] = {
 421                 { "cgroup.procs",           true  },
 422                 { "cgroup.subtree_control", true  },
 423                 { "cgroup.threads",         false },
 424                 {},
 425         };
 426
 427         static const struct Attribute* const attributes[] = {
 428                 [false] = legacy_attributes,
 429                 [true]  = unified_attributes,
 430         };
 431
 432         _cleanup_free_ char *fs = NULL;
 433         const struct Attribute *i;
 434         int r, unified;
 435
 436         assert(path);
 437
 438         if (uid == UID_INVALID && gid == GID_INVALID)
 439                 return 0;
 440
 441         unified = cg_unified_controller(controller);
 442         if (unified < 0)
 443                 return unified;
 444
 445         /* Configure access to the cgroup itself */
 446         r = cg_get_path(controller, path, NULL, &fs);
 447         if (r < 0)
 448                 return r;
 449
 450         r = chmod_and_chown(fs, 0755, uid, gid);
 451         if (r < 0)
 452                 return r;
 453
 454         /* Configure access to the cgroup's attributes */
 455         for (i = attributes[unified]; i->name; i++) {
 456                 fs = mfree(fs);
 457
 458                 r = cg_get_path(controller, path, i->name, &fs);
 459                 if (r < 0)
 460                         return r;
 461
 462                 r = chmod_and_chown(fs, 0644, uid, gid);
 463                 if (r < 0) {
 464                         if (i->fatal)
 465                                 return r;
 466
 467                         log_debug_errno(r, "Failed to set access on cgroup %s, ignoring: %m", fs);
 468                 }
 469         }
 470
 471         if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 472                 r = cg_hybrid_unified();
 473                 if (r < 0)
 474                         return r;
 475                 if (r > 0) {
 476                         /* Always propagate access mode from unified to legacy controller */
 477                         r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, uid, gid);
 478                         if (r < 0)
 479                                 log_debug_errno(r, "Failed to set access on compatibility systemd cgroup %s, ignoring: %m", path);
 480                 }
 481         }
 482
 483         return 0;
 484 }
 485
 486 struct access_callback_data {
 487         uid_t uid;
 488         gid_t gid;
 489         int error;
 490 };
 491
 492 static int access_callback(
 493                 RecurseDirEvent event,
 494                 const char *path,
 495                 int dir_fd,
 496                 int inode_fd,
 497                 const struct dirent *de,
 498                 const struct statx *sx,
 499                 void *userdata) {
 500
 501         struct access_callback_data *d = ASSERT_PTR(userdata);
 502
 503         if (!IN_SET(event, RECURSE_DIR_ENTER, RECURSE_DIR_ENTRY))
 504                 return RECURSE_DIR_CONTINUE;
 505
 506         assert(inode_fd >= 0);
 507
 508         /* fchown() doesn't support O_PATH fds, hence we use the /proc/self/fd/ trick */
 509         if (chown(FORMAT_PROC_FD_PATH(inode_fd), d->uid, d->gid) < 0) {
 510                 log_debug_errno(errno, "Failed to change ownership of '%s', ignoring: %m", ASSERT_PTR(path));
 511
 512                 if (d->error == 0) /* Return last error to caller */
 513                         d->error = errno;
 514         }
 515
 516         return RECURSE_DIR_CONTINUE;
 517 }
 518
 519 int cg_set_access_recursive(
 520                 const char *controller,
 521                 const char *path,
 522                 uid_t uid,
 523                 gid_t gid) {
 524
 525         _cleanup_close_ int fd = -EBADF;
 526         _cleanup_free_ char *fs = NULL;
 527         int r;
 528
 529         /* A recursive version of cg_set_access(). But note that this one changes ownership of *all* files,
 530          * not just the allowlist that cg_set_access() uses. Use cg_set_access() on the cgroup you want to
 531          * delegate, and cg_set_access_recursive() for any subcrgoups you might want to create below it. */
 532
 533         if (!uid_is_valid(uid) && !gid_is_valid(gid))
 534                 return 0;
 535
 536         r = cg_get_path(controller, path, NULL, &fs);
 537         if (r < 0)
 538                 return r;
 539
 540         fd = open(fs, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
 541         if (fd < 0)
 542                 return -errno;
 543
 544         struct access_callback_data d = {
 545                 .uid = uid,
 546                 .gid = gid,
 547         };
 548
 549         r = recurse_dir(fd,
 550                         fs,
 551                         /* statx_mask= */ 0,
 552                         /* n_depth_max= */ UINT_MAX,
 553                         RECURSE_DIR_SAME_MOUNT|RECURSE_DIR_INODE_FD|RECURSE_DIR_TOPLEVEL,
 554                         access_callback,
 555                         &d);
 556         if (r < 0)
 557                 return r;
 558
 559         return -d.error;
 560 }
 561
 562 int cg_migrate(
 563                 const char *cfrom,
 564                 const char *pfrom,
 565                 const char *cto,
 566                 const char *pto,
 567                 CGroupFlags flags) {
 568
 569         bool done = false;
 570         _cleanup_set_free_ Set *s = NULL;
 571         int r, ret = 0;
 572         pid_t my_pid;
 573
 574         assert(cfrom);
 575         assert(pfrom);
 576         assert(cto);
 577         assert(pto);
 578
 579         s = set_new(NULL);
 580         if (!s)
 581                 return -ENOMEM;
 582
 583         my_pid = getpid_cached();
 584
 585         do {
 586                 _cleanup_fclose_ FILE *f = NULL;
 587                 pid_t pid = 0;
 588                 done = true;
 589
 590                 r = cg_enumerate_processes(cfrom, pfrom, &f);
 591                 if (r < 0) {
 592                         if (ret >= 0 && r != -ENOENT)
 593                                 return r;
 594
 595                         return ret;
 596                 }
 597
 598                 while ((r = cg_read_pid(f, &pid)) > 0) {
 599
 600                         /* This might do weird stuff if we aren't a
 601                          * single-threaded program. However, we
 602                          * luckily know we are not */
 603                         if ((flags & CGROUP_IGNORE_SELF) && pid == my_pid)
 604                                 continue;
 605
 606                         if (set_get(s, PID_TO_PTR(pid)) == PID_TO_PTR(pid))
 607                                 continue;
 608
 609                         /* Ignore kernel threads. Since they can only
 610                          * exist in the root cgroup, we only check for
 611                          * them there. */
 612                         if (cfrom &&
 613                             empty_or_root(pfrom) &&
 614                             is_kernel_thread(pid) > 0)
 615                                 continue;
 616
 617                         r = cg_attach(cto, pto, pid);
 618                         if (r < 0) {
 619                                 if (ret >= 0 && r != -ESRCH)
 620                                         ret = r;
 621                         } else if (ret == 0)
 622                                 ret = 1;
 623
 624                         done = false;
 625
 626                         r = set_put(s, PID_TO_PTR(pid));
 627                         if (r < 0) {
 628                                 if (ret >= 0)
 629                                         return r;
 630
 631                                 return ret;
 632                         }
 633                 }
 634
 635                 if (r < 0) {
 636                         if (ret >= 0)
 637                                 return r;
 638
 639                         return ret;
 640                 }
 641         } while (!done);
 642
 643         return ret;
 644 }
 645
 646 int cg_migrate_recursive(
 647                 const char *cfrom,
 648                 const char *pfrom,
 649                 const char *cto,
 650                 const char *pto,
 651                 CGroupFlags flags) {
 652
 653         _cleanup_closedir_ DIR *d = NULL;
 654         int r, ret = 0;
 655         char *fn;
 656
 657         assert(cfrom);
 658         assert(pfrom);
 659         assert(cto);
 660         assert(pto);
 661
 662         ret = cg_migrate(cfrom, pfrom, cto, pto, flags);
 663
 664         r = cg_enumerate_subgroups(cfrom, pfrom, &d);
 665         if (r < 0) {
 666                 if (ret >= 0 && r != -ENOENT)
 667                         return r;
 668
 669                 return ret;
 670         }
 671
 672         while ((r = cg_read_subgroup(d, &fn)) > 0) {
 673                 _cleanup_free_ char *p = NULL;
 674
 675                 p = path_join(empty_to_root(pfrom), fn);
 676                 free(fn);
 677                 if (!p)
 678                         return -ENOMEM;
 679
 680                 r = cg_migrate_recursive(cfrom, p, cto, pto, flags);
 681                 if (r != 0 && ret >= 0)
 682                         ret = r;
 683         }
 684
 685         if (r < 0 && ret >= 0)
 686                 ret = r;
 687
 688         if (flags & CGROUP_REMOVE) {
 689                 r = cg_rmdir(cfrom, pfrom);
 690                 if (r < 0 && ret >= 0 && !IN_SET(r, -ENOENT, -EBUSY))
 691                         return r;
 692         }
 693
 694         return ret;
 695 }
 696
 697 int cg_migrate_recursive_fallback(
 698                 const char *cfrom,
 699                 const char *pfrom,
 700                 const char *cto,
 701                 const char *pto,
 702                 CGroupFlags flags) {
 703
 704         int r;
 705
 706         assert(cfrom);
 707         assert(pfrom);
 708         assert(cto);
 709         assert(pto);
 710
 711         r = cg_migrate_recursive(cfrom, pfrom, cto, pto, flags);
 712         if (r < 0) {
 713                 char prefix[strlen(pto) + 1];
 714
 715                 /* This didn't work? Then let's try all prefixes of the destination */
 716
 717                 PATH_FOREACH_PREFIX(prefix, pto) {
 718                         int q;
 719
 720                         q = cg_migrate_recursive(cfrom, pfrom, cto, prefix, flags);
 721                         if (q >= 0)
 722                                 return q;
 723                 }
 724         }
 725
 726         return r;
 727 }
 728
 729 int cg_create_everywhere(CGroupMask supported, CGroupMask mask, const char *path) {
 730         CGroupController c;
 731         CGroupMask done;
 732         bool created;
 733         int r;
 734
 735         /* This one will create a cgroup in our private tree, but also
 736          * duplicate it in the trees specified in mask, and remove it
 737          * in all others.
 738          *
 739          * Returns 0 if the group already existed in the systemd hierarchy,
 740          * 1 on success, negative otherwise.
 741          */
 742
 743         /* First create the cgroup in our own hierarchy. */
 744         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, path);
 745         if (r < 0)
 746                 return r;
 747         created = r;
 748
 749         /* If we are in the unified hierarchy, we are done now */
 750         r = cg_all_unified();
 751         if (r < 0)
 752                 return r;
 753         if (r > 0)
 754                 return created;
 755
 756         supported &= CGROUP_MASK_V1;
 757         mask = CGROUP_MASK_EXTEND_JOINED(mask);
 758         done = 0;
 759
 760         /* Otherwise, do the same in the other hierarchies */
 761         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
 762                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
 763                 const char *n;
 764
 765                 if (!FLAGS_SET(supported, bit))
 766                         continue;
 767
 768                 if (FLAGS_SET(done, bit))
 769                         continue;
 770
 771                 n = cgroup_controller_to_string(c);
 772                 if (FLAGS_SET(mask, bit))
 773                         (void) cg_create(n, path);
 774
 775                 done |= CGROUP_MASK_EXTEND_JOINED(bit);
 776         }
 777
 778         return created;
 779 }
 780
 781 int cg_attach_everywhere(CGroupMask supported, const char *path, pid_t pid, cg_migrate_callback_t path_callback, void *userdata) {
 782         int r;
 783
 784         r = cg_attach(SYSTEMD_CGROUP_CONTROLLER, path, pid);
 785         if (r < 0)
 786                 return r;
 787
 788         r = cg_all_unified();
 789         if (r < 0)
 790                 return r;
 791         if (r > 0)
 792                 return 0;
 793
 794         supported &= CGROUP_MASK_V1;
 795         CGroupMask done = 0;
 796
 797         for (CGroupController c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
 798                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
 799                 const char *p = NULL;
 800
 801                 if (!FLAGS_SET(supported, bit))
 802                         continue;
 803
 804                 if (FLAGS_SET(done, bit))
 805                         continue;
 806
 807                 if (path_callback)
 808                         p = path_callback(bit, userdata);
 809                 if (!p)
 810                         p = path;
 811
 812                 (void) cg_attach_fallback(cgroup_controller_to_string(c), p, pid);
 813                 done |= CGROUP_MASK_EXTEND_JOINED(bit);
 814         }
 815
 816         return 0;
 817 }
 818
 819 int cg_migrate_v1_controllers(CGroupMask supported, CGroupMask mask, const char *from, cg_migrate_callback_t to_callback, void *userdata) {
 820         CGroupController c;
 821         CGroupMask done;
 822         int r = 0, q;
 823
 824         assert(to_callback);
 825
 826         supported &= CGROUP_MASK_V1;
 827         mask = CGROUP_MASK_EXTEND_JOINED(mask);
 828         done = 0;
 829
 830         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
 831                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
 832                 const char *to = NULL;
 833
 834                 if (!FLAGS_SET(supported, bit))
 835                         continue;
 836
 837                 if (FLAGS_SET(done, bit))
 838                         continue;
 839
 840                 if (!FLAGS_SET(mask, bit))
 841                         continue;
 842
 843                 to = to_callback(bit, userdata);
 844
 845                 /* Remember first error and try continuing */
 846                 q = cg_migrate_recursive_fallback(SYSTEMD_CGROUP_CONTROLLER, from, cgroup_controller_to_string(c), to, 0);
 847                 r = (r < 0) ? r : q;
 848
 849                 done |= CGROUP_MASK_EXTEND_JOINED(bit);
 850         }
 851
 852         return r;
 853 }
 854
 855 int cg_trim_everywhere(CGroupMask supported, const char *path, bool delete_root) {
 856         int r, q;
 857
 858         r = cg_trim(SYSTEMD_CGROUP_CONTROLLER, path, delete_root);
 859         if (r < 0)
 860                 return r;
 861
 862         q = cg_all_unified();
 863         if (q < 0)
 864                 return q;
 865         if (q > 0)
 866                 return r;
 867
 868         return cg_trim_v1_controllers(supported, _CGROUP_MASK_ALL, path, delete_root);
 869 }
 870
 871 int cg_trim_v1_controllers(CGroupMask supported, CGroupMask mask, const char *path, bool delete_root) {
 872         CGroupController c;
 873         CGroupMask done;
 874         int r = 0, q;
 875
 876         supported &= CGROUP_MASK_V1;
 877         mask = CGROUP_MASK_EXTEND_JOINED(mask);
 878         done = 0;
 879
 880         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
 881                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
 882
 883                 if (!FLAGS_SET(supported, bit))
 884                         continue;
 885
 886                 if (FLAGS_SET(done, bit))
 887                         continue;
 888
 889                 if (FLAGS_SET(mask, bit)) {
 890                         /* Remember first error and try continuing */
 891                         q = cg_trim(cgroup_controller_to_string(c), path, delete_root);
 892                         r = (r < 0) ? r : q;
 893                 }
 894                 done |= CGROUP_MASK_EXTEND_JOINED(bit);
 895         }
 896
 897         return r;
 898 }
 899
 900 int cg_enable_everywhere(
 901                 CGroupMask supported,
 902                 CGroupMask mask,
 903                 const char *p,
 904                 CGroupMask *ret_result_mask) {
 905
 906         _cleanup_fclose_ FILE *f = NULL;
 907         _cleanup_free_ char *fs = NULL;
 908         CGroupController c;
 909         CGroupMask ret = 0;
 910         int r;
 911
 912         assert(p);
 913
 914         if (supported == 0) {
 915                 if (ret_result_mask)
 916                         *ret_result_mask = 0;
 917                 return 0;
 918         }
 919
 920         r = cg_all_unified();
 921         if (r < 0)
 922                 return r;
 923         if (r == 0) {
 924                 /* On the legacy hierarchy there's no concept of "enabling" controllers in cgroups defined. Let's claim
 925                  * complete success right away. (If you wonder why we return the full mask here, rather than zero: the
 926                  * caller tends to use the returned mask later on to compare if all controllers where properly joined,
 927                  * and if not requeues realization. This use is the primary purpose of the return value, hence let's
 928                  * minimize surprises here and reduce triggers for re-realization by always saying we fully
 929                  * succeeded.) */
 930                 if (ret_result_mask)
 931                         *ret_result_mask = mask & supported & CGROUP_MASK_V2; /* If you wonder why we mask this with
 932                                                                                * CGROUP_MASK_V2: The 'supported' mask
 933                                                                                * might contain pure-V1 or BPF
 934                                                                                * controllers, and we never want to
 935                                                                                * claim that we could enable those with
 936                                                                                * cgroup.subtree_control */
 937                 return 0;
 938         }
 939
 940         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, p, "cgroup.subtree_control", &fs);
 941         if (r < 0)
 942                 return r;
 943
 944         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
 945                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
 946                 const char *n;
 947
 948                 if (!FLAGS_SET(CGROUP_MASK_V2, bit))
 949                         continue;
 950
 951                 if (!FLAGS_SET(supported, bit))
 952                         continue;
 953
 954                 n = cgroup_controller_to_string(c);
 955                 {
 956                         char s[1 + strlen(n) + 1];
 957
 958                         s[0] = FLAGS_SET(mask, bit) ? '+' : '-';
 959                         strcpy(s + 1, n);
 960
 961                         if (!f) {
 962                                 f = fopen(fs, "we");
 963                                 if (!f)
 964                                         return log_debug_errno(errno, "Failed to open cgroup.subtree_control file of %s: %m", p);
 965                         }
 966
 967                         r = write_string_stream(f, s, WRITE_STRING_FILE_DISABLE_BUFFER);
 968                         if (r < 0) {
 969                                 log_debug_errno(r, "Failed to %s controller %s for %s (%s): %m",
 970                                                 FLAGS_SET(mask, bit) ? "enable" : "disable", n, p, fs);
 971                                 clearerr(f);
 972
 973                                 /* If we can't turn off a controller, leave it on in the reported resulting mask. This
 974                                  * happens for example when we attempt to turn off a controller up in the tree that is
 975                                  * used down in the tree. */
 976                                 if (!FLAGS_SET(mask, bit) && r == -EBUSY) /* You might wonder why we check for EBUSY
 977                                                                            * only here, and not follow the same logic
 978                                                                            * for other errors such as EINVAL or
 979                                                                            * EOPNOTSUPP or anything else. That's
 980                                                                            * because EBUSY indicates that the
 981                                                                            * controllers is currently enabled and
 982                                                                            * cannot be disabled because something down
 983                                                                            * the hierarchy is still using it. Any other
 984                                                                            * error most likely means something like "I
 985                                                                            * never heard of this controller" or
 986                                                                            * similar. In the former case it's hence
 987                                                                            * safe to assume the controller is still on
 988                                                                            * after the failed operation, while in the
 989                                                                            * latter case it's safer to assume the
 990                                                                            * controller is unknown and hence certainly
 991                                                                            * not enabled. */
 992                                         ret |= bit;
 993                         } else {
 994                                 /* Otherwise, if we managed to turn on a controller, set the bit reflecting that. */
 995                                 if (FLAGS_SET(mask, bit))
 996                                         ret |= bit;
 997                         }
 998                 }
 999         }
1000
1001         /* Let's return the precise set of controllers now enabled for the cgroup. */
1002         if (ret_result_mask)
1003                 *ret_result_mask = ret;
1004
1005         return 0;
1006 }