src/shared/cgroup-setup.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <unistd.h>
   4
   5 #include "cgroup-setup.h"
   6 #include "cgroup-util.h"
   7 #include "errno-util.h"
   8 #include "fd-util.h"
   9 #include "fileio.h"
  10 #include "fs-util.h"
  11 #include "missing_threads.h"
  12 #include "mkdir.h"
  13 #include "parse-util.h"
  14 #include "path-util.h"
  15 #include "proc-cmdline.h"
  16 #include "process-util.h"
  17 #include "recurse-dir.h"
  18 #include "stdio-util.h"
  19 #include "string-util.h"
  20 #include "user-util.h"
  21 #include "virt.h"
  22
  23 static int cg_any_controller_used_for_v1(void) {
  24         _cleanup_free_ char *buf = NULL;
  25         _cleanup_strv_free_ char **lines = NULL;
  26         int r;
  27
  28         r = read_full_virtual_file("/proc/cgroups", &buf, NULL);
  29         if (r < 0)
  30                 return log_debug_errno(r, "Could not read /proc/cgroups, ignoring: %m");
  31
  32         r = strv_split_newlines_full(&lines, buf, 0);
  33         if (r < 0)
  34                 return r;
  35
  36         /* The intention of this is to check if the fully unified cgroup tree setup is possible, meaning all
  37          * enabled kernel cgroup controllers are currently not in use by cgroup1.  For reference:
  38          * https://systemd.io/CGROUP_DELEGATION/#three-different-tree-setups-
  39          *
  40          * Note that this is typically only useful to check inside a container where we don't know what
  41          * cgroup tree setup is in use by the host; if the host is using legacy or hybrid, we can't use
  42          * unified since some or all controllers would be missing. This is not the best way to detect this,
  43          * as whatever container manager created our container should have mounted /sys/fs/cgroup
  44          * appropriately, but in case that wasn't done, we try to detect if it's possible for us to use
  45          * unified cgroups. */
  46         STRV_FOREACH(line, lines) {
  47                 _cleanup_free_ char *name = NULL, *hierarchy_id = NULL, *num = NULL, *enabled = NULL;
  48
  49                 /* Skip header line */
  50                 if (startswith(*line, "#"))
  51                         continue;
  52
  53                 const char *p = *line;
  54                 r = extract_many_words(&p, NULL, 0, &name, &hierarchy_id, &num, &enabled);
  55                 if (r < 0)
  56                         return log_debug_errno(r, "Error parsing /proc/cgroups line, ignoring: %m");
  57                 else if (r < 4) {
  58                         log_debug("Invalid /proc/cgroups line, ignoring.");
  59                         continue;
  60                 }
  61
  62                 /* Ignore disabled controllers. */
  63                 if (streq(enabled, "0"))
  64                         continue;
  65
  66                 /* Ignore controllers we don't care about. */
  67                 if (cgroup_controller_from_string(name) < 0)
  68                         continue;
  69
  70                 /* Since the unified cgroup doesn't use multiple hierarchies, if any controller has a
  71                  * non-zero hierarchy_id that means it's in use already in a legacy (or hybrid) cgroup v1
  72                  * hierarchy, and can't be used in a unified cgroup. */
  73                 if (!streq(hierarchy_id, "0")) {
  74                         log_debug("Cgroup controller %s in use by legacy v1 hierarchy.", name);
  75                         return 1;
  76                 }
  77         }
  78
  79         return 0;
  80 }
  81
  82 bool cg_is_unified_wanted(void) {
  83         static thread_local int wanted = -1;
  84         int r;
  85
  86         /* If we have a cached value, return that. */
  87         if (wanted >= 0)
  88                 return wanted;
  89
  90         /* If the hierarchy is already mounted, then follow whatever was chosen for it. */
  91         r = cg_unified_cached(true);
  92         if (r >= 0)
  93                 return (wanted = r >= CGROUP_UNIFIED_ALL);
  94
  95         /* If we were explicitly passed systemd.unified_cgroup_hierarchy, respect that. */
  96         bool b;
  97         r = proc_cmdline_get_bool("systemd.unified_cgroup_hierarchy", /* flags = */ 0, &b);
  98         if (r > 0)
  99                 return (wanted = b);
 100
 101         /* If we passed cgroup_no_v1=all with no other instructions, it seems highly unlikely that we want to
 102          * use hybrid or legacy hierarchy. */
 103         _cleanup_free_ char *c = NULL;
 104         r = proc_cmdline_get_key("cgroup_no_v1", 0, &c);
 105         if (r > 0 && streq_ptr(c, "all"))
 106                 return (wanted = true);
 107
 108         /* If any controller is in use as v1, don't use unified. */
 109         return (wanted = (cg_any_controller_used_for_v1() <= 0));
 110 }
 111
 112 bool cg_is_legacy_wanted(void) {
 113         static thread_local int wanted = -1;
 114
 115         /* If we have a cached value, return that. */
 116         if (wanted >= 0)
 117                 return wanted;
 118
 119         /* Check if we have cgroup v2 already mounted. */
 120         if (cg_unified_cached(true) == CGROUP_UNIFIED_ALL)
 121                 return (wanted = false);
 122
 123         /* Otherwise, assume that at least partial legacy is wanted,
 124          * since cgroup v2 should already be mounted at this point. */
 125         return (wanted = true);
 126 }
 127
 128 bool cg_is_hybrid_wanted(void) {
 129         static thread_local int wanted = -1;
 130         int r;
 131
 132         /* If we have a cached value, return that. */
 133         if (wanted >= 0)
 134                 return wanted;
 135
 136         /* If the hierarchy is already mounted, then follow whatever was chosen for it. */
 137         if (cg_unified_cached(true) == CGROUP_UNIFIED_ALL)
 138                 return (wanted = false);
 139
 140         /* Otherwise, let's see what the kernel command line has to say.  Since checking is expensive, cache
 141          * a non-error result.
 142          * The meaning of the kernel option is reversed wrt. to the return value of this function, hence the
 143          * negation. */
 144         bool b;
 145         r = proc_cmdline_get_bool("systemd.legacy_systemd_cgroup_controller", /* flags = */ 0, &b);
 146         if (r > 0)
 147                 return (wanted = !b);
 148
 149         /* The default hierarchy is "unified". But if this is reached, it means that unified hierarchy was
 150          * not mounted, so return true too. */
 151         return (wanted = true);
 152 }
 153
 154 bool cg_is_legacy_force_enabled(void) {
 155         bool force;
 156
 157         if (!cg_is_legacy_wanted())
 158                 return false;
 159
 160         /* If in container, we have to follow host's cgroup hierarchy. */
 161         if (detect_container() > 0)
 162                 return true;
 163
 164         if (proc_cmdline_get_bool("SYSTEMD_CGROUP_ENABLE_LEGACY_FORCE", /* flags = */ 0, &force) < 0)
 165                 return false;
 166
 167         return force;
 168 }
 169
 170 int cg_weight_parse(const char *s, uint64_t *ret) {
 171         uint64_t u;
 172         int r;
 173
 174         if (isempty(s)) {
 175                 *ret = CGROUP_WEIGHT_INVALID;
 176                 return 0;
 177         }
 178
 179         r = safe_atou64(s, &u);
 180         if (r < 0)
 181                 return r;
 182
 183         if (u < CGROUP_WEIGHT_MIN || u > CGROUP_WEIGHT_MAX)
 184                 return -ERANGE;
 185
 186         *ret = u;
 187         return 0;
 188 }
 189
 190 int cg_cpu_weight_parse(const char *s, uint64_t *ret) {
 191         if (streq_ptr(s, "idle"))
 192                 return *ret = CGROUP_WEIGHT_IDLE;
 193         return cg_weight_parse(s, ret);
 194 }
 195
 196 int cg_cpu_shares_parse(const char *s, uint64_t *ret) {
 197         uint64_t u;
 198         int r;
 199
 200         if (isempty(s)) {
 201                 *ret = CGROUP_CPU_SHARES_INVALID;
 202                 return 0;
 203         }
 204
 205         r = safe_atou64(s, &u);
 206         if (r < 0)
 207                 return r;
 208
 209         if (u < CGROUP_CPU_SHARES_MIN || u > CGROUP_CPU_SHARES_MAX)
 210                 return -ERANGE;
 211
 212         *ret = u;
 213         return 0;
 214 }
 215
 216 int cg_blkio_weight_parse(const char *s, uint64_t *ret) {
 217         uint64_t u;
 218         int r;
 219
 220         if (isempty(s)) {
 221                 *ret = CGROUP_BLKIO_WEIGHT_INVALID;
 222                 return 0;
 223         }
 224
 225         r = safe_atou64(s, &u);
 226         if (r < 0)
 227                 return r;
 228
 229         if (u < CGROUP_BLKIO_WEIGHT_MIN || u > CGROUP_BLKIO_WEIGHT_MAX)
 230                 return -ERANGE;
 231
 232         *ret = u;
 233         return 0;
 234 }
 235
 236 static int trim_cb(
 237                 RecurseDirEvent event,
 238                 const char *path,
 239                 int dir_fd,
 240                 int inode_fd,
 241                 const struct dirent *de,
 242                 const struct statx *sx,
 243                 void *userdata) {
 244
 245         /* Failures to delete inner cgroup we ignore (but debug log in case error code is unexpected) */
 246         if (event == RECURSE_DIR_LEAVE &&
 247             de->d_type == DT_DIR &&
 248             unlinkat(dir_fd, de->d_name, AT_REMOVEDIR) < 0 &&
 249             !IN_SET(errno, ENOENT, ENOTEMPTY, EBUSY))
 250                 log_debug_errno(errno, "Failed to trim inner cgroup %s, ignoring: %m", path);
 251
 252         return RECURSE_DIR_CONTINUE;
 253 }
 254
 255 int cg_trim(const char *controller, const char *path, bool delete_root) {
 256         _cleanup_free_ char *fs = NULL;
 257         int r, q;
 258
 259         assert(path);
 260         assert(controller);
 261
 262         r = cg_get_path(controller, path, NULL, &fs);
 263         if (r < 0)
 264                 return r;
 265
 266         r = recurse_dir_at(
 267                         AT_FDCWD,
 268                         fs,
 269                         /* statx_mask= */ 0,
 270                         /* n_depth_max= */ UINT_MAX,
 271                         RECURSE_DIR_ENSURE_TYPE,
 272                         trim_cb,
 273                         NULL);
 274         if (r == -ENOENT) /* non-existing is the ultimate trimming, hence no error */
 275                 r = 0;
 276         else if (r < 0)
 277                 log_debug_errno(r, "Failed to iterate through cgroup %s: %m", path);
 278
 279         /* If we shall delete the top-level cgroup, then propagate the failure to do so (except if it is
 280          * already gone anyway). Also, let's debug log about this failure, except if the error code is an
 281          * expected one. */
 282         if (delete_root && !empty_or_root(path) &&
 283             rmdir(fs) < 0 && errno != ENOENT) {
 284                 if (!IN_SET(errno, ENOTEMPTY, EBUSY))
 285                         log_debug_errno(errno, "Failed to trim cgroup %s: %m", path);
 286                 if (r >= 0)
 287                         r = -errno;
 288         }
 289
 290         q = cg_hybrid_unified();
 291         if (q < 0)
 292                 return q;
 293         if (q > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER))
 294                 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, delete_root);
 295
 296         return r;
 297 }
 298
 299 /* Create a cgroup in the hierarchy of controller.
 300  * Returns 0 if the group already existed, 1 on success, negative otherwise.
 301  */
 302 int cg_create(const char *controller, const char *path) {
 303         _cleanup_free_ char *fs = NULL;
 304         int r;
 305
 306         r = cg_get_path_and_check(controller, path, NULL, &fs);
 307         if (r < 0)
 308                 return r;
 309
 310         r = mkdir_parents(fs, 0755);
 311         if (r < 0)
 312                 return r;
 313
 314         r = RET_NERRNO(mkdir(fs, 0755));
 315         if (r == -EEXIST)
 316                 return 0;
 317         if (r < 0)
 318                 return r;
 319
 320         r = cg_hybrid_unified();
 321         if (r < 0)
 322                 return r;
 323
 324         if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 325                 r = cg_create(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path);
 326                 if (r < 0)
 327                         log_warning_errno(r, "Failed to create compat systemd cgroup %s: %m", path);
 328         }
 329
 330         return 1;
 331 }
 332
 333 int cg_create_and_attach(const char *controller, const char *path, pid_t pid) {
 334         int r, q;
 335
 336         assert(pid >= 0);
 337
 338         r = cg_create(controller, path);
 339         if (r < 0)
 340                 return r;
 341
 342         q = cg_attach(controller, path, pid);
 343         if (q < 0)
 344                 return q;
 345
 346         /* This does not remove the cgroup on failure */
 347         return r;
 348 }
 349
 350 int cg_attach(const char *controller, const char *path, pid_t pid) {
 351         _cleanup_free_ char *fs = NULL;
 352         char c[DECIMAL_STR_MAX(pid_t) + 2];
 353         int r;
 354
 355         assert(path);
 356         assert(pid >= 0);
 357
 358         r = cg_get_path_and_check(controller, path, "cgroup.procs", &fs);
 359         if (r < 0)
 360                 return r;
 361
 362         if (pid == 0)
 363                 pid = getpid_cached();
 364
 365         xsprintf(c, PID_FMT "\n", pid);
 366
 367         r = write_string_file(fs, c, WRITE_STRING_FILE_DISABLE_BUFFER);
 368         if (r == -EOPNOTSUPP && cg_is_threaded(path) > 0)
 369                 /* When the threaded mode is used, we cannot read/write the file. Let's return recognizable error. */
 370                 return -EUCLEAN;
 371         if (r < 0)
 372                 return r;
 373
 374         r = cg_hybrid_unified();
 375         if (r < 0)
 376                 return r;
 377
 378         if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 379                 r = cg_attach(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, pid);
 380                 if (r < 0)
 381                         log_warning_errno(r, "Failed to attach "PID_FMT" to compat systemd cgroup %s: %m", pid, path);
 382         }
 383
 384         return 0;
 385 }
 386
 387 int cg_fd_attach(int fd, pid_t pid) {
 388         char c[DECIMAL_STR_MAX(pid_t) + 2];
 389
 390         assert(fd >= 0);
 391         assert(pid >= 0);
 392
 393         if (pid == 0)
 394                 pid = getpid_cached();
 395
 396         xsprintf(c, PID_FMT "\n", pid);
 397
 398         return write_string_file_at(fd, "cgroup.procs", c, WRITE_STRING_FILE_DISABLE_BUFFER);
 399 }
 400
 401 int cg_attach_fallback(const char *controller, const char *path, pid_t pid) {
 402         int r;
 403
 404         assert(controller);
 405         assert(path);
 406         assert(pid >= 0);
 407
 408         r = cg_attach(controller, path, pid);
 409         if (r < 0) {
 410                 char prefix[strlen(path) + 1];
 411
 412                 /* This didn't work? Then let's try all prefixes of
 413                  * the destination */
 414
 415                 PATH_FOREACH_PREFIX(prefix, path) {
 416                         int q;
 417
 418                         q = cg_attach(controller, prefix, pid);
 419                         if (q >= 0)
 420                                 return q;
 421                 }
 422         }
 423
 424         return r;
 425 }
 426
 427 int cg_set_access(
 428                 const char *controller,
 429                 const char *path,
 430                 uid_t uid,
 431                 gid_t gid) {
 432
 433         struct Attribute {
 434                 const char *name;
 435                 bool fatal;
 436         };
 437
 438         /* cgroup v1, aka legacy/non-unified */
 439         static const struct Attribute legacy_attributes[] = {
 440                 { "cgroup.procs",           true  },
 441                 { "tasks",                  false },
 442                 { "cgroup.clone_children",  false },
 443                 {},
 444         };
 445
 446         /* cgroup v2, aka unified */
 447         static const struct Attribute unified_attributes[] = {
 448                 { "cgroup.procs",           true  },
 449                 { "cgroup.subtree_control", true  },
 450                 { "cgroup.threads",         false },
 451                 { "memory.oom.group",       false },
 452                 { "memory.reclaim",         false },
 453                 {},
 454         };
 455
 456         static const struct Attribute* const attributes[] = {
 457                 [false] = legacy_attributes,
 458                 [true]  = unified_attributes,
 459         };
 460
 461         _cleanup_free_ char *fs = NULL;
 462         const struct Attribute *i;
 463         int r, unified;
 464
 465         assert(path);
 466
 467         if (uid == UID_INVALID && gid == GID_INVALID)
 468                 return 0;
 469
 470         unified = cg_unified_controller(controller);
 471         if (unified < 0)
 472                 return unified;
 473
 474         /* Configure access to the cgroup itself */
 475         r = cg_get_path(controller, path, NULL, &fs);
 476         if (r < 0)
 477                 return r;
 478
 479         r = chmod_and_chown(fs, 0755, uid, gid);
 480         if (r < 0)
 481                 return r;
 482
 483         /* Configure access to the cgroup's attributes */
 484         for (i = attributes[unified]; i->name; i++) {
 485                 fs = mfree(fs);
 486
 487                 r = cg_get_path(controller, path, i->name, &fs);
 488                 if (r < 0)
 489                         return r;
 490
 491                 r = chmod_and_chown(fs, 0644, uid, gid);
 492                 if (r < 0) {
 493                         if (i->fatal)
 494                                 return r;
 495
 496                         log_debug_errno(r, "Failed to set access on cgroup %s, ignoring: %m", fs);
 497                 }
 498         }
 499
 500         if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
 501                 r = cg_hybrid_unified();
 502                 if (r < 0)
 503                         return r;
 504                 if (r > 0) {
 505                         /* Always propagate access mode from unified to legacy controller */
 506                         r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, uid, gid);
 507                         if (r < 0)
 508                                 log_debug_errno(r, "Failed to set access on compatibility systemd cgroup %s, ignoring: %m", path);
 509                 }
 510         }
 511
 512         return 0;
 513 }
 514
 515 struct access_callback_data {
 516         uid_t uid;
 517         gid_t gid;
 518         int error;
 519 };
 520
 521 static int access_callback(
 522                 RecurseDirEvent event,
 523                 const char *path,
 524                 int dir_fd,
 525                 int inode_fd,
 526                 const struct dirent *de,
 527                 const struct statx *sx,
 528                 void *userdata) {
 529
 530         struct access_callback_data *d = ASSERT_PTR(userdata);
 531
 532         if (!IN_SET(event, RECURSE_DIR_ENTER, RECURSE_DIR_ENTRY))
 533                 return RECURSE_DIR_CONTINUE;
 534
 535         assert(inode_fd >= 0);
 536
 537         /* fchown() doesn't support O_PATH fds, hence we use the /proc/self/fd/ trick */
 538         if (chown(FORMAT_PROC_FD_PATH(inode_fd), d->uid, d->gid) < 0) {
 539                 log_debug_errno(errno, "Failed to change ownership of '%s', ignoring: %m", ASSERT_PTR(path));
 540
 541                 if (d->error == 0) /* Return last error to caller */
 542                         d->error = errno;
 543         }
 544
 545         return RECURSE_DIR_CONTINUE;
 546 }
 547
 548 int cg_set_access_recursive(
 549                 const char *controller,
 550                 const char *path,
 551                 uid_t uid,
 552                 gid_t gid) {
 553
 554         _cleanup_close_ int fd = -EBADF;
 555         _cleanup_free_ char *fs = NULL;
 556         int r;
 557
 558         /* A recursive version of cg_set_access(). But note that this one changes ownership of *all* files,
 559          * not just the allowlist that cg_set_access() uses. Use cg_set_access() on the cgroup you want to
 560          * delegate, and cg_set_access_recursive() for any subcrgoups you might want to create below it. */
 561
 562         if (!uid_is_valid(uid) && !gid_is_valid(gid))
 563                 return 0;
 564
 565         r = cg_get_path(controller, path, NULL, &fs);
 566         if (r < 0)
 567                 return r;
 568
 569         fd = open(fs, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
 570         if (fd < 0)
 571                 return -errno;
 572
 573         struct access_callback_data d = {
 574                 .uid = uid,
 575                 .gid = gid,
 576         };
 577
 578         r = recurse_dir(fd,
 579                         fs,
 580                         /* statx_mask= */ 0,
 581                         /* n_depth_max= */ UINT_MAX,
 582                         RECURSE_DIR_SAME_MOUNT|RECURSE_DIR_INODE_FD|RECURSE_DIR_TOPLEVEL,
 583                         access_callback,
 584                         &d);
 585         if (r < 0)
 586                 return r;
 587
 588         return -d.error;
 589 }
 590
 591 int cg_migrate(
 592                 const char *cfrom,
 593                 const char *pfrom,
 594                 const char *cto,
 595                 const char *pto,
 596                 CGroupFlags flags) {
 597
 598         bool done = false;
 599         _cleanup_set_free_ Set *s = NULL;
 600         int r, ret = 0;
 601
 602         assert(cfrom);
 603         assert(pfrom);
 604         assert(cto);
 605         assert(pto);
 606
 607         do {
 608                 _cleanup_fclose_ FILE *f = NULL;
 609                 pid_t pid;
 610
 611                 done = true;
 612
 613                 r = cg_enumerate_processes(cfrom, pfrom, &f);
 614                 if (r < 0)
 615                         return RET_GATHER(ret, r);
 616
 617                 while ((r = cg_read_pid(f, &pid)) > 0) {
 618                         /* This might do weird stuff if we aren't a single-threaded program. However, we
 619                          * luckily know we are. */
 620                         if (FLAGS_SET(flags, CGROUP_IGNORE_SELF) && pid == getpid_cached())
 621                                 continue;
 622
 623                         if (set_contains(s, PID_TO_PTR(pid)))
 624                                 continue;
 625
 626                         /* Ignore kernel threads. Since they can only exist in the root cgroup, we only
 627                          * check for them there. */
 628                         if (cfrom && empty_or_root(pfrom) &&
 629                             pid_is_kernel_thread(pid) > 0)
 630                                 continue;
 631
 632                         r = cg_attach(cto, pto, pid);
 633                         if (r < 0) {
 634                                 if (r != -ESRCH)
 635                                         RET_GATHER(ret, r);
 636                         } else if (ret == 0)
 637                                 ret = 1;
 638
 639                         done = false;
 640
 641                         r = set_ensure_put(&s, /* hash_ops = */ NULL, PID_TO_PTR(pid));
 642                         if (r < 0)
 643                                 return RET_GATHER(ret, r);
 644                 }
 645                 if (r < 0)
 646                         return RET_GATHER(ret, r);
 647         } while (!done);
 648
 649         return ret;
 650 }
 651
 652 int cg_migrate_recursive(
 653                 const char *cfrom,
 654                 const char *pfrom,
 655                 const char *cto,
 656                 const char *pto,
 657                 CGroupFlags flags) {
 658
 659         _cleanup_closedir_ DIR *d = NULL;
 660         int r, ret = 0;
 661         char *fn;
 662
 663         assert(cfrom);
 664         assert(pfrom);
 665         assert(cto);
 666         assert(pto);
 667
 668         ret = cg_migrate(cfrom, pfrom, cto, pto, flags);
 669
 670         r = cg_enumerate_subgroups(cfrom, pfrom, &d);
 671         if (r < 0) {
 672                 if (ret >= 0 && r != -ENOENT)
 673                         return r;
 674
 675                 return ret;
 676         }
 677
 678         while ((r = cg_read_subgroup(d, &fn)) > 0) {
 679                 _cleanup_free_ char *p = NULL;
 680
 681                 p = path_join(empty_to_root(pfrom), fn);
 682                 free(fn);
 683                 if (!p)
 684                         return -ENOMEM;
 685
 686                 r = cg_migrate_recursive(cfrom, p, cto, pto, flags);
 687                 if (r != 0 && ret >= 0)
 688                         ret = r;
 689         }
 690
 691         if (r < 0 && ret >= 0)
 692                 ret = r;
 693
 694         if (flags & CGROUP_REMOVE) {
 695                 r = cg_rmdir(cfrom, pfrom);
 696                 if (r < 0 && ret >= 0 && !IN_SET(r, -ENOENT, -EBUSY))
 697                         return r;
 698         }
 699
 700         return ret;
 701 }
 702
 703 int cg_migrate_recursive_fallback(
 704                 const char *cfrom,
 705                 const char *pfrom,
 706                 const char *cto,
 707                 const char *pto,
 708                 CGroupFlags flags) {
 709
 710         int r;
 711
 712         assert(cfrom);
 713         assert(pfrom);
 714         assert(cto);
 715         assert(pto);
 716
 717         r = cg_migrate_recursive(cfrom, pfrom, cto, pto, flags);
 718         if (r < 0) {
 719                 char prefix[strlen(pto) + 1];
 720
 721                 /* This didn't work? Then let's try all prefixes of the destination */
 722
 723                 PATH_FOREACH_PREFIX(prefix, pto) {
 724                         int q;
 725
 726                         q = cg_migrate_recursive(cfrom, pfrom, cto, prefix, flags);
 727                         if (q >= 0)
 728                                 return q;
 729                 }
 730         }
 731
 732         return r;
 733 }
 734
 735 int cg_create_everywhere(CGroupMask supported, CGroupMask mask, const char *path) {
 736         CGroupController c;
 737         CGroupMask done;
 738         bool created;
 739         int r;
 740
 741         /* This one will create a cgroup in our private tree, but also
 742          * duplicate it in the trees specified in mask, and remove it
 743          * in all others.
 744          *
 745          * Returns 0 if the group already existed in the systemd hierarchy,
 746          * 1 on success, negative otherwise.
 747          */
 748
 749         /* First create the cgroup in our own hierarchy. */
 750         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, path);
 751         if (r < 0)
 752                 return r;
 753         created = r;
 754
 755         /* If we are in the unified hierarchy, we are done now */
 756         r = cg_all_unified();
 757         if (r < 0)
 758                 return r;
 759         if (r > 0)
 760                 return created;
 761
 762         supported &= CGROUP_MASK_V1;
 763         mask = CGROUP_MASK_EXTEND_JOINED(mask);
 764         done = 0;
 765
 766         /* Otherwise, do the same in the other hierarchies */
 767         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
 768                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
 769                 const char *n;
 770
 771                 if (!FLAGS_SET(supported, bit))
 772                         continue;
 773
 774                 if (FLAGS_SET(done, bit))
 775                         continue;
 776
 777                 n = cgroup_controller_to_string(c);
 778                 if (FLAGS_SET(mask, bit))
 779                         (void) cg_create(n, path);
 780
 781                 done |= CGROUP_MASK_EXTEND_JOINED(bit);
 782         }
 783
 784         return created;
 785 }
 786
 787 int cg_attach_everywhere(CGroupMask supported, const char *path, pid_t pid, cg_migrate_callback_t path_callback, void *userdata) {
 788         int r;
 789
 790         r = cg_attach(SYSTEMD_CGROUP_CONTROLLER, path, pid);
 791         if (r < 0)
 792                 return r;
 793
 794         r = cg_all_unified();
 795         if (r < 0)
 796                 return r;
 797         if (r > 0)
 798                 return 0;
 799
 800         supported &= CGROUP_MASK_V1;
 801         CGroupMask done = 0;
 802
 803         for (CGroupController c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
 804                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
 805                 const char *p = NULL;
 806
 807                 if (!FLAGS_SET(supported, bit))
 808                         continue;
 809
 810                 if (FLAGS_SET(done, bit))
 811                         continue;
 812
 813                 if (path_callback)
 814                         p = path_callback(bit, userdata);
 815                 if (!p)
 816                         p = path;
 817
 818                 (void) cg_attach_fallback(cgroup_controller_to_string(c), p, pid);
 819                 done |= CGROUP_MASK_EXTEND_JOINED(bit);
 820         }
 821
 822         return 0;
 823 }
 824
 825 int cg_migrate_v1_controllers(CGroupMask supported, CGroupMask mask, const char *from, cg_migrate_callback_t to_callback, void *userdata) {
 826         CGroupController c;
 827         CGroupMask done;
 828         int r = 0, q;
 829
 830         assert(to_callback);
 831
 832         supported &= CGROUP_MASK_V1;
 833         mask = CGROUP_MASK_EXTEND_JOINED(mask);
 834         done = 0;
 835
 836         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
 837                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
 838                 const char *to = NULL;
 839
 840                 if (!FLAGS_SET(supported, bit))
 841                         continue;
 842
 843                 if (FLAGS_SET(done, bit))
 844                         continue;
 845
 846                 if (!FLAGS_SET(mask, bit))
 847                         continue;
 848
 849                 to = to_callback(bit, userdata);
 850
 851                 /* Remember first error and try continuing */
 852                 q = cg_migrate_recursive_fallback(SYSTEMD_CGROUP_CONTROLLER, from, cgroup_controller_to_string(c), to, 0);
 853                 r = (r < 0) ? r : q;
 854
 855                 done |= CGROUP_MASK_EXTEND_JOINED(bit);
 856         }
 857
 858         return r;
 859 }
 860
 861 int cg_trim_everywhere(CGroupMask supported, const char *path, bool delete_root) {
 862         int r, q;
 863
 864         r = cg_trim(SYSTEMD_CGROUP_CONTROLLER, path, delete_root);
 865         if (r < 0)
 866                 return r;
 867
 868         q = cg_all_unified();
 869         if (q < 0)
 870                 return q;
 871         if (q > 0)
 872                 return r;
 873
 874         return cg_trim_v1_controllers(supported, _CGROUP_MASK_ALL, path, delete_root);
 875 }
 876
 877 int cg_trim_v1_controllers(CGroupMask supported, CGroupMask mask, const char *path, bool delete_root) {
 878         CGroupController c;
 879         CGroupMask done;
 880         int r = 0, q;
 881
 882         supported &= CGROUP_MASK_V1;
 883         mask = CGROUP_MASK_EXTEND_JOINED(mask);
 884         done = 0;
 885
 886         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
 887                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
 888
 889                 if (!FLAGS_SET(supported, bit))
 890                         continue;
 891
 892                 if (FLAGS_SET(done, bit))
 893                         continue;
 894
 895                 if (FLAGS_SET(mask, bit)) {
 896                         /* Remember first error and try continuing */
 897                         q = cg_trim(cgroup_controller_to_string(c), path, delete_root);
 898                         r = (r < 0) ? r : q;
 899                 }
 900                 done |= CGROUP_MASK_EXTEND_JOINED(bit);
 901         }
 902
 903         return r;
 904 }
 905
 906 int cg_enable_everywhere(
 907                 CGroupMask supported,
 908                 CGroupMask mask,
 909                 const char *p,
 910                 CGroupMask *ret_result_mask) {
 911
 912         _cleanup_fclose_ FILE *f = NULL;
 913         _cleanup_free_ char *fs = NULL;
 914         CGroupController c;
 915         CGroupMask ret = 0;
 916         int r;
 917
 918         assert(p);
 919
 920         if (supported == 0) {
 921                 if (ret_result_mask)
 922                         *ret_result_mask = 0;
 923                 return 0;
 924         }
 925
 926         r = cg_all_unified();
 927         if (r < 0)
 928                 return r;
 929         if (r == 0) {
 930                 /* On the legacy hierarchy there's no concept of "enabling" controllers in cgroups defined. Let's claim
 931                  * complete success right away. (If you wonder why we return the full mask here, rather than zero: the
 932                  * caller tends to use the returned mask later on to compare if all controllers where properly joined,
 933                  * and if not requeues realization. This use is the primary purpose of the return value, hence let's
 934                  * minimize surprises here and reduce triggers for re-realization by always saying we fully
 935                  * succeeded.) */
 936                 if (ret_result_mask)
 937                         *ret_result_mask = mask & supported & CGROUP_MASK_V2; /* If you wonder why we mask this with
 938                                                                                * CGROUP_MASK_V2: The 'supported' mask
 939                                                                                * might contain pure-V1 or BPF
 940                                                                                * controllers, and we never want to
 941                                                                                * claim that we could enable those with
 942                                                                                * cgroup.subtree_control */
 943                 return 0;
 944         }
 945
 946         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, p, "cgroup.subtree_control", &fs);
 947         if (r < 0)
 948                 return r;
 949
 950         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
 951                 CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
 952                 const char *n;
 953
 954                 if (!FLAGS_SET(CGROUP_MASK_V2, bit))
 955                         continue;
 956
 957                 if (!FLAGS_SET(supported, bit))
 958                         continue;
 959
 960                 n = cgroup_controller_to_string(c);
 961                 {
 962                         char s[1 + strlen(n) + 1];
 963
 964                         s[0] = FLAGS_SET(mask, bit) ? '+' : '-';
 965                         strcpy(s + 1, n);
 966
 967                         if (!f) {
 968                                 f = fopen(fs, "we");
 969                                 if (!f)
 970                                         return log_debug_errno(errno, "Failed to open cgroup.subtree_control file of %s: %m", p);
 971                         }
 972
 973                         r = write_string_stream(f, s, WRITE_STRING_FILE_DISABLE_BUFFER);
 974                         if (r < 0) {
 975                                 log_debug_errno(r, "Failed to %s controller %s for %s (%s): %m",
 976                                                 FLAGS_SET(mask, bit) ? "enable" : "disable", n, p, fs);
 977                                 clearerr(f);
 978
 979                                 /* If we can't turn off a controller, leave it on in the reported resulting mask. This
 980                                  * happens for example when we attempt to turn off a controller up in the tree that is
 981                                  * used down in the tree. */
 982                                 if (!FLAGS_SET(mask, bit) && r == -EBUSY) /* You might wonder why we check for EBUSY
 983                                                                            * only here, and not follow the same logic
 984                                                                            * for other errors such as EINVAL or
 985                                                                            * EOPNOTSUPP or anything else. That's
 986                                                                            * because EBUSY indicates that the
 987                                                                            * controllers is currently enabled and
 988                                                                            * cannot be disabled because something down
 989                                                                            * the hierarchy is still using it. Any other
 990                                                                            * error most likely means something like "I
 991                                                                            * never heard of this controller" or
 992                                                                            * similar. In the former case it's hence
 993                                                                            * safe to assume the controller is still on
 994                                                                            * after the failed operation, while in the
 995                                                                            * latter case it's safer to assume the
 996                                                                            * controller is unknown and hence certainly
 997                                                                            * not enabled. */
 998                                         ret |= bit;
 999                         } else {
1000                                 /* Otherwise, if we managed to turn on a controller, set the bit reflecting that. */
1001                                 if (FLAGS_SET(mask, bit))
1002                                         ret |= bit;
1003                         }
1004                 }
1005         }
1006
1007         /* Let's return the precise set of controllers now enabled for the cgroup. */
1008         if (ret_result_mask)
1009                 *ret_result_mask = ret;
1010
1011         return 0;
1012 }