src/shared/mount-setup.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <errno.h>
   4 #include <stdlib.h>
   5 #include <sys/mount.h>
   6 #include <sys/statvfs.h>
   7 #include <unistd.h>
   8
   9 #include "alloc-util.h"
  10 #include "bus-util.h"
  11 #include "cgroup-setup.h"
  12 #include "cgroup-util.h"
  13 #include "conf-files.h"
  14 #include "dev-setup.h"
  15 #include "dirent-util.h"
  16 #include "efi-loader.h"
  17 #include "fd-util.h"
  18 #include "fileio.h"
  19 #include "fs-util.h"
  20 #include "label-util.h"
  21 #include "log.h"
  22 #include "macro.h"
  23 #include "mkdir-label.h"
  24 #include "mount-setup.h"
  25 #include "mount-util.h"
  26 #include "mountpoint-util.h"
  27 #include "nulstr-util.h"
  28 #include "path-util.h"
  29 #include "recurse-dir.h"
  30 #include "set.h"
  31 #include "smack-util.h"
  32 #include "strv.h"
  33 #include "user-util.h"
  34 #include "virt.h"
  35
  36 typedef enum MountMode {
  37         MNT_NONE           = 0,
  38         MNT_FATAL          = 1 << 0,
  39         MNT_IN_CONTAINER   = 1 << 1,
  40         MNT_CHECK_WRITABLE = 1 << 2,
  41         MNT_FOLLOW_SYMLINK = 1 << 3,
  42 } MountMode;
  43
  44 typedef struct MountPoint {
  45         const char *what;
  46         const char *where;
  47         const char *type;
  48         const char *options;
  49         unsigned long flags;
  50         bool (*condition_fn)(void);
  51         MountMode mode;
  52 } MountPoint;
  53
  54 /* The first three entries we might need before SELinux is up. The
  55  * fourth (securityfs) is needed by IMA to load a custom policy. The
  56  * other ones we can delay until SELinux and IMA are loaded. When
  57  * SMACK is enabled we need smackfs, too, so it's a fifth one. */
  58 #if ENABLE_SMACK
  59 #define N_EARLY_MOUNT 5
  60 #else
  61 #define N_EARLY_MOUNT 4
  62 #endif
  63
  64 static bool check_recursiveprot_supported(void) {
  65         int r;
  66
  67         if (!cg_is_unified_wanted())
  68                 return false;
  69
  70         r = mount_option_supported("cgroup2", "memory_recursiveprot", NULL);
  71         if (r < 0)
  72                 log_debug_errno(r, "Failed to determiner whether the 'memory_recursiveprot' mount option is supported, assuming not: %m");
  73         else if (r == 0)
  74                 log_debug("This kernel version does not support 'memory_recursiveprot', not using mount option.");
  75
  76         return r > 0;
  77 }
  78
  79 static const MountPoint mount_table[] = {
  80         { "proc",        "/proc",                     "proc",       NULL,                                       MS_NOSUID|MS_NOEXEC|MS_NODEV,
  81           NULL,          MNT_FATAL|MNT_IN_CONTAINER|MNT_FOLLOW_SYMLINK },
  82         { "sysfs",       "/sys",                      "sysfs",      NULL,                                       MS_NOSUID|MS_NOEXEC|MS_NODEV,
  83           NULL,          MNT_FATAL|MNT_IN_CONTAINER },
  84         { "devtmpfs",    "/dev",                      "devtmpfs",   "mode=0755" TMPFS_LIMITS_DEV,               MS_NOSUID|MS_STRICTATIME,
  85           NULL,          MNT_FATAL|MNT_IN_CONTAINER },
  86         { "securityfs",  "/sys/kernel/security",      "securityfs", NULL,                                       MS_NOSUID|MS_NOEXEC|MS_NODEV,
  87           NULL,          MNT_NONE                   },
  88 #if ENABLE_SMACK
  89         { "smackfs",     "/sys/fs/smackfs",           "smackfs",    "smackfsdef=*",                             MS_NOSUID|MS_NOEXEC|MS_NODEV,
  90           mac_smack_use, MNT_FATAL                  },
  91         { "tmpfs",       "/dev/shm",                  "tmpfs",      "mode=01777,smackfsroot=*",                 MS_NOSUID|MS_NODEV|MS_STRICTATIME,
  92           mac_smack_use, MNT_FATAL                  },
  93 #endif
  94         { "tmpfs",       "/dev/shm",                  "tmpfs",      "mode=01777",                               MS_NOSUID|MS_NODEV|MS_STRICTATIME,
  95           NULL,          MNT_FATAL|MNT_IN_CONTAINER },
  96         { "devpts",      "/dev/pts",                  "devpts",     "mode=0620,gid=" STRINGIFY(TTY_GID),        MS_NOSUID|MS_NOEXEC,
  97           NULL,          MNT_IN_CONTAINER           },
  98 #if ENABLE_SMACK
  99         { "tmpfs",       "/run",                      "tmpfs",      "mode=0755,smackfsroot=*" TMPFS_LIMITS_RUN, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
 100           mac_smack_use, MNT_FATAL                  },
 101 #endif
 102         { "tmpfs",       "/run",                      "tmpfs",      "mode=0755" TMPFS_LIMITS_RUN,               MS_NOSUID|MS_NODEV|MS_STRICTATIME,
 103           NULL,          MNT_FATAL|MNT_IN_CONTAINER },
 104         { "cgroup2",     "/sys/fs/cgroup",            "cgroup2",    "nsdelegate,memory_recursiveprot",          MS_NOSUID|MS_NOEXEC|MS_NODEV,
 105           check_recursiveprot_supported, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
 106         { "cgroup2",     "/sys/fs/cgroup",            "cgroup2",    "nsdelegate",                               MS_NOSUID|MS_NOEXEC|MS_NODEV,
 107           cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
 108         { "cgroup2",     "/sys/fs/cgroup",            "cgroup2",    NULL,                                       MS_NOSUID|MS_NOEXEC|MS_NODEV,
 109           cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
 110 #if ENABLE_PSTORE
 111         { "pstore",      "/sys/fs/pstore",            "pstore",     NULL,                                       MS_NOSUID|MS_NOEXEC|MS_NODEV,
 112           NULL,          MNT_NONE                   },
 113 #endif
 114 #if ENABLE_EFI
 115         { "efivarfs",    "/sys/firmware/efi/efivars", "efivarfs",   NULL,                                       MS_NOSUID|MS_NOEXEC|MS_NODEV,
 116           is_efi_boot,   MNT_NONE                   },
 117 #endif
 118         { "bpf",         "/sys/fs/bpf",               "bpf",        "mode=0700",                                MS_NOSUID|MS_NOEXEC|MS_NODEV,
 119           NULL,          MNT_NONE,                  },
 120 };
 121
 122 assert_cc(N_EARLY_MOUNT <= ELEMENTSOF(mount_table));
 123
 124 bool mount_point_is_api(const char *path) {
 125         /* Checks if this mount point is considered "API", and hence
 126          * should be ignored */
 127
 128         FOREACH_ARRAY(i, mount_table, ELEMENTSOF(mount_table))
 129                 if (path_equal(path, i->where))
 130                         return true;
 131
 132         return path_startswith(path, "/sys/fs/cgroup/");
 133 }
 134
 135 bool mount_point_ignore(const char *path) {
 136         /* These are API file systems that might be mounted by other software, we just list them here so that
 137          * we know that we should ignore them. */
 138         FOREACH_STRING(i,
 139                        /* SELinux file systems */
 140                        "/sys/fs/selinux",
 141                        /* Container bind mounts */
 142                        "/dev/console",
 143                        "/proc/kmsg",
 144                        "/proc/sys",
 145                        "/proc/sys/kernel/random/boot_id")
 146                 if (path_equal(path, i))
 147                         return true;
 148
 149         if (path_startswith(path, "/run/host")) /* All mounts passed in from the container manager are
 150                                                  * something we better ignore. */
 151                 return true;
 152
 153         return false;
 154 }
 155
 156 static int mount_one(const MountPoint *p, bool relabel) {
 157         int r, priority;
 158
 159         assert(p);
 160         assert(p->what);
 161         assert(p->where);
 162         assert(p->type);
 163
 164         priority = FLAGS_SET(p->mode, MNT_FATAL) ? LOG_ERR : LOG_DEBUG;
 165
 166         if (p->condition_fn && !p->condition_fn())
 167                 return 0;
 168
 169         /* Relabel first, just in case */
 170         if (relabel)
 171                 (void) label_fix(p->where, LABEL_IGNORE_ENOENT|LABEL_IGNORE_EROFS);
 172
 173         r = path_is_mount_point_full(p->where, /* root = */ NULL, AT_SYMLINK_FOLLOW);
 174         if (r < 0 && r != -ENOENT) {
 175                 log_full_errno(priority, r, "Failed to determine whether %s is a mount point: %m", p->where);
 176                 return FLAGS_SET(p->mode, MNT_FATAL) ? r : 0;
 177         }
 178         if (r > 0)
 179                 return 0;
 180
 181         /* Skip securityfs in a container */
 182         if (!FLAGS_SET(p->mode, MNT_IN_CONTAINER) && detect_container() > 0)
 183                 return 0;
 184
 185         /* The access mode here doesn't really matter too much, since
 186          * the mounted file system will take precedence anyway. */
 187         if (relabel)
 188                 (void) mkdir_p_label(p->where, 0755);
 189         else
 190                 (void) mkdir_p(p->where, 0755);
 191
 192         log_debug("Mounting %s to %s of type %s with options %s.",
 193                   p->what,
 194                   p->where,
 195                   p->type,
 196                   strna(p->options));
 197
 198         if (FLAGS_SET(p->mode, MNT_FOLLOW_SYMLINK))
 199                 r = mount_follow_verbose(priority, p->what, p->where, p->type, p->flags, p->options);
 200         else
 201                 r = mount_nofollow_verbose(priority, p->what, p->where, p->type, p->flags, p->options);
 202         if (r < 0)
 203                 return FLAGS_SET(p->mode, MNT_FATAL) ? r : 0;
 204
 205         /* Relabel again, since we now mounted something fresh here */
 206         if (relabel)
 207                 (void) label_fix(p->where, 0);
 208
 209         if (FLAGS_SET(p->mode, MNT_CHECK_WRITABLE))
 210                 if (access(p->where, W_OK) < 0) {
 211                         r = -errno;
 212
 213                         (void) umount2(p->where, UMOUNT_NOFOLLOW);
 214                         (void) rmdir(p->where);
 215
 216                         log_full_errno(priority, r, "Mount point '%s' not writable after mounting, undoing: %m", p->where);
 217                         return FLAGS_SET(p->mode, MNT_FATAL) ? r : 0;
 218                 }
 219
 220         return 1;
 221 }
 222
 223 static int mount_points_setup(size_t n, bool loaded_policy) {
 224         int r = 0;
 225
 226         assert(n <= ELEMENTSOF(mount_table));
 227
 228         FOREACH_ARRAY(mp, mount_table, n)
 229                 RET_GATHER(r, mount_one(mp, loaded_policy));
 230
 231         return r;
 232 }
 233
 234 int mount_setup_early(void) {
 235         /* Do a minimal mount of /proc and friends to enable the most basic stuff, such as SELinux */
 236         return mount_points_setup(N_EARLY_MOUNT, /* loaded_policy= */ false);
 237 }
 238
 239 static const char *join_with(const char *controller) {
 240
 241         static const char* const pairs[] = {
 242                 "cpu", "cpuacct",
 243                 "net_cls", "net_prio",
 244                 NULL
 245         };
 246
 247         assert(controller);
 248
 249         /* This will lookup which controller to mount another controller with. Input is a controller name, and output
 250          * is the other controller name. The function works both ways: you can input one and get the other, and input
 251          * the other to get the one. */
 252
 253         STRV_FOREACH_PAIR(x, y, pairs) {
 254                 if (streq(controller, *x))
 255                         return *y;
 256                 if (streq(controller, *y))
 257                         return *x;
 258         }
 259
 260         return NULL;
 261 }
 262
 263 static int symlink_controller(const char *target, const char *alias) {
 264         const char *a;
 265         int r;
 266
 267         assert(target);
 268         assert(alias);
 269
 270         a = strjoina("/sys/fs/cgroup/", alias);
 271
 272         r = symlink_idempotent(target, a, false);
 273         if (r < 0)
 274                 return log_error_errno(r, "Failed to create symlink %s: %m", a);
 275
 276 #if HAVE_SMACK_RUN_LABEL
 277         const char *p;
 278
 279         p = strjoina("/sys/fs/cgroup/", target);
 280
 281         r = mac_smack_copy(a, p);
 282         if (r < 0 && !ERRNO_IS_NOT_SUPPORTED(r))
 283                 return log_error_errno(r, "Failed to copy smack label from %s to %s: %m", p, a);
 284 #endif
 285
 286         return 0;
 287 }
 288
 289 #if HAVE_SELINUX || ENABLE_SMACK
 290 static int relabel_cb(
 291                 RecurseDirEvent event,
 292                 const char *path,
 293                 int dir_fd,
 294                 int inode_fd,
 295                 const struct dirent *de,
 296                 const struct statx *sx,
 297                 void *userdata) {
 298
 299         switch (event) {
 300
 301         case RECURSE_DIR_LEAVE:
 302         case RECURSE_DIR_SKIP_MOUNT:
 303                 /* If we already saw this dirent when entering it or this is a dirent that on a different
 304                  * mount, don't relabel it. */
 305                 return RECURSE_DIR_CONTINUE;
 306
 307         case RECURSE_DIR_ENTER:
 308                 /* /run/initramfs/ + /run/nextroot/ are static data and big, no need to dynamically relabel
 309                  * its contents at boot... */
 310                 if (PATH_STARTSWITH_SET(path, "/run/initramfs", "/run/nextroot"))
 311                         return RECURSE_DIR_SKIP_ENTRY;
 312
 313                 _fallthrough_;
 314
 315         default:
 316                 /* Otherwise, label it, even if we had trouble stat()ing it and similar. SELinux can figure this out */
 317                 (void) label_fix(path, 0);
 318                 return RECURSE_DIR_CONTINUE;
 319         }
 320 }
 321
 322 static int relabel_tree(const char *path) {
 323         int r;
 324
 325         r = recurse_dir_at(AT_FDCWD, path, 0, UINT_MAX, RECURSE_DIR_ENSURE_TYPE|RECURSE_DIR_SAME_MOUNT, relabel_cb, NULL);
 326         if (r < 0)
 327                 log_debug_errno(r, "Failed to recursively relabel '%s': %m", path);
 328
 329         return r;
 330 }
 331
 332 static int relabel_extra(void) {
 333         _cleanup_strv_free_ char **files = NULL;
 334         int r, c = 0;
 335
 336         /* Support for relabelling additional files or directories after loading the policy. For this, code in the
 337          * initrd simply has to drop in *.relabel files into /run/systemd/relabel-extra.d/. We'll read all such files
 338          * expecting one absolute path by line and will relabel each (and everyone below that in case the path refers
 339          * to a directory). These drop-in files are supposed to be absolutely minimal, and do not understand comments
 340          * and such. After the operation succeeded the files are removed, and the drop-in directory as well, if
 341          * possible.
 342          */
 343
 344         r = conf_files_list(&files, ".relabel", NULL,
 345                             CONF_FILES_FILTER_MASKED | CONF_FILES_REGULAR,
 346                             "/run/systemd/relabel-extra.d/");
 347         if (r < 0)
 348                 return log_error_errno(r, "Failed to enumerate /run/systemd/relabel-extra.d/, ignoring: %m");
 349
 350         STRV_FOREACH(file, files) {
 351                 _cleanup_fclose_ FILE *f = NULL;
 352
 353                 f = fopen(*file, "re");
 354                 if (!f) {
 355                         log_warning_errno(errno, "Failed to open %s, ignoring: %m", *file);
 356                         continue;
 357                 }
 358
 359                 for (;;) {
 360                         _cleanup_free_ char *line = NULL;
 361
 362                         r = read_line(f, LONG_LINE_MAX, &line);
 363                         if (r < 0) {
 364                                 log_warning_errno(r, "Failed to read %s, ignoring: %m", *file);
 365                                 break;
 366                         }
 367                         if (r == 0) /* EOF */
 368                                 break;
 369
 370                         path_simplify(line);
 371
 372                         if (!path_is_normalized(line)) {
 373                                 log_warning("Path to relabel is not normalized, ignoring: %s", line);
 374                                 continue;
 375                         }
 376
 377                         if (!path_is_absolute(line)) {
 378                                 log_warning("Path to relabel is not absolute, ignoring: %s", line);
 379                                 continue;
 380                         }
 381
 382                         log_debug("Relabelling additional file/directory '%s'.", line);
 383                         (void) label_fix(line, 0);
 384                         (void) relabel_tree(line);
 385                         c++;
 386                 }
 387
 388                 if (unlink(*file) < 0)
 389                         log_warning_errno(errno, "Failed to remove %s, ignoring: %m", *file);
 390         }
 391
 392         /* Remove when we complete things. */
 393         if (rmdir("/run/systemd/relabel-extra.d") < 0 &&
 394             errno != ENOENT)
 395                 log_warning_errno(errno, "Failed to remove /run/systemd/relabel-extra.d/ directory: %m");
 396
 397         return c;
 398 }
 399 #endif
 400
 401 int mount_setup(bool loaded_policy, bool leave_propagation) {
 402         int r;
 403
 404         r = mount_points_setup(ELEMENTSOF(mount_table), loaded_policy);
 405         if (r < 0)
 406                 return r;
 407
 408 #if HAVE_SELINUX || ENABLE_SMACK
 409         /* Nodes in devtmpfs and /run need to be manually updated for
 410          * the appropriate labels, after mounting. The other virtual
 411          * API file systems like /sys and /proc do not need that, they
 412          * use the same label for all their files. */
 413         if (loaded_policy) {
 414                 usec_t before_relabel, after_relabel;
 415                 int n_extra;
 416
 417                 before_relabel = now(CLOCK_MONOTONIC);
 418
 419                 FOREACH_STRING(i, "/dev", "/dev/shm", "/run")
 420                         (void) relabel_tree(i);
 421
 422                 n_extra = relabel_extra();
 423
 424                 after_relabel = now(CLOCK_MONOTONIC);
 425
 426                 log_info("Relabeled /dev/, /dev/shm/, /run/%s in %s.",
 427                          n_extra > 0 ? ", and additional files" : "",
 428                          FORMAT_TIMESPAN(after_relabel - before_relabel, 0));
 429         }
 430 #endif
 431
 432         /* Create a few default symlinks, which are normally created
 433          * by udevd, but some scripts might need them before we start
 434          * udevd. */
 435         dev_setup(NULL, UID_INVALID, GID_INVALID);
 436
 437         /* Mark the root directory as shared in regards to mount propagation. The kernel defaults to "private", but we
 438          * think it makes more sense to have a default of "shared" so that nspawn and the container tools work out of
 439          * the box. If specific setups need other settings they can reset the propagation mode to private if
 440          * needed. Note that we set this only when we are invoked directly by the kernel. If we are invoked by a
 441          * container manager we assume the container manager knows what it is doing (for example, because it set up
 442          * some directories with different propagation modes). */
 443         if (detect_container() <= 0 && !leave_propagation)
 444                 if (mount(NULL, "/", NULL, MS_REC|MS_SHARED, NULL) < 0)
 445                         log_warning_errno(errno, "Failed to set up the root directory for shared mount propagation: %m");
 446
 447         /* Create a few directories we always want around, Note that sd_booted() checks for /run/systemd/system, so
 448          * this mkdir really needs to stay for good, otherwise software that copied sd-daemon.c into their sources will
 449          * misdetect systemd. */
 450         (void) mkdir_label("/run/systemd", 0755);
 451         (void) mkdir_label("/run/systemd/system", 0755);
 452
 453         /* Make sure there's always a place where sandboxed environments can mount root file systems they are
 454          * about to move into, even when unprivileged, without having to create a temporary one in /tmp/
 455          * (which they then have to keep track of and clean) */
 456         (void) mkdir_label("/run/systemd/mount-rootfs", 0555);
 457
 458         /* Make sure we have a mount point to hide in sandboxes */
 459         (void) mkdir_label("/run/credentials", 0755);
 460
 461         /* Also create /run/systemd/inaccessible nodes, so that we always have something to mount
 462          * inaccessible nodes from. If we run in a container the host might have created these for us already
 463          * in /run/host/inaccessible/. Use those if we can, since that way we likely get access to block/char
 464          * device nodes that are inaccessible, and if userns is used to nodes that are on mounts owned by a
 465          * userns outside the container and thus nicely read-only and not remountable. */
 466         if (access("/run/host/inaccessible/", F_OK) < 0) {
 467                 if (errno != ENOENT)
 468                         log_debug_errno(errno, "Failed to check if /run/host/inaccessible exists, ignoring: %m");
 469
 470                 (void) make_inaccessible_nodes("/run/systemd", UID_INVALID, GID_INVALID);
 471         } else
 472                 (void) symlink("../host/inaccessible", "/run/systemd/inaccessible");
 473
 474         return 0;
 475 }
 476
 477 static const MountPoint cgroupv1_mount_table[] = {
 478         { "tmpfs",       "/sys/fs/cgroup",            "tmpfs",      "mode=0755" TMPFS_LIMITS_SYS_FS_CGROUP,     MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
 479           cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
 480         { "cgroup2",     "/sys/fs/cgroup/unified",    "cgroup2",    "nsdelegate",                               MS_NOSUID|MS_NOEXEC|MS_NODEV,
 481           cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
 482         { "cgroup2",     "/sys/fs/cgroup/unified",    "cgroup2",    NULL,                                       MS_NOSUID|MS_NOEXEC|MS_NODEV,
 483           cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
 484         { "cgroup",      "/sys/fs/cgroup/systemd",    "cgroup",     "none,name=systemd,xattr",                  MS_NOSUID|MS_NOEXEC|MS_NODEV,
 485           cg_is_legacy_wanted, MNT_IN_CONTAINER     },
 486         { "cgroup",      "/sys/fs/cgroup/systemd",    "cgroup",     "none,name=systemd",                        MS_NOSUID|MS_NOEXEC|MS_NODEV,
 487           cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
 488 };
 489
 490 static void relabel_cgroup_legacy_hierarchy(void) {
 491 #if HAVE_SELINUX || ENABLE_SMACK
 492         struct statfs st;
 493
 494         assert(cg_is_legacy_wanted());
 495
 496         /* Temporarily remount the root cgroup filesystem to give it a proper label. Do this
 497            only when the filesystem has been already populated by a previous instance of systemd
 498            running from initrd. Otherwise don't remount anything and leave the filesystem read-write
 499            for the cgroup filesystems to be mounted inside. */
 500         if (statfs("/sys/fs/cgroup", &st) < 0)
 501                 return (void) log_error_errno(errno, "Failed to determine mount flags for /sys/fs/cgroup/: %m");
 502
 503         if (st.f_flags & ST_RDONLY)
 504                 (void) mount_nofollow(NULL, "/sys/fs/cgroup", NULL, MS_REMOUNT, NULL);
 505
 506         (void) label_fix("/sys/fs/cgroup", 0);
 507         (void) relabel_tree("/sys/fs/cgroup");
 508
 509         if (st.f_flags & ST_RDONLY)
 510                 (void) mount_nofollow(NULL, "/sys/fs/cgroup", NULL, MS_REMOUNT|MS_RDONLY, NULL);
 511 #endif
 512 }
 513
 514 int mount_cgroup_legacy_controllers(bool loaded_policy) {
 515         _cleanup_set_free_ Set *controllers = NULL;
 516         int r;
 517
 518         if (!cg_is_legacy_wanted())
 519                 return 0;
 520
 521         FOREACH_ARRAY(mp, cgroupv1_mount_table, ELEMENTSOF(cgroupv1_mount_table)) {
 522                 r = mount_one(mp, loaded_policy);
 523                 if (r < 0)
 524                         return r;
 525         }
 526
 527         if (loaded_policy)
 528                 relabel_cgroup_legacy_hierarchy();
 529
 530         /* Mount all available cgroup controllers that are built into the kernel. */
 531         r = cg_kernel_controllers(&controllers);
 532         if (r < 0)
 533                 return log_error_errno(r, "Failed to enumerate cgroup controllers: %m");
 534
 535         for (;;) {
 536                 _cleanup_free_ char *options = NULL, *controller = NULL, *where = NULL;
 537                 const char *other_controller;
 538                 MountPoint p = {
 539                         .what = "cgroup",
 540                         .type = "cgroup",
 541                         .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV,
 542                         .mode = MNT_IN_CONTAINER,
 543                 };
 544
 545                 controller = set_steal_first(controllers);
 546                 if (!controller)
 547                         break;
 548
 549                 /* Check if we shall mount this together with another controller */
 550                 other_controller = join_with(controller);
 551                 if (other_controller) {
 552                         _cleanup_free_ char *c = NULL;
 553
 554                         /* Check if the other controller is actually available in the kernel too */
 555                         c = set_remove(controllers, other_controller);
 556                         if (c) {
 557
 558                                 /* Join the two controllers into one string, and maintain a stable ordering */
 559                                 if (strcmp(controller, other_controller) < 0)
 560                                         options = strjoin(controller, ",", other_controller);
 561                                 else
 562                                         options = strjoin(other_controller, ",", controller);
 563                                 if (!options)
 564                                         return log_oom();
 565                         }
 566                 }
 567
 568                 /* The simple case, where there's only one controller to mount together */
 569                 if (!options)
 570                         options = TAKE_PTR(controller);
 571
 572                 where = path_join("/sys/fs/cgroup", options);
 573                 if (!where)
 574                         return log_oom();
 575
 576                 p.where = where;
 577                 p.options = options;
 578
 579                 r = mount_one(&p, true);
 580                 if (r < 0)
 581                         return r;
 582
 583                 /* Create symlinks from the individual controller names, in case we have a joined mount */
 584                 if (controller)
 585                         (void) symlink_controller(options, controller);
 586                 if (other_controller)
 587                         (void) symlink_controller(options, other_controller);
 588         }
 589
 590         /* Now that we mounted everything, let's make the tmpfs the cgroup file systems are mounted into read-only. */
 591         (void) mount_nofollow("tmpfs", "/sys/fs/cgroup", "tmpfs",
 592                               MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY,
 593                               "mode=0755" TMPFS_LIMITS_SYS_FS_CGROUP);
 594
 595         return 1;
 596 }