src/shared/mount-setup.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <errno.h>
   4 #include <stdlib.h>
   5 #include <sys/mount.h>
   6 #include <sys/statvfs.h>
   7 #include <unistd.h>
   8
   9 #include "alloc-util.h"
  10 #include "bus-util.h"
  11 #include "cgroup-setup.h"
  12 #include "cgroup-util.h"
  13 #include "conf-files.h"
  14 #include "dev-setup.h"
  15 #include "dirent-util.h"
  16 #include "efi-loader.h"
  17 #include "fd-util.h"
  18 #include "fileio.h"
  19 #include "fs-util.h"
  20 #include "label-util.h"
  21 #include "log.h"
  22 #include "macro.h"
  23 #include "mkdir-label.h"
  24 #include "mount-setup.h"
  25 #include "mount-util.h"
  26 #include "mountpoint-util.h"
  27 #include "nulstr-util.h"
  28 #include "path-util.h"
  29 #include "recurse-dir.h"
  30 #include "set.h"
  31 #include "smack-util.h"
  32 #include "strv.h"
  33 #include "user-util.h"
  34 #include "virt.h"
  35
  36 typedef enum MountMode {
  37         MNT_NONE           = 0,
  38         MNT_FATAL          = 1 << 0,
  39         MNT_IN_CONTAINER   = 1 << 1,
  40         MNT_CHECK_WRITABLE = 1 << 2,
  41         MNT_FOLLOW_SYMLINK = 1 << 3,
  42 } MountMode;
  43
  44 typedef struct MountPoint {
  45         const char *what;
  46         const char *where;
  47         const char *type;
  48         const char *options;
  49         unsigned long flags;
  50         bool (*condition_fn)(void);
  51         MountMode mode;
  52 } MountPoint;
  53
  54 /* The first three entries we might need before SELinux is up. The
  55  * fourth (securityfs) is needed by IMA to load a custom policy. The
  56  * other ones we can delay until SELinux and IMA are loaded. When
  57  * SMACK is enabled we need smackfs, too, so it's a fifth one. */
  58 #if ENABLE_SMACK
  59 #define N_EARLY_MOUNT 5
  60 #else
  61 #define N_EARLY_MOUNT 4
  62 #endif
  63
  64 static bool check_recursiveprot_supported(void) {
  65         int r;
  66
  67         if (!cg_is_unified_wanted())
  68                 return false;
  69
  70         r = mount_option_supported("cgroup2", "memory_recursiveprot", NULL);
  71         if (r < 0)
  72                 log_debug_errno(r, "Failed to determiner whether the 'memory_recursiveprot' mount option is supported, assuming not: %m");
  73         else if (r == 0)
  74                 log_debug("This kernel version does not support 'memory_recursiveprot', not using mount option.");
  75
  76         return r > 0;
  77 }
  78
  79 static const MountPoint mount_table[] = {
  80         { "proc",        "/proc",                     "proc",       NULL,                                       MS_NOSUID|MS_NOEXEC|MS_NODEV,
  81           NULL,          MNT_FATAL|MNT_IN_CONTAINER|MNT_FOLLOW_SYMLINK },
  82         { "sysfs",       "/sys",                      "sysfs",      NULL,                                       MS_NOSUID|MS_NOEXEC|MS_NODEV,
  83           NULL,          MNT_FATAL|MNT_IN_CONTAINER },
  84         { "devtmpfs",    "/dev",                      "devtmpfs",   "mode=0755" TMPFS_LIMITS_DEV,               MS_NOSUID|MS_STRICTATIME,
  85           NULL,          MNT_FATAL|MNT_IN_CONTAINER },
  86         { "securityfs",  "/sys/kernel/security",      "securityfs", NULL,                                       MS_NOSUID|MS_NOEXEC|MS_NODEV,
  87           NULL,          MNT_NONE                   },
  88 #if ENABLE_SMACK
  89         { "smackfs",     "/sys/fs/smackfs",           "smackfs",    "smackfsdef=*",                             MS_NOSUID|MS_NOEXEC|MS_NODEV,
  90           mac_smack_use, MNT_FATAL                  },
  91         { "tmpfs",       "/dev/shm",                  "tmpfs",      "mode=01777,smackfsroot=*",                 MS_NOSUID|MS_NODEV|MS_STRICTATIME,
  92           mac_smack_use, MNT_FATAL                  },
  93 #endif
  94         { "tmpfs",       "/dev/shm",                  "tmpfs",      "mode=01777",                               MS_NOSUID|MS_NODEV|MS_STRICTATIME,
  95           NULL,          MNT_FATAL|MNT_IN_CONTAINER },
  96         { "devpts",      "/dev/pts",                  "devpts",     "mode=0620,gid=" STRINGIFY(TTY_GID),        MS_NOSUID|MS_NOEXEC,
  97           NULL,          MNT_IN_CONTAINER           },
  98 #if ENABLE_SMACK
  99         { "tmpfs",       "/run",                      "tmpfs",      "mode=0755,smackfsroot=*" TMPFS_LIMITS_RUN, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
 100           mac_smack_use, MNT_FATAL                  },
 101 #endif
 102         { "tmpfs",       "/run",                      "tmpfs",      "mode=0755" TMPFS_LIMITS_RUN,               MS_NOSUID|MS_NODEV|MS_STRICTATIME,
 103           NULL,          MNT_FATAL|MNT_IN_CONTAINER },
 104         { "cgroup2",     "/sys/fs/cgroup",            "cgroup2",    "nsdelegate,memory_recursiveprot",          MS_NOSUID|MS_NOEXEC|MS_NODEV,
 105           check_recursiveprot_supported, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
 106         { "cgroup2",     "/sys/fs/cgroup",            "cgroup2",    "nsdelegate",                               MS_NOSUID|MS_NOEXEC|MS_NODEV,
 107           cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
 108         { "cgroup2",     "/sys/fs/cgroup",            "cgroup2",    NULL,                                       MS_NOSUID|MS_NOEXEC|MS_NODEV,
 109           cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
 110         { "tmpfs",       "/sys/fs/cgroup",            "tmpfs",      "mode=0755" TMPFS_LIMITS_SYS_FS_CGROUP,     MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
 111           cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
 112         { "cgroup2",     "/sys/fs/cgroup/unified",    "cgroup2",    "nsdelegate",                               MS_NOSUID|MS_NOEXEC|MS_NODEV,
 113           cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
 114         { "cgroup2",     "/sys/fs/cgroup/unified",    "cgroup2",    NULL,                                       MS_NOSUID|MS_NOEXEC|MS_NODEV,
 115           cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
 116         { "cgroup",      "/sys/fs/cgroup/systemd",    "cgroup",     "none,name=systemd,xattr",                  MS_NOSUID|MS_NOEXEC|MS_NODEV,
 117           cg_is_legacy_wanted, MNT_IN_CONTAINER     },
 118         { "cgroup",      "/sys/fs/cgroup/systemd",    "cgroup",     "none,name=systemd",                        MS_NOSUID|MS_NOEXEC|MS_NODEV,
 119           cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
 120 #if ENABLE_PSTORE
 121         { "pstore",      "/sys/fs/pstore",            "pstore",     NULL,                                       MS_NOSUID|MS_NOEXEC|MS_NODEV,
 122           NULL,          MNT_NONE                   },
 123 #endif
 124 #if ENABLE_EFI
 125         { "efivarfs",    "/sys/firmware/efi/efivars", "efivarfs",   NULL,                                       MS_NOSUID|MS_NOEXEC|MS_NODEV,
 126           is_efi_boot,   MNT_NONE                   },
 127 #endif
 128         { "bpf",         "/sys/fs/bpf",               "bpf",        "mode=0700",                                MS_NOSUID|MS_NOEXEC|MS_NODEV,
 129           NULL,          MNT_NONE,                  },
 130 };
 131
 132 assert_cc(N_EARLY_MOUNT <= ELEMENTSOF(mount_table));
 133
 134 bool mount_point_is_api(const char *path) {
 135         /* Checks if this mount point is considered "API", and hence
 136          * should be ignored */
 137
 138         for (size_t i = 0; i < ELEMENTSOF(mount_table); i++)
 139                 if (path_equal(path, mount_table[i].where))
 140                         return true;
 141
 142         return path_startswith(path, "/sys/fs/cgroup/");
 143 }
 144
 145 bool mount_point_ignore(const char *path) {
 146         /* These are API file systems that might be mounted by other software, we just list them here so that
 147          * we know that we should ignore them. */
 148         FOREACH_STRING(i,
 149                        /* SELinux file systems */
 150                        "/sys/fs/selinux",
 151                        /* Container bind mounts */
 152                        "/dev/console",
 153                        "/proc/kmsg",
 154                        "/proc/sys",
 155                        "/proc/sys/kernel/random/boot_id")
 156                 if (path_equal(path, i))
 157                         return true;
 158
 159         if (path_startswith(path, "/run/host")) /* All mounts passed in from the container manager are
 160                                                  * something we better ignore. */
 161                 return true;
 162
 163         return false;
 164 }
 165
 166 static int mount_one(const MountPoint *p, bool relabel) {
 167         int r, priority;
 168
 169         assert(p);
 170
 171         priority = (p->mode & MNT_FATAL) ? LOG_ERR : LOG_DEBUG;
 172
 173         if (p->condition_fn && !p->condition_fn())
 174                 return 0;
 175
 176         /* Relabel first, just in case */
 177         if (relabel)
 178                 (void) label_fix(p->where, LABEL_IGNORE_ENOENT|LABEL_IGNORE_EROFS);
 179
 180         r = path_is_mount_point(p->where, NULL, AT_SYMLINK_FOLLOW);
 181         if (r < 0 && r != -ENOENT) {
 182                 log_full_errno(priority, r, "Failed to determine whether %s is a mount point: %m", p->where);
 183                 return (p->mode & MNT_FATAL) ? r : 0;
 184         }
 185         if (r > 0)
 186                 return 0;
 187
 188         /* Skip securityfs in a container */
 189         if (!(p->mode & MNT_IN_CONTAINER) && detect_container() > 0)
 190                 return 0;
 191
 192         /* The access mode here doesn't really matter too much, since
 193          * the mounted file system will take precedence anyway. */
 194         if (relabel)
 195                 (void) mkdir_p_label(p->where, 0755);
 196         else
 197                 (void) mkdir_p(p->where, 0755);
 198
 199         log_debug("Mounting %s to %s of type %s with options %s.",
 200                   p->what,
 201                   p->where,
 202                   p->type,
 203                   strna(p->options));
 204
 205         if (FLAGS_SET(p->mode, MNT_FOLLOW_SYMLINK))
 206                 r = mount_follow_verbose(priority, p->what, p->where, p->type, p->flags, p->options);
 207         else
 208                 r = mount_nofollow_verbose(priority, p->what, p->where, p->type, p->flags, p->options);
 209         if (r < 0)
 210                 return (p->mode & MNT_FATAL) ? r : 0;
 211
 212         /* Relabel again, since we now mounted something fresh here */
 213         if (relabel)
 214                 (void) label_fix(p->where, 0);
 215
 216         if (p->mode & MNT_CHECK_WRITABLE) {
 217                 if (access(p->where, W_OK) < 0) {
 218                         r = -errno;
 219
 220                         (void) umount2(p->where, UMOUNT_NOFOLLOW);
 221                         (void) rmdir(p->where);
 222
 223                         log_full_errno(priority, r, "Mount point %s not writable after mounting, undoing: %m", p->where);
 224                         return (p->mode & MNT_FATAL) ? r : 0;
 225                 }
 226         }
 227
 228         return 1;
 229 }
 230
 231 static int mount_points_setup(size_t n, bool loaded_policy) {
 232         int ret = 0, r;
 233
 234         assert(n <= ELEMENTSOF(mount_table));
 235
 236         FOREACH_ARRAY(mp, mount_table, n) {
 237                 r = mount_one(mp, loaded_policy);
 238                 if (r != 0 && ret >= 0)
 239                         ret = r;
 240         }
 241
 242         return ret;
 243 }
 244
 245 int mount_setup_early(void) {
 246         /* Do a minimal mount of /proc and friends to enable the most basic stuff, such as SELinux */
 247         return mount_points_setup(N_EARLY_MOUNT, /* loaded_policy= */ false);
 248 }
 249
 250 static const char *join_with(const char *controller) {
 251
 252         static const char* const pairs[] = {
 253                 "cpu", "cpuacct",
 254                 "net_cls", "net_prio",
 255                 NULL
 256         };
 257
 258         assert(controller);
 259
 260         /* This will lookup which controller to mount another controller with. Input is a controller name, and output
 261          * is the other controller name. The function works both ways: you can input one and get the other, and input
 262          * the other to get the one. */
 263
 264         STRV_FOREACH_PAIR(x, y, pairs) {
 265                 if (streq(controller, *x))
 266                         return *y;
 267                 if (streq(controller, *y))
 268                         return *x;
 269         }
 270
 271         return NULL;
 272 }
 273
 274 static int symlink_controller(const char *target, const char *alias) {
 275         const char *a;
 276         int r;
 277
 278         assert(target);
 279         assert(alias);
 280
 281         a = strjoina("/sys/fs/cgroup/", alias);
 282
 283         r = symlink_idempotent(target, a, false);
 284         if (r < 0)
 285                 return log_error_errno(r, "Failed to create symlink %s: %m", a);
 286
 287 #if HAVE_SMACK_RUN_LABEL
 288         const char *p;
 289
 290         p = strjoina("/sys/fs/cgroup/", target);
 291
 292         r = mac_smack_copy(a, p);
 293         if (r < 0 && !ERRNO_IS_NOT_SUPPORTED(r))
 294                 return log_error_errno(r, "Failed to copy smack label from %s to %s: %m", p, a);
 295 #endif
 296
 297         return 0;
 298 }
 299
 300 int mount_cgroup_controllers(void) {
 301         _cleanup_set_free_ Set *controllers = NULL;
 302         int r;
 303
 304         if (!cg_is_legacy_wanted())
 305                 return 0;
 306
 307         /* Mount all available cgroup controllers that are built into the kernel. */
 308         r = cg_kernel_controllers(&controllers);
 309         if (r < 0)
 310                 return log_error_errno(r, "Failed to enumerate cgroup controllers: %m");
 311
 312         for (;;) {
 313                 _cleanup_free_ char *options = NULL, *controller = NULL, *where = NULL;
 314                 const char *other_controller;
 315                 MountPoint p = {
 316                         .what = "cgroup",
 317                         .type = "cgroup",
 318                         .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV,
 319                         .mode = MNT_IN_CONTAINER,
 320                 };
 321
 322                 controller = set_steal_first(controllers);
 323                 if (!controller)
 324                         break;
 325
 326                 /* Check if we shall mount this together with another controller */
 327                 other_controller = join_with(controller);
 328                 if (other_controller) {
 329                         _cleanup_free_ char *c = NULL;
 330
 331                         /* Check if the other controller is actually available in the kernel too */
 332                         c = set_remove(controllers, other_controller);
 333                         if (c) {
 334
 335                                 /* Join the two controllers into one string, and maintain a stable ordering */
 336                                 if (strcmp(controller, other_controller) < 0)
 337                                         options = strjoin(controller, ",", other_controller);
 338                                 else
 339                                         options = strjoin(other_controller, ",", controller);
 340                                 if (!options)
 341                                         return log_oom();
 342                         }
 343                 }
 344
 345                 /* The simple case, where there's only one controller to mount together */
 346                 if (!options)
 347                         options = TAKE_PTR(controller);
 348
 349                 where = path_join("/sys/fs/cgroup", options);
 350                 if (!where)
 351                         return log_oom();
 352
 353                 p.where = where;
 354                 p.options = options;
 355
 356                 r = mount_one(&p, true);
 357                 if (r < 0)
 358                         return r;
 359
 360                 /* Create symlinks from the individual controller names, in case we have a joined mount */
 361                 if (controller)
 362                         (void) symlink_controller(options, controller);
 363                 if (other_controller)
 364                         (void) symlink_controller(options, other_controller);
 365         }
 366
 367         /* Now that we mounted everything, let's make the tmpfs the cgroup file systems are mounted into read-only. */
 368         (void) mount_nofollow("tmpfs", "/sys/fs/cgroup", "tmpfs",
 369                               MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY,
 370                               "mode=0755" TMPFS_LIMITS_SYS_FS_CGROUP);
 371
 372         return 0;
 373 }
 374
 375 #if HAVE_SELINUX || ENABLE_SMACK
 376 static int relabel_cb(
 377                 RecurseDirEvent event,
 378                 const char *path,
 379                 int dir_fd,
 380                 int inode_fd,
 381                 const struct dirent *de,
 382                 const struct statx *sx,
 383                 void *userdata) {
 384
 385         switch (event) {
 386
 387         case RECURSE_DIR_LEAVE:
 388         case RECURSE_DIR_SKIP_MOUNT:
 389                 /* If we already saw this dirent when entering it or this is a dirent that on a different
 390                  * mount, don't relabel it. */
 391                 return RECURSE_DIR_CONTINUE;
 392
 393         case RECURSE_DIR_ENTER:
 394                 /* /run/initramfs/ + /run/nextroot/ are static data and big, no need to dynamically relabel
 395                  * its contents at boot... */
 396                 if (PATH_STARTSWITH_SET(path, "/run/initramfs", "/run/nextroot"))
 397                         return RECURSE_DIR_SKIP_ENTRY;
 398
 399                 _fallthrough_;
 400
 401         default:
 402                 /* Otherwise, label it, even if we had trouble stat()ing it and similar. SELinux can figure this out */
 403                 (void) label_fix(path, 0);
 404                 return RECURSE_DIR_CONTINUE;
 405         }
 406 }
 407
 408 static int relabel_tree(const char *path) {
 409         int r;
 410
 411         r = recurse_dir_at(AT_FDCWD, path, 0, UINT_MAX, RECURSE_DIR_ENSURE_TYPE|RECURSE_DIR_SAME_MOUNT, relabel_cb, NULL);
 412         if (r < 0)
 413                 log_debug_errno(r, "Failed to recursively relabel '%s': %m", path);
 414
 415         return r;
 416 }
 417
 418 static int relabel_cgroup_filesystems(void) {
 419         int r;
 420         struct statfs st;
 421
 422         r = cg_all_unified();
 423         if (r == 0) {
 424                 /* Temporarily remount the root cgroup filesystem to give it a proper label. Do this
 425                    only when the filesystem has been already populated by a previous instance of systemd
 426                    running from initrd. Otherwise don't remount anything and leave the filesystem read-write
 427                    for the cgroup filesystems to be mounted inside. */
 428                 if (statfs("/sys/fs/cgroup", &st) < 0)
 429                         return log_error_errno(errno, "Failed to determine mount flags for /sys/fs/cgroup: %m");
 430
 431                 if (st.f_flags & ST_RDONLY)
 432                         (void) mount_nofollow(NULL, "/sys/fs/cgroup", NULL, MS_REMOUNT, NULL);
 433
 434                 (void) label_fix("/sys/fs/cgroup", 0);
 435                 (void) relabel_tree("/sys/fs/cgroup");
 436
 437                 if (st.f_flags & ST_RDONLY)
 438                         (void) mount_nofollow(NULL, "/sys/fs/cgroup", NULL, MS_REMOUNT|MS_RDONLY, NULL);
 439
 440         } else if (r < 0)
 441                 return log_error_errno(r, "Failed to determine whether we are in all unified mode: %m");
 442
 443         return 0;
 444 }
 445
 446 static int relabel_extra(void) {
 447         _cleanup_strv_free_ char **files = NULL;
 448         int r, c = 0;
 449
 450         /* Support for relabelling additional files or directories after loading the policy. For this, code in the
 451          * initrd simply has to drop in *.relabel files into /run/systemd/relabel-extra.d/. We'll read all such files
 452          * expecting one absolute path by line and will relabel each (and everyone below that in case the path refers
 453          * to a directory). These drop-in files are supposed to be absolutely minimal, and do not understand comments
 454          * and such. After the operation succeeded the files are removed, and the drop-in directory as well, if
 455          * possible.
 456          */
 457
 458         r = conf_files_list(&files, ".relabel", NULL,
 459                             CONF_FILES_FILTER_MASKED | CONF_FILES_REGULAR,
 460                             "/run/systemd/relabel-extra.d/");
 461         if (r < 0)
 462                 return log_error_errno(r, "Failed to enumerate /run/systemd/relabel-extra.d/, ignoring: %m");
 463
 464         STRV_FOREACH(file, files) {
 465                 _cleanup_fclose_ FILE *f = NULL;
 466
 467                 f = fopen(*file, "re");
 468                 if (!f) {
 469                         log_warning_errno(errno, "Failed to open %s, ignoring: %m", *file);
 470                         continue;
 471                 }
 472
 473                 for (;;) {
 474                         _cleanup_free_ char *line = NULL;
 475
 476                         r = read_line(f, LONG_LINE_MAX, &line);
 477                         if (r < 0) {
 478                                 log_warning_errno(r, "Failed to read %s, ignoring: %m", *file);
 479                                 break;
 480                         }
 481                         if (r == 0) /* EOF */
 482                                 break;
 483
 484                         path_simplify(line);
 485
 486                         if (!path_is_normalized(line)) {
 487                                 log_warning("Path to relabel is not normalized, ignoring: %s", line);
 488                                 continue;
 489                         }
 490
 491                         if (!path_is_absolute(line)) {
 492                                 log_warning("Path to relabel is not absolute, ignoring: %s", line);
 493                                 continue;
 494                         }
 495
 496                         log_debug("Relabelling additional file/directory '%s'.", line);
 497                         (void) label_fix(line, 0);
 498                         (void) relabel_tree(line);
 499                         c++;
 500                 }
 501
 502                 if (unlink(*file) < 0)
 503                         log_warning_errno(errno, "Failed to remove %s, ignoring: %m", *file);
 504         }
 505
 506         /* Remove when we complete things. */
 507         if (rmdir("/run/systemd/relabel-extra.d") < 0 &&
 508             errno != ENOENT)
 509                 log_warning_errno(errno, "Failed to remove /run/systemd/relabel-extra.d/ directory: %m");
 510
 511         return c;
 512 }
 513 #endif
 514
 515 int mount_setup(bool loaded_policy, bool leave_propagation) {
 516         int r;
 517
 518         r = mount_points_setup(ELEMENTSOF(mount_table), loaded_policy);
 519         if (r < 0)
 520                 return r;
 521
 522 #if HAVE_SELINUX || ENABLE_SMACK
 523         /* Nodes in devtmpfs and /run need to be manually updated for
 524          * the appropriate labels, after mounting. The other virtual
 525          * API file systems like /sys and /proc do not need that, they
 526          * use the same label for all their files. */
 527         if (loaded_policy) {
 528                 usec_t before_relabel, after_relabel;
 529                 int n_extra;
 530
 531                 before_relabel = now(CLOCK_MONOTONIC);
 532
 533                 FOREACH_STRING(i, "/dev", "/dev/shm", "/run")
 534                         (void) relabel_tree(i);
 535
 536                 (void) relabel_cgroup_filesystems();
 537
 538                 n_extra = relabel_extra();
 539
 540                 after_relabel = now(CLOCK_MONOTONIC);
 541
 542                 log_info("Relabeled /dev, /dev/shm, /run, /sys/fs/cgroup%s in %s.",
 543                          n_extra > 0 ? ", additional files" : "",
 544                          FORMAT_TIMESPAN(after_relabel - before_relabel, 0));
 545         }
 546 #endif
 547
 548         /* Create a few default symlinks, which are normally created
 549          * by udevd, but some scripts might need them before we start
 550          * udevd. */
 551         dev_setup(NULL, UID_INVALID, GID_INVALID);
 552
 553         /* Mark the root directory as shared in regards to mount propagation. The kernel defaults to "private", but we
 554          * think it makes more sense to have a default of "shared" so that nspawn and the container tools work out of
 555          * the box. If specific setups need other settings they can reset the propagation mode to private if
 556          * needed. Note that we set this only when we are invoked directly by the kernel. If we are invoked by a
 557          * container manager we assume the container manager knows what it is doing (for example, because it set up
 558          * some directories with different propagation modes). */
 559         if (detect_container() <= 0 && !leave_propagation)
 560                 if (mount(NULL, "/", NULL, MS_REC|MS_SHARED, NULL) < 0)
 561                         log_warning_errno(errno, "Failed to set up the root directory for shared mount propagation: %m");
 562
 563         /* Create a few directories we always want around, Note that sd_booted() checks for /run/systemd/system, so
 564          * this mkdir really needs to stay for good, otherwise software that copied sd-daemon.c into their sources will
 565          * misdetect systemd. */
 566         (void) mkdir_label("/run/systemd", 0755);
 567         (void) mkdir_label("/run/systemd/system", 0755);
 568
 569         /* Make sure there's always a place where sandboxed environments can mount root file systems they are
 570          * about to move into, even when unprivileged, without having to create a temporary one in /tmp/
 571          * (which they then have to keep track of and clean) */
 572         (void) mkdir_label("/run/systemd/mount-rootfs", 0555);
 573
 574         /* Make sure we have a mount point to hide in sandboxes */
 575         (void) mkdir_label("/run/credentials", 0755);
 576
 577         /* Also create /run/systemd/inaccessible nodes, so that we always have something to mount
 578          * inaccessible nodes from. If we run in a container the host might have created these for us already
 579          * in /run/host/inaccessible/. Use those if we can, since that way we likely get access to block/char
 580          * device nodes that are inaccessible, and if userns is used to nodes that are on mounts owned by a
 581          * userns outside the container and thus nicely read-only and not remountable. */
 582         if (access("/run/host/inaccessible/", F_OK) < 0) {
 583                 if (errno != ENOENT)
 584                         log_debug_errno(errno, "Failed to check if /run/host/inaccessible exists, ignoring: %m");
 585
 586                 (void) make_inaccessible_nodes("/run/systemd", UID_INVALID, GID_INVALID);
 587         } else
 588                 (void) symlink("../host/inaccessible", "/run/systemd/inaccessible");
 589
 590         return 0;
 591 }