src/core/mount-setup.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #include <errno.h>
   4 #include <ftw.h>
   5 #include <stdlib.h>
   6 #include <sys/mount.h>
   7 #include <sys/statvfs.h>
   8 #include <unistd.h>
   9
  10 #include "alloc-util.h"
  11 #include "bus-util.h"
  12 #include "cgroup-util.h"
  13 #include "dev-setup.h"
  14 #include "efivars.h"
  15 #include "fileio.h"
  16 #include "fs-util.h"
  17 #include "label.h"
  18 #include "log.h"
  19 #include "macro.h"
  20 #include "missing.h"
  21 #include "mkdir.h"
  22 #include "mount-setup.h"
  23 #include "mount-util.h"
  24 #include "path-util.h"
  25 #include "set.h"
  26 #include "smack-util.h"
  27 #include "strv.h"
  28 #include "user-util.h"
  29 #include "util.h"
  30 #include "virt.h"
  31
  32 typedef enum MountMode {
  33         MNT_NONE           = 0,
  34         MNT_FATAL          = 1 << 0,
  35         MNT_IN_CONTAINER   = 1 << 1,
  36         MNT_CHECK_WRITABLE = 1 << 2,
  37 } MountMode;
  38
  39 typedef struct MountPoint {
  40         const char *what;
  41         const char *where;
  42         const char *type;
  43         const char *options;
  44         unsigned long flags;
  45         bool (*condition_fn)(void);
  46         MountMode mode;
  47 } MountPoint;
  48
  49 /* The first three entries we might need before SELinux is up. The
  50  * fourth (securityfs) is needed by IMA to load a custom policy. The
  51  * other ones we can delay until SELinux and IMA are loaded. When
  52  * SMACK is enabled we need smackfs, too, so it's a fifth one. */
  53 #if ENABLE_SMACK
  54 #define N_EARLY_MOUNT 5
  55 #else
  56 #define N_EARLY_MOUNT 4
  57 #endif
  58
  59 static const MountPoint mount_table[] = {
  60         { "sysfs",       "/sys",                      "sysfs",      NULL,                      MS_NOSUID|MS_NOEXEC|MS_NODEV,
  61           NULL,          MNT_FATAL|MNT_IN_CONTAINER },
  62         { "proc",        "/proc",                     "proc",       NULL,                      MS_NOSUID|MS_NOEXEC|MS_NODEV,
  63           NULL,          MNT_FATAL|MNT_IN_CONTAINER },
  64         { "devtmpfs",    "/dev",                      "devtmpfs",   "mode=755",                MS_NOSUID|MS_STRICTATIME,
  65           NULL,          MNT_FATAL|MNT_IN_CONTAINER },
  66         { "securityfs",  "/sys/kernel/security",      "securityfs", NULL,                      MS_NOSUID|MS_NOEXEC|MS_NODEV,
  67           NULL,          MNT_NONE                   },
  68 #if ENABLE_SMACK
  69         { "smackfs",     "/sys/fs/smackfs",           "smackfs",    "smackfsdef=*",            MS_NOSUID|MS_NOEXEC|MS_NODEV,
  70           mac_smack_use, MNT_FATAL                  },
  71         { "tmpfs",       "/dev/shm",                  "tmpfs",      "mode=1777,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
  72           mac_smack_use, MNT_FATAL                  },
  73 #endif
  74         { "tmpfs",       "/dev/shm",                  "tmpfs",      "mode=1777",               MS_NOSUID|MS_NODEV|MS_STRICTATIME,
  75           NULL,          MNT_FATAL|MNT_IN_CONTAINER },
  76         { "devpts",      "/dev/pts",                  "devpts",     "mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC,
  77           NULL,          MNT_IN_CONTAINER           },
  78 #if ENABLE_SMACK
  79         { "tmpfs",       "/run",                      "tmpfs",      "mode=755,smackfsroot=*",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,
  80           mac_smack_use, MNT_FATAL                  },
  81 #endif
  82         { "tmpfs",       "/run",                      "tmpfs",      "mode=755",                MS_NOSUID|MS_NODEV|MS_STRICTATIME,
  83           NULL,          MNT_FATAL|MNT_IN_CONTAINER },
  84         { "cgroup2",     "/sys/fs/cgroup",            "cgroup2",    "nsdelegate",              MS_NOSUID|MS_NOEXEC|MS_NODEV,
  85           cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
  86         { "cgroup2",     "/sys/fs/cgroup",            "cgroup2",    NULL,                      MS_NOSUID|MS_NOEXEC|MS_NODEV,
  87           cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
  88         { "tmpfs",       "/sys/fs/cgroup",            "tmpfs",      "mode=755",                MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
  89           cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
  90         { "cgroup2",     "/sys/fs/cgroup/unified",    "cgroup2",    "nsdelegate",              MS_NOSUID|MS_NOEXEC|MS_NODEV,
  91           cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
  92         { "cgroup2",     "/sys/fs/cgroup/unified",    "cgroup2",    NULL,                      MS_NOSUID|MS_NOEXEC|MS_NODEV,
  93           cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
  94         { "cgroup",      "/sys/fs/cgroup/systemd",    "cgroup",     "none,name=systemd,xattr", MS_NOSUID|MS_NOEXEC|MS_NODEV,
  95           cg_is_legacy_wanted, MNT_IN_CONTAINER     },
  96         { "cgroup",      "/sys/fs/cgroup/systemd",    "cgroup",     "none,name=systemd",       MS_NOSUID|MS_NOEXEC|MS_NODEV,
  97           cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
  98         { "pstore",      "/sys/fs/pstore",            "pstore",     NULL,                      MS_NOSUID|MS_NOEXEC|MS_NODEV,
  99           NULL,          MNT_NONE                   },
 100 #if ENABLE_EFI
 101         { "efivarfs",    "/sys/firmware/efi/efivars", "efivarfs",   NULL,                      MS_NOSUID|MS_NOEXEC|MS_NODEV,
 102           is_efi_boot,   MNT_NONE                   },
 103 #endif
 104         { "bpf",         "/sys/fs/bpf",               "bpf",        "mode=700",                MS_NOSUID|MS_NOEXEC|MS_NODEV,
 105           NULL,          MNT_NONE,                  },
 106 };
 107
 108 /* These are API file systems that might be mounted by other software,
 109  * we just list them here so that we know that we should ignore them */
 110
 111 static const char ignore_paths[] =
 112         /* SELinux file systems */
 113         "/sys/fs/selinux\0"
 114         /* Container bind mounts */
 115         "/proc/sys\0"
 116         "/dev/console\0"
 117         "/proc/kmsg\0";
 118
 119 bool mount_point_is_api(const char *path) {
 120         unsigned i;
 121
 122         /* Checks if this mount point is considered "API", and hence
 123          * should be ignored */
 124
 125         for (i = 0; i < ELEMENTSOF(mount_table); i ++)
 126                 if (path_equal(path, mount_table[i].where))
 127                         return true;
 128
 129         return path_startswith(path, "/sys/fs/cgroup/");
 130 }
 131
 132 bool mount_point_ignore(const char *path) {
 133         const char *i;
 134
 135         NULSTR_FOREACH(i, ignore_paths)
 136                 if (path_equal(path, i))
 137                         return true;
 138
 139         return false;
 140 }
 141
 142 static int mount_one(const MountPoint *p, bool relabel) {
 143         int r, priority;
 144
 145         assert(p);
 146
 147         priority = (p->mode & MNT_FATAL) ? LOG_ERR : LOG_DEBUG;
 148
 149         if (p->condition_fn && !p->condition_fn())
 150                 return 0;
 151
 152         /* Relabel first, just in case */
 153         if (relabel)
 154                 (void) label_fix(p->where, LABEL_IGNORE_ENOENT|LABEL_IGNORE_EROFS);
 155
 156         r = path_is_mount_point(p->where, NULL, AT_SYMLINK_FOLLOW);
 157         if (r < 0 && r != -ENOENT) {
 158                 log_full_errno(priority, r, "Failed to determine whether %s is a mount point: %m", p->where);
 159                 return (p->mode & MNT_FATAL) ? r : 0;
 160         }
 161         if (r > 0)
 162                 return 0;
 163
 164         /* Skip securityfs in a container */
 165         if (!(p->mode & MNT_IN_CONTAINER) && detect_container() > 0)
 166                 return 0;
 167
 168         /* The access mode here doesn't really matter too much, since
 169          * the mounted file system will take precedence anyway. */
 170         if (relabel)
 171                 (void) mkdir_p_label(p->where, 0755);
 172         else
 173                 (void) mkdir_p(p->where, 0755);
 174
 175         log_debug("Mounting %s to %s of type %s with options %s.",
 176                   p->what,
 177                   p->where,
 178                   p->type,
 179                   strna(p->options));
 180
 181         if (mount(p->what,
 182                   p->where,
 183                   p->type,
 184                   p->flags,
 185                   p->options) < 0) {
 186                 log_full_errno(priority, errno, "Failed to mount %s at %s: %m", p->type, p->where);
 187                 return (p->mode & MNT_FATAL) ? -errno : 0;
 188         }
 189
 190         /* Relabel again, since we now mounted something fresh here */
 191         if (relabel)
 192                 (void) label_fix(p->where, 0);
 193
 194         if (p->mode & MNT_CHECK_WRITABLE) {
 195                 if (access(p->where, W_OK) < 0) {
 196                         r = -errno;
 197
 198                         (void) umount(p->where);
 199                         (void) rmdir(p->where);
 200
 201                         log_full_errno(priority, r, "Mount point %s not writable after mounting: %m", p->where);
 202                         return (p->mode & MNT_FATAL) ? r : 0;
 203                 }
 204         }
 205
 206         return 1;
 207 }
 208
 209 static int mount_points_setup(unsigned n, bool loaded_policy) {
 210         unsigned i;
 211         int r = 0;
 212
 213         for (i = 0; i < n; i ++) {
 214                 int j;
 215
 216                 j = mount_one(mount_table + i, loaded_policy);
 217                 if (j != 0 && r >= 0)
 218                         r = j;
 219         }
 220
 221         return r;
 222 }
 223
 224 int mount_setup_early(void) {
 225         assert_cc(N_EARLY_MOUNT <= ELEMENTSOF(mount_table));
 226
 227         /* Do a minimal mount of /proc and friends to enable the most
 228          * basic stuff, such as SELinux */
 229         return mount_points_setup(N_EARLY_MOUNT, false);
 230 }
 231
 232 static const char *join_with(const char *controller) {
 233
 234         static const char* const pairs[] = {
 235                 "cpu", "cpuacct",
 236                 "net_cls", "net_prio",
 237                 NULL
 238         };
 239
 240         const char *const *x, *const *y;
 241
 242         assert(controller);
 243
 244         /* This will lookup which controller to mount another controller with. Input is a controller name, and output
 245          * is the other controller name. The function works both ways: you can input one and get the other, and input
 246          * the other to get the one. */
 247
 248         STRV_FOREACH_PAIR(x, y, pairs) {
 249                 if (streq(controller, *x))
 250                         return *y;
 251                 if (streq(controller, *y))
 252                         return *x;
 253         }
 254
 255         return NULL;
 256 }
 257
 258 static int symlink_controller(const char *target, const char *alias) {
 259         const char *a;
 260         int r;
 261
 262         assert(target);
 263         assert(alias);
 264
 265         a = strjoina("/sys/fs/cgroup/", alias);
 266
 267         r = symlink_idempotent(target, a, false);
 268         if (r < 0)
 269                 return log_error_errno(r, "Failed to create symlink %s: %m", a);
 270
 271 #ifdef SMACK_RUN_LABEL
 272         const char *p;
 273
 274         p = strjoina("/sys/fs/cgroup/", target);
 275
 276         r = mac_smack_copy(a, p);
 277         if (r < 0 && r != -EOPNOTSUPP)
 278                 return log_error_errno(r, "Failed to copy smack label from %s to %s: %m", p, a);
 279 #endif
 280
 281         return 0;
 282 }
 283
 284 int mount_cgroup_controllers(void) {
 285         _cleanup_set_free_free_ Set *controllers = NULL;
 286         int r;
 287
 288         if (!cg_is_legacy_wanted())
 289                 return 0;
 290
 291         /* Mount all available cgroup controllers that are built into the kernel. */
 292         r = cg_kernel_controllers(&controllers);
 293         if (r < 0)
 294                 return log_error_errno(r, "Failed to enumerate cgroup controllers: %m");
 295
 296         for (;;) {
 297                 _cleanup_free_ char *options = NULL, *controller = NULL, *where = NULL;
 298                 const char *other_controller;
 299                 MountPoint p = {
 300                         .what = "cgroup",
 301                         .type = "cgroup",
 302                         .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV,
 303                         .mode = MNT_IN_CONTAINER,
 304                 };
 305
 306                 controller = set_steal_first(controllers);
 307                 if (!controller)
 308                         break;
 309
 310                 /* Check if we shall mount this together with another controller */
 311                 other_controller = join_with(controller);
 312                 if (other_controller) {
 313                         _cleanup_free_ char *c = NULL;
 314
 315                         /* Check if the other controller is actually available in the kernel too */
 316                         c = set_remove(controllers, other_controller);
 317                         if (c) {
 318
 319                                 /* Join the two controllers into one string, and maintain a stable ordering */
 320                                 if (strcmp(controller, other_controller) < 0)
 321                                         options = strjoin(controller, ",", other_controller);
 322                                 else
 323                                         options = strjoin(other_controller, ",", controller);
 324                                 if (!options)
 325                                         return log_oom();
 326                         }
 327                 }
 328
 329                 /* The simple case, where there's only one controller to mount together */
 330                 if (!options)
 331                         options = TAKE_PTR(controller);
 332
 333                 where = strappend("/sys/fs/cgroup/", options);
 334                 if (!where)
 335                         return log_oom();
 336
 337                 p.where = where;
 338                 p.options = options;
 339
 340                 r = mount_one(&p, true);
 341                 if (r < 0)
 342                         return r;
 343
 344                 /* Create symlinks from the individual controller names, in case we have a joined mount */
 345                 if (controller)
 346                         (void) symlink_controller(options, controller);
 347                 if (other_controller)
 348                         (void) symlink_controller(options, other_controller);
 349         }
 350
 351         /* Now that we mounted everything, let's make the tmpfs the cgroup file systems are mounted into read-only. */
 352         (void) mount("tmpfs", "/sys/fs/cgroup", "tmpfs", MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
 353
 354         return 0;
 355 }
 356
 357 #if HAVE_SELINUX || ENABLE_SMACK
 358 static int nftw_cb(
 359                 const char *fpath,
 360                 const struct stat *sb,
 361                 int tflag,
 362                 struct FTW *ftwbuf) {
 363
 364         /* No need to label /dev twice in a row... */
 365         if (_unlikely_(ftwbuf->level == 0))
 366                 return FTW_CONTINUE;
 367
 368         (void) label_fix(fpath, 0);
 369
 370         /* /run/initramfs is static data and big, no need to
 371          * dynamically relabel its contents at boot... */
 372         if (_unlikely_(ftwbuf->level == 1 &&
 373                       tflag == FTW_D &&
 374                       streq(fpath, "/run/initramfs")))
 375                 return FTW_SKIP_SUBTREE;
 376
 377         return FTW_CONTINUE;
 378 };
 379
 380 static int relabel_cgroup_filesystems(void) {
 381         int r;
 382         struct statfs st;
 383
 384         r = cg_all_unified();
 385         if (r == 0) {
 386                 /* Temporarily remount the root cgroup filesystem to give it a proper label. Do this
 387                    only when the filesystem has been already populated by a previous instance of systemd
 388                    running from initrd. Otherwise don't remount anything and leave the filesystem read-write
 389                    for the cgroup filesystems to be mounted inside. */
 390                 if (statfs("/sys/fs/cgroup", &st) < 0)
 391                         return log_error_errno(errno, "Failed to determine mount flags for /sys/fs/cgroup: %m");
 392
 393                 if (st.f_flags & ST_RDONLY)
 394                         (void) mount(NULL, "/sys/fs/cgroup", NULL, MS_REMOUNT, NULL);
 395
 396                 (void) label_fix("/sys/fs/cgroup", 0);
 397                 (void) nftw("/sys/fs/cgroup", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
 398
 399                 if (st.f_flags & ST_RDONLY)
 400                         (void) mount(NULL, "/sys/fs/cgroup", NULL, MS_REMOUNT|MS_RDONLY, NULL);
 401
 402         } else if (r < 0)
 403                 return log_error_errno(r, "Failed to determine whether we are in all unified mode: %m");
 404
 405         return 0;
 406 }
 407 #endif
 408
 409 int mount_setup(bool loaded_policy) {
 410         int r = 0;
 411
 412         r = mount_points_setup(ELEMENTSOF(mount_table), loaded_policy);
 413         if (r < 0)
 414                 return r;
 415
 416 #if HAVE_SELINUX || ENABLE_SMACK
 417         /* Nodes in devtmpfs and /run need to be manually updated for
 418          * the appropriate labels, after mounting. The other virtual
 419          * API file systems like /sys and /proc do not need that, they
 420          * use the same label for all their files. */
 421         if (loaded_policy) {
 422                 usec_t before_relabel, after_relabel;
 423                 char timespan[FORMAT_TIMESPAN_MAX];
 424
 425                 before_relabel = now(CLOCK_MONOTONIC);
 426
 427                 (void) nftw("/dev", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
 428                 (void) nftw("/dev/shm", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
 429                 (void) nftw("/run", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
 430
 431                 r = relabel_cgroup_filesystems();
 432                 if (r < 0)
 433                         return r;
 434
 435                 after_relabel = now(CLOCK_MONOTONIC);
 436
 437                 log_info("Relabelled /dev, /run and /sys/fs/cgroup in %s.",
 438                          format_timespan(timespan, sizeof(timespan), after_relabel - before_relabel, 0));
 439         }
 440 #endif
 441
 442         /* Create a few default symlinks, which are normally created
 443          * by udevd, but some scripts might need them before we start
 444          * udevd. */
 445         dev_setup(NULL, UID_INVALID, GID_INVALID);
 446
 447         /* Mark the root directory as shared in regards to mount propagation. The kernel defaults to "private", but we
 448          * think it makes more sense to have a default of "shared" so that nspawn and the container tools work out of
 449          * the box. If specific setups need other settings they can reset the propagation mode to private if
 450          * needed. Note that we set this only when we are invoked directly by the kernel. If we are invoked by a
 451          * container manager we assume the container manager knows what it is doing (for example, because it set up
 452          * some directories with different propagation modes). */
 453         if (detect_container() <= 0)
 454                 if (mount(NULL, "/", NULL, MS_REC|MS_SHARED, NULL) < 0)
 455                         log_warning_errno(errno, "Failed to set up the root directory for shared mount propagation: %m");
 456
 457         /* Create a few directories we always want around, Note that sd_booted() checks for /run/systemd/system, so
 458          * this mkdir really needs to stay for good, otherwise software that copied sd-daemon.c into their sources will
 459          * misdetect systemd. */
 460         (void) mkdir_label("/run/systemd", 0755);
 461         (void) mkdir_label("/run/systemd/system", 0755);
 462
 463         /* Set up inaccessible (and empty) file nodes of all types */
 464         (void) mkdir_label("/run/systemd/inaccessible", 0000);
 465         (void) mknod("/run/systemd/inaccessible/reg", S_IFREG | 0000, 0);
 466         (void) mkdir_label("/run/systemd/inaccessible/dir", 0000);
 467         (void) mkfifo("/run/systemd/inaccessible/fifo", 0000);
 468         (void) mknod("/run/systemd/inaccessible/sock", S_IFSOCK | 0000, 0);
 469
 470         /* The following two are likely to fail if we lack the privs for it (for example in an userns environment, if
 471          * CAP_SYS_MKNOD is missing, or if a device node policy prohibit major/minor of 0 device nodes to be
 472          * created). But that's entirely fine. Consumers of these files should carry fallback to use a different node
 473          * then, for example /run/systemd/inaccessible/sock, which is close enough in behaviour and semantics for most
 474          * uses. */
 475         (void) mknod("/run/systemd/inaccessible/chr", S_IFCHR | 0000, makedev(0, 0));
 476         (void) mknod("/run/systemd/inaccessible/blk", S_IFBLK | 0000, makedev(0, 0));
 477
 478         return 0;
 479 }