src/core/namespace.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <errno.h>
   4 #include <linux/loop.h>
   5 #include <sched.h>
   6 #include <stdio.h>
   7 #include <sys/file.h>
   8 #include <sys/mount.h>
   9 #include <unistd.h>
  10 #if WANT_LINUX_FS_H
  11 #include <linux/fs.h>
  12 #endif
  13
  14 #include "alloc-util.h"
  15 #include "base-filesystem.h"
  16 #include "chase-symlinks.h"
  17 #include "dev-setup.h"
  18 #include "devnum-util.h"
  19 #include "env-util.h"
  20 #include "escape.h"
  21 #include "extension-release.h"
  22 #include "fd-util.h"
  23 #include "format-util.h"
  24 #include "glyph-util.h"
  25 #include "label.h"
  26 #include "list.h"
  27 #include "loop-util.h"
  28 #include "loopback-setup.h"
  29 #include "missing_syscall.h"
  30 #include "mkdir-label.h"
  31 #include "mount-util.h"
  32 #include "mountpoint-util.h"
  33 #include "namespace-util.h"
  34 #include "namespace.h"
  35 #include "nsflags.h"
  36 #include "nulstr-util.h"
  37 #include "os-util.h"
  38 #include "path-util.h"
  39 #include "selinux-util.h"
  40 #include "socket-util.h"
  41 #include "sort-util.h"
  42 #include "stat-util.h"
  43 #include "string-table.h"
  44 #include "string-util.h"
  45 #include "strv.h"
  46 #include "tmpfile-util.h"
  47 #include "umask-util.h"
  48 #include "user-util.h"
  49
  50 #define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
  51
  52 typedef enum MountMode {
  53         /* This is ordered by priority! */
  54         INACCESSIBLE,
  55         OVERLAY_MOUNT,
  56         MOUNT_IMAGES,
  57         BIND_MOUNT,
  58         BIND_MOUNT_RECURSIVE,
  59         PRIVATE_TMP,
  60         PRIVATE_TMP_READONLY,
  61         PRIVATE_DEV,
  62         BIND_DEV,
  63         EMPTY_DIR,
  64         SYSFS,
  65         PROCFS,
  66         READONLY,
  67         READWRITE,
  68         NOEXEC,
  69         EXEC,
  70         TMPFS,
  71         RUN,
  72         EXTENSION_DIRECTORIES, /* Bind-mounted outside the root directory, and used by subsequent mounts */
  73         EXTENSION_IMAGES, /* Mounted outside the root directory, and used by subsequent mounts */
  74         MQUEUEFS,
  75         READWRITE_IMPLICIT, /* Should have the lowest priority. */
  76         _MOUNT_MODE_MAX,
  77 } MountMode;
  78
  79 typedef struct MountEntry {
  80         const char *path_const;   /* Memory allocated on stack or static */
  81         MountMode mode:5;
  82         bool ignore:1;            /* Ignore if path does not exist? */
  83         bool has_prefix:1;        /* Already is prefixed by the root dir? */
  84         bool read_only:1;         /* Shall this mount point be read-only? */
  85         bool nosuid:1;            /* Shall set MS_NOSUID on the mount itself */
  86         bool noexec:1;            /* Shall set MS_NOEXEC on the mount itself */
  87         bool exec:1;              /* Shall clear MS_NOEXEC on the mount itself */
  88         bool applied:1;           /* Already applied */
  89         char *path_malloc;        /* Use this instead of 'path_const' if we had to allocate memory */
  90         const char *unprefixed_path_const; /* If the path was amended with a prefix, these will save the original */
  91         char *unprefixed_path_malloc;
  92         const char *source_const; /* The source path, for bind mounts or images */
  93         char *source_malloc;
  94         const char *options_const;/* Mount options for tmpfs */
  95         char *options_malloc;
  96         unsigned long flags;      /* Mount flags used by EMPTY_DIR and TMPFS. Do not include MS_RDONLY here, but please use read_only. */
  97         unsigned n_followed;
  98         LIST_HEAD(MountOptions, image_options);
  99 } MountEntry;
 100
 101 /* If MountAPIVFS= is used, let's mount /sys, /proc, /dev and /run into the it, but only as a fallback if the user hasn't mounted
 102  * something there already. These mounts are hence overridden by any other explicitly configured mounts. */
 103 static const MountEntry apivfs_table[] = {
 104         { "/proc",               PROCFS,       false },
 105         { "/dev",                BIND_DEV,     false },
 106         { "/sys",                SYSFS,        false },
 107         { "/run",                RUN,          false, .options_const = "mode=0755" TMPFS_LIMITS_RUN, .flags = MS_NOSUID|MS_NODEV|MS_STRICTATIME },
 108 };
 109
 110 /* ProtectKernelTunables= option and the related filesystem APIs */
 111 static const MountEntry protect_kernel_tunables_proc_table[] = {
 112         { "/proc/acpi",          READONLY,           true  },
 113         { "/proc/apm",           READONLY,           true  }, /* Obsolete API, there's no point in permitting access to this, ever */
 114         { "/proc/asound",        READONLY,           true  },
 115         { "/proc/bus",           READONLY,           true  },
 116         { "/proc/fs",            READONLY,           true  },
 117         { "/proc/irq",           READONLY,           true  },
 118         { "/proc/kallsyms",      INACCESSIBLE,       true  },
 119         { "/proc/kcore",         INACCESSIBLE,       true  },
 120         { "/proc/latency_stats", READONLY,           true  },
 121         { "/proc/mtrr",          READONLY,           true  },
 122         { "/proc/scsi",          READONLY,           true  },
 123         { "/proc/sys",           READONLY,           true  },
 124         { "/proc/sysrq-trigger", READONLY,           true  },
 125         { "/proc/timer_stats",   READONLY,           true  },
 126 };
 127
 128 static const MountEntry protect_kernel_tunables_sys_table[] = {
 129         { "/sys",                READONLY,           false },
 130         { "/sys/fs/bpf",         READONLY,           true  },
 131         { "/sys/fs/cgroup",      READWRITE_IMPLICIT, false }, /* READONLY is set by ProtectControlGroups= option */
 132         { "/sys/fs/selinux",     READWRITE_IMPLICIT, true  },
 133         { "/sys/kernel/debug",   READONLY,           true  },
 134         { "/sys/kernel/tracing", READONLY,           true  },
 135 };
 136
 137 /* ProtectKernelModules= option */
 138 static const MountEntry protect_kernel_modules_table[] = {
 139 #if HAVE_SPLIT_USR
 140         { "/lib/modules",        INACCESSIBLE, true  },
 141 #endif
 142         { "/usr/lib/modules",    INACCESSIBLE, true  },
 143 };
 144
 145 /* ProtectKernelLogs= option */
 146 static const MountEntry protect_kernel_logs_proc_table[] = {
 147         { "/proc/kmsg",          INACCESSIBLE, true },
 148 };
 149
 150 static const MountEntry protect_kernel_logs_dev_table[] = {
 151         { "/dev/kmsg",           INACCESSIBLE, true },
 152 };
 153
 154 /*
 155  * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
 156  * system should be protected by ProtectSystem=
 157  */
 158 static const MountEntry protect_home_read_only_table[] = {
 159         { "/home",               READONLY,     true  },
 160         { "/run/user",           READONLY,     true  },
 161         { "/root",               READONLY,     true  },
 162 };
 163
 164 /* ProtectHome=tmpfs table */
 165 static const MountEntry protect_home_tmpfs_table[] = {
 166         { "/home",               TMPFS,        true, .read_only = true, .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME },
 167         { "/run/user",           TMPFS,        true, .read_only = true, .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME },
 168         { "/root",               TMPFS,        true, .read_only = true, .options_const = "mode=0700" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME },
 169 };
 170
 171 /* ProtectHome=yes table */
 172 static const MountEntry protect_home_yes_table[] = {
 173         { "/home",               INACCESSIBLE, true  },
 174         { "/run/user",           INACCESSIBLE, true  },
 175         { "/root",               INACCESSIBLE, true  },
 176 };
 177
 178 /* ProtectSystem=yes table */
 179 static const MountEntry protect_system_yes_table[] = {
 180         { "/usr",                READONLY,     false },
 181         { "/boot",               READONLY,     true  },
 182         { "/efi",                READONLY,     true  },
 183 #if HAVE_SPLIT_USR
 184         { "/lib",                READONLY,     true  },
 185         { "/lib64",              READONLY,     true  },
 186         { "/bin",                READONLY,     true  },
 187 #  if HAVE_SPLIT_BIN
 188         { "/sbin",               READONLY,     true  },
 189 #  endif
 190 #endif
 191 };
 192
 193 /* ProtectSystem=full includes ProtectSystem=yes */
 194 static const MountEntry protect_system_full_table[] = {
 195         { "/usr",                READONLY,     false },
 196         { "/boot",               READONLY,     true  },
 197         { "/efi",                READONLY,     true  },
 198         { "/etc",                READONLY,     false },
 199 #if HAVE_SPLIT_USR
 200         { "/lib",                READONLY,     true  },
 201         { "/lib64",              READONLY,     true  },
 202         { "/bin",                READONLY,     true  },
 203 #  if HAVE_SPLIT_BIN
 204         { "/sbin",               READONLY,     true  },
 205 #  endif
 206 #endif
 207 };
 208
 209 /*
 210  * ProtectSystem=strict table. In this strict mode, we mount everything
 211  * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
 212  * which are left writable, but PrivateDevices= + ProtectKernelTunables=
 213  * protect those, and these options should be fully orthogonal.
 214  * (And of course /home and friends are also left writable, as ProtectHome=
 215  * shall manage those, orthogonally).
 216  */
 217 static const MountEntry protect_system_strict_table[] = {
 218         { "/",                   READONLY,           false },
 219         { "/proc",               READWRITE_IMPLICIT, false },      /* ProtectKernelTunables= */
 220         { "/sys",                READWRITE_IMPLICIT, false },      /* ProtectKernelTunables= */
 221         { "/dev",                READWRITE_IMPLICIT, false },      /* PrivateDevices= */
 222         { "/home",               READWRITE_IMPLICIT, true  },      /* ProtectHome= */
 223         { "/run/user",           READWRITE_IMPLICIT, true  },      /* ProtectHome= */
 224         { "/root",               READWRITE_IMPLICIT, true  },      /* ProtectHome= */
 225 };
 226
 227 static const char * const mount_mode_table[_MOUNT_MODE_MAX] = {
 228         [INACCESSIBLE]         = "inaccessible",
 229         [OVERLAY_MOUNT]        = "overlay",
 230         [BIND_MOUNT]           = "bind",
 231         [BIND_MOUNT_RECURSIVE] = "rbind",
 232         [PRIVATE_TMP]          = "private-tmp",
 233         [PRIVATE_DEV]          = "private-dev",
 234         [BIND_DEV]             = "bind-dev",
 235         [EMPTY_DIR]            = "empty",
 236         [SYSFS]                = "sysfs",
 237         [PROCFS]               = "procfs",
 238         [READONLY]             = "read-only",
 239         [READWRITE]            = "read-write",
 240         [TMPFS]                = "tmpfs",
 241         [MOUNT_IMAGES]         = "mount-images",
 242         [READWRITE_IMPLICIT]   = "rw-implicit",
 243         [EXEC]                 = "exec",
 244         [NOEXEC]               = "noexec",
 245         [MQUEUEFS]             = "mqueuefs",
 246 };
 247
 248 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(mount_mode, MountMode);
 249
 250 static const char *mount_entry_path(const MountEntry *p) {
 251         assert(p);
 252
 253         /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
 254          * otherwise the stack/static ->path field is returned. */
 255
 256         return p->path_malloc ?: p->path_const;
 257 }
 258
 259 static const char *mount_entry_unprefixed_path(const MountEntry *p) {
 260         assert(p);
 261
 262         /* Returns the unprefixed path (ie: before prefix_where_needed() ran), if any */
 263
 264         return p->unprefixed_path_malloc ?: p->unprefixed_path_const ?: mount_entry_path(p);
 265 }
 266
 267 static void mount_entry_consume_prefix(MountEntry *p, char *new_path) {
 268         assert(p);
 269         assert(p->path_malloc || p->path_const);
 270         assert(new_path);
 271
 272         /* Saves current path in unprefixed_ variable, and takes over new_path */
 273
 274         free_and_replace(p->unprefixed_path_malloc, p->path_malloc);
 275         /* If we didn't have a path on the heap, then it's a static one */
 276         if (!p->unprefixed_path_malloc)
 277                 p->unprefixed_path_const = p->path_const;
 278         p->path_malloc = new_path;
 279         p->has_prefix = true;
 280 }
 281
 282 static bool mount_entry_read_only(const MountEntry *p) {
 283         assert(p);
 284
 285         return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE, PRIVATE_TMP_READONLY);
 286 }
 287
 288 static bool mount_entry_noexec(const MountEntry *p) {
 289         assert(p);
 290
 291         return p->noexec || IN_SET(p->mode, NOEXEC, INACCESSIBLE, SYSFS, PROCFS);
 292 }
 293
 294 static bool mount_entry_exec(const MountEntry *p) {
 295         assert(p);
 296
 297         return p->exec || p->mode == EXEC;
 298 }
 299
 300 static const char *mount_entry_source(const MountEntry *p) {
 301         assert(p);
 302
 303         return p->source_malloc ?: p->source_const;
 304 }
 305
 306 static const char *mount_entry_options(const MountEntry *p) {
 307         assert(p);
 308
 309         return p->options_malloc ?: p->options_const;
 310 }
 311
 312 static void mount_entry_done(MountEntry *p) {
 313         assert(p);
 314
 315         p->path_malloc = mfree(p->path_malloc);
 316         p->unprefixed_path_malloc = mfree(p->unprefixed_path_malloc);
 317         p->source_malloc = mfree(p->source_malloc);
 318         p->options_malloc = mfree(p->options_malloc);
 319         p->image_options = mount_options_free_all(p->image_options);
 320 }
 321
 322 static int append_access_mounts(MountEntry **p, char **strv, MountMode mode, bool forcibly_require_prefix) {
 323         assert(p);
 324
 325         /* Adds a list of user-supplied READWRITE/READWRITE_IMPLICIT/READONLY/INACCESSIBLE entries */
 326
 327         STRV_FOREACH(i, strv) {
 328                 bool ignore = false, needs_prefix = false;
 329                 const char *e = *i;
 330
 331                 /* Look for any prefixes */
 332                 if (startswith(e, "-")) {
 333                         e++;
 334                         ignore = true;
 335                 }
 336                 if (startswith(e, "+")) {
 337                         e++;
 338                         needs_prefix = true;
 339                 }
 340
 341                 if (!path_is_absolute(e))
 342                         return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
 343                                                "Path is not absolute: %s", e);
 344
 345                 *((*p)++) = (MountEntry) {
 346                         .path_const = e,
 347                         .mode = mode,
 348                         .ignore = ignore,
 349                         .has_prefix = !needs_prefix && !forcibly_require_prefix,
 350                 };
 351         }
 352
 353         return 0;
 354 }
 355
 356 static int append_empty_dir_mounts(MountEntry **p, char **strv) {
 357         assert(p);
 358
 359         /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
 360          * "/private/" boundary directories for DynamicUser=1. */
 361
 362         STRV_FOREACH(i, strv) {
 363
 364                 *((*p)++) = (MountEntry) {
 365                         .path_const = *i,
 366                         .mode = EMPTY_DIR,
 367                         .ignore = false,
 368                         .read_only = true,
 369                         .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST,
 370                         .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
 371                 };
 372         }
 373
 374         return 0;
 375 }
 376
 377 static int append_bind_mounts(MountEntry **p, const BindMount *binds, size_t n) {
 378         assert(p);
 379
 380         for (size_t i = 0; i < n; i++) {
 381                 const BindMount *b = binds + i;
 382
 383                 *((*p)++) = (MountEntry) {
 384                         .path_const = b->destination,
 385                         .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT,
 386                         .read_only = b->read_only,
 387                         .nosuid = b->nosuid,
 388                         .source_const = b->source,
 389                         .ignore = b->ignore_enoent,
 390                 };
 391         }
 392
 393         return 0;
 394 }
 395
 396 static int append_mount_images(MountEntry **p, const MountImage *mount_images, size_t n) {
 397         assert(p);
 398
 399         for (size_t i = 0; i < n; i++) {
 400                 const MountImage *m = mount_images + i;
 401
 402                 *((*p)++) = (MountEntry) {
 403                         .path_const = m->destination,
 404                         .mode = MOUNT_IMAGES,
 405                         .source_const = m->source,
 406                         .image_options = m->mount_options,
 407                         .ignore = m->ignore_enoent,
 408                 };
 409         }
 410
 411         return 0;
 412 }
 413
 414 static int append_extensions(
 415                 MountEntry **p,
 416                 const char *root,
 417                 const char *extension_dir,
 418                 char **hierarchies,
 419                 const MountImage *mount_images,
 420                 size_t n,
 421                 char **extension_directories) {
 422
 423         _cleanup_strv_free_ char **overlays = NULL;
 424         int r;
 425
 426         if (n == 0 && strv_isempty(extension_directories))
 427                 return 0;
 428
 429         assert(p);
 430         assert(extension_dir);
 431
 432         /* Prepare a list of overlays, that will have as each element a string suitable for being
 433          * passed as a lowerdir= parameter, so start with the hierarchy on the root.
 434          * The overlays vector will have the same number of elements and will correspond to the
 435          * hierarchies vector, so they can be iterated upon together. */
 436         STRV_FOREACH(hierarchy, hierarchies) {
 437                 _cleanup_free_ char *prefixed_hierarchy = NULL;
 438
 439                 prefixed_hierarchy = path_join(root, *hierarchy);
 440                 if (!prefixed_hierarchy)
 441                         return -ENOMEM;
 442
 443                 r = strv_consume(&overlays, TAKE_PTR(prefixed_hierarchy));
 444                 if (r < 0)
 445                         return r;
 446         }
 447
 448         /* First, prepare a mount for each image, but these won't be visible to the unit, instead
 449          * they will be mounted in our propagate directory, and used as a source for the overlay. */
 450         for (size_t i = 0; i < n; i++) {
 451                 _cleanup_free_ char *mount_point = NULL;
 452                 const MountImage *m = mount_images + i;
 453
 454                 r = asprintf(&mount_point, "%s/%zu", extension_dir, i);
 455                 if (r < 0)
 456                         return -ENOMEM;
 457
 458                 for (size_t j = 0; hierarchies && hierarchies[j]; ++j) {
 459                         _cleanup_free_ char *prefixed_hierarchy = NULL, *escaped = NULL, *lowerdir = NULL;
 460
 461                         prefixed_hierarchy = path_join(mount_point, hierarchies[j]);
 462                         if (!prefixed_hierarchy)
 463                                 return -ENOMEM;
 464
 465                         escaped = shell_escape(prefixed_hierarchy, ",:");
 466                         if (!escaped)
 467                                 return -ENOMEM;
 468
 469                         /* Note that lowerdir= parameters are in 'reverse' order, so the
 470                          * top-most directory in the overlay comes first in the list. */
 471                         lowerdir = strjoin(escaped, ":", overlays[j]);
 472                         if (!lowerdir)
 473                                 return -ENOMEM;
 474
 475                         free_and_replace(overlays[j], lowerdir);
 476                 }
 477
 478                 *((*p)++) = (MountEntry) {
 479                         .path_malloc = TAKE_PTR(mount_point),
 480                         .image_options = m->mount_options,
 481                         .ignore = m->ignore_enoent,
 482                         .source_const = m->source,
 483                         .mode = EXTENSION_IMAGES,
 484                         .has_prefix = true,
 485                 };
 486         }
 487
 488         /* Secondly, extend the lowerdir= parameters with each ExtensionDirectory.
 489          * Bind mount them in the same location as the ExtensionImages, so that we
 490          * can check that they are valid trees (extension-release.d). */
 491         STRV_FOREACH(extension_directory, extension_directories) {
 492                 _cleanup_free_ char *mount_point = NULL, *source = NULL;
 493                 const char *e = *extension_directory;
 494                 bool ignore_enoent = false;
 495
 496                 /* Pick up the counter where the ExtensionImages left it. */
 497                 r = asprintf(&mount_point, "%s/%zu", extension_dir, n++);
 498                 if (r < 0)
 499                         return -ENOMEM;
 500
 501                 /* Look for any prefixes */
 502                 if (startswith(e, "-")) {
 503                         e++;
 504                         ignore_enoent = true;
 505                 }
 506                 /* Ignore this for now */
 507                 if (startswith(e, "+"))
 508                         e++;
 509
 510                 source = strdup(e);
 511                 if (!source)
 512                         return -ENOMEM;
 513
 514                 for (size_t j = 0; hierarchies && hierarchies[j]; ++j) {
 515                         _cleanup_free_ char *prefixed_hierarchy = NULL, *escaped = NULL, *lowerdir = NULL;
 516
 517                         prefixed_hierarchy = path_join(mount_point, hierarchies[j]);
 518                         if (!prefixed_hierarchy)
 519                                 return -ENOMEM;
 520
 521                         escaped = shell_escape(prefixed_hierarchy, ",:");
 522                         if (!escaped)
 523                                 return -ENOMEM;
 524
 525                         /* Note that lowerdir= parameters are in 'reverse' order, so the
 526                          * top-most directory in the overlay comes first in the list. */
 527                         lowerdir = strjoin(escaped, ":", overlays[j]);
 528                         if (!lowerdir)
 529                                 return -ENOMEM;
 530
 531                         free_and_replace(overlays[j], lowerdir);
 532                 }
 533
 534                 *((*p)++) = (MountEntry) {
 535                         .path_malloc = TAKE_PTR(mount_point),
 536                         .source_const = TAKE_PTR(source),
 537                         .mode = EXTENSION_DIRECTORIES,
 538                         .ignore = ignore_enoent,
 539                         .has_prefix = true,
 540                         .read_only = true,
 541                 };
 542         }
 543
 544         /* Then, for each hierarchy, prepare an overlay with the list of lowerdir= strings
 545          * set up earlier. */
 546         for (size_t i = 0; hierarchies && hierarchies[i]; ++i) {
 547                 _cleanup_free_ char *prefixed_hierarchy = NULL;
 548
 549                 prefixed_hierarchy = path_join(root, hierarchies[i]);
 550                 if (!prefixed_hierarchy)
 551                         return -ENOMEM;
 552
 553                 *((*p)++) = (MountEntry) {
 554                         .path_malloc = TAKE_PTR(prefixed_hierarchy),
 555                         .options_malloc = TAKE_PTR(overlays[i]),
 556                         .mode = OVERLAY_MOUNT,
 557                         .has_prefix = true,
 558                         .ignore = true, /* If the source image doesn't set the ignore bit it will fail earlier. */
 559                 };
 560         }
 561
 562         return 0;
 563 }
 564
 565 static int append_tmpfs_mounts(MountEntry **p, const TemporaryFileSystem *tmpfs, size_t n) {
 566         assert(p);
 567
 568         for (size_t i = 0; i < n; i++) {
 569                 const TemporaryFileSystem *t = tmpfs + i;
 570                 _cleanup_free_ char *o = NULL, *str = NULL;
 571                 unsigned long flags;
 572                 bool ro = false;
 573                 int r;
 574
 575                 if (!path_is_absolute(t->path))
 576                         return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
 577                                                "Path is not absolute: %s",
 578                                                t->path);
 579
 580                 str = strjoin("mode=0755" NESTED_TMPFS_LIMITS ",", t->options);
 581                 if (!str)
 582                         return -ENOMEM;
 583
 584                 r = mount_option_mangle(str, MS_NODEV|MS_STRICTATIME, &flags, &o);
 585                 if (r < 0)
 586                         return log_debug_errno(r, "Failed to parse mount option '%s': %m", str);
 587
 588                 ro = flags & MS_RDONLY;
 589                 if (ro)
 590                         flags ^= MS_RDONLY;
 591
 592                 *((*p)++) = (MountEntry) {
 593                         .path_const = t->path,
 594                         .mode = TMPFS,
 595                         .read_only = ro,
 596                         .options_malloc = TAKE_PTR(o),
 597                         .flags = flags,
 598                 };
 599         }
 600
 601         return 0;
 602 }
 603
 604 static int append_static_mounts(MountEntry **p, const MountEntry *mounts, size_t n, bool ignore_protect) {
 605         assert(p);
 606         assert(mounts);
 607
 608         /* Adds a list of static pre-defined entries */
 609
 610         for (size_t i = 0; i < n; i++)
 611                 *((*p)++) = (MountEntry) {
 612                         .path_const = mount_entry_path(mounts+i),
 613                         .mode = mounts[i].mode,
 614                         .ignore = mounts[i].ignore || ignore_protect,
 615                 };
 616
 617         return 0;
 618 }
 619
 620 static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
 621         assert(p);
 622
 623         switch (protect_home) {
 624
 625         case PROTECT_HOME_NO:
 626                 return 0;
 627
 628         case PROTECT_HOME_READ_ONLY:
 629                 return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
 630
 631         case PROTECT_HOME_TMPFS:
 632                 return append_static_mounts(p, protect_home_tmpfs_table, ELEMENTSOF(protect_home_tmpfs_table), ignore_protect);
 633
 634         case PROTECT_HOME_YES:
 635                 return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
 636
 637         default:
 638                 assert_not_reached();
 639         }
 640 }
 641
 642 static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
 643         assert(p);
 644
 645         switch (protect_system) {
 646
 647         case PROTECT_SYSTEM_NO:
 648                 return 0;
 649
 650         case PROTECT_SYSTEM_STRICT:
 651                 return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
 652
 653         case PROTECT_SYSTEM_YES:
 654                 return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
 655
 656         case PROTECT_SYSTEM_FULL:
 657                 return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
 658
 659         default:
 660                 assert_not_reached();
 661         }
 662 }
 663
 664 static int mount_path_compare(const MountEntry *a, const MountEntry *b) {
 665         int d;
 666
 667         /* ExtensionImages/Directories will be used by other mounts as a base, so sort them first
 668          * regardless of the prefix - they are set up in the propagate directory anyway */
 669         d = -CMP(a->mode == EXTENSION_IMAGES, b->mode == EXTENSION_IMAGES);
 670         if (d != 0)
 671                 return d;
 672         d = -CMP(a->mode == EXTENSION_DIRECTORIES, b->mode == EXTENSION_DIRECTORIES);
 673         if (d != 0)
 674                 return d;
 675
 676         /* If the paths are not equal, then order prefixes first */
 677         d = path_compare(mount_entry_path(a), mount_entry_path(b));
 678         if (d != 0)
 679                 return d;
 680
 681         /* If the paths are equal, check the mode */
 682         return CMP((int) a->mode, (int) b->mode);
 683 }
 684
 685 static int prefix_where_needed(MountEntry *m, size_t n, const char *root_directory) {
 686         /* Prefixes all paths in the bind mount table with the root directory if the entry needs that. */
 687
 688         assert(m || n == 0);
 689
 690         for (size_t i = 0; i < n; i++) {
 691                 char *s;
 692
 693                 if (m[i].has_prefix)
 694                         continue;
 695
 696                 s = path_join(root_directory, mount_entry_path(m+i));
 697                 if (!s)
 698                         return -ENOMEM;
 699
 700                 mount_entry_consume_prefix(&m[i], s);
 701         }
 702
 703         return 0;
 704 }
 705
 706 static void drop_duplicates(MountEntry *m, size_t *n) {
 707         MountEntry *f, *t, *previous;
 708
 709         assert(m);
 710         assert(n);
 711
 712         /* Drops duplicate entries. Expects that the array is properly ordered already. */
 713
 714         for (f = m, t = m, previous = NULL; f < m + *n; f++) {
 715
 716                 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
 717                  * above. Note that we only drop duplicates that haven't been mounted yet. */
 718                 if (previous &&
 719                     path_equal(mount_entry_path(f), mount_entry_path(previous)) &&
 720                     !f->applied && !previous->applied) {
 721                         log_debug("%s (%s) is duplicate.", mount_entry_path(f), mount_mode_to_string(f->mode));
 722                         /* Propagate the flags to the remaining entry */
 723                         previous->read_only = previous->read_only || mount_entry_read_only(f);
 724                         previous->noexec = previous->noexec || mount_entry_noexec(f);
 725                         previous->exec = previous->exec || mount_entry_exec(f);
 726                         mount_entry_done(f);
 727                         continue;
 728                 }
 729
 730                 *t = *f;
 731                 previous = t;
 732                 t++;
 733         }
 734
 735         *n = t - m;
 736 }
 737
 738 static void drop_inaccessible(MountEntry *m, size_t *n) {
 739         MountEntry *f, *t;
 740         const char *clear = NULL;
 741
 742         assert(m);
 743         assert(n);
 744
 745         /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
 746          * ordered already. */
 747
 748         for (f = m, t = m; f < m + *n; f++) {
 749
 750                 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
 751                  * it, as inaccessible paths really should drop the entire subtree. */
 752                 if (clear && path_startswith(mount_entry_path(f), clear)) {
 753                         log_debug("%s is masked by %s.", mount_entry_path(f), clear);
 754                         mount_entry_done(f);
 755                         continue;
 756                 }
 757
 758                 clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
 759
 760                 *t = *f;
 761                 t++;
 762         }
 763
 764         *n = t - m;
 765 }
 766
 767 static void drop_nop(MountEntry *m, size_t *n) {
 768         MountEntry *f, *t;
 769
 770         assert(m);
 771         assert(n);
 772
 773         /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
 774          * list is ordered by prefixes. */
 775
 776         for (f = m, t = m; f < m + *n; f++) {
 777
 778                 /* Only suppress such subtrees for READONLY, READWRITE and READWRITE_IMPLICIT entries */
 779                 if (IN_SET(f->mode, READONLY, READWRITE, READWRITE_IMPLICIT)) {
 780                         MountEntry *found = NULL;
 781
 782                         /* Now let's find the first parent of the entry we are looking at. */
 783                         for (MountEntry *p = PTR_SUB1(t, m); p; p = PTR_SUB1(p, m))
 784                                 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
 785                                         found = p;
 786                                         break;
 787                                 }
 788
 789                         /* We found it, let's see if it's the same mode, if so, we can drop this entry */
 790                         if (found && found->mode == f->mode) {
 791                                 log_debug("%s (%s) is made redundant by %s (%s)",
 792                                           mount_entry_path(f), mount_mode_to_string(f->mode),
 793                                           mount_entry_path(found), mount_mode_to_string(found->mode));
 794                                 mount_entry_done(f);
 795                                 continue;
 796                         }
 797                 }
 798
 799                 *t = *f;
 800                 t++;
 801         }
 802
 803         *n = t - m;
 804 }
 805
 806 static void drop_outside_root(const char *root_directory, MountEntry *m, size_t *n) {
 807         MountEntry *f, *t;
 808
 809         assert(m);
 810         assert(n);
 811
 812         /* Nothing to do */
 813         if (!root_directory)
 814                 return;
 815
 816         /* Drops all mounts that are outside of the root directory. */
 817
 818         for (f = m, t = m; f < m + *n; f++) {
 819
 820                 /* ExtensionImages/Directories bases are opened in /run/systemd/unit-extensions on the host */
 821                 if (!IN_SET(f->mode, EXTENSION_IMAGES, EXTENSION_DIRECTORIES) && !path_startswith(mount_entry_path(f), root_directory)) {
 822                         log_debug("%s is outside of root directory.", mount_entry_path(f));
 823                         mount_entry_done(f);
 824                         continue;
 825                 }
 826
 827                 *t = *f;
 828                 t++;
 829         }
 830
 831         *n = t - m;
 832 }
 833
 834 static int clone_device_node(
 835                 const char *d,
 836                 const char *temporary_mount,
 837                 bool *make_devnode) {
 838
 839         _cleanup_free_ char *sl = NULL;
 840         const char *dn, *bn, *t;
 841         struct stat st;
 842         int r;
 843
 844         if (stat(d, &st) < 0) {
 845                 if (errno == ENOENT) {
 846                         log_debug_errno(errno, "Device node '%s' to clone does not exist, ignoring.", d);
 847                         return -ENXIO;
 848                 }
 849
 850                 return log_debug_errno(errno, "Failed to stat() device node '%s' to clone, ignoring: %m", d);
 851         }
 852
 853         if (!S_ISBLK(st.st_mode) &&
 854             !S_ISCHR(st.st_mode))
 855                 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
 856                                        "Device node '%s' to clone is not a device node, ignoring.",
 857                                        d);
 858
 859         dn = strjoina(temporary_mount, d);
 860
 861         /* First, try to create device node properly */
 862         if (*make_devnode) {
 863                 mac_selinux_create_file_prepare(d, st.st_mode);
 864                 r = mknod(dn, st.st_mode, st.st_rdev);
 865                 mac_selinux_create_file_clear();
 866                 if (r >= 0)
 867                         goto add_symlink;
 868                 if (errno != EPERM)
 869                         return log_debug_errno(errno, "mknod failed for %s: %m", d);
 870
 871                 /* This didn't work, let's not try this again for the next iterations. */
 872                 *make_devnode = false;
 873         }
 874
 875         /* We're about to fall back to bind-mounting the device node. So create a dummy bind-mount target.
 876          * Do not prepare device-node SELinux label (see issue 13762) */
 877         r = mknod(dn, S_IFREG, 0);
 878         if (r < 0 && errno != EEXIST)
 879                 return log_debug_errno(errno, "mknod() fallback failed for '%s': %m", d);
 880
 881         /* Fallback to bind-mounting: The assumption here is that all used device nodes carry standard
 882          * properties. Specifically, the devices nodes we bind-mount should either be owned by root:root or
 883          * root:tty (e.g. /dev/tty, /dev/ptmx) and should not carry ACLs. */
 884         r = mount_nofollow_verbose(LOG_DEBUG, d, dn, NULL, MS_BIND, NULL);
 885         if (r < 0)
 886                 return r;
 887
 888 add_symlink:
 889         bn = path_startswith(d, "/dev/");
 890         if (!bn)
 891                 return 0;
 892
 893         /* Create symlinks like /dev/char/1:9 → ../urandom */
 894         if (asprintf(&sl, "%s/dev/%s/" DEVNUM_FORMAT_STR,
 895                      temporary_mount,
 896                      S_ISCHR(st.st_mode) ? "char" : "block",
 897                      DEVNUM_FORMAT_VAL(st.st_rdev)) < 0)
 898                 return log_oom();
 899
 900         (void) mkdir_parents(sl, 0755);
 901
 902         t = strjoina("../", bn);
 903         if (symlink(t, sl) < 0)
 904                 log_debug_errno(errno, "Failed to symlink '%s' to '%s', ignoring: %m", t, sl);
 905
 906         return 0;
 907 }
 908
 909 static int mount_private_dev(MountEntry *m) {
 910         static const char devnodes[] =
 911                 "/dev/null\0"
 912                 "/dev/zero\0"
 913                 "/dev/full\0"
 914                 "/dev/random\0"
 915                 "/dev/urandom\0"
 916                 "/dev/tty\0";
 917
 918         char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
 919         const char *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
 920         bool can_mknod = true;
 921         int r;
 922
 923         assert(m);
 924
 925         if (!mkdtemp(temporary_mount))
 926                 return log_debug_errno(errno, "Failed to create temporary directory '%s': %m", temporary_mount);
 927
 928         dev = strjoina(temporary_mount, "/dev");
 929         (void) mkdir(dev, 0755);
 930         r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=0755" TMPFS_LIMITS_PRIVATE_DEV);
 931         if (r < 0)
 932                 goto fail;
 933
 934         r = label_fix_full(AT_FDCWD, dev, "/dev", 0);
 935         if (r < 0) {
 936                 log_debug_errno(r, "Failed to fix label of '%s' as /dev: %m", dev);
 937                 goto fail;
 938         }
 939
 940         devpts = strjoina(temporary_mount, "/dev/pts");
 941         (void) mkdir(devpts, 0755);
 942         r = mount_nofollow_verbose(LOG_DEBUG, "/dev/pts", devpts, NULL, MS_BIND, NULL);
 943         if (r < 0)
 944                 goto fail;
 945
 946         /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx.
 947          * When /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible.
 948          * Thus, in that case make a clone.
 949          * In nspawn and other containers it will be a symlink, in that case make it a symlink. */
 950         r = is_symlink("/dev/ptmx");
 951         if (r < 0) {
 952                 log_debug_errno(r, "Failed to detect whether /dev/ptmx is a symlink or not: %m");
 953                 goto fail;
 954         } else if (r > 0) {
 955                 devptmx = strjoina(temporary_mount, "/dev/ptmx");
 956                 if (symlink("pts/ptmx", devptmx) < 0) {
 957                         r = log_debug_errno(errno, "Failed to create a symlink '%s' to pts/ptmx: %m", devptmx);
 958                         goto fail;
 959                 }
 960         } else {
 961                 r = clone_device_node("/dev/ptmx", temporary_mount, &can_mknod);
 962                 if (r < 0)
 963                         goto fail;
 964         }
 965
 966         devshm = strjoina(temporary_mount, "/dev/shm");
 967         (void) mkdir(devshm, 0755);
 968         r = mount_nofollow_verbose(LOG_DEBUG, "/dev/shm", devshm, NULL, MS_BIND, NULL);
 969         if (r < 0)
 970                 goto fail;
 971
 972         devmqueue = strjoina(temporary_mount, "/dev/mqueue");
 973         (void) mkdir(devmqueue, 0755);
 974         (void) mount_nofollow_verbose(LOG_DEBUG, "/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
 975
 976         devhugepages = strjoina(temporary_mount, "/dev/hugepages");
 977         (void) mkdir(devhugepages, 0755);
 978         (void) mount_nofollow_verbose(LOG_DEBUG, "/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
 979
 980         devlog = strjoina(temporary_mount, "/dev/log");
 981         if (symlink("/run/systemd/journal/dev-log", devlog) < 0)
 982                 log_debug_errno(errno, "Failed to create a symlink '%s' to /run/systemd/journal/dev-log, ignoring: %m", devlog);
 983
 984         NULSTR_FOREACH(d, devnodes) {
 985                 r = clone_device_node(d, temporary_mount, &can_mknod);
 986                 /* ENXIO means the *source* is not a device file, skip creation in that case */
 987                 if (r < 0 && r != -ENXIO)
 988                         goto fail;
 989         }
 990
 991         r = dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
 992         if (r < 0)
 993                 log_debug_errno(r, "Failed to set up basic device tree at '%s', ignoring: %m", temporary_mount);
 994
 995         /* Create the /dev directory if missing. It is more likely to be missing when the service is started
 996          * with RootDirectory. This is consistent with mount units creating the mount points when missing. */
 997         (void) mkdir_p_label(mount_entry_path(m), 0755);
 998
 999         /* Unmount everything in old /dev */
1000         r = umount_recursive(mount_entry_path(m), 0);
1001         if (r < 0)
1002                 log_debug_errno(r, "Failed to unmount directories below '%s', ignoring: %m", mount_entry_path(m));
1003
1004         r = mount_nofollow_verbose(LOG_DEBUG, dev, mount_entry_path(m), NULL, MS_MOVE, NULL);
1005         if (r < 0)
1006                 goto fail;
1007
1008         (void) rmdir(dev);
1009         (void) rmdir(temporary_mount);
1010
1011         return 0;
1012
1013 fail:
1014         if (devpts)
1015                 (void) umount_verbose(LOG_DEBUG, devpts, UMOUNT_NOFOLLOW);
1016
1017         if (devshm)
1018                 (void) umount_verbose(LOG_DEBUG, devshm, UMOUNT_NOFOLLOW);
1019
1020         if (devhugepages)
1021                 (void) umount_verbose(LOG_DEBUG, devhugepages, UMOUNT_NOFOLLOW);
1022
1023         if (devmqueue)
1024                 (void) umount_verbose(LOG_DEBUG, devmqueue, UMOUNT_NOFOLLOW);
1025
1026         (void) umount_verbose(LOG_DEBUG, dev, UMOUNT_NOFOLLOW);
1027         (void) rmdir(dev);
1028         (void) rmdir(temporary_mount);
1029
1030         return r;
1031 }
1032
1033 static int mount_bind_dev(const MountEntry *m) {
1034         int r;
1035
1036         assert(m);
1037
1038         /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the
1039          * service's /dev. This is only used when RootDirectory= is set. */
1040
1041         (void) mkdir_p_label(mount_entry_path(m), 0755);
1042
1043         r = path_is_mount_point(mount_entry_path(m), NULL, 0);
1044         if (r < 0)
1045                 return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
1046         if (r > 0) /* make this a NOP if /dev is already a mount point */
1047                 return 0;
1048
1049         r = mount_nofollow_verbose(LOG_DEBUG, "/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL);
1050         if (r < 0)
1051                 return r;
1052
1053         return 1;
1054 }
1055
1056 static int mount_sysfs(const MountEntry *m) {
1057         int r;
1058
1059         assert(m);
1060
1061         (void) mkdir_p_label(mount_entry_path(m), 0755);
1062
1063         r = path_is_mount_point(mount_entry_path(m), NULL, 0);
1064         if (r < 0)
1065                 return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
1066         if (r > 0) /* make this a NOP if /sys is already a mount point */
1067                 return 0;
1068
1069         /* Bind mount the host's version so that we get all child mounts of it, too. */
1070         r = mount_nofollow_verbose(LOG_DEBUG, "/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL);
1071         if (r < 0)
1072                 return r;
1073
1074         return 1;
1075 }
1076
1077 static bool mount_option_supported(const char *fstype, const char *key, const char *value) {
1078         _cleanup_close_ int fd = -EBADF;
1079         int r;
1080
1081         /* This function assumes support by default. Only if the fsconfig() call fails with -EINVAL/-EOPNOTSUPP
1082          * will it report that the option/value is not supported. */
1083
1084         fd = fsopen(fstype, FSOPEN_CLOEXEC);
1085         if (fd < 0) {
1086                 if (errno != ENOSYS)
1087                         log_debug_errno(errno, "Failed to open superblock context for '%s': %m", fstype);
1088                 return true; /* If fsopen() fails for whatever reason, assume the value is supported. */
1089         }
1090
1091         r = fsconfig(fd, FSCONFIG_SET_STRING, key, value, 0);
1092         if (r < 0 && !IN_SET(errno, EINVAL, EOPNOTSUPP, ENOSYS))
1093                 log_debug_errno(errno, "Failed to set '%s=%s' on '%s' superblock context: %m", key, value, fstype);
1094
1095         return r >= 0 || !IN_SET(errno, EINVAL, EOPNOTSUPP);
1096 }
1097
1098 static int mount_procfs(const MountEntry *m, const NamespaceInfo *ns_info) {
1099         _cleanup_free_ char *opts = NULL;
1100         const char *entry_path;
1101         int r, n;
1102
1103         assert(m);
1104         assert(ns_info);
1105
1106         if (ns_info->protect_proc != PROTECT_PROC_DEFAULT ||
1107             ns_info->proc_subset != PROC_SUBSET_ALL) {
1108
1109                 /* Starting with kernel 5.8 procfs' hidepid= logic is truly per-instance (previously it
1110                  * pretended to be per-instance but actually was per-namespace), hence let's make use of it
1111                  * if requested. To make sure this logic succeeds only on kernels where hidepid= is
1112                  * per-instance, we'll exclusively use the textual value for hidepid=, since support was
1113                  * added in the same commit: if it's supported it is thus also per-instance. */
1114
1115                 const char *hpv = ns_info->protect_proc == PROTECT_PROC_DEFAULT ?
1116                                   "off" :
1117                                   protect_proc_to_string(ns_info->protect_proc);
1118
1119                 /* hidepid= support was added in 5.8, so we can use fsconfig()/fsopen() (which were added in
1120                  * 5.2) to check if hidepid= is supported. This avoids a noisy dmesg log by the kernel when
1121                  * trying to use hidepid= on systems where it isn't supported. The same applies for subset=.
1122                  * fsopen()/fsconfig() was also backported on some distros which allows us to detect
1123                  * hidepid=/subset= support in even more scenarios. */
1124
1125                 if (mount_option_supported("proc", "hidepid", hpv)) {
1126                         opts = strjoin("hidepid=", hpv);
1127                         if (!opts)
1128                                 return -ENOMEM;
1129                 }
1130
1131                 if (ns_info->proc_subset == PROC_SUBSET_PID && mount_option_supported("proc", "subset", "pid"))
1132                         if (!strextend_with_separator(&opts, ",", "subset=pid"))
1133                                 return -ENOMEM;
1134         }
1135
1136         entry_path = mount_entry_path(m);
1137         (void) mkdir_p_label(entry_path, 0755);
1138
1139         /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in
1140          * one. i.e we don't reuse existing mounts here under any condition, we want a new instance owned by
1141          * our user namespace and with our hidepid= settings applied. Hence, let's get rid of everything
1142          * mounted on /proc/ first. */
1143
1144         n = umount_recursive(entry_path, 0);
1145
1146         r = mount_nofollow_verbose(LOG_DEBUG, "proc", entry_path, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
1147         if (r == -EINVAL && opts)
1148                 /* If this failed with EINVAL then this likely means the textual hidepid= stuff is
1149                  * not supported by the kernel, and thus the per-instance hidepid= neither, which
1150                  * means we really don't want to use it, since it would affect our host's /proc
1151                  * mount. Hence let's gracefully fallback to a classic, unrestricted version. */
1152                 r = mount_nofollow_verbose(LOG_DEBUG, "proc", entry_path, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
1153         if (r == -EPERM) {
1154                 /* When we do not have enough privileges to mount /proc, fallback to use existing /proc. */
1155
1156                 if (n > 0)
1157                         /* /proc or some of sub-mounts are umounted in the above. Refuse incomplete tree.
1158                          * Propagate the original error code returned by mount() in the above. */
1159                         return -EPERM;
1160
1161                 r = path_is_mount_point(entry_path, NULL, 0);
1162                 if (r < 0)
1163                         return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
1164                 if (r == 0) {
1165                         /* We lack permissions to mount a new instance of /proc, and it is not already
1166                          * mounted. But we can access the host's, so as a final fallback bind-mount it to
1167                          * the destination, as most likely we are inside a user manager in an unprivileged
1168                          * user namespace. */
1169                         r = mount_nofollow_verbose(LOG_DEBUG, "/proc", entry_path, NULL, MS_BIND|MS_REC, NULL);
1170                         if (r < 0)
1171                                 return -EPERM;
1172                 }
1173         } else if (r < 0)
1174                 return r;
1175
1176         return 1;
1177 }
1178
1179 static int mount_tmpfs(const MountEntry *m) {
1180         const char *entry_path, *inner_path;
1181         int r;
1182
1183         assert(m);
1184
1185         entry_path = mount_entry_path(m);
1186         inner_path = mount_entry_unprefixed_path(m);
1187
1188         /* First, get rid of everything that is below if there is anything. Then, overmount with our new
1189          * tmpfs */
1190
1191         (void) mkdir_p_label(entry_path, 0755);
1192         (void) umount_recursive(entry_path, 0);
1193
1194         r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", entry_path, "tmpfs", m->flags, mount_entry_options(m));
1195         if (r < 0)
1196                 return r;
1197
1198         r = label_fix_full(AT_FDCWD, entry_path, inner_path, 0);
1199         if (r < 0)
1200                 return log_debug_errno(r, "Failed to fix label of '%s' as '%s': %m", entry_path, inner_path);
1201
1202         return 1;
1203 }
1204
1205 static int mount_run(const MountEntry *m) {
1206         int r;
1207
1208         assert(m);
1209
1210         r = path_is_mount_point(mount_entry_path(m), NULL, 0);
1211         if (r < 0 && r != -ENOENT)
1212                 return log_debug_errno(r, "Unable to determine whether /run is already mounted: %m");
1213         if (r > 0) /* make this a NOP if /run is already a mount point */
1214                 return 0;
1215
1216         return mount_tmpfs(m);
1217 }
1218
1219 static int mount_mqueuefs(const MountEntry *m) {
1220         int r;
1221         const char *entry_path;
1222
1223         assert(m);
1224
1225         entry_path = mount_entry_path(m);
1226
1227         (void) mkdir_p_label(entry_path, 0755);
1228         (void) umount_recursive(entry_path, 0);
1229
1230         r = mount_nofollow_verbose(LOG_DEBUG, "mqueue", entry_path, "mqueue", m->flags, mount_entry_options(m));
1231         if (r < 0)
1232                 return r;
1233
1234         return 0;
1235 }
1236
1237 static int mount_image(const MountEntry *m, const char *root_directory) {
1238
1239         _cleanup_free_ char *host_os_release_id = NULL, *host_os_release_version_id = NULL,
1240                             *host_os_release_sysext_level = NULL;
1241         int r;
1242
1243         assert(m);
1244
1245         if (m->mode == EXTENSION_IMAGES) {
1246                 r = parse_os_release(
1247                                 empty_to_root(root_directory),
1248                                 "ID", &host_os_release_id,
1249                                 "VERSION_ID", &host_os_release_version_id,
1250                                 "SYSEXT_LEVEL", &host_os_release_sysext_level,
1251                                 NULL);
1252                 if (r < 0)
1253                         return log_debug_errno(r, "Failed to acquire 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory));
1254                 if (isempty(host_os_release_id))
1255                         return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "'ID' field not found or empty in 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory));
1256         }
1257
1258         r = verity_dissect_and_mount(
1259                         /* src_fd= */ -1, mount_entry_source(m), mount_entry_path(m), m->image_options,
1260                         host_os_release_id, host_os_release_version_id, host_os_release_sysext_level, NULL);
1261         if (r == -ENOENT && m->ignore)
1262                 return 0;
1263         if (r == -ESTALE && host_os_release_id)
1264                 return log_error_errno(r,
1265                                        "Failed to mount image %s, extension-release metadata does not match the lower layer's: ID=%s%s%s%s%s",
1266                                        mount_entry_source(m),
1267                                        host_os_release_id,
1268                                        host_os_release_version_id ? " VERSION_ID=" : "",
1269                                        strempty(host_os_release_version_id),
1270                                        host_os_release_sysext_level ? " SYSEXT_LEVEL=" : "",
1271                                        strempty(host_os_release_sysext_level));
1272         if (r < 0)
1273                 return log_debug_errno(r, "Failed to mount image %s on %s: %m", mount_entry_source(m), mount_entry_path(m));
1274
1275         return 1;
1276 }
1277
1278 static int mount_overlay(const MountEntry *m) {
1279         const char *options;
1280         int r;
1281
1282         assert(m);
1283
1284         options = strjoina("lowerdir=", mount_entry_options(m));
1285
1286         (void) mkdir_p_label(mount_entry_path(m), 0755);
1287
1288         r = mount_nofollow_verbose(LOG_DEBUG, "overlay", mount_entry_path(m), "overlay", MS_RDONLY, options);
1289         if (r == -ENOENT && m->ignore)
1290                 return 0;
1291         if (r < 0)
1292                 return r;
1293
1294         return 1;
1295 }
1296
1297 static int follow_symlink(
1298                 const char *root_directory,
1299                 MountEntry *m) {
1300
1301         _cleanup_free_ char *target = NULL;
1302         int r;
1303
1304         /* Let's chase symlinks, but only one step at a time. That's because depending where the symlink points we
1305          * might need to change the order in which we mount stuff. Hence: let's normalize piecemeal, and do one step at
1306          * a time by specifying CHASE_STEP. This function returns 0 if we resolved one step, and > 0 if we reached the
1307          * end and already have a fully normalized name. */
1308
1309         r = chase_symlinks(mount_entry_path(m), root_directory, CHASE_STEP|CHASE_NONEXISTENT, &target, NULL);
1310         if (r < 0)
1311                 return log_debug_errno(r, "Failed to chase symlinks '%s': %m", mount_entry_path(m));
1312         if (r > 0) /* Reached the end, nothing more to resolve */
1313                 return 1;
1314
1315         if (m->n_followed >= CHASE_SYMLINKS_MAX) /* put a boundary on things */
1316                 return log_debug_errno(SYNTHETIC_ERRNO(ELOOP),
1317                                        "Symlink loop on '%s'.",
1318                                        mount_entry_path(m));
1319
1320         log_debug("Followed mount entry path symlink %s %s %s.",
1321                   mount_entry_path(m), special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), target);
1322
1323         mount_entry_consume_prefix(m, TAKE_PTR(target));
1324
1325         m->n_followed ++;
1326
1327         return 0;
1328 }
1329
1330 static int apply_one_mount(
1331                 const char *root_directory,
1332                 MountEntry *m,
1333                 const NamespaceInfo *ns_info) {
1334
1335         _cleanup_free_ char *inaccessible = NULL;
1336         bool rbind = true, make = false;
1337         const char *what;
1338         int r;
1339
1340         assert(m);
1341         assert(ns_info);
1342
1343         log_debug("Applying namespace mount on %s", mount_entry_path(m));
1344
1345         switch (m->mode) {
1346
1347         case INACCESSIBLE: {
1348                 _cleanup_free_ char *tmp = NULL;
1349                 const char *runtime_dir;
1350                 struct stat target;
1351
1352                 /* First, get rid of everything that is below if there
1353                  * is anything... Then, overmount it with an
1354                  * inaccessible path. */
1355                 (void) umount_recursive(mount_entry_path(m), 0);
1356
1357                 if (lstat(mount_entry_path(m), &target) < 0) {
1358                         if (errno == ENOENT && m->ignore)
1359                                 return 0;
1360
1361                         return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m",
1362                                                mount_entry_path(m));
1363                 }
1364
1365                 if (geteuid() == 0)
1366                         runtime_dir = "/run";
1367                 else {
1368                         if (asprintf(&tmp, "/run/user/" UID_FMT, geteuid()) < 0)
1369                                 return -ENOMEM;
1370
1371                         runtime_dir = tmp;
1372                 }
1373
1374                 r = mode_to_inaccessible_node(runtime_dir, target.st_mode, &inaccessible);
1375                 if (r < 0)
1376                         return log_debug_errno(SYNTHETIC_ERRNO(ELOOP),
1377                                                "File type not supported for inaccessible mounts. Note that symlinks are not allowed");
1378                 what = inaccessible;
1379                 break;
1380         }
1381
1382         case READONLY:
1383         case READWRITE:
1384         case READWRITE_IMPLICIT:
1385         case EXEC:
1386         case NOEXEC:
1387                 r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
1388                 if (r == -ENOENT && m->ignore)
1389                         return 0;
1390                 if (r < 0)
1391                         return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m",
1392                                                mount_entry_path(m));
1393                 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY
1394                             * and MS_NOEXEC bits for the mount point if needed. */
1395                         return 0;
1396                 /* This isn't a mount point yet, let's make it one. */
1397                 what = mount_entry_path(m);
1398                 break;
1399
1400         case EXTENSION_DIRECTORIES: {
1401                 _cleanup_free_ char *host_os_release_id = NULL, *host_os_release_version_id = NULL,
1402                                 *host_os_release_sysext_level = NULL, *extension_name = NULL;
1403                 _cleanup_strv_free_ char **extension_release = NULL;
1404
1405                 r = path_extract_filename(mount_entry_source(m), &extension_name);
1406                 if (r < 0)
1407                         return log_debug_errno(r, "Failed to extract extension name from %s: %m", mount_entry_source(m));
1408
1409                 r = parse_os_release(
1410                                 empty_to_root(root_directory),
1411                                 "ID", &host_os_release_id,
1412                                 "VERSION_ID", &host_os_release_version_id,
1413                                 "SYSEXT_LEVEL", &host_os_release_sysext_level,
1414                                 NULL);
1415                 if (r < 0)
1416                         return log_debug_errno(r, "Failed to acquire 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory));
1417                 if (isempty(host_os_release_id))
1418                         return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "'ID' field not found or empty in 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory));
1419
1420                 r = load_extension_release_pairs(mount_entry_source(m), extension_name, /* relax_extension_release_check= */ false, &extension_release);
1421                 if (r == -ENOENT && m->ignore)
1422                         return 0;
1423                 if (r < 0)
1424                         return log_debug_errno(r, "Failed to parse directory %s extension-release metadata: %m", extension_name);
1425
1426                 r = extension_release_validate(
1427                                 extension_name,
1428                                 host_os_release_id,
1429                                 host_os_release_version_id,
1430                                 host_os_release_sysext_level,
1431                                 /* host_sysext_scope */ NULL, /* Leave empty, we need to accept both system and portable */
1432                                 extension_release);
1433                 if (r == 0)
1434                         return log_debug_errno(SYNTHETIC_ERRNO(ESTALE), "Directory %s extension-release metadata does not match the root's", extension_name);
1435                 if (r < 0)
1436                         return log_debug_errno(r, "Failed to compare directory %s extension-release metadata with the root's os-release: %m", extension_name);
1437
1438                 _fallthrough_;
1439         }
1440
1441         case BIND_MOUNT:
1442                 rbind = false;
1443
1444                 _fallthrough_;
1445         case BIND_MOUNT_RECURSIVE: {
1446                 _cleanup_free_ char *chased = NULL;
1447
1448                 /* Since mount() will always follow symlinks we chase the symlinks on our own first. Note
1449                  * that bind mount source paths are always relative to the host root, hence we pass NULL as
1450                  * root directory to chase_symlinks() here. */
1451
1452                 r = chase_symlinks(mount_entry_source(m), NULL, CHASE_TRAIL_SLASH, &chased, NULL);
1453                 if (r == -ENOENT && m->ignore) {
1454                         log_debug_errno(r, "Path %s does not exist, ignoring.", mount_entry_source(m));
1455                         return 0;
1456                 }
1457                 if (r < 0)
1458                         return log_debug_errno(r, "Failed to follow symlinks on %s: %m", mount_entry_source(m));
1459
1460                 log_debug("Followed source symlinks %s %s %s.",
1461                           mount_entry_source(m), special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), chased);
1462
1463                 free_and_replace(m->source_malloc, chased);
1464
1465                 what = mount_entry_source(m);
1466                 make = true;
1467                 break;
1468         }
1469
1470         case EMPTY_DIR:
1471         case TMPFS:
1472                 return mount_tmpfs(m);
1473
1474         case PRIVATE_TMP:
1475         case PRIVATE_TMP_READONLY:
1476                 what = mount_entry_source(m);
1477                 make = true;
1478                 break;
1479
1480         case PRIVATE_DEV:
1481                 return mount_private_dev(m);
1482
1483         case BIND_DEV:
1484                 return mount_bind_dev(m);
1485
1486         case SYSFS:
1487                 return mount_sysfs(m);
1488
1489         case PROCFS:
1490                 return mount_procfs(m, ns_info);
1491
1492         case RUN:
1493                 return mount_run(m);
1494
1495         case MQUEUEFS:
1496                 return mount_mqueuefs(m);
1497
1498         case MOUNT_IMAGES:
1499                 return mount_image(m, NULL);
1500
1501         case EXTENSION_IMAGES:
1502                 return mount_image(m, root_directory);
1503
1504         case OVERLAY_MOUNT:
1505                 return mount_overlay(m);
1506
1507         default:
1508                 assert_not_reached();
1509         }
1510
1511         assert(what);
1512
1513         r = mount_nofollow_verbose(LOG_DEBUG, what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL);
1514         if (r < 0) {
1515                 bool try_again = false;
1516
1517                 if (r == -ENOENT && make) {
1518                         int q;
1519
1520                         /* Hmm, either the source or the destination are missing. Let's see if we can create
1521                            the destination, then try again. */
1522
1523                         (void) mkdir_parents(mount_entry_path(m), 0755);
1524
1525                         q = make_mount_point_inode_from_path(what, mount_entry_path(m), 0755);
1526                         if (q < 0 && q != -EEXIST)
1527                                 log_error_errno(q, "Failed to create destination mount point node '%s': %m",
1528                                                 mount_entry_path(m));
1529                         else
1530                                 try_again = true;
1531                 }
1532
1533                 if (try_again)
1534                         r = mount_nofollow_verbose(LOG_DEBUG, what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL);
1535                 if (r < 0)
1536                         return log_error_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
1537         }
1538
1539         log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
1540         return 0;
1541 }
1542
1543 static int make_read_only(const MountEntry *m, char **deny_list, FILE *proc_self_mountinfo) {
1544         unsigned long new_flags = 0, flags_mask = 0;
1545         bool submounts;
1546         int r;
1547
1548         assert(m);
1549         assert(proc_self_mountinfo);
1550
1551         if (mount_entry_read_only(m) || m->mode == PRIVATE_DEV) {
1552                 new_flags |= MS_RDONLY;
1553                 flags_mask |= MS_RDONLY;
1554         }
1555
1556         if (m->nosuid) {
1557                 new_flags |= MS_NOSUID;
1558                 flags_mask |= MS_NOSUID;
1559         }
1560
1561         if (flags_mask == 0) /* No Change? */
1562                 return 0;
1563
1564         /* We generally apply these changes recursively, except for /dev, and the cases we know there's
1565          * nothing further down.  Set /dev readonly, but not submounts like /dev/shm. Also, we only set the
1566          * per-mount read-only flag.  We can't set it on the superblock, if we are inside a user namespace
1567          * and running Linux <= 4.17. */
1568         submounts =
1569                 mount_entry_read_only(m) &&
1570                 !IN_SET(m->mode, EMPTY_DIR, TMPFS);
1571         if (submounts)
1572                 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, deny_list, proc_self_mountinfo);
1573         else
1574                 r = bind_remount_one_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, proc_self_mountinfo);
1575
1576         /* Note that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked
1577          * read-only already stays this way. This improves compatibility with container managers, where we
1578          * won't attempt to undo read-only mounts already applied. */
1579
1580         if (r == -ENOENT && m->ignore)
1581                 return 0;
1582         if (r < 0)
1583                 return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m),
1584                                        submounts ? " and its submounts" : "");
1585         return 0;
1586 }
1587
1588 static int make_noexec(const MountEntry *m, char **deny_list, FILE *proc_self_mountinfo) {
1589         unsigned long new_flags = 0, flags_mask = 0;
1590         bool submounts;
1591         int r;
1592
1593         assert(m);
1594         assert(proc_self_mountinfo);
1595
1596         if (mount_entry_noexec(m)) {
1597                 new_flags |= MS_NOEXEC;
1598                 flags_mask |= MS_NOEXEC;
1599         } else if (mount_entry_exec(m)) {
1600                 new_flags &= ~MS_NOEXEC;
1601                 flags_mask |= MS_NOEXEC;
1602         }
1603
1604         if (flags_mask == 0) /* No Change? */
1605                 return 0;
1606
1607         submounts = !IN_SET(m->mode, EMPTY_DIR, TMPFS);
1608
1609         if (submounts)
1610                 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, deny_list, proc_self_mountinfo);
1611         else
1612                 r = bind_remount_one_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, proc_self_mountinfo);
1613
1614         if (r == -ENOENT && m->ignore)
1615                 return 0;
1616         if (r < 0)
1617                 return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m),
1618                                        submounts ? " and its submounts" : "");
1619         return 0;
1620 }
1621
1622 static int make_nosuid(const MountEntry *m, FILE *proc_self_mountinfo) {
1623         bool submounts;
1624         int r;
1625
1626         assert(m);
1627         assert(proc_self_mountinfo);
1628
1629         submounts = !IN_SET(m->mode, EMPTY_DIR, TMPFS);
1630
1631         if (submounts)
1632                 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), MS_NOSUID, MS_NOSUID, NULL, proc_self_mountinfo);
1633         else
1634                 r = bind_remount_one_with_mountinfo(mount_entry_path(m), MS_NOSUID, MS_NOSUID, proc_self_mountinfo);
1635         if (r == -ENOENT && m->ignore)
1636                 return 0;
1637         if (r < 0)
1638                 return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m),
1639                                        submounts ? " and its submounts" : "");
1640         return 0;
1641 }
1642
1643 static bool namespace_info_mount_apivfs(const NamespaceInfo *ns_info) {
1644         assert(ns_info);
1645
1646         /*
1647          * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
1648          * since to protect the API VFS mounts, they need to be around in the
1649          * first place...
1650          */
1651
1652         return ns_info->mount_apivfs ||
1653                 ns_info->protect_control_groups ||
1654                 ns_info->protect_kernel_tunables ||
1655                 ns_info->protect_proc != PROTECT_PROC_DEFAULT ||
1656                 ns_info->proc_subset != PROC_SUBSET_ALL;
1657 }
1658
1659 static size_t namespace_calculate_mounts(
1660                 const NamespaceInfo *ns_info,
1661                 char** read_write_paths,
1662                 char** read_only_paths,
1663                 char** inaccessible_paths,
1664                 char** exec_paths,
1665                 char** no_exec_paths,
1666                 char** empty_directories,
1667                 size_t n_bind_mounts,
1668                 size_t n_temporary_filesystems,
1669                 size_t n_mount_images,
1670                 size_t n_extension_images,
1671                 size_t n_extension_directories,
1672                 size_t n_hierarchies,
1673                 const char* tmp_dir,
1674                 const char* var_tmp_dir,
1675                 const char *creds_path,
1676                 const char* log_namespace,
1677                 bool setup_propagate,
1678                 const char* notify_socket) {
1679
1680         size_t protect_home_cnt;
1681         size_t protect_system_cnt =
1682                 (ns_info->protect_system == PROTECT_SYSTEM_STRICT ?
1683                  ELEMENTSOF(protect_system_strict_table) :
1684                  ((ns_info->protect_system == PROTECT_SYSTEM_FULL) ?
1685                   ELEMENTSOF(protect_system_full_table) :
1686                   ((ns_info->protect_system == PROTECT_SYSTEM_YES) ?
1687                    ELEMENTSOF(protect_system_yes_table) : 0)));
1688
1689         protect_home_cnt =
1690                 (ns_info->protect_home == PROTECT_HOME_YES ?
1691                  ELEMENTSOF(protect_home_yes_table) :
1692                  ((ns_info->protect_home == PROTECT_HOME_READ_ONLY) ?
1693                   ELEMENTSOF(protect_home_read_only_table) :
1694                   ((ns_info->protect_home == PROTECT_HOME_TMPFS) ?
1695                    ELEMENTSOF(protect_home_tmpfs_table) : 0)));
1696
1697         return !!tmp_dir + !!var_tmp_dir +
1698                 strv_length(read_write_paths) +
1699                 strv_length(read_only_paths) +
1700                 strv_length(inaccessible_paths) +
1701                 strv_length(exec_paths) +
1702                 strv_length(no_exec_paths) +
1703                 strv_length(empty_directories) +
1704                 n_bind_mounts +
1705                 n_mount_images +
1706                 (n_extension_images > 0 || n_extension_directories > 0 ? /* Mount each image and directory plus an overlay per hierarchy */
1707                  n_hierarchies + n_extension_images + n_extension_directories: 0) +
1708                 n_temporary_filesystems +
1709                 ns_info->private_dev +
1710                 (ns_info->protect_kernel_tunables ?
1711                  ELEMENTSOF(protect_kernel_tunables_proc_table) + ELEMENTSOF(protect_kernel_tunables_sys_table) : 0) +
1712                 (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
1713                 (ns_info->protect_kernel_logs ?
1714                  ELEMENTSOF(protect_kernel_logs_proc_table) + ELEMENTSOF(protect_kernel_logs_dev_table) : 0) +
1715                 (ns_info->protect_control_groups ? 1 : 0) +
1716                 protect_home_cnt + protect_system_cnt +
1717                 (ns_info->protect_hostname ? 2 : 0) +
1718                 (namespace_info_mount_apivfs(ns_info) ? ELEMENTSOF(apivfs_table) : 0) +
1719                 (creds_path ? 2 : 1) +
1720                 !!log_namespace +
1721                 setup_propagate + /* /run/systemd/incoming */
1722                 !!notify_socket +
1723                 ns_info->private_ipc; /* /dev/mqueue */
1724 }
1725
1726 /* Walk all mount entries and dropping any unused mounts. This affects all
1727  * mounts:
1728  * - that are implicitly protected by a path that has been rendered inaccessible
1729  * - whose immediate parent requests the same protection mode as the mount itself
1730  * - that are outside of the relevant root directory
1731  * - which are duplicates
1732  */
1733 static void drop_unused_mounts(const char *root_directory, MountEntry *mounts, size_t *n_mounts) {
1734         assert(root_directory);
1735         assert(n_mounts);
1736         assert(mounts || *n_mounts == 0);
1737
1738         typesafe_qsort(mounts, *n_mounts, mount_path_compare);
1739
1740         drop_duplicates(mounts, n_mounts);
1741         drop_outside_root(root_directory, mounts, n_mounts);
1742         drop_inaccessible(mounts, n_mounts);
1743         drop_nop(mounts, n_mounts);
1744 }
1745
1746 static int create_symlinks_from_tuples(const char *root, char **strv_symlinks) {
1747         int r;
1748
1749         STRV_FOREACH_PAIR(src, dst, strv_symlinks) {
1750                 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
1751
1752                 src_abs = path_join(root, *src);
1753                 dst_abs = path_join(root, *dst);
1754                 if (!src_abs || !dst_abs)
1755                         return -ENOMEM;
1756
1757                 r = mkdir_parents_label(dst_abs, 0755);
1758                 if (r < 0)
1759                         return r;
1760
1761                 r = symlink_idempotent(src_abs, dst_abs, true);
1762                 if (r < 0)
1763                         return r;
1764         }
1765
1766         return 0;
1767 }
1768
1769 static int apply_mounts(
1770                 const char *root,
1771                 const NamespaceInfo *ns_info,
1772                 MountEntry *mounts,
1773                 size_t *n_mounts,
1774                 char **exec_dir_symlinks,
1775                 char **error_path) {
1776
1777         _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
1778         _cleanup_free_ char **deny_list = NULL;
1779         int r;
1780
1781         if (n_mounts == 0) /* Shortcut: nothing to do */
1782                 return 0;
1783
1784         assert(root);
1785         assert(mounts);
1786         assert(n_mounts);
1787
1788         /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of
1789          * /proc. For example, this is the case with the option: 'InaccessiblePaths=/proc'. */
1790         proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
1791         if (!proc_self_mountinfo) {
1792                 r = -errno;
1793
1794                 if (error_path)
1795                         *error_path = strdup("/proc/self/mountinfo");
1796
1797                 return log_debug_errno(r, "Failed to open /proc/self/mountinfo: %m");
1798         }
1799
1800         /* First round, establish all mounts we need */
1801         for (;;) {
1802                 bool again = false;
1803
1804                 for (MountEntry *m = mounts; m < mounts + *n_mounts; ++m) {
1805
1806                         if (m->applied)
1807                                 continue;
1808
1809                         /* ExtensionImages/Directories are first opened in the propagate directory, not in the root_directory */
1810                         r = follow_symlink(!IN_SET(m->mode, EXTENSION_IMAGES, EXTENSION_DIRECTORIES) ? root : NULL, m);
1811                         if (r < 0) {
1812                                 if (error_path && mount_entry_path(m))
1813                                         *error_path = strdup(mount_entry_path(m));
1814                                 return r;
1815                         }
1816                         if (r == 0) {
1817                                 /* We hit a symlinked mount point. The entry got rewritten and might
1818                                  * point to a very different place now. Let's normalize the changed
1819                                  * list, and start from the beginning. After all to mount the entry
1820                                  * at the new location we might need some other mounts first */
1821                                 again = true;
1822                                 break;
1823                         }
1824
1825                         r = apply_one_mount(root, m, ns_info);
1826                         if (r < 0) {
1827                                 if (error_path && mount_entry_path(m))
1828                                         *error_path = strdup(mount_entry_path(m));
1829                                 return r;
1830                         }
1831
1832                         m->applied = true;
1833                 }
1834
1835                 if (!again)
1836                         break;
1837
1838                 drop_unused_mounts(root, mounts, n_mounts);
1839         }
1840
1841         /* Now that all filesystems have been set up, but before the
1842          * read-only switches are flipped, create the exec dirs symlinks.
1843          * Note that when /var/lib is not empty/tmpfs, these symlinks will already
1844          * exist, which means this will be a no-op. */
1845         r = create_symlinks_from_tuples(root, exec_dir_symlinks);
1846         if (r < 0)
1847                 return log_debug_errno(r, "Failed to set up ExecDirectories symlinks inside mount namespace: %m");
1848
1849         /* Create a deny list we can pass to bind_mount_recursive() */
1850         deny_list = new(char*, (*n_mounts)+1);
1851         if (!deny_list)
1852                 return -ENOMEM;
1853         for (size_t j = 0; j < *n_mounts; j++)
1854                 deny_list[j] = (char*) mount_entry_path(mounts+j);
1855         deny_list[*n_mounts] = NULL;
1856
1857         /* Second round, flip the ro bits if necessary. */
1858         for (MountEntry *m = mounts; m < mounts + *n_mounts; ++m) {
1859                 r = make_read_only(m, deny_list, proc_self_mountinfo);
1860                 if (r < 0) {
1861                         if (error_path && mount_entry_path(m))
1862                                 *error_path = strdup(mount_entry_path(m));
1863                         return r;
1864                 }
1865         }
1866
1867         /* Third round, flip the noexec bits with a simplified deny list. */
1868         for (size_t j = 0; j < *n_mounts; j++)
1869                 if (IN_SET((mounts+j)->mode, EXEC, NOEXEC))
1870                         deny_list[j] = (char*) mount_entry_path(mounts+j);
1871         deny_list[*n_mounts] = NULL;
1872
1873         for (MountEntry *m = mounts; m < mounts + *n_mounts; ++m) {
1874                 r = make_noexec(m, deny_list, proc_self_mountinfo);
1875                 if (r < 0) {
1876                         if (error_path && mount_entry_path(m))
1877                                 *error_path = strdup(mount_entry_path(m));
1878                         return r;
1879                 }
1880         }
1881
1882         /* Fourth round, flip the nosuid bits without a deny list. */
1883         if (ns_info->mount_nosuid)
1884                 for (MountEntry *m = mounts; m < mounts + *n_mounts; ++m) {
1885                         r = make_nosuid(m, proc_self_mountinfo);
1886                         if (r < 0) {
1887                                 if (error_path && mount_entry_path(m))
1888                                         *error_path = strdup(mount_entry_path(m));
1889                                 return r;
1890                         }
1891                 }
1892
1893         return 1;
1894 }
1895
1896 static bool root_read_only(
1897                 char **read_only_paths,
1898                 ProtectSystem protect_system) {
1899
1900         /* Determine whether the root directory is going to be read-only given the configured settings. */
1901
1902         if (protect_system == PROTECT_SYSTEM_STRICT)
1903                 return true;
1904
1905         if (prefixed_path_strv_contains(read_only_paths, "/"))
1906                 return true;
1907
1908         return false;
1909 }
1910
1911 static bool home_read_only(
1912                 char** read_only_paths,
1913                 char** inaccessible_paths,
1914                 char** empty_directories,
1915                 const BindMount *bind_mounts,
1916                 size_t n_bind_mounts,
1917                 const TemporaryFileSystem *temporary_filesystems,
1918                 size_t n_temporary_filesystems,
1919                 ProtectHome protect_home) {
1920
1921         /* Determine whether the /home directory is going to be read-only given the configured settings. Yes,
1922          * this is a bit sloppy, since we don't bother checking for cases where / is affected by multiple
1923          * settings. */
1924
1925         if (protect_home != PROTECT_HOME_NO)
1926                 return true;
1927
1928         if (prefixed_path_strv_contains(read_only_paths, "/home") ||
1929             prefixed_path_strv_contains(inaccessible_paths, "/home") ||
1930             prefixed_path_strv_contains(empty_directories, "/home"))
1931                 return true;
1932
1933         for (size_t i = 0; i < n_temporary_filesystems; i++)
1934                 if (path_equal(temporary_filesystems[i].path, "/home"))
1935                         return true;
1936
1937         /* If /home is overmounted with some dir from the host it's not writable. */
1938         for (size_t i = 0; i < n_bind_mounts; i++)
1939                 if (path_equal(bind_mounts[i].destination, "/home"))
1940                         return true;
1941
1942         return false;
1943 }
1944
1945 static int verity_settings_prepare(
1946                 VeritySettings *verity,
1947                 const char *root_image,
1948                 const void *root_hash,
1949                 size_t root_hash_size,
1950                 const char *root_hash_path,
1951                 const void *root_hash_sig,
1952                 size_t root_hash_sig_size,
1953                 const char *root_hash_sig_path,
1954                 const char *verity_data_path) {
1955
1956         int r;
1957
1958         assert(verity);
1959
1960         if (root_hash) {
1961                 void *d;
1962
1963                 d = memdup(root_hash, root_hash_size);
1964                 if (!d)
1965                         return -ENOMEM;
1966
1967                 free_and_replace(verity->root_hash, d);
1968                 verity->root_hash_size = root_hash_size;
1969                 verity->designator = PARTITION_ROOT;
1970         }
1971
1972         if (root_hash_sig) {
1973                 void *d;
1974
1975                 d = memdup(root_hash_sig, root_hash_sig_size);
1976                 if (!d)
1977                         return -ENOMEM;
1978
1979                 free_and_replace(verity->root_hash_sig, d);
1980                 verity->root_hash_sig_size = root_hash_sig_size;
1981                 verity->designator = PARTITION_ROOT;
1982         }
1983
1984         if (verity_data_path) {
1985                 r = free_and_strdup(&verity->data_path, verity_data_path);
1986                 if (r < 0)
1987                         return r;
1988         }
1989
1990         r = verity_settings_load(
1991                         verity,
1992                         root_image,
1993                         root_hash_path,
1994                         root_hash_sig_path);
1995         if (r < 0)
1996                 return log_debug_errno(r, "Failed to load root hash: %m");
1997
1998         return 0;
1999 }
2000
2001 int setup_namespace(
2002                 const char* root_directory,
2003                 const char* root_image,
2004                 const MountOptions *root_image_options,
2005                 const NamespaceInfo *ns_info,
2006                 char** read_write_paths,
2007                 char** read_only_paths,
2008                 char** inaccessible_paths,
2009                 char** exec_paths,
2010                 char** no_exec_paths,
2011                 char** empty_directories,
2012                 char** exec_dir_symlinks,
2013                 const BindMount *bind_mounts,
2014                 size_t n_bind_mounts,
2015                 const TemporaryFileSystem *temporary_filesystems,
2016                 size_t n_temporary_filesystems,
2017                 const MountImage *mount_images,
2018                 size_t n_mount_images,
2019                 const char* tmp_dir,
2020                 const char* var_tmp_dir,
2021                 const char *creds_path,
2022                 const char *log_namespace,
2023                 unsigned long mount_flags,
2024                 const void *root_hash,
2025                 size_t root_hash_size,
2026                 const char *root_hash_path,
2027                 const void *root_hash_sig,
2028                 size_t root_hash_sig_size,
2029                 const char *root_hash_sig_path,
2030                 const char *verity_data_path,
2031                 const MountImage *extension_images,
2032                 size_t n_extension_images,
2033                 char **extension_directories,
2034                 const char *propagate_dir,
2035                 const char *incoming_dir,
2036                 const char *extension_dir,
2037                 const char *notify_socket,
2038                 char **error_path) {
2039
2040         _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
2041         _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
2042         _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
2043         _cleanup_strv_free_ char **hierarchies = NULL;
2044         MountEntry *m = NULL, *mounts = NULL;
2045         bool require_prefix = false, setup_propagate = false;
2046         const char *root;
2047         DissectImageFlags dissect_image_flags =
2048                 DISSECT_IMAGE_GENERIC_ROOT |
2049                 DISSECT_IMAGE_REQUIRE_ROOT |
2050                 DISSECT_IMAGE_DISCARD_ON_LOOP |
2051                 DISSECT_IMAGE_RELAX_VAR_CHECK |
2052                 DISSECT_IMAGE_FSCK |
2053                 DISSECT_IMAGE_USR_NO_ROOT |
2054                 DISSECT_IMAGE_GROWFS |
2055                 DISSECT_IMAGE_ADD_PARTITION_DEVICES |
2056                 DISSECT_IMAGE_PIN_PARTITION_DEVICES;
2057         size_t n_mounts;
2058         int r;
2059
2060         assert(ns_info);
2061
2062         /* Make sure that all mknod(), mkdir() calls we do are unaffected by the umask, and the access modes
2063          * we configure take effect */
2064         BLOCK_WITH_UMASK(0000);
2065
2066         if (!isempty(propagate_dir) && !isempty(incoming_dir))
2067                 setup_propagate = true;
2068
2069         if (mount_flags == 0)
2070                 mount_flags = MS_SHARED;
2071
2072         if (root_image) {
2073                 /* Make the whole image read-only if we can determine that we only access it in a read-only fashion. */
2074                 if (root_read_only(read_only_paths,
2075                                    ns_info->protect_system) &&
2076                     home_read_only(read_only_paths, inaccessible_paths, empty_directories,
2077                                    bind_mounts, n_bind_mounts, temporary_filesystems, n_temporary_filesystems,
2078                                    ns_info->protect_home) &&
2079                     strv_isempty(read_write_paths))
2080                         dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
2081
2082                 r = verity_settings_prepare(
2083                                 &verity,
2084                                 root_image,
2085                                 root_hash, root_hash_size, root_hash_path,
2086                                 root_hash_sig, root_hash_sig_size, root_hash_sig_path,
2087                                 verity_data_path);
2088                 if (r < 0)
2089                         return r;
2090
2091                 SET_FLAG(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE, verity.data_path);
2092
2093                 r = loop_device_make_by_path(
2094                                 root_image,
2095                                 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_DEVICE_READ_ONLY) ? O_RDONLY : -1 /* < 0 means writable if possible, read-only as fallback */,
2096                                 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
2097                                 LOCK_SH,
2098                                 &loop_device);
2099                 if (r < 0)
2100                         return log_debug_errno(r, "Failed to create loop device for root image: %m");
2101
2102                 r = dissect_loop_device(
2103                                 loop_device,
2104                                 &verity,
2105                                 root_image_options,
2106                                 dissect_image_flags,
2107                                 &dissected_image);
2108                 if (r < 0)
2109                         return log_debug_errno(r, "Failed to dissect image: %m");
2110
2111                 r = dissected_image_load_verity_sig_partition(
2112                                 dissected_image,
2113                                 loop_device->fd,
2114                                 &verity);
2115                 if (r < 0)
2116                         return r;
2117
2118                 r = dissected_image_decrypt(
2119                                 dissected_image,
2120                                 NULL,
2121                                 &verity,
2122                                 dissect_image_flags);
2123                 if (r < 0)
2124                         return log_debug_errno(r, "Failed to decrypt dissected image: %m");
2125         }
2126
2127         if (root_directory)
2128                 root = root_directory;
2129         else {
2130                 /* /run/systemd should have been created by PID 1 early on already, but in some cases, like
2131                  * when running tests (test-execute), it might not have been created yet so let's make sure
2132                  * we create it if it doesn't already exist. */
2133                 (void) mkdir_p_label("/run/systemd", 0755);
2134
2135                 /* Always create the mount namespace in a temporary directory, instead of operating directly
2136                  * in the root. The temporary directory prevents any mounts from being potentially obscured
2137                  * my other mounts we already applied.  We use the same mount point for all images, which is
2138                  * safe, since they all live in their own namespaces after all, and hence won't see each
2139                  * other. */
2140
2141                 root = "/run/systemd/unit-root";
2142                 (void) mkdir_label(root, 0700);
2143                 require_prefix = true;
2144         }
2145
2146         if (n_extension_images > 0 || !strv_isempty(extension_directories)) {
2147                 r = parse_env_extension_hierarchies(&hierarchies);
2148                 if (r < 0)
2149                         return r;
2150         }
2151
2152         n_mounts = namespace_calculate_mounts(
2153                         ns_info,
2154                         read_write_paths,
2155                         read_only_paths,
2156                         inaccessible_paths,
2157                         exec_paths,
2158                         no_exec_paths,
2159                         empty_directories,
2160                         n_bind_mounts,
2161                         n_temporary_filesystems,
2162                         n_mount_images,
2163                         n_extension_images,
2164                         strv_length(extension_directories),
2165                         strv_length(hierarchies),
2166                         tmp_dir, var_tmp_dir,
2167                         creds_path,
2168                         log_namespace,
2169                         setup_propagate,
2170                         notify_socket);
2171
2172         if (n_mounts > 0) {
2173                 m = mounts = new0(MountEntry, n_mounts);
2174                 if (!mounts)
2175                         return -ENOMEM;
2176
2177                 r = append_access_mounts(&m, read_write_paths, READWRITE, require_prefix);
2178                 if (r < 0)
2179                         goto finish;
2180
2181                 r = append_access_mounts(&m, read_only_paths, READONLY, require_prefix);
2182                 if (r < 0)
2183                         goto finish;
2184
2185                 r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE, require_prefix);
2186                 if (r < 0)
2187                         goto finish;
2188
2189                 r = append_access_mounts(&m, exec_paths, EXEC, require_prefix);
2190                 if (r < 0)
2191                         goto finish;
2192
2193                 r = append_access_mounts(&m, no_exec_paths, NOEXEC, require_prefix);
2194                 if (r < 0)
2195                         goto finish;
2196
2197                 r = append_empty_dir_mounts(&m, empty_directories);
2198                 if (r < 0)
2199                         goto finish;
2200
2201                 r = append_bind_mounts(&m, bind_mounts, n_bind_mounts);
2202                 if (r < 0)
2203                         goto finish;
2204
2205                 r = append_tmpfs_mounts(&m, temporary_filesystems, n_temporary_filesystems);
2206                 if (r < 0)
2207                         goto finish;
2208
2209                 if (tmp_dir) {
2210                         bool ro = streq(tmp_dir, RUN_SYSTEMD_EMPTY);
2211
2212                         *(m++) = (MountEntry) {
2213                                 .path_const = "/tmp",
2214                                 .mode = ro ? PRIVATE_TMP_READONLY : PRIVATE_TMP,
2215                                 .source_const = tmp_dir,
2216                         };
2217                 }
2218
2219                 if (var_tmp_dir) {
2220                         bool ro = streq(var_tmp_dir, RUN_SYSTEMD_EMPTY);
2221
2222                         *(m++) = (MountEntry) {
2223                                 .path_const = "/var/tmp",
2224                                 .mode = ro ? PRIVATE_TMP_READONLY : PRIVATE_TMP,
2225                                 .source_const = var_tmp_dir,
2226                         };
2227                 }
2228
2229                 r = append_mount_images(&m, mount_images, n_mount_images);
2230                 if (r < 0)
2231                         goto finish;
2232
2233                 r = append_extensions(&m, root, extension_dir, hierarchies, extension_images, n_extension_images, extension_directories);
2234                 if (r < 0)
2235                         goto finish;
2236
2237                 if (ns_info->private_dev)
2238                         *(m++) = (MountEntry) {
2239                                 .path_const = "/dev",
2240                                 .mode = PRIVATE_DEV,
2241                                 .flags = DEV_MOUNT_OPTIONS,
2242                         };
2243
2244                 /* In case /proc is successfully mounted with pid tree subset only (ProcSubset=pid), the
2245                    protective mounts to non-pid /proc paths would fail. But the pid only option may have
2246                    failed gracefully, so let's try the mounts but it's not fatal if they don't succeed. */
2247                 bool ignore_protect_proc = ns_info->ignore_protect_paths || ns_info->proc_subset == PROC_SUBSET_PID;
2248                 if (ns_info->protect_kernel_tunables) {
2249                         r = append_static_mounts(&m,
2250                                                  protect_kernel_tunables_proc_table,
2251                                                  ELEMENTSOF(protect_kernel_tunables_proc_table),
2252                                                  ignore_protect_proc);
2253                         if (r < 0)
2254                                 goto finish;
2255
2256                         r = append_static_mounts(&m,
2257                                                  protect_kernel_tunables_sys_table,
2258                                                  ELEMENTSOF(protect_kernel_tunables_sys_table),
2259                                                  ns_info->ignore_protect_paths);
2260                         if (r < 0)
2261                                 goto finish;
2262                 }
2263
2264                 if (ns_info->protect_kernel_modules) {
2265                         r = append_static_mounts(&m,
2266                                                  protect_kernel_modules_table,
2267                                                  ELEMENTSOF(protect_kernel_modules_table),
2268                                                  ns_info->ignore_protect_paths);
2269                         if (r < 0)
2270                                 goto finish;
2271                 }
2272
2273                 if (ns_info->protect_kernel_logs) {
2274                         r = append_static_mounts(&m,
2275                                                  protect_kernel_logs_proc_table,
2276                                                  ELEMENTSOF(protect_kernel_logs_proc_table),
2277                                                  ignore_protect_proc);
2278                         if (r < 0)
2279                                 goto finish;
2280
2281                         r = append_static_mounts(&m,
2282                                                  protect_kernel_logs_dev_table,
2283                                                  ELEMENTSOF(protect_kernel_logs_dev_table),
2284                                                  ns_info->ignore_protect_paths);
2285                         if (r < 0)
2286                                 goto finish;
2287                 }
2288
2289                 if (ns_info->protect_control_groups)
2290                         *(m++) = (MountEntry) {
2291                                 .path_const = "/sys/fs/cgroup",
2292                                 .mode = READONLY,
2293                         };
2294
2295                 r = append_protect_home(&m, ns_info->protect_home, ns_info->ignore_protect_paths);
2296                 if (r < 0)
2297                         goto finish;
2298
2299                 r = append_protect_system(&m, ns_info->protect_system, false);
2300                 if (r < 0)
2301                         goto finish;
2302
2303                 if (namespace_info_mount_apivfs(ns_info)) {
2304                         r = append_static_mounts(&m,
2305                                                  apivfs_table,
2306                                                  ELEMENTSOF(apivfs_table),
2307                                                  ns_info->ignore_protect_paths);
2308                         if (r < 0)
2309                                 goto finish;
2310                 }
2311
2312                 /* Note, if proc is mounted with subset=pid then neither of the
2313                  * two paths will exist, i.e. they are implicitly protected by
2314                  * the mount option. */
2315                 if (ns_info->protect_hostname) {
2316                         *(m++) = (MountEntry) {
2317                                 .path_const = "/proc/sys/kernel/hostname",
2318                                 .mode = READONLY,
2319                                 .ignore = ignore_protect_proc,
2320                         };
2321                         *(m++) = (MountEntry) {
2322                                 .path_const = "/proc/sys/kernel/domainname",
2323                                 .mode = READONLY,
2324                                 .ignore = ignore_protect_proc,
2325                         };
2326                 }
2327
2328                 if (ns_info->private_ipc)
2329                         *(m++) = (MountEntry) {
2330                                 .path_const = "/dev/mqueue",
2331                                 .mode = MQUEUEFS,
2332                                 .flags = MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
2333                         };
2334
2335                 if (creds_path) {
2336                         /* If our service has a credentials store configured, then bind that one in, but hide
2337                          * everything else. */
2338
2339                         *(m++) = (MountEntry) {
2340                                 .path_const = "/run/credentials",
2341                                 .mode = TMPFS,
2342                                 .read_only = true,
2343                                 .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST,
2344                                 .flags = MS_NODEV|MS_STRICTATIME|MS_NOSUID|MS_NOEXEC,
2345                         };
2346
2347                         *(m++) = (MountEntry) {
2348                                 .path_const = creds_path,
2349                                 .mode = BIND_MOUNT,
2350                                 .read_only = true,
2351                                 .source_const = creds_path,
2352                         };
2353                 } else {
2354                         /* If our service has no credentials store configured, then make the whole
2355                          * credentials tree inaccessible wholesale. */
2356
2357                         *(m++) = (MountEntry) {
2358                                 .path_const = "/run/credentials",
2359                                 .mode = INACCESSIBLE,
2360                                 .ignore = true,
2361                         };
2362                 }
2363
2364                 if (log_namespace) {
2365                         _cleanup_free_ char *q = NULL;
2366
2367                         q = strjoin("/run/systemd/journal.", log_namespace);
2368                         if (!q) {
2369                                 r = -ENOMEM;
2370                                 goto finish;
2371                         }
2372
2373                         *(m++) = (MountEntry) {
2374                                 .path_const = "/run/systemd/journal",
2375                                 .mode = BIND_MOUNT_RECURSIVE,
2376                                 .read_only = true,
2377                                 .source_malloc = TAKE_PTR(q),
2378                         };
2379                 }
2380
2381                 /* Will be used to add bind mounts at runtime */
2382                 if (setup_propagate)
2383                         *(m++) = (MountEntry) {
2384                                 .source_const = propagate_dir,
2385                                 .path_const = incoming_dir,
2386                                 .mode = BIND_MOUNT,
2387                                 .read_only = true,
2388                         };
2389
2390                 if (notify_socket)
2391                         *(m++) = (MountEntry) {
2392                                 .path_const = notify_socket,
2393                                 .source_const = notify_socket,
2394                                 .mode = BIND_MOUNT,
2395                                 .read_only = true,
2396                         };
2397
2398                 assert(mounts + n_mounts == m);
2399
2400                 /* Prepend the root directory where that's necessary */
2401                 r = prefix_where_needed(mounts, n_mounts, root);
2402                 if (r < 0)
2403                         goto finish;
2404
2405                 drop_unused_mounts(root, mounts, &n_mounts);
2406         }
2407
2408         /* All above is just preparation, figuring out what to do. Let's now actually start doing something. */
2409
2410         if (unshare(CLONE_NEWNS) < 0) {
2411                 r = log_debug_errno(errno, "Failed to unshare the mount namespace: %m");
2412                 if (IN_SET(r, -EACCES, -EPERM, -EOPNOTSUPP, -ENOSYS))
2413                         /* If the kernel doesn't support namespaces, or when there's a MAC or seccomp filter
2414                          * in place that doesn't allow us to create namespaces (or a missing cap), then
2415                          * propagate a recognizable error back, which the caller can use to detect this case
2416                          * (and only this) and optionally continue without namespacing applied. */
2417                         r = -ENOANO;
2418
2419                 goto finish;
2420         }
2421
2422         /* Create the source directory to allow runtime propagation of mounts */
2423         if (setup_propagate)
2424                 (void) mkdir_p(propagate_dir, 0600);
2425
2426         if (n_extension_images > 0 || !strv_isempty(extension_directories))
2427                 /* ExtensionImages/Directories mountpoint directories will be created while parsing the
2428                  * mounts to create, so have the parent ready */
2429                 (void) mkdir_p(extension_dir, 0600);
2430
2431         /* Remount / as SLAVE so that nothing now mounted in the namespace
2432          * shows up in the parent */
2433         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
2434                 r = log_debug_errno(errno, "Failed to remount '/' as SLAVE: %m");
2435                 goto finish;
2436         }
2437
2438         if (root_image) {
2439                 /* A root image is specified, mount it to the right place */
2440                 r = dissected_image_mount(dissected_image, root, UID_INVALID, UID_INVALID, dissect_image_flags);
2441                 if (r < 0) {
2442                         log_debug_errno(r, "Failed to mount root image: %m");
2443                         goto finish;
2444                 }
2445
2446                 /* Now release the block device lock, so that udevd is free to call BLKRRPART on the device
2447                  * if it likes. */
2448                 r = loop_device_flock(loop_device, LOCK_UN);
2449                 if (r < 0) {
2450                         log_debug_errno(r, "Failed to release lock on loopback block device: %m");
2451                         goto finish;
2452                 }
2453
2454                 r = dissected_image_relinquish(dissected_image);
2455                 if (r < 0) {
2456                         log_debug_errno(r, "Failed to relinquish dissected image: %m");
2457                         goto finish;
2458                 }
2459
2460         } else if (root_directory) {
2461
2462                 /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
2463                 r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW);
2464                 if (r < 0) {
2465                         log_debug_errno(r, "Failed to detect that %s is a mount point or not: %m", root);
2466                         goto finish;
2467                 }
2468                 if (r == 0) {
2469                         r = mount_nofollow_verbose(LOG_DEBUG, root, root, NULL, MS_BIND|MS_REC, NULL);
2470                         if (r < 0)
2471                                 goto finish;
2472                 }
2473
2474         } else {
2475                 /* Let's mount the main root directory to the root directory to use */
2476                 r = mount_nofollow_verbose(LOG_DEBUG, "/", root, NULL, MS_BIND|MS_REC, NULL);
2477                 if (r < 0)
2478                         goto finish;
2479         }
2480
2481         /* Try to set up the new root directory before mounting anything else there. */
2482         if (root_image || root_directory)
2483                 (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
2484
2485         /* Now make the magic happen */
2486         r = apply_mounts(root, ns_info, mounts, &n_mounts, exec_dir_symlinks, error_path);
2487         if (r < 0)
2488                 goto finish;
2489
2490         /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
2491         r = mount_switch_root(root, /* mount_propagation_flag = */ 0);
2492         if (r == -EINVAL && root_directory) {
2493                 /* If we are using root_directory and we don't have privileges (ie: user manager in a user
2494                  * namespace) and the root_directory is already a mount point in the parent namespace,
2495                  * MS_MOVE will fail as we don't have permission to change it (with EINVAL rather than
2496                  * EPERM). Attempt to bind-mount it over itself (like we do above if it's not already a
2497                  * mount point) and try again. */
2498                 r = mount_nofollow_verbose(LOG_DEBUG, root, root, NULL, MS_BIND|MS_REC, NULL);
2499                 if (r < 0)
2500                         goto finish;
2501                 r = mount_switch_root(root, /* mount_propagation_flag = */ 0);
2502         }
2503         if (r < 0) {
2504                 log_debug_errno(r, "Failed to mount root with MS_MOVE: %m");
2505                 goto finish;
2506         }
2507
2508         /* Remount / as the desired mode. Note that this will not
2509          * reestablish propagation from our side to the host, since
2510          * what's disconnected is disconnected. */
2511         if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
2512                 r = log_debug_errno(errno, "Failed to remount '/' with desired mount flags: %m");
2513                 goto finish;
2514         }
2515
2516         /* bind_mount_in_namespace() will MS_MOVE into that directory, and that's only
2517          * supported for non-shared mounts. This needs to happen after remounting / or it will fail. */
2518         if (setup_propagate) {
2519                 r = mount(NULL, incoming_dir, NULL, MS_SLAVE, NULL);
2520                 if (r < 0) {
2521                         log_error_errno(r, "Failed to remount %s with MS_SLAVE: %m", incoming_dir);
2522                         goto finish;
2523                 }
2524         }
2525
2526         r = 0;
2527
2528 finish:
2529         if (n_mounts > 0)
2530                 for (m = mounts; m < mounts + n_mounts; m++)
2531                         mount_entry_done(m);
2532
2533         free(mounts);
2534
2535         return r;
2536 }
2537
2538 void bind_mount_free_many(BindMount *b, size_t n) {
2539         assert(b || n == 0);
2540
2541         for (size_t i = 0; i < n; i++) {
2542                 free(b[i].source);
2543                 free(b[i].destination);
2544         }
2545
2546         free(b);
2547 }
2548
2549 int bind_mount_add(BindMount **b, size_t *n, const BindMount *item) {
2550         _cleanup_free_ char *s = NULL, *d = NULL;
2551         BindMount *c;
2552
2553         assert(b);
2554         assert(n);
2555         assert(item);
2556
2557         s = strdup(item->source);
2558         if (!s)
2559                 return -ENOMEM;
2560
2561         d = strdup(item->destination);
2562         if (!d)
2563                 return -ENOMEM;
2564
2565         c = reallocarray(*b, *n + 1, sizeof(BindMount));
2566         if (!c)
2567                 return -ENOMEM;
2568
2569         *b = c;
2570
2571         c[(*n) ++] = (BindMount) {
2572                 .source = TAKE_PTR(s),
2573                 .destination = TAKE_PTR(d),
2574                 .read_only = item->read_only,
2575                 .nosuid = item->nosuid,
2576                 .recursive = item->recursive,
2577                 .ignore_enoent = item->ignore_enoent,
2578         };
2579
2580         return 0;
2581 }
2582
2583 MountImage* mount_image_free_many(MountImage *m, size_t *n) {
2584         assert(n);
2585         assert(m || *n == 0);
2586
2587         for (size_t i = 0; i < *n; i++) {
2588                 free(m[i].source);
2589                 free(m[i].destination);
2590                 mount_options_free_all(m[i].mount_options);
2591         }
2592
2593         free(m);
2594         *n = 0;
2595         return NULL;
2596 }
2597
2598 int mount_image_add(MountImage **m, size_t *n, const MountImage *item) {
2599         _cleanup_free_ char *s = NULL, *d = NULL;
2600         _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
2601         MountImage *c;
2602
2603         assert(m);
2604         assert(n);
2605         assert(item);
2606
2607         s = strdup(item->source);
2608         if (!s)
2609                 return -ENOMEM;
2610
2611         if (item->destination) {
2612                 d = strdup(item->destination);
2613                 if (!d)
2614                         return -ENOMEM;
2615         }
2616
2617         LIST_FOREACH(mount_options, i, item->mount_options) {
2618                 _cleanup_(mount_options_free_allp) MountOptions *o = NULL;
2619
2620                 o = new(MountOptions, 1);
2621                 if (!o)
2622                         return -ENOMEM;
2623
2624                 *o = (MountOptions) {
2625                         .partition_designator = i->partition_designator,
2626                         .options = strdup(i->options),
2627                 };
2628                 if (!o->options)
2629                         return -ENOMEM;
2630
2631                 LIST_APPEND(mount_options, options, TAKE_PTR(o));
2632         }
2633
2634         c = reallocarray(*m, *n + 1, sizeof(MountImage));
2635         if (!c)
2636                 return -ENOMEM;
2637
2638         *m = c;
2639
2640         c[(*n) ++] = (MountImage) {
2641                 .source = TAKE_PTR(s),
2642                 .destination = TAKE_PTR(d),
2643                 .mount_options = TAKE_PTR(options),
2644                 .ignore_enoent = item->ignore_enoent,
2645                 .type = item->type,
2646         };
2647
2648         return 0;
2649 }
2650
2651 void temporary_filesystem_free_many(TemporaryFileSystem *t, size_t n) {
2652         assert(t || n == 0);
2653
2654         for (size_t i = 0; i < n; i++) {
2655                 free(t[i].path);
2656                 free(t[i].options);
2657         }
2658
2659         free(t);
2660 }
2661
2662 int temporary_filesystem_add(
2663                 TemporaryFileSystem **t,
2664                 size_t *n,
2665                 const char *path,
2666                 const char *options) {
2667
2668         _cleanup_free_ char *p = NULL, *o = NULL;
2669         TemporaryFileSystem *c;
2670
2671         assert(t);
2672         assert(n);
2673         assert(path);
2674
2675         p = strdup(path);
2676         if (!p)
2677                 return -ENOMEM;
2678
2679         if (!isempty(options)) {
2680                 o = strdup(options);
2681                 if (!o)
2682                         return -ENOMEM;
2683         }
2684
2685         c = reallocarray(*t, *n + 1, sizeof(TemporaryFileSystem));
2686         if (!c)
2687                 return -ENOMEM;
2688
2689         *t = c;
2690
2691         c[(*n) ++] = (TemporaryFileSystem) {
2692                 .path = TAKE_PTR(p),
2693                 .options = TAKE_PTR(o),
2694         };
2695
2696         return 0;
2697 }
2698
2699 static int make_tmp_prefix(const char *prefix) {
2700         _cleanup_free_ char *t = NULL;
2701         _cleanup_close_ int fd = -EBADF;
2702         int r;
2703
2704         /* Don't do anything unless we know the dir is actually missing */
2705         r = access(prefix, F_OK);
2706         if (r >= 0)
2707                 return 0;
2708         if (errno != ENOENT)
2709                 return -errno;
2710
2711         WITH_UMASK(000)
2712                 r = mkdir_parents(prefix, 0755);
2713         if (r < 0)
2714                 return r;
2715
2716         r = tempfn_random(prefix, NULL, &t);
2717         if (r < 0)
2718                 return r;
2719
2720         /* umask will corrupt this access mode, but that doesn't matter, we need to call chmod() anyway for
2721          * the suid bit, below. */
2722         fd = open_mkdir_at(AT_FDCWD, t, O_EXCL|O_CLOEXEC, 0777);
2723         if (fd < 0)
2724                 return fd;
2725
2726         r = RET_NERRNO(fchmod(fd, 01777));
2727         if (r < 0) {
2728                 (void) rmdir(t);
2729                 return r;
2730         }
2731
2732         r = RET_NERRNO(rename(t, prefix));
2733         if (r < 0) {
2734                 (void) rmdir(t);
2735                 return r == -EEXIST ? 0 : r; /* it's fine if someone else created the dir by now */
2736         }
2737
2738         return 0;
2739
2740 }
2741
2742 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path, char **tmp_path) {
2743         _cleanup_free_ char *x = NULL;
2744         _cleanup_free_ char *y = NULL;
2745         sd_id128_t boot_id;
2746         bool rw = true;
2747         int r;
2748
2749         assert(id);
2750         assert(prefix);
2751         assert(path);
2752
2753         /* We include the boot id in the directory so that after a
2754          * reboot we can easily identify obsolete directories. */
2755
2756         r = sd_id128_get_boot(&boot_id);
2757         if (r < 0)
2758                 return r;
2759
2760         x = strjoin(prefix, "/systemd-private-", SD_ID128_TO_STRING(boot_id), "-", id, "-XXXXXX");
2761         if (!x)
2762                 return -ENOMEM;
2763
2764         r = make_tmp_prefix(prefix);
2765         if (r < 0)
2766                 return r;
2767
2768         WITH_UMASK(0077)
2769                 if (!mkdtemp(x)) {
2770                         if (errno == EROFS || ERRNO_IS_DISK_SPACE(errno))
2771                                 rw = false;
2772                         else
2773                                 return -errno;
2774                 }
2775
2776         if (rw) {
2777                 y = strjoin(x, "/tmp");
2778                 if (!y)
2779                         return -ENOMEM;
2780
2781                 WITH_UMASK(0000)
2782                         if (mkdir(y, 0777 | S_ISVTX) < 0)
2783                                 return -errno;
2784
2785                 r = label_fix_full(AT_FDCWD, y, prefix, 0);
2786                 if (r < 0)
2787                         return r;
2788
2789                 if (tmp_path)
2790                         *tmp_path = TAKE_PTR(y);
2791         } else {
2792                 /* Trouble: we failed to create the directory. Instead of failing, let's simulate /tmp being
2793                  * read-only. This way the service will get the EROFS result as if it was writing to the real
2794                  * file system. */
2795                 WITH_UMASK(0000)
2796                         r = mkdir_p(RUN_SYSTEMD_EMPTY, 0500);
2797                 if (r < 0)
2798                         return r;
2799
2800                 r = free_and_strdup(&x, RUN_SYSTEMD_EMPTY);
2801                 if (r < 0)
2802                         return r;
2803         }
2804
2805         *path = TAKE_PTR(x);
2806         return 0;
2807 }
2808
2809 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
2810         _cleanup_(namespace_cleanup_tmpdirp) char *a = NULL;
2811         _cleanup_(rmdir_and_freep) char *a_tmp = NULL;
2812         char *b;
2813         int r;
2814
2815         assert(id);
2816         assert(tmp_dir);
2817         assert(var_tmp_dir);
2818
2819         r = setup_one_tmp_dir(id, "/tmp", &a, &a_tmp);
2820         if (r < 0)
2821                 return r;
2822
2823         r = setup_one_tmp_dir(id, "/var/tmp", &b, NULL);
2824         if (r < 0)
2825                 return r;
2826
2827         a_tmp = mfree(a_tmp); /* avoid rmdir */
2828         *tmp_dir = TAKE_PTR(a);
2829         *var_tmp_dir = TAKE_PTR(b);
2830
2831         return 0;
2832 }
2833
2834 int setup_shareable_ns(const int ns_storage_socket[static 2], unsigned long nsflag) {
2835         _cleanup_close_ int ns = -1;
2836         int r, q;
2837         const char *ns_name, *ns_path;
2838
2839         assert(ns_storage_socket);
2840         assert(ns_storage_socket[0] >= 0);
2841         assert(ns_storage_socket[1] >= 0);
2842
2843         ns_name = namespace_single_flag_to_string(nsflag);
2844         assert(ns_name);
2845
2846         /* We use the passed socketpair as a storage buffer for our
2847          * namespace reference fd. Whatever process runs this first
2848          * shall create a new namespace, all others should just join
2849          * it. To serialize that we use a file lock on the socket
2850          * pair.
2851          *
2852          * It's a bit crazy, but hey, works great! */
2853
2854         if (lockf(ns_storage_socket[0], F_LOCK, 0) < 0)
2855                 return -errno;
2856
2857         ns = receive_one_fd(ns_storage_socket[0], MSG_DONTWAIT);
2858         if (ns == -EAGAIN) {
2859                 /* Nothing stored yet, so let's create a new namespace. */
2860
2861                 if (unshare(nsflag) < 0) {
2862                         r = -errno;
2863                         goto fail;
2864                 }
2865
2866                 (void) loopback_setup();
2867
2868                 ns_path = strjoina("/proc/self/ns/", ns_name);
2869                 ns = open(ns_path, O_RDONLY|O_CLOEXEC|O_NOCTTY);
2870                 if (ns < 0) {
2871                         r = -errno;
2872                         goto fail;
2873                 }
2874
2875                 r = 1;
2876
2877         } else if (ns < 0) {
2878                 r = ns;
2879                 goto fail;
2880
2881         } else {
2882                 /* Yay, found something, so let's join the namespace */
2883                 if (setns(ns, nsflag) < 0) {
2884                         r = -errno;
2885                         goto fail;
2886                 }
2887
2888                 r = 0;
2889         }
2890
2891         q = send_one_fd(ns_storage_socket[1], ns, MSG_DONTWAIT);
2892         if (q < 0) {
2893                 r = q;
2894                 goto fail;
2895         }
2896
2897 fail:
2898         (void) lockf(ns_storage_socket[0], F_ULOCK, 0);
2899         return r;
2900 }
2901
2902 int open_shareable_ns_path(const int ns_storage_socket[static 2], const char *path, unsigned long nsflag) {
2903         _cleanup_close_ int ns = -1;
2904         int q, r;
2905
2906         assert(ns_storage_socket);
2907         assert(ns_storage_socket[0] >= 0);
2908         assert(ns_storage_socket[1] >= 0);
2909         assert(path);
2910
2911         /* If the storage socket doesn't contain a ns fd yet, open one via the file system and store it in
2912          * it. This is supposed to be called ahead of time, i.e. before setup_shareable_ns() which will
2913          * allocate a new anonymous ns if needed. */
2914
2915         if (lockf(ns_storage_socket[0], F_LOCK, 0) < 0)
2916                 return -errno;
2917
2918         ns = receive_one_fd(ns_storage_socket[0], MSG_DONTWAIT);
2919         if (ns == -EAGAIN) {
2920                 /* Nothing stored yet. Open the file from the file system. */
2921
2922                 ns = open(path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
2923                 if (ns < 0) {
2924                         r = -errno;
2925                         goto fail;
2926                 }
2927
2928                 r = fd_is_ns(ns, nsflag);
2929                 if (r == 0) { /* Not a ns of our type? Refuse early. */
2930                         r = -EINVAL;
2931                         goto fail;
2932                 }
2933                 if (r < 0 && r != -EUCLEAN) /* EUCLEAN: we don't know */
2934                         goto fail;
2935
2936                 r = 1;
2937
2938         } else if (ns < 0) {
2939                 r = ns;
2940                 goto fail;
2941         } else
2942                 r = 0; /* Already allocated */
2943
2944         q = send_one_fd(ns_storage_socket[1], ns, MSG_DONTWAIT);
2945         if (q < 0) {
2946                 r = q;
2947                 goto fail;
2948         }
2949
2950 fail:
2951         (void) lockf(ns_storage_socket[0], F_ULOCK, 0);
2952         return r;
2953 }
2954
2955 bool ns_type_supported(NamespaceType type) {
2956         const char *t, *ns_proc;
2957
2958         t = namespace_type_to_string(type);
2959         if (!t) /* Don't know how to translate this? Then it's not supported */
2960                 return false;
2961
2962         ns_proc = strjoina("/proc/self/ns/", t);
2963         return access(ns_proc, F_OK) == 0;
2964 }
2965
2966 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
2967         [PROTECT_HOME_NO]        = "no",
2968         [PROTECT_HOME_YES]       = "yes",
2969         [PROTECT_HOME_READ_ONLY] = "read-only",
2970         [PROTECT_HOME_TMPFS]     = "tmpfs",
2971 };
2972
2973 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_home, ProtectHome, PROTECT_HOME_YES);
2974
2975 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
2976         [PROTECT_SYSTEM_NO]     = "no",
2977         [PROTECT_SYSTEM_YES]    = "yes",
2978         [PROTECT_SYSTEM_FULL]   = "full",
2979         [PROTECT_SYSTEM_STRICT] = "strict",
2980 };
2981
2982 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_system, ProtectSystem, PROTECT_SYSTEM_YES);
2983
2984 static const char* const namespace_type_table[] = {
2985         [NAMESPACE_MOUNT]  = "mnt",
2986         [NAMESPACE_CGROUP] = "cgroup",
2987         [NAMESPACE_UTS]    = "uts",
2988         [NAMESPACE_IPC]    = "ipc",
2989         [NAMESPACE_USER]   = "user",
2990         [NAMESPACE_PID]    = "pid",
2991         [NAMESPACE_NET]    = "net",
2992         [NAMESPACE_TIME]   = "time",
2993 };
2994
2995 DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);
2996
2997 static const char* const protect_proc_table[_PROTECT_PROC_MAX] = {
2998         [PROTECT_PROC_DEFAULT]    = "default",
2999         [PROTECT_PROC_NOACCESS]   = "noaccess",
3000         [PROTECT_PROC_INVISIBLE]  = "invisible",
3001         [PROTECT_PROC_PTRACEABLE] = "ptraceable",
3002 };
3003
3004 DEFINE_STRING_TABLE_LOOKUP(protect_proc, ProtectProc);
3005
3006 static const char* const proc_subset_table[_PROC_SUBSET_MAX] = {
3007         [PROC_SUBSET_ALL] = "all",
3008         [PROC_SUBSET_PID] = "pid",
3009 };
3010
3011 DEFINE_STRING_TABLE_LOOKUP(proc_subset, ProcSubset);