src/core/namespace.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2010 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <errno.h>
  22 #include <sched.h>
  23 #include <stdio.h>
  24 #include <string.h>
  25 #include <sys/mount.h>
  26 #include <sys/stat.h>
  27 #include <unistd.h>
  28 #include <linux/fs.h>
  29
  30 #include "alloc-util.h"
  31 #include "base-filesystem.h"
  32 #include "dev-setup.h"
  33 #include "fd-util.h"
  34 #include "fs-util.h"
  35 #include "label.h"
  36 #include "loop-util.h"
  37 #include "loopback-setup.h"
  38 #include "missing.h"
  39 #include "mkdir.h"
  40 #include "mount-util.h"
  41 #include "namespace.h"
  42 #include "path-util.h"
  43 #include "selinux-util.h"
  44 #include "socket-util.h"
  45 #include "stat-util.h"
  46 #include "string-table.h"
  47 #include "string-util.h"
  48 #include "strv.h"
  49 #include "umask-util.h"
  50 #include "user-util.h"
  51 #include "util.h"
  52
  53 #define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
  54
  55 typedef enum MountMode {
  56         /* This is ordered by priority! */
  57         INACCESSIBLE,
  58         BIND_MOUNT,
  59         BIND_MOUNT_RECURSIVE,
  60         PRIVATE_TMP,
  61         PRIVATE_DEV,
  62         BIND_DEV,
  63         EMPTY_DIR,
  64         SYSFS,
  65         PROCFS,
  66         READONLY,
  67         READWRITE,
  68         TMPFS,
  69 } MountMode;
  70
  71 typedef struct MountEntry {
  72         const char *path_const;   /* Memory allocated on stack or static */
  73         MountMode mode:5;
  74         bool ignore:1;            /* Ignore if path does not exist? */
  75         bool has_prefix:1;        /* Already is prefixed by the root dir? */
  76         bool read_only:1;         /* Shall this mount point be read-only? */
  77         char *path_malloc;        /* Use this instead of 'path_const' if we had to allocate memory */
  78         const char *source_const; /* The source path, for bind mounts */
  79         char *source_malloc;
  80         const char *options_const;/* Mount options for tmpfs */
  81         char *options_malloc;
  82         unsigned long flags;      /* Mount flags used by EMPTY_DIR and TMPFS. Do not include MS_RDONLY here, but please use read_only. */
  83 } MountEntry;
  84
  85 /* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
  86  * something there already. These mounts are hence overriden by any other explicitly configured mounts. */
  87 static const MountEntry apivfs_table[] = {
  88         { "/proc",               PROCFS,       false },
  89         { "/dev",                BIND_DEV,     false },
  90         { "/sys",                SYSFS,        false },
  91 };
  92
  93 /* ProtectKernelTunables= option and the related filesystem APIs */
  94 static const MountEntry protect_kernel_tunables_table[] = {
  95         { "/proc/sys",           READONLY,     false },
  96         { "/proc/sysrq-trigger", READONLY,     true  },
  97         { "/proc/latency_stats", READONLY,     true  },
  98         { "/proc/mtrr",          READONLY,     true  },
  99         { "/proc/apm",           READONLY,     true  }, /* Obsolete API, there's no point in permitting access to this, ever */
 100         { "/proc/acpi",          READONLY,     true  },
 101         { "/proc/timer_stats",   READONLY,     true  },
 102         { "/proc/asound",        READONLY,     true  },
 103         { "/proc/bus",           READONLY,     true  },
 104         { "/proc/fs",            READONLY,     true  },
 105         { "/proc/irq",           READONLY,     true  },
 106         { "/sys",                READONLY,     false },
 107         { "/sys/kernel/debug",   READONLY,     true  },
 108         { "/sys/kernel/tracing", READONLY,     true  },
 109         { "/sys/fs/bpf",         READONLY,     true  },
 110         { "/sys/fs/cgroup",      READWRITE,    false }, /* READONLY is set by ProtectControlGroups= option */
 111         { "/sys/fs/selinux",     READWRITE,    true  },
 112 };
 113
 114 /* ProtectKernelModules= option */
 115 static const MountEntry protect_kernel_modules_table[] = {
 116 #if HAVE_SPLIT_USR
 117         { "/lib/modules",        INACCESSIBLE, true  },
 118 #endif
 119         { "/usr/lib/modules",    INACCESSIBLE, true  },
 120 };
 121
 122 /*
 123  * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
 124  * system should be protected by ProtectSystem=
 125  */
 126 static const MountEntry protect_home_read_only_table[] = {
 127         { "/home",               READONLY,     true  },
 128         { "/run/user",           READONLY,     true  },
 129         { "/root",               READONLY,     true  },
 130 };
 131
 132 /* ProtectHome=tmpfs table */
 133 static const MountEntry protect_home_tmpfs_table[] = {
 134         { "/home",               TMPFS,        true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
 135         { "/run/user",           TMPFS,        true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
 136         { "/root",               TMPFS,        true, .read_only = true, .options_const = "mode=0700", .flags = MS_NODEV|MS_STRICTATIME },
 137 };
 138
 139 /* ProtectHome=yes table */
 140 static const MountEntry protect_home_yes_table[] = {
 141         { "/home",               INACCESSIBLE, true  },
 142         { "/run/user",           INACCESSIBLE, true  },
 143         { "/root",               INACCESSIBLE, true  },
 144 };
 145
 146 /* ProtectSystem=yes table */
 147 static const MountEntry protect_system_yes_table[] = {
 148         { "/usr",                READONLY,     false },
 149         { "/boot",               READONLY,     true  },
 150         { "/efi",                READONLY,     true  },
 151 #if HAVE_SPLIT_USR
 152         { "/lib",                READONLY,     true  },
 153         { "/lib64",              READONLY,     true  },
 154         { "/bin",                READONLY,     true  },
 155         { "/sbin",               READONLY,     true  },
 156 #endif
 157 };
 158
 159 /* ProtectSystem=full includes ProtectSystem=yes */
 160 static const MountEntry protect_system_full_table[] = {
 161         { "/usr",                READONLY,     false },
 162         { "/boot",               READONLY,     true  },
 163         { "/efi",                READONLY,     true  },
 164         { "/etc",                READONLY,     false },
 165 #if HAVE_SPLIT_USR
 166         { "/lib",                READONLY,     true  },
 167         { "/lib64",              READONLY,     true  },
 168         { "/bin",                READONLY,     true  },
 169         { "/sbin",               READONLY,     true  },
 170 #endif
 171 };
 172
 173 /*
 174  * ProtectSystem=strict table. In this strict mode, we mount everything
 175  * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
 176  * which are left writable, but PrivateDevices= + ProtectKernelTunables=
 177  * protect those, and these options should be fully orthogonal.
 178  * (And of course /home and friends are also left writable, as ProtectHome=
 179  * shall manage those, orthogonally).
 180  */
 181 static const MountEntry protect_system_strict_table[] = {
 182         { "/",                   READONLY,     false },
 183         { "/proc",               READWRITE,    false },      /* ProtectKernelTunables= */
 184         { "/sys",                READWRITE,    false },      /* ProtectKernelTunables= */
 185         { "/dev",                READWRITE,    false },      /* PrivateDevices= */
 186         { "/home",               READWRITE,    true  },      /* ProtectHome= */
 187         { "/run/user",           READWRITE,    true  },      /* ProtectHome= */
 188         { "/root",               READWRITE,    true  },      /* ProtectHome= */
 189 };
 190
 191 static const char *mount_entry_path(const MountEntry *p) {
 192         assert(p);
 193
 194         /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
 195          * otherwise the stack/static ->path field is returned. */
 196
 197         return p->path_malloc ?: p->path_const;
 198 }
 199
 200 static bool mount_entry_read_only(const MountEntry *p) {
 201         assert(p);
 202
 203         return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE);
 204 }
 205
 206 static const char *mount_entry_source(const MountEntry *p) {
 207         assert(p);
 208
 209         return p->source_malloc ?: p->source_const;
 210 }
 211
 212 static const char *mount_entry_options(const MountEntry *p) {
 213         assert(p);
 214
 215         return p->options_malloc ?: p->options_const;
 216 }
 217
 218 static void mount_entry_done(MountEntry *p) {
 219         assert(p);
 220
 221         p->path_malloc = mfree(p->path_malloc);
 222         p->source_malloc = mfree(p->source_malloc);
 223         p->options_malloc = mfree(p->options_malloc);
 224 }
 225
 226 static int append_access_mounts(MountEntry **p, char **strv, MountMode mode, bool forcibly_require_prefix) {
 227         char **i;
 228
 229         assert(p);
 230
 231         /* Adds a list of user-supplied READWRITE/READONLY/INACCESSIBLE entries */
 232
 233         STRV_FOREACH(i, strv) {
 234                 bool ignore = false, needs_prefix = false;
 235                 const char *e = *i;
 236
 237                 /* Look for any prefixes */
 238                 if (startswith(e, "-")) {
 239                         e++;
 240                         ignore = true;
 241                 }
 242                 if (startswith(e, "+")) {
 243                         e++;
 244                         needs_prefix = true;
 245                 }
 246
 247                 if (!path_is_absolute(e))
 248                         return -EINVAL;
 249
 250                 *((*p)++) = (MountEntry) {
 251                         .path_const = e,
 252                         .mode = mode,
 253                         .ignore = ignore,
 254                         .has_prefix = !needs_prefix && !forcibly_require_prefix,
 255                 };
 256         }
 257
 258         return 0;
 259 }
 260
 261 static int append_empty_dir_mounts(MountEntry **p, char **strv) {
 262         char **i;
 263
 264         assert(p);
 265
 266         /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
 267          * "/private/" boundary directories for DynamicUser=1. */
 268
 269         STRV_FOREACH(i, strv) {
 270
 271                 *((*p)++) = (MountEntry) {
 272                         .path_const = *i,
 273                         .mode = EMPTY_DIR,
 274                         .ignore = false,
 275                         .has_prefix = false,
 276                         .read_only = true,
 277                         .options_const = "mode=755",
 278                         .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
 279                 };
 280         }
 281
 282         return 0;
 283 }
 284
 285 static int append_bind_mounts(MountEntry **p, const BindMount *binds, unsigned n) {
 286         unsigned i;
 287
 288         assert(p);
 289
 290         for (i = 0; i < n; i++) {
 291                 const BindMount *b = binds + i;
 292
 293                 *((*p)++) = (MountEntry) {
 294                         .path_const = b->destination,
 295                         .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT,
 296                         .read_only = b->read_only,
 297                         .source_const = b->source,
 298                         .ignore = b->ignore_enoent,
 299                 };
 300         }
 301
 302         return 0;
 303 }
 304
 305 static int append_tmpfs_mounts(MountEntry **p, const TemporaryFileSystem *tmpfs, unsigned n) {
 306         unsigned i;
 307         int r;
 308
 309         assert(p);
 310
 311         for (i = 0; i < n; i++) {
 312                 const TemporaryFileSystem *t = tmpfs + i;
 313                 _cleanup_free_ char *o = NULL, *str = NULL;
 314                 unsigned long flags = MS_NODEV|MS_STRICTATIME;
 315                 bool ro = false;
 316
 317                 if (!path_is_absolute(t->path))
 318                         return -EINVAL;
 319
 320                 if (!isempty(t->options)) {
 321                         str = strjoin("mode=0755,", t->options);
 322                         if (!str)
 323                                 return -ENOMEM;
 324
 325                         r = mount_option_mangle(str, MS_NODEV|MS_STRICTATIME, &flags, &o);
 326                         if (r < 0)
 327                                 return r;
 328
 329                         ro = !!(flags & MS_RDONLY);
 330                         if (ro)
 331                                 flags ^= MS_RDONLY;
 332                 }
 333
 334                 *((*p)++) = (MountEntry) {
 335                         .path_const = t->path,
 336                         .mode = TMPFS,
 337                         .read_only = ro,
 338                         .options_malloc = o,
 339                         .flags = flags,
 340                 };
 341
 342                 o = NULL;
 343         }
 344
 345         return 0;
 346 }
 347
 348 static int append_static_mounts(MountEntry **p, const MountEntry *mounts, unsigned n, bool ignore_protect) {
 349         unsigned i;
 350
 351         assert(p);
 352         assert(mounts);
 353
 354         /* Adds a list of static pre-defined entries */
 355
 356         for (i = 0; i < n; i++)
 357                 *((*p)++) = (MountEntry) {
 358                         .path_const = mount_entry_path(mounts+i),
 359                         .mode = mounts[i].mode,
 360                         .ignore = mounts[i].ignore || ignore_protect,
 361                 };
 362
 363         return 0;
 364 }
 365
 366 static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
 367         assert(p);
 368
 369         switch (protect_home) {
 370
 371         case PROTECT_HOME_NO:
 372                 return 0;
 373
 374         case PROTECT_HOME_READ_ONLY:
 375                 return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
 376
 377         case PROTECT_HOME_TMPFS:
 378                 return append_static_mounts(p, protect_home_tmpfs_table, ELEMENTSOF(protect_home_tmpfs_table), ignore_protect);
 379
 380         case PROTECT_HOME_YES:
 381                 return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
 382
 383         default:
 384                 assert_not_reached("Unexpected ProtectHome= value");
 385         }
 386 }
 387
 388 static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
 389         assert(p);
 390
 391         switch (protect_system) {
 392
 393         case PROTECT_SYSTEM_NO:
 394                 return 0;
 395
 396         case PROTECT_SYSTEM_STRICT:
 397                 return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
 398
 399         case PROTECT_SYSTEM_YES:
 400                 return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
 401
 402         case PROTECT_SYSTEM_FULL:
 403                 return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
 404
 405         default:
 406                 assert_not_reached("Unexpected ProtectSystem= value");
 407         }
 408 }
 409
 410 static int mount_path_compare(const void *a, const void *b) {
 411         const MountEntry *p = a, *q = b;
 412         int d;
 413
 414         /* If the paths are not equal, then order prefixes first */
 415         d = path_compare(mount_entry_path(p), mount_entry_path(q));
 416         if (d != 0)
 417                 return d;
 418
 419         /* If the paths are equal, check the mode */
 420         if (p->mode < q->mode)
 421                 return -1;
 422
 423         if (p->mode > q->mode)
 424                 return 1;
 425
 426         return 0;
 427 }
 428
 429 static int prefix_where_needed(MountEntry *m, unsigned n, const char *root_directory) {
 430         unsigned i;
 431
 432         /* Prefixes all paths in the bind mount table with the root directory if it is specified and the entry needs
 433          * that. */
 434
 435         if (!root_directory)
 436                 return 0;
 437
 438         for (i = 0; i < n; i++) {
 439                 char *s;
 440
 441                 if (m[i].has_prefix)
 442                         continue;
 443
 444                 s = prefix_root(root_directory, mount_entry_path(m+i));
 445                 if (!s)
 446                         return -ENOMEM;
 447
 448                 free_and_replace(m[i].path_malloc, s);
 449                 m[i].has_prefix = true;
 450         }
 451
 452         return 0;
 453 }
 454
 455 static void drop_duplicates(MountEntry *m, unsigned *n) {
 456         MountEntry *f, *t, *previous;
 457
 458         assert(m);
 459         assert(n);
 460
 461         /* Drops duplicate entries. Expects that the array is properly ordered already. */
 462
 463         for (f = m, t = m, previous = NULL; f < m + *n; f++) {
 464
 465                 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
 466                  * above. */
 467                 if (previous && path_equal(mount_entry_path(f), mount_entry_path(previous))) {
 468                         log_debug("%s is duplicate.", mount_entry_path(f));
 469                         previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */
 470                         mount_entry_done(f);
 471                         continue;
 472                 }
 473
 474                 *t = *f;
 475                 previous = t;
 476                 t++;
 477         }
 478
 479         *n = t - m;
 480 }
 481
 482 static void drop_inaccessible(MountEntry *m, unsigned *n) {
 483         MountEntry *f, *t;
 484         const char *clear = NULL;
 485
 486         assert(m);
 487         assert(n);
 488
 489         /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
 490          * ordered already. */
 491
 492         for (f = m, t = m; f < m + *n; f++) {
 493
 494                 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
 495                  * it, as inaccessible paths really should drop the entire subtree. */
 496                 if (clear && path_startswith(mount_entry_path(f), clear)) {
 497                         log_debug("%s is masked by %s.", mount_entry_path(f), clear);
 498                         mount_entry_done(f);
 499                         continue;
 500                 }
 501
 502                 clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
 503
 504                 *t = *f;
 505                 t++;
 506         }
 507
 508         *n = t - m;
 509 }
 510
 511 static void drop_nop(MountEntry *m, unsigned *n) {
 512         MountEntry *f, *t;
 513
 514         assert(m);
 515         assert(n);
 516
 517         /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
 518          * list is ordered by prefixes. */
 519
 520         for (f = m, t = m; f < m + *n; f++) {
 521
 522                 /* Only suppress such subtrees for READONLY and READWRITE entries */
 523                 if (IN_SET(f->mode, READONLY, READWRITE)) {
 524                         MountEntry *p;
 525                         bool found = false;
 526
 527                         /* Now let's find the first parent of the entry we are looking at. */
 528                         for (p = t-1; p >= m; p--) {
 529                                 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
 530                                         found = true;
 531                                         break;
 532                                 }
 533                         }
 534
 535                         /* We found it, let's see if it's the same mode, if so, we can drop this entry */
 536                         if (found && p->mode == f->mode) {
 537                                 log_debug("%s is redundant by %s", mount_entry_path(f), mount_entry_path(p));
 538                                 mount_entry_done(f);
 539                                 continue;
 540                         }
 541                 }
 542
 543                 *t = *f;
 544                 t++;
 545         }
 546
 547         *n = t - m;
 548 }
 549
 550 static void drop_outside_root(const char *root_directory, MountEntry *m, unsigned *n) {
 551         MountEntry *f, *t;
 552
 553         assert(m);
 554         assert(n);
 555
 556         /* Nothing to do */
 557         if (!root_directory)
 558                 return;
 559
 560         /* Drops all mounts that are outside of the root directory. */
 561
 562         for (f = m, t = m; f < m + *n; f++) {
 563
 564                 if (!path_startswith(mount_entry_path(f), root_directory)) {
 565                         log_debug("%s is outside of root directory.", mount_entry_path(f));
 566                         mount_entry_done(f);
 567                         continue;
 568                 }
 569
 570                 *t = *f;
 571                 t++;
 572         }
 573
 574         *n = t - m;
 575 }
 576
 577 static int clone_device_node(const char *d, const char *temporary_mount) {
 578         const char *dn;
 579         struct stat st;
 580         int r;
 581
 582         if (stat(d, &st) < 0) {
 583                 if (errno == ENOENT)
 584                         return 0;
 585                 return -errno;
 586         }
 587
 588         if (!S_ISBLK(st.st_mode) &&
 589             !S_ISCHR(st.st_mode))
 590                 return -EINVAL;
 591
 592         if (st.st_rdev == 0)
 593                 return 0;
 594
 595         dn = strjoina(temporary_mount, d);
 596
 597         mac_selinux_create_file_prepare(d, st.st_mode);
 598         r = mknod(dn, st.st_mode, st.st_rdev);
 599         mac_selinux_create_file_clear();
 600         if (r < 0)
 601                 return log_debug_errno(errno, "mknod failed for %s: %m", d);
 602
 603         return 1;
 604 }
 605
 606 static int mount_private_dev(MountEntry *m) {
 607         static const char devnodes[] =
 608                 "/dev/null\0"
 609                 "/dev/zero\0"
 610                 "/dev/full\0"
 611                 "/dev/random\0"
 612                 "/dev/urandom\0"
 613                 "/dev/tty\0";
 614
 615         char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
 616         const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
 617         _cleanup_umask_ mode_t u;
 618         int r;
 619
 620         assert(m);
 621
 622         u = umask(0000);
 623
 624         if (!mkdtemp(temporary_mount))
 625                 return -errno;
 626
 627         dev = strjoina(temporary_mount, "/dev");
 628         (void) mkdir(dev, 0755);
 629         if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
 630                 r = -errno;
 631                 goto fail;
 632         }
 633
 634         devpts = strjoina(temporary_mount, "/dev/pts");
 635         (void) mkdir(devpts, 0755);
 636         if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
 637                 r = -errno;
 638                 goto fail;
 639         }
 640
 641         /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx
 642          * when /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible
 643          * thus, in that case make a clone
 644          *
 645          * in nspawn and other containers it will be a symlink, in that case make it a symlink
 646          */
 647         r = is_symlink("/dev/ptmx");
 648         if (r < 0)
 649                 goto fail;
 650         if (r > 0) {
 651                 devptmx = strjoina(temporary_mount, "/dev/ptmx");
 652                 if (symlink("pts/ptmx", devptmx) < 0) {
 653                         r = -errno;
 654                         goto fail;
 655                 }
 656         } else {
 657                 r = clone_device_node("/dev/ptmx", temporary_mount);
 658                 if (r < 0)
 659                         goto fail;
 660                 if (r == 0) {
 661                         r = -ENXIO;
 662                         goto fail;
 663                 }
 664         }
 665
 666         devshm = strjoina(temporary_mount, "/dev/shm");
 667         (void) mkdir(devshm, 0755);
 668         r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
 669         if (r < 0) {
 670                 r = -errno;
 671                 goto fail;
 672         }
 673
 674         devmqueue = strjoina(temporary_mount, "/dev/mqueue");
 675         (void) mkdir(devmqueue, 0755);
 676         (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
 677
 678         devhugepages = strjoina(temporary_mount, "/dev/hugepages");
 679         (void) mkdir(devhugepages, 0755);
 680         (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
 681
 682         devlog = strjoina(temporary_mount, "/dev/log");
 683         (void) symlink("/run/systemd/journal/dev-log", devlog);
 684
 685         NULSTR_FOREACH(d, devnodes) {
 686                 r = clone_device_node(d, temporary_mount);
 687                 if (r < 0)
 688                         goto fail;
 689         }
 690
 691         dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
 692
 693         /* Create the /dev directory if missing. It is more likely to be
 694          * missing when the service is started with RootDirectory. This is
 695          * consistent with mount units creating the mount points when missing.
 696          */
 697         (void) mkdir_p_label(mount_entry_path(m), 0755);
 698
 699         /* Unmount everything in old /dev */
 700         umount_recursive(mount_entry_path(m), 0);
 701         if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) {
 702                 r = -errno;
 703                 goto fail;
 704         }
 705
 706         rmdir(dev);
 707         rmdir(temporary_mount);
 708
 709         return 0;
 710
 711 fail:
 712         if (devpts)
 713                 umount(devpts);
 714
 715         if (devshm)
 716                 umount(devshm);
 717
 718         if (devhugepages)
 719                 umount(devhugepages);
 720
 721         if (devmqueue)
 722                 umount(devmqueue);
 723
 724         umount(dev);
 725         rmdir(dev);
 726         rmdir(temporary_mount);
 727
 728         return r;
 729 }
 730
 731 static int mount_bind_dev(const MountEntry *m) {
 732         int r;
 733
 734         assert(m);
 735
 736         /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
 737          * /dev. This is only used when RootDirectory= is set. */
 738
 739         (void) mkdir_p_label(mount_entry_path(m), 0755);
 740
 741         r = path_is_mount_point(mount_entry_path(m), NULL, 0);
 742         if (r < 0)
 743                 return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
 744         if (r > 0) /* make this a NOP if /dev is already a mount point */
 745                 return 0;
 746
 747         if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
 748                 return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m));
 749
 750         return 1;
 751 }
 752
 753 static int mount_sysfs(const MountEntry *m) {
 754         int r;
 755
 756         assert(m);
 757
 758         (void) mkdir_p_label(mount_entry_path(m), 0755);
 759
 760         r = path_is_mount_point(mount_entry_path(m), NULL, 0);
 761         if (r < 0)
 762                 return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
 763         if (r > 0) /* make this a NOP if /sys is already a mount point */
 764                 return 0;
 765
 766         /* Bind mount the host's version so that we get all child mounts of it, too. */
 767         if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
 768                 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
 769
 770         return 1;
 771 }
 772
 773 static int mount_procfs(const MountEntry *m) {
 774         int r;
 775
 776         assert(m);
 777
 778         (void) mkdir_p_label(mount_entry_path(m), 0755);
 779
 780         r = path_is_mount_point(mount_entry_path(m), NULL, 0);
 781         if (r < 0)
 782                 return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
 783         if (r > 0) /* make this a NOP if /proc is already a mount point */
 784                 return 0;
 785
 786         /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
 787         if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
 788                 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
 789
 790         return 1;
 791 }
 792
 793 static int mount_tmpfs(const MountEntry *m) {
 794         assert(m);
 795
 796         /* First, get rid of everything that is below if there is anything. Then, overmount with our new tmpfs */
 797
 798         (void) mkdir_p_label(mount_entry_path(m), 0755);
 799         (void) umount_recursive(mount_entry_path(m), 0);
 800
 801         if (mount("tmpfs", mount_entry_path(m), "tmpfs", m->flags, mount_entry_options(m)) < 0)
 802                 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
 803
 804         return 1;
 805 }
 806
 807 static int mount_entry_chase(
 808                 const char *root_directory,
 809                 const MountEntry *m,
 810                 const char *path,
 811                 bool chase_nonexistent,
 812                 char **location) {
 813
 814         char *chased;
 815         int r;
 816
 817         assert(m);
 818
 819         /* Since mount() will always follow symlinks and we need to take the different root directory into account we
 820          * chase the symlinks on our own first. This is called for the destination path, as well as the source path (if
 821          * that applies). The result is stored in "location". */
 822
 823         r = chase_symlinks(path, root_directory, chase_nonexistent ? CHASE_NONEXISTENT : 0, &chased);
 824         if (r == -ENOENT && m->ignore) {
 825                 log_debug_errno(r, "Path %s does not exist, ignoring.", path);
 826                 return 0;
 827         }
 828         if (r < 0)
 829                 return log_debug_errno(r, "Failed to follow symlinks on %s: %m", path);
 830
 831         log_debug("Followed symlinks %s → %s.", path, chased);
 832
 833         free(*location);
 834         *location = chased;
 835
 836         return 1;
 837 }
 838
 839 static int apply_mount(
 840                 const char *root_directory,
 841                 MountEntry *m) {
 842
 843         bool rbind = true, make = false;
 844         const char *what;
 845         int r;
 846
 847         assert(m);
 848
 849         r = mount_entry_chase(root_directory, m, mount_entry_path(m), !IN_SET(m->mode, INACCESSIBLE, READONLY, READWRITE), &m->path_malloc);
 850         if (r <= 0)
 851                 return r;
 852
 853         log_debug("Applying namespace mount on %s", mount_entry_path(m));
 854
 855         switch (m->mode) {
 856
 857         case INACCESSIBLE: {
 858                 struct stat target;
 859
 860                 /* First, get rid of everything that is below if there
 861                  * is anything... Then, overmount it with an
 862                  * inaccessible path. */
 863                 (void) umount_recursive(mount_entry_path(m), 0);
 864
 865                 if (lstat(mount_entry_path(m), &target) < 0)
 866                         return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m));
 867
 868                 what = mode_to_inaccessible_node(target.st_mode);
 869                 if (!what) {
 870                         log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed");
 871                         return -ELOOP;
 872                 }
 873                 break;
 874         }
 875
 876         case READONLY:
 877         case READWRITE:
 878                 r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
 879                 if (r < 0)
 880                         return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m));
 881                 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
 882                         return 0;
 883                 /* This isn't a mount point yet, let's make it one. */
 884                 what = mount_entry_path(m);
 885                 break;
 886
 887         case BIND_MOUNT:
 888                 rbind = false;
 889
 890                 _fallthrough_;
 891         case BIND_MOUNT_RECURSIVE:
 892                 /* Also chase the source mount */
 893
 894                 r = mount_entry_chase(root_directory, m, mount_entry_source(m), false, &m->source_malloc);
 895                 if (r <= 0)
 896                         return r;
 897
 898                 what = mount_entry_source(m);
 899                 make = true;
 900                 break;
 901
 902         case EMPTY_DIR:
 903         case TMPFS:
 904                 return mount_tmpfs(m);
 905
 906         case PRIVATE_TMP:
 907                 what = mount_entry_source(m);
 908                 make = true;
 909                 break;
 910
 911         case PRIVATE_DEV:
 912                 return mount_private_dev(m);
 913
 914         case BIND_DEV:
 915                 return mount_bind_dev(m);
 916
 917         case SYSFS:
 918                 return mount_sysfs(m);
 919
 920         case PROCFS:
 921                 return mount_procfs(m);
 922
 923         default:
 924                 assert_not_reached("Unknown mode");
 925         }
 926
 927         assert(what);
 928
 929         if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0) {
 930                 bool try_again = false;
 931                 r = -errno;
 932
 933                 if (r == -ENOENT && make) {
 934                         struct stat st;
 935
 936                         /* Hmm, either the source or the destination are missing. Let's see if we can create the destination, then try again */
 937
 938                         if (stat(what, &st) >= 0) {
 939
 940                                 (void) mkdir_parents(mount_entry_path(m), 0755);
 941
 942                                 if (S_ISDIR(st.st_mode))
 943                                         try_again = mkdir(mount_entry_path(m), 0755) >= 0;
 944                                 else
 945                                         try_again = touch(mount_entry_path(m)) >= 0;
 946                         }
 947                 }
 948
 949                 if (try_again) {
 950                         if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0)
 951                                 r = -errno;
 952                         else
 953                                 r = 0;
 954                 }
 955
 956                 if (r < 0)
 957                         return log_debug_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
 958         }
 959
 960         log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
 961         return 0;
 962 }
 963
 964 static int make_read_only(const MountEntry *m, char **blacklist, FILE *proc_self_mountinfo) {
 965         int r = 0;
 966
 967         assert(m);
 968         assert(proc_self_mountinfo);
 969
 970         if (mount_entry_read_only(m)) {
 971                 if (IN_SET(m->mode, EMPTY_DIR, TMPFS)) {
 972                         /* Make superblock readonly */
 973                         if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT | MS_RDONLY | m->flags, mount_entry_options(m)) < 0)
 974                                 r = -errno;
 975                 } else
 976                         r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), true, blacklist, proc_self_mountinfo);
 977         } else if (m->mode == PRIVATE_DEV) {
 978                 /* Superblock can be readonly but the submounts can't */
 979                 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
 980                         r = -errno;
 981         } else
 982                 return 0;
 983
 984         /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
 985          * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
 986          * read-only mounts already applied. */
 987
 988         if (r == -ENOENT && m->ignore)
 989                 r = 0;
 990
 991         return r;
 992 }
 993
 994 static bool namespace_info_mount_apivfs(const char *root_directory, const NamespaceInfo *ns_info) {
 995         assert(ns_info);
 996
 997         /*
 998          * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
 999          * since to protect the API VFS mounts, they need to be around in the
1000          * first place... and RootDirectory= or RootImage= need to be set.
1001          */
1002
1003         /* root_directory should point to a mount point */
1004         return root_directory &&
1005                 (ns_info->mount_apivfs ||
1006                  ns_info->protect_control_groups ||
1007                  ns_info->protect_kernel_tunables);
1008 }
1009
1010 static unsigned namespace_calculate_mounts(
1011                 const char* root_directory,
1012                 const NamespaceInfo *ns_info,
1013                 char** read_write_paths,
1014                 char** read_only_paths,
1015                 char** inaccessible_paths,
1016                 char** empty_directories,
1017                 unsigned n_bind_mounts,
1018                 unsigned n_temporary_filesystems,
1019                 const char* tmp_dir,
1020                 const char* var_tmp_dir,
1021                 ProtectHome protect_home,
1022                 ProtectSystem protect_system) {
1023
1024         unsigned protect_home_cnt;
1025         unsigned protect_system_cnt =
1026                 (protect_system == PROTECT_SYSTEM_STRICT ?
1027                  ELEMENTSOF(protect_system_strict_table) :
1028                  ((protect_system == PROTECT_SYSTEM_FULL) ?
1029                   ELEMENTSOF(protect_system_full_table) :
1030                   ((protect_system == PROTECT_SYSTEM_YES) ?
1031                    ELEMENTSOF(protect_system_yes_table) : 0)));
1032
1033         protect_home_cnt =
1034                 (protect_home == PROTECT_HOME_YES ?
1035                  ELEMENTSOF(protect_home_yes_table) :
1036                  ((protect_home == PROTECT_HOME_READ_ONLY) ?
1037                   ELEMENTSOF(protect_home_read_only_table) :
1038                   ((protect_home == PROTECT_HOME_TMPFS) ?
1039                    ELEMENTSOF(protect_home_tmpfs_table) : 0)));
1040
1041         return !!tmp_dir + !!var_tmp_dir +
1042                 strv_length(read_write_paths) +
1043                 strv_length(read_only_paths) +
1044                 strv_length(inaccessible_paths) +
1045                 strv_length(empty_directories) +
1046                 n_bind_mounts +
1047                 n_temporary_filesystems +
1048                 ns_info->private_dev +
1049                 (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
1050                 (ns_info->protect_control_groups ? 1 : 0) +
1051                 (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
1052                 protect_home_cnt + protect_system_cnt +
1053                 (namespace_info_mount_apivfs(root_directory, ns_info) ? ELEMENTSOF(apivfs_table) : 0);
1054 }
1055
1056 int setup_namespace(
1057                 const char* root_directory,
1058                 const char* root_image,
1059                 const NamespaceInfo *ns_info,
1060                 char** read_write_paths,
1061                 char** read_only_paths,
1062                 char** inaccessible_paths,
1063                 char** empty_directories,
1064                 const BindMount *bind_mounts,
1065                 unsigned n_bind_mounts,
1066                 const TemporaryFileSystem *temporary_filesystems,
1067                 unsigned n_temporary_filesystems,
1068                 const char* tmp_dir,
1069                 const char* var_tmp_dir,
1070                 ProtectHome protect_home,
1071                 ProtectSystem protect_system,
1072                 unsigned long mount_flags,
1073                 DissectImageFlags dissect_image_flags) {
1074
1075         _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
1076         _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
1077         _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
1078         _cleanup_free_ void *root_hash = NULL;
1079         MountEntry *m, *mounts = NULL;
1080         size_t root_hash_size = 0;
1081         bool make_slave = false;
1082         const char *root;
1083         unsigned n_mounts;
1084         bool require_prefix = false;
1085         int r = 0;
1086
1087         assert(ns_info);
1088
1089         if (mount_flags == 0)
1090                 mount_flags = MS_SHARED;
1091
1092         if (root_image) {
1093                 dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT;
1094
1095                 if (protect_system == PROTECT_SYSTEM_STRICT && strv_isempty(read_write_paths))
1096                         dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
1097
1098                 r = loop_device_make_by_path(root_image,
1099                                              dissect_image_flags & DISSECT_IMAGE_READ_ONLY ? O_RDONLY : O_RDWR,
1100                                              &loop_device);
1101                 if (r < 0)
1102                         return r;
1103
1104                 r = root_hash_load(root_image, &root_hash, &root_hash_size);
1105                 if (r < 0)
1106                         return r;
1107
1108                 r = dissect_image(loop_device->fd, root_hash, root_hash_size, dissect_image_flags, &dissected_image);
1109                 if (r < 0)
1110                         return r;
1111
1112                 r = dissected_image_decrypt(dissected_image, NULL, root_hash, root_hash_size, dissect_image_flags, &decrypted_image);
1113                 if (r < 0)
1114                         return r;
1115         }
1116
1117         if (root_directory)
1118                 root = root_directory;
1119         else if (root_image || n_bind_mounts > 0 || n_temporary_filesystems > 0) {
1120
1121                 /* If we are booting from an image, create a mount point for the image, if it's still missing. We use
1122                  * the same mount point for all images, which is safe, since they all live in their own namespaces
1123                  * after all, and hence won't see each other. We also use such a root directory whenever there are bind
1124                  * mounts configured, so that their source mounts are never obstructed by mounts we already applied
1125                  * while we are applying them. */
1126
1127                 root = "/run/systemd/unit-root";
1128                 (void) mkdir_label(root, 0700);
1129                 require_prefix = true;
1130         } else
1131                 root = NULL;
1132
1133         n_mounts = namespace_calculate_mounts(
1134                         root,
1135                         ns_info,
1136                         read_write_paths,
1137                         read_only_paths,
1138                         inaccessible_paths,
1139                         empty_directories,
1140                         n_bind_mounts,
1141                         n_temporary_filesystems,
1142                         tmp_dir, var_tmp_dir,
1143                         protect_home, protect_system);
1144
1145         /* Set mount slave mode */
1146         if (root || n_mounts > 0)
1147                 make_slave = true;
1148
1149         if (n_mounts > 0) {
1150                 m = mounts = (MountEntry *) alloca0(n_mounts * sizeof(MountEntry));
1151                 r = append_access_mounts(&m, read_write_paths, READWRITE, require_prefix);
1152                 if (r < 0)
1153                         goto finish;
1154
1155                 r = append_access_mounts(&m, read_only_paths, READONLY, require_prefix);
1156                 if (r < 0)
1157                         goto finish;
1158
1159                 r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE, require_prefix);
1160                 if (r < 0)
1161                         goto finish;
1162
1163                 r = append_empty_dir_mounts(&m, empty_directories);
1164                 if (r < 0)
1165                         goto finish;
1166
1167                 r = append_bind_mounts(&m, bind_mounts, n_bind_mounts);
1168                 if (r < 0)
1169                         goto finish;
1170
1171                 r = append_tmpfs_mounts(&m, temporary_filesystems, n_temporary_filesystems);
1172                 if (r < 0)
1173                         goto finish;
1174
1175                 if (tmp_dir) {
1176                         *(m++) = (MountEntry) {
1177                                 .path_const = "/tmp",
1178                                 .mode = PRIVATE_TMP,
1179                                 .source_const = tmp_dir,
1180                         };
1181                 }
1182
1183                 if (var_tmp_dir) {
1184                         *(m++) = (MountEntry) {
1185                                 .path_const = "/var/tmp",
1186                                 .mode = PRIVATE_TMP,
1187                                 .source_const = var_tmp_dir,
1188                         };
1189                 }
1190
1191                 if (ns_info->private_dev) {
1192                         *(m++) = (MountEntry) {
1193                                 .path_const = "/dev",
1194                                 .mode = PRIVATE_DEV,
1195                         };
1196                 }
1197
1198                 if (ns_info->protect_kernel_tunables) {
1199                         r = append_static_mounts(&m, protect_kernel_tunables_table, ELEMENTSOF(protect_kernel_tunables_table), ns_info->ignore_protect_paths);
1200                         if (r < 0)
1201                                 goto finish;
1202                 }
1203
1204                 if (ns_info->protect_kernel_modules) {
1205                         r = append_static_mounts(&m, protect_kernel_modules_table, ELEMENTSOF(protect_kernel_modules_table), ns_info->ignore_protect_paths);
1206                         if (r < 0)
1207                                 goto finish;
1208                 }
1209
1210                 if (ns_info->protect_control_groups) {
1211                         *(m++) = (MountEntry) {
1212                                 .path_const = "/sys/fs/cgroup",
1213                                 .mode = READONLY,
1214                         };
1215                 }
1216
1217                 r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths);
1218                 if (r < 0)
1219                         goto finish;
1220
1221                 r = append_protect_system(&m, protect_system, false);
1222                 if (r < 0)
1223                         goto finish;
1224
1225                 if (namespace_info_mount_apivfs(root, ns_info)) {
1226                         r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths);
1227                         if (r < 0)
1228                                 goto finish;
1229                 }
1230
1231                 assert(mounts + n_mounts == m);
1232
1233                 /* Prepend the root directory where that's necessary */
1234                 r = prefix_where_needed(mounts, n_mounts, root);
1235                 if (r < 0)
1236                         goto finish;
1237
1238                 qsort(mounts, n_mounts, sizeof(MountEntry), mount_path_compare);
1239
1240                 drop_duplicates(mounts, &n_mounts);
1241                 drop_outside_root(root, mounts, &n_mounts);
1242                 drop_inaccessible(mounts, &n_mounts);
1243                 drop_nop(mounts, &n_mounts);
1244         }
1245
1246         if (unshare(CLONE_NEWNS) < 0) {
1247                 r = -errno;
1248                 goto finish;
1249         }
1250
1251         if (make_slave) {
1252                 /* Remount / as SLAVE so that nothing now mounted in the namespace
1253                    shows up in the parent */
1254                 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1255                         r = -errno;
1256                         goto finish;
1257                 }
1258         }
1259
1260         if (root_image) {
1261                 /* A root image is specified, mount it to the right place */
1262                 r = dissected_image_mount(dissected_image, root, UID_INVALID, dissect_image_flags);
1263                 if (r < 0)
1264                         goto finish;
1265
1266                 if (decrypted_image) {
1267                         r = decrypted_image_relinquish(decrypted_image);
1268                         if (r < 0)
1269                                 goto finish;
1270                 }
1271
1272                 loop_device_relinquish(loop_device);
1273
1274         } else if (root_directory) {
1275
1276                 /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
1277                 r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW);
1278                 if (r < 0)
1279                         goto finish;
1280                 if (r == 0) {
1281                         if (mount(root, root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1282                                 r = -errno;
1283                                 goto finish;
1284                         }
1285                 }
1286
1287         } else if (root) {
1288
1289                 /* Let's mount the main root directory to the root directory to use */
1290                 if (mount("/", root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1291                         r = -errno;
1292                         goto finish;
1293                 }
1294         }
1295
1296         /* Try to set up the new root directory before mounting anything else there. */
1297         if (root_image || root_directory)
1298                 (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
1299
1300         if (n_mounts > 0) {
1301                 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
1302                 char **blacklist;
1303                 unsigned j;
1304
1305                 /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of /proc.
1306                  * For example, this is the case with the option: 'InaccessiblePaths=/proc' */
1307                 proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
1308                 if (!proc_self_mountinfo) {
1309                         r = -errno;
1310                         goto finish;
1311                 }
1312
1313                 /* First round, add in all special mounts we need */
1314                 for (m = mounts; m < mounts + n_mounts; ++m) {
1315                         r = apply_mount(root, m);
1316                         if (r < 0)
1317                                 goto finish;
1318                 }
1319
1320                 /* Create a blacklist we can pass to bind_mount_recursive() */
1321                 blacklist = newa(char*, n_mounts+1);
1322                 for (j = 0; j < n_mounts; j++)
1323                         blacklist[j] = (char*) mount_entry_path(mounts+j);
1324                 blacklist[j] = NULL;
1325
1326                 /* Second round, flip the ro bits if necessary. */
1327                 for (m = mounts; m < mounts + n_mounts; ++m) {
1328                         r = make_read_only(m, blacklist, proc_self_mountinfo);
1329                         if (r < 0)
1330                                 goto finish;
1331                 }
1332         }
1333
1334         if (root) {
1335                 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
1336                 r = mount_move_root(root);
1337                 if (r < 0)
1338                         goto finish;
1339         }
1340
1341         /* Remount / as the desired mode. Note that this will not
1342          * reestablish propagation from our side to the host, since
1343          * what's disconnected is disconnected. */
1344         if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
1345                 r = -errno;
1346                 goto finish;
1347         }
1348
1349         r = 0;
1350
1351 finish:
1352         for (m = mounts; m < mounts + n_mounts; m++)
1353                 mount_entry_done(m);
1354
1355         return r;
1356 }
1357
1358 void bind_mount_free_many(BindMount *b, unsigned n) {
1359         unsigned i;
1360
1361         assert(b || n == 0);
1362
1363         for (i = 0; i < n; i++) {
1364                 free(b[i].source);
1365                 free(b[i].destination);
1366         }
1367
1368         free(b);
1369 }
1370
1371 int bind_mount_add(BindMount **b, unsigned *n, const BindMount *item) {
1372         _cleanup_free_ char *s = NULL, *d = NULL;
1373         BindMount *c;
1374
1375         assert(b);
1376         assert(n);
1377         assert(item);
1378
1379         s = strdup(item->source);
1380         if (!s)
1381                 return -ENOMEM;
1382
1383         d = strdup(item->destination);
1384         if (!d)
1385                 return -ENOMEM;
1386
1387         c = reallocarray(*b, *n + 1, sizeof(BindMount));
1388         if (!c)
1389                 return -ENOMEM;
1390
1391         *b = c;
1392
1393         c[(*n) ++] = (BindMount) {
1394                 .source = s,
1395                 .destination = d,
1396                 .read_only = item->read_only,
1397                 .recursive = item->recursive,
1398                 .ignore_enoent = item->ignore_enoent,
1399         };
1400
1401         s = d = NULL;
1402         return 0;
1403 }
1404
1405 void temporary_filesystem_free_many(TemporaryFileSystem *t, unsigned n) {
1406         unsigned i;
1407
1408         assert(t || n == 0);
1409
1410         for (i = 0; i < n; i++) {
1411                 free(t[i].path);
1412                 free(t[i].options);
1413         }
1414
1415         free(t);
1416 }
1417
1418 int temporary_filesystem_add(
1419                 TemporaryFileSystem **t,
1420                 unsigned *n,
1421                 const char *path,
1422                 const char *options) {
1423
1424         _cleanup_free_ char *p = NULL, *o = NULL;
1425         TemporaryFileSystem *c;
1426
1427         assert(t);
1428         assert(n);
1429         assert(path);
1430
1431         p = strdup(path);
1432         if (!p)
1433                 return -ENOMEM;
1434
1435         if (!isempty(options)) {
1436                 o = strdup(options);
1437                 if (!o)
1438                         return -ENOMEM;
1439         }
1440
1441         c = reallocarray(*t, *n + 1, sizeof(TemporaryFileSystem));
1442         if (!c)
1443                 return -ENOMEM;
1444
1445         *t = c;
1446
1447         c[(*n) ++] = (TemporaryFileSystem) {
1448                 .path = p,
1449                 .options = o,
1450         };
1451
1452         p = o = NULL;
1453         return 0;
1454 }
1455
1456 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
1457         _cleanup_free_ char *x = NULL;
1458         char bid[SD_ID128_STRING_MAX];
1459         sd_id128_t boot_id;
1460         int r;
1461
1462         assert(id);
1463         assert(prefix);
1464         assert(path);
1465
1466         /* We include the boot id in the directory so that after a
1467          * reboot we can easily identify obsolete directories. */
1468
1469         r = sd_id128_get_boot(&boot_id);
1470         if (r < 0)
1471                 return r;
1472
1473         x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX");
1474         if (!x)
1475                 return -ENOMEM;
1476
1477         RUN_WITH_UMASK(0077)
1478                 if (!mkdtemp(x))
1479                         return -errno;
1480
1481         RUN_WITH_UMASK(0000) {
1482                 char *y;
1483
1484                 y = strjoina(x, "/tmp");
1485
1486                 if (mkdir(y, 0777 | S_ISVTX) < 0)
1487                         return -errno;
1488         }
1489
1490         *path = x;
1491         x = NULL;
1492
1493         return 0;
1494 }
1495
1496 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
1497         char *a, *b;
1498         int r;
1499
1500         assert(id);
1501         assert(tmp_dir);
1502         assert(var_tmp_dir);
1503
1504         r = setup_one_tmp_dir(id, "/tmp", &a);
1505         if (r < 0)
1506                 return r;
1507
1508         r = setup_one_tmp_dir(id, "/var/tmp", &b);
1509         if (r < 0) {
1510                 char *t;
1511
1512                 t = strjoina(a, "/tmp");
1513                 rmdir(t);
1514                 rmdir(a);
1515
1516                 free(a);
1517                 return r;
1518         }
1519
1520         *tmp_dir = a;
1521         *var_tmp_dir = b;
1522
1523         return 0;
1524 }
1525
1526 int setup_netns(int netns_storage_socket[2]) {
1527         _cleanup_close_ int netns = -1;
1528         int r, q;
1529
1530         assert(netns_storage_socket);
1531         assert(netns_storage_socket[0] >= 0);
1532         assert(netns_storage_socket[1] >= 0);
1533
1534         /* We use the passed socketpair as a storage buffer for our
1535          * namespace reference fd. Whatever process runs this first
1536          * shall create a new namespace, all others should just join
1537          * it. To serialize that we use a file lock on the socket
1538          * pair.
1539          *
1540          * It's a bit crazy, but hey, works great! */
1541
1542         if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1543                 return -errno;
1544
1545         netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1546         if (netns == -EAGAIN) {
1547                 /* Nothing stored yet, so let's create a new namespace */
1548
1549                 if (unshare(CLONE_NEWNET) < 0) {
1550                         r = -errno;
1551                         goto fail;
1552                 }
1553
1554                 loopback_setup();
1555
1556                 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
1557                 if (netns < 0) {
1558                         r = -errno;
1559                         goto fail;
1560                 }
1561
1562                 r = 1;
1563
1564         } else if (netns < 0) {
1565                 r = netns;
1566                 goto fail;
1567
1568         } else {
1569                 /* Yay, found something, so let's join the namespace */
1570                 if (setns(netns, CLONE_NEWNET) < 0) {
1571                         r = -errno;
1572                         goto fail;
1573                 }
1574
1575                 r = 0;
1576         }
1577
1578         q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1579         if (q < 0) {
1580                 r = q;
1581                 goto fail;
1582         }
1583
1584 fail:
1585         (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
1586         return r;
1587 }
1588
1589 bool ns_type_supported(NamespaceType type) {
1590         const char *t, *ns_proc;
1591
1592         t = namespace_type_to_string(type);
1593         if (!t) /* Don't know how to translate this? Then it's not supported */
1594                 return false;
1595
1596         ns_proc = strjoina("/proc/self/ns/", t);
1597         return access(ns_proc, F_OK) == 0;
1598 }
1599
1600 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
1601         [PROTECT_HOME_NO] = "no",
1602         [PROTECT_HOME_YES] = "yes",
1603         [PROTECT_HOME_READ_ONLY] = "read-only",
1604         [PROTECT_HOME_TMPFS] = "tmpfs",
1605 };
1606
1607 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
1608
1609 ProtectHome parse_protect_home_or_bool(const char *s) {
1610         int r;
1611
1612         r = parse_boolean(s);
1613         if (r > 0)
1614                 return PROTECT_HOME_YES;
1615         if (r == 0)
1616                 return PROTECT_HOME_NO;
1617
1618         return protect_home_from_string(s);
1619 }
1620
1621 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
1622         [PROTECT_SYSTEM_NO] = "no",
1623         [PROTECT_SYSTEM_YES] = "yes",
1624         [PROTECT_SYSTEM_FULL] = "full",
1625         [PROTECT_SYSTEM_STRICT] = "strict",
1626 };
1627
1628 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);
1629
1630 ProtectSystem parse_protect_system_or_bool(const char *s) {
1631         int r;
1632
1633         r = parse_boolean(s);
1634         if (r > 0)
1635                 return PROTECT_SYSTEM_YES;
1636         if (r == 0)
1637                 return PROTECT_SYSTEM_NO;
1638
1639         return protect_system_from_string(s);
1640 }
1641
1642 static const char* const namespace_type_table[] = {
1643         [NAMESPACE_MOUNT] = "mnt",
1644         [NAMESPACE_CGROUP] = "cgroup",
1645         [NAMESPACE_UTS] = "uts",
1646         [NAMESPACE_IPC] = "ipc",
1647         [NAMESPACE_USER] = "user",
1648         [NAMESPACE_PID] = "pid",
1649         [NAMESPACE_NET] = "net",
1650 };
1651
1652 DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);