src/core/namespace.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2010 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <errno.h>
  22 #include <sched.h>
  23 #include <stdio.h>
  24 #include <string.h>
  25 #include <sys/mount.h>
  26 #include <sys/stat.h>
  27 #include <unistd.h>
  28 #include <linux/fs.h>
  29
  30 #include "alloc-util.h"
  31 #include "base-filesystem.h"
  32 #include "dev-setup.h"
  33 #include "fd-util.h"
  34 #include "fs-util.h"
  35 #include "label.h"
  36 #include "loop-util.h"
  37 #include "loopback-setup.h"
  38 #include "missing.h"
  39 #include "mkdir.h"
  40 #include "mount-util.h"
  41 #include "namespace.h"
  42 #include "path-util.h"
  43 #include "selinux-util.h"
  44 #include "socket-util.h"
  45 #include "stat-util.h"
  46 #include "string-table.h"
  47 #include "string-util.h"
  48 #include "strv.h"
  49 #include "umask-util.h"
  50 #include "user-util.h"
  51 #include "util.h"
  52
  53 #define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
  54
  55 typedef enum MountMode {
  56         /* This is ordered by priority! */
  57         INACCESSIBLE,
  58         BIND_MOUNT,
  59         BIND_MOUNT_RECURSIVE,
  60         PRIVATE_TMP,
  61         PRIVATE_DEV,
  62         BIND_DEV,
  63         EMPTY_DIR,
  64         SYSFS,
  65         PROCFS,
  66         READONLY,
  67         READWRITE,
  68         TMPFS,
  69 } MountMode;
  70
  71 typedef struct MountEntry {
  72         const char *path_const;   /* Memory allocated on stack or static */
  73         MountMode mode:5;
  74         bool ignore:1;            /* Ignore if path does not exist? */
  75         bool has_prefix:1;        /* Already is prefixed by the root dir? */
  76         bool read_only:1;         /* Shall this mount point be read-only? */
  77         char *path_malloc;        /* Use this instead of 'path_const' if we had to allocate memory */
  78         const char *source_const; /* The source path, for bind mounts */
  79         char *source_malloc;
  80         const char *options_const;/* Mount options for tmpfs */
  81         char *options_malloc;
  82         unsigned long flags;      /* Mount flags used by EMPTY_DIR and TMPFS. Do not include MS_RDONLY here, but please use read_only. */
  83 } MountEntry;
  84
  85 /* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
  86  * something there already. These mounts are hence overriden by any other explicitly configured mounts. */
  87 static const MountEntry apivfs_table[] = {
  88         { "/proc",               PROCFS,       false },
  89         { "/dev",                BIND_DEV,     false },
  90         { "/sys",                SYSFS,        false },
  91 };
  92
  93 /* ProtectKernelTunables= option and the related filesystem APIs */
  94 static const MountEntry protect_kernel_tunables_table[] = {
  95         { "/proc/sys",           READONLY,     false },
  96         { "/proc/sysrq-trigger", READONLY,     true  },
  97         { "/proc/latency_stats", READONLY,     true  },
  98         { "/proc/mtrr",          READONLY,     true  },
  99         { "/proc/apm",           READONLY,     true  }, /* Obsolete API, there's no point in permitting access to this, ever */
 100         { "/proc/acpi",          READONLY,     true  },
 101         { "/proc/timer_stats",   READONLY,     true  },
 102         { "/proc/asound",        READONLY,     true  },
 103         { "/proc/bus",           READONLY,     true  },
 104         { "/proc/fs",            READONLY,     true  },
 105         { "/proc/irq",           READONLY,     true  },
 106         { "/sys",                READONLY,     false },
 107         { "/sys/kernel/debug",   READONLY,     true  },
 108         { "/sys/kernel/tracing", READONLY,     true  },
 109         { "/sys/fs/bpf",         READONLY,     true  },
 110         { "/sys/fs/cgroup",      READWRITE,    false }, /* READONLY is set by ProtectControlGroups= option */
 111         { "/sys/fs/selinux",     READWRITE,    true  },
 112 };
 113
 114 /* ProtectKernelModules= option */
 115 static const MountEntry protect_kernel_modules_table[] = {
 116 #if HAVE_SPLIT_USR
 117         { "/lib/modules",        INACCESSIBLE, true  },
 118 #endif
 119         { "/usr/lib/modules",    INACCESSIBLE, true  },
 120 };
 121
 122 /*
 123  * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
 124  * system should be protected by ProtectSystem=
 125  */
 126 static const MountEntry protect_home_read_only_table[] = {
 127         { "/home",               READONLY,     true  },
 128         { "/run/user",           READONLY,     true  },
 129         { "/root",               READONLY,     true  },
 130 };
 131
 132 /* ProtectHome=tmpfs table */
 133 static const MountEntry protect_home_tmpfs_table[] = {
 134         { "/home",               TMPFS,        true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
 135         { "/run/user",           TMPFS,        true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
 136         { "/root",               TMPFS,        true, .read_only = true, .options_const = "mode=0700", .flags = MS_NODEV|MS_STRICTATIME },
 137 };
 138
 139 /* ProtectHome=yes table */
 140 static const MountEntry protect_home_yes_table[] = {
 141         { "/home",               INACCESSIBLE, true  },
 142         { "/run/user",           INACCESSIBLE, true  },
 143         { "/root",               INACCESSIBLE, true  },
 144 };
 145
 146 /* ProtectSystem=yes table */
 147 static const MountEntry protect_system_yes_table[] = {
 148         { "/usr",                READONLY,     false },
 149         { "/boot",               READONLY,     true  },
 150         { "/efi",                READONLY,     true  },
 151 };
 152
 153 /* ProtectSystem=full includes ProtectSystem=yes */
 154 static const MountEntry protect_system_full_table[] = {
 155         { "/usr",                READONLY,     false },
 156         { "/boot",               READONLY,     true  },
 157         { "/efi",                READONLY,     true  },
 158         { "/etc",                READONLY,     false },
 159 };
 160
 161 /*
 162  * ProtectSystem=strict table. In this strict mode, we mount everything
 163  * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
 164  * which are left writable, but PrivateDevices= + ProtectKernelTunables=
 165  * protect those, and these options should be fully orthogonal.
 166  * (And of course /home and friends are also left writable, as ProtectHome=
 167  * shall manage those, orthogonally).
 168  */
 169 static const MountEntry protect_system_strict_table[] = {
 170         { "/",                   READONLY,     false },
 171         { "/proc",               READWRITE,    false },      /* ProtectKernelTunables= */
 172         { "/sys",                READWRITE,    false },      /* ProtectKernelTunables= */
 173         { "/dev",                READWRITE,    false },      /* PrivateDevices= */
 174         { "/home",               READWRITE,    true  },      /* ProtectHome= */
 175         { "/run/user",           READWRITE,    true  },      /* ProtectHome= */
 176         { "/root",               READWRITE,    true  },      /* ProtectHome= */
 177 };
 178
 179 static const char *mount_entry_path(const MountEntry *p) {
 180         assert(p);
 181
 182         /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
 183          * otherwise the stack/static ->path field is returned. */
 184
 185         return p->path_malloc ?: p->path_const;
 186 }
 187
 188 static bool mount_entry_read_only(const MountEntry *p) {
 189         assert(p);
 190
 191         return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE);
 192 }
 193
 194 static const char *mount_entry_source(const MountEntry *p) {
 195         assert(p);
 196
 197         return p->source_malloc ?: p->source_const;
 198 }
 199
 200 static const char *mount_entry_options(const MountEntry *p) {
 201         assert(p);
 202
 203         return p->options_malloc ?: p->options_const;
 204 }
 205
 206 static void mount_entry_done(MountEntry *p) {
 207         assert(p);
 208
 209         p->path_malloc = mfree(p->path_malloc);
 210         p->source_malloc = mfree(p->source_malloc);
 211         p->options_malloc = mfree(p->options_malloc);
 212 }
 213
 214 static int append_access_mounts(MountEntry **p, char **strv, MountMode mode, bool forcibly_require_prefix) {
 215         char **i;
 216
 217         assert(p);
 218
 219         /* Adds a list of user-supplied READWRITE/READONLY/INACCESSIBLE entries */
 220
 221         STRV_FOREACH(i, strv) {
 222                 bool ignore = false, needs_prefix = false;
 223                 const char *e = *i;
 224
 225                 /* Look for any prefixes */
 226                 if (startswith(e, "-")) {
 227                         e++;
 228                         ignore = true;
 229                 }
 230                 if (startswith(e, "+")) {
 231                         e++;
 232                         needs_prefix = true;
 233                 }
 234
 235                 if (!path_is_absolute(e))
 236                         return -EINVAL;
 237
 238                 *((*p)++) = (MountEntry) {
 239                         .path_const = e,
 240                         .mode = mode,
 241                         .ignore = ignore,
 242                         .has_prefix = !needs_prefix && !forcibly_require_prefix,
 243                 };
 244         }
 245
 246         return 0;
 247 }
 248
 249 static int append_empty_dir_mounts(MountEntry **p, char **strv) {
 250         char **i;
 251
 252         assert(p);
 253
 254         /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
 255          * "/private/" boundary directories for DynamicUser=1. */
 256
 257         STRV_FOREACH(i, strv) {
 258
 259                 *((*p)++) = (MountEntry) {
 260                         .path_const = *i,
 261                         .mode = EMPTY_DIR,
 262                         .ignore = false,
 263                         .has_prefix = false,
 264                         .read_only = true,
 265                         .options_const = "mode=755",
 266                         .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
 267                 };
 268         }
 269
 270         return 0;
 271 }
 272
 273 static int append_bind_mounts(MountEntry **p, const BindMount *binds, unsigned n) {
 274         unsigned i;
 275
 276         assert(p);
 277
 278         for (i = 0; i < n; i++) {
 279                 const BindMount *b = binds + i;
 280
 281                 *((*p)++) = (MountEntry) {
 282                         .path_const = b->destination,
 283                         .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT,
 284                         .read_only = b->read_only,
 285                         .source_const = b->source,
 286                         .ignore = b->ignore_enoent,
 287                 };
 288         }
 289
 290         return 0;
 291 }
 292
 293 static int append_tmpfs_mounts(MountEntry **p, const TemporaryFileSystem *tmpfs, unsigned n) {
 294         unsigned i;
 295         int r;
 296
 297         assert(p);
 298
 299         for (i = 0; i < n; i++) {
 300                 const TemporaryFileSystem *t = tmpfs + i;
 301                 _cleanup_free_ char *o = NULL, *str = NULL;
 302                 unsigned long flags = MS_NODEV|MS_STRICTATIME;
 303                 bool ro = false;
 304
 305                 if (!path_is_absolute(t->path))
 306                         return -EINVAL;
 307
 308                 if (!isempty(t->options)) {
 309                         str = strjoin("mode=0755,", t->options);
 310                         if (!str)
 311                                 return -ENOMEM;
 312
 313                         r = mount_option_mangle(str, MS_NODEV|MS_STRICTATIME, &flags, &o);
 314                         if (r < 0)
 315                                 return r;
 316
 317                         ro = !!(flags & MS_RDONLY);
 318                         if (ro)
 319                                 flags ^= MS_RDONLY;
 320                 }
 321
 322                 *((*p)++) = (MountEntry) {
 323                         .path_const = t->path,
 324                         .mode = TMPFS,
 325                         .read_only = ro,
 326                         .options_malloc = o,
 327                         .flags = flags,
 328                 };
 329
 330                 o = NULL;
 331         }
 332
 333         return 0;
 334 }
 335
 336 static int append_static_mounts(MountEntry **p, const MountEntry *mounts, unsigned n, bool ignore_protect) {
 337         unsigned i;
 338
 339         assert(p);
 340         assert(mounts);
 341
 342         /* Adds a list of static pre-defined entries */
 343
 344         for (i = 0; i < n; i++)
 345                 *((*p)++) = (MountEntry) {
 346                         .path_const = mount_entry_path(mounts+i),
 347                         .mode = mounts[i].mode,
 348                         .ignore = mounts[i].ignore || ignore_protect,
 349                 };
 350
 351         return 0;
 352 }
 353
 354 static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
 355         assert(p);
 356
 357         switch (protect_home) {
 358
 359         case PROTECT_HOME_NO:
 360                 return 0;
 361
 362         case PROTECT_HOME_READ_ONLY:
 363                 return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
 364
 365         case PROTECT_HOME_TMPFS:
 366                 return append_static_mounts(p, protect_home_tmpfs_table, ELEMENTSOF(protect_home_tmpfs_table), ignore_protect);
 367
 368         case PROTECT_HOME_YES:
 369                 return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
 370
 371         default:
 372                 assert_not_reached("Unexpected ProtectHome= value");
 373         }
 374 }
 375
 376 static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
 377         assert(p);
 378
 379         switch (protect_system) {
 380
 381         case PROTECT_SYSTEM_NO:
 382                 return 0;
 383
 384         case PROTECT_SYSTEM_STRICT:
 385                 return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
 386
 387         case PROTECT_SYSTEM_YES:
 388                 return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
 389
 390         case PROTECT_SYSTEM_FULL:
 391                 return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
 392
 393         default:
 394                 assert_not_reached("Unexpected ProtectSystem= value");
 395         }
 396 }
 397
 398 static int mount_path_compare(const void *a, const void *b) {
 399         const MountEntry *p = a, *q = b;
 400         int d;
 401
 402         /* If the paths are not equal, then order prefixes first */
 403         d = path_compare(mount_entry_path(p), mount_entry_path(q));
 404         if (d != 0)
 405                 return d;
 406
 407         /* If the paths are equal, check the mode */
 408         if (p->mode < q->mode)
 409                 return -1;
 410
 411         if (p->mode > q->mode)
 412                 return 1;
 413
 414         return 0;
 415 }
 416
 417 static int prefix_where_needed(MountEntry *m, unsigned n, const char *root_directory) {
 418         unsigned i;
 419
 420         /* Prefixes all paths in the bind mount table with the root directory if it is specified and the entry needs
 421          * that. */
 422
 423         if (!root_directory)
 424                 return 0;
 425
 426         for (i = 0; i < n; i++) {
 427                 char *s;
 428
 429                 if (m[i].has_prefix)
 430                         continue;
 431
 432                 s = prefix_root(root_directory, mount_entry_path(m+i));
 433                 if (!s)
 434                         return -ENOMEM;
 435
 436                 free_and_replace(m[i].path_malloc, s);
 437                 m[i].has_prefix = true;
 438         }
 439
 440         return 0;
 441 }
 442
 443 static void drop_duplicates(MountEntry *m, unsigned *n) {
 444         MountEntry *f, *t, *previous;
 445
 446         assert(m);
 447         assert(n);
 448
 449         /* Drops duplicate entries. Expects that the array is properly ordered already. */
 450
 451         for (f = m, t = m, previous = NULL; f < m + *n; f++) {
 452
 453                 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
 454                  * above. */
 455                 if (previous && path_equal(mount_entry_path(f), mount_entry_path(previous))) {
 456                         log_debug("%s is duplicate.", mount_entry_path(f));
 457                         previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */
 458                         mount_entry_done(f);
 459                         continue;
 460                 }
 461
 462                 *t = *f;
 463                 previous = t;
 464                 t++;
 465         }
 466
 467         *n = t - m;
 468 }
 469
 470 static void drop_inaccessible(MountEntry *m, unsigned *n) {
 471         MountEntry *f, *t;
 472         const char *clear = NULL;
 473
 474         assert(m);
 475         assert(n);
 476
 477         /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
 478          * ordered already. */
 479
 480         for (f = m, t = m; f < m + *n; f++) {
 481
 482                 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
 483                  * it, as inaccessible paths really should drop the entire subtree. */
 484                 if (clear && path_startswith(mount_entry_path(f), clear)) {
 485                         log_debug("%s is masked by %s.", mount_entry_path(f), clear);
 486                         mount_entry_done(f);
 487                         continue;
 488                 }
 489
 490                 clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
 491
 492                 *t = *f;
 493                 t++;
 494         }
 495
 496         *n = t - m;
 497 }
 498
 499 static void drop_nop(MountEntry *m, unsigned *n) {
 500         MountEntry *f, *t;
 501
 502         assert(m);
 503         assert(n);
 504
 505         /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
 506          * list is ordered by prefixes. */
 507
 508         for (f = m, t = m; f < m + *n; f++) {
 509
 510                 /* Only suppress such subtrees for READONLY and READWRITE entries */
 511                 if (IN_SET(f->mode, READONLY, READWRITE)) {
 512                         MountEntry *p;
 513                         bool found = false;
 514
 515                         /* Now let's find the first parent of the entry we are looking at. */
 516                         for (p = t-1; p >= m; p--) {
 517                                 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
 518                                         found = true;
 519                                         break;
 520                                 }
 521                         }
 522
 523                         /* We found it, let's see if it's the same mode, if so, we can drop this entry */
 524                         if (found && p->mode == f->mode) {
 525                                 log_debug("%s is redundant by %s", mount_entry_path(f), mount_entry_path(p));
 526                                 mount_entry_done(f);
 527                                 continue;
 528                         }
 529                 }
 530
 531                 *t = *f;
 532                 t++;
 533         }
 534
 535         *n = t - m;
 536 }
 537
 538 static void drop_outside_root(const char *root_directory, MountEntry *m, unsigned *n) {
 539         MountEntry *f, *t;
 540
 541         assert(m);
 542         assert(n);
 543
 544         /* Nothing to do */
 545         if (!root_directory)
 546                 return;
 547
 548         /* Drops all mounts that are outside of the root directory. */
 549
 550         for (f = m, t = m; f < m + *n; f++) {
 551
 552                 if (!path_startswith(mount_entry_path(f), root_directory)) {
 553                         log_debug("%s is outside of root directory.", mount_entry_path(f));
 554                         mount_entry_done(f);
 555                         continue;
 556                 }
 557
 558                 *t = *f;
 559                 t++;
 560         }
 561
 562         *n = t - m;
 563 }
 564
 565 static int clone_device_node(const char *d, const char *temporary_mount) {
 566         const char *dn;
 567         struct stat st;
 568         int r;
 569
 570         if (stat(d, &st) < 0) {
 571                 if (errno == ENOENT)
 572                         return 0;
 573                 return -errno;
 574         }
 575
 576         if (!S_ISBLK(st.st_mode) &&
 577             !S_ISCHR(st.st_mode))
 578                 return -EINVAL;
 579
 580         if (st.st_rdev == 0)
 581                 return 0;
 582
 583         dn = strjoina(temporary_mount, d);
 584
 585         mac_selinux_create_file_prepare(d, st.st_mode);
 586         r = mknod(dn, st.st_mode, st.st_rdev);
 587         mac_selinux_create_file_clear();
 588         if (r < 0)
 589                 return log_debug_errno(errno, "mknod failed for %s: %m", d);
 590
 591         return 1;
 592 }
 593
 594 static int mount_private_dev(MountEntry *m) {
 595         static const char devnodes[] =
 596                 "/dev/null\0"
 597                 "/dev/zero\0"
 598                 "/dev/full\0"
 599                 "/dev/random\0"
 600                 "/dev/urandom\0"
 601                 "/dev/tty\0";
 602
 603         char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
 604         const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
 605         _cleanup_umask_ mode_t u;
 606         int r;
 607
 608         assert(m);
 609
 610         u = umask(0000);
 611
 612         if (!mkdtemp(temporary_mount))
 613                 return -errno;
 614
 615         dev = strjoina(temporary_mount, "/dev");
 616         (void) mkdir(dev, 0755);
 617         if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
 618                 r = -errno;
 619                 goto fail;
 620         }
 621
 622         devpts = strjoina(temporary_mount, "/dev/pts");
 623         (void) mkdir(devpts, 0755);
 624         if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
 625                 r = -errno;
 626                 goto fail;
 627         }
 628
 629         /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx
 630          * when /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible
 631          * thus, in that case make a clone
 632          *
 633          * in nspawn and other containers it will be a symlink, in that case make it a symlink
 634          */
 635         r = is_symlink("/dev/ptmx");
 636         if (r < 0)
 637                 goto fail;
 638         if (r > 0) {
 639                 devptmx = strjoina(temporary_mount, "/dev/ptmx");
 640                 if (symlink("pts/ptmx", devptmx) < 0) {
 641                         r = -errno;
 642                         goto fail;
 643                 }
 644         } else {
 645                 r = clone_device_node("/dev/ptmx", temporary_mount);
 646                 if (r < 0)
 647                         goto fail;
 648                 if (r == 0) {
 649                         r = -ENXIO;
 650                         goto fail;
 651                 }
 652         }
 653
 654         devshm = strjoina(temporary_mount, "/dev/shm");
 655         (void) mkdir(devshm, 0755);
 656         r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
 657         if (r < 0) {
 658                 r = -errno;
 659                 goto fail;
 660         }
 661
 662         devmqueue = strjoina(temporary_mount, "/dev/mqueue");
 663         (void) mkdir(devmqueue, 0755);
 664         (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
 665
 666         devhugepages = strjoina(temporary_mount, "/dev/hugepages");
 667         (void) mkdir(devhugepages, 0755);
 668         (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
 669
 670         devlog = strjoina(temporary_mount, "/dev/log");
 671         (void) symlink("/run/systemd/journal/dev-log", devlog);
 672
 673         NULSTR_FOREACH(d, devnodes) {
 674                 r = clone_device_node(d, temporary_mount);
 675                 if (r < 0)
 676                         goto fail;
 677         }
 678
 679         dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
 680
 681         /* Create the /dev directory if missing. It is more likely to be
 682          * missing when the service is started with RootDirectory. This is
 683          * consistent with mount units creating the mount points when missing.
 684          */
 685         (void) mkdir_p_label(mount_entry_path(m), 0755);
 686
 687         /* Unmount everything in old /dev */
 688         umount_recursive(mount_entry_path(m), 0);
 689         if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) {
 690                 r = -errno;
 691                 goto fail;
 692         }
 693
 694         rmdir(dev);
 695         rmdir(temporary_mount);
 696
 697         return 0;
 698
 699 fail:
 700         if (devpts)
 701                 umount(devpts);
 702
 703         if (devshm)
 704                 umount(devshm);
 705
 706         if (devhugepages)
 707                 umount(devhugepages);
 708
 709         if (devmqueue)
 710                 umount(devmqueue);
 711
 712         umount(dev);
 713         rmdir(dev);
 714         rmdir(temporary_mount);
 715
 716         return r;
 717 }
 718
 719 static int mount_bind_dev(const MountEntry *m) {
 720         int r;
 721
 722         assert(m);
 723
 724         /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
 725          * /dev. This is only used when RootDirectory= is set. */
 726
 727         (void) mkdir_p_label(mount_entry_path(m), 0755);
 728
 729         r = path_is_mount_point(mount_entry_path(m), NULL, 0);
 730         if (r < 0)
 731                 return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
 732         if (r > 0) /* make this a NOP if /dev is already a mount point */
 733                 return 0;
 734
 735         if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
 736                 return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m));
 737
 738         return 1;
 739 }
 740
 741 static int mount_sysfs(const MountEntry *m) {
 742         int r;
 743
 744         assert(m);
 745
 746         (void) mkdir_p_label(mount_entry_path(m), 0755);
 747
 748         r = path_is_mount_point(mount_entry_path(m), NULL, 0);
 749         if (r < 0)
 750                 return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
 751         if (r > 0) /* make this a NOP if /sys is already a mount point */
 752                 return 0;
 753
 754         /* Bind mount the host's version so that we get all child mounts of it, too. */
 755         if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
 756                 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
 757
 758         return 1;
 759 }
 760
 761 static int mount_procfs(const MountEntry *m) {
 762         int r;
 763
 764         assert(m);
 765
 766         (void) mkdir_p_label(mount_entry_path(m), 0755);
 767
 768         r = path_is_mount_point(mount_entry_path(m), NULL, 0);
 769         if (r < 0)
 770                 return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
 771         if (r > 0) /* make this a NOP if /proc is already a mount point */
 772                 return 0;
 773
 774         /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
 775         if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
 776                 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
 777
 778         return 1;
 779 }
 780
 781 static int mount_tmpfs(const MountEntry *m) {
 782         assert(m);
 783
 784         /* First, get rid of everything that is below if there is anything. Then, overmount with our new tmpfs */
 785
 786         (void) mkdir_p_label(mount_entry_path(m), 0755);
 787         (void) umount_recursive(mount_entry_path(m), 0);
 788
 789         if (mount("tmpfs", mount_entry_path(m), "tmpfs", m->flags, mount_entry_options(m)) < 0)
 790                 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
 791
 792         return 1;
 793 }
 794
 795 static int mount_entry_chase(
 796                 const char *root_directory,
 797                 const MountEntry *m,
 798                 const char *path,
 799                 bool chase_nonexistent,
 800                 char **location) {
 801
 802         char *chased;
 803         int r;
 804
 805         assert(m);
 806
 807         /* Since mount() will always follow symlinks and we need to take the different root directory into account we
 808          * chase the symlinks on our own first. This is called for the destination path, as well as the source path (if
 809          * that applies). The result is stored in "location". */
 810
 811         r = chase_symlinks(path, root_directory, chase_nonexistent ? CHASE_NONEXISTENT : 0, &chased);
 812         if (r == -ENOENT && m->ignore) {
 813                 log_debug_errno(r, "Path %s does not exist, ignoring.", path);
 814                 return 0;
 815         }
 816         if (r < 0)
 817                 return log_debug_errno(r, "Failed to follow symlinks on %s: %m", path);
 818
 819         log_debug("Followed symlinks %s → %s.", path, chased);
 820
 821         free(*location);
 822         *location = chased;
 823
 824         return 1;
 825 }
 826
 827 static int apply_mount(
 828                 const char *root_directory,
 829                 MountEntry *m) {
 830
 831         bool rbind = true, make = false;
 832         const char *what;
 833         int r;
 834
 835         assert(m);
 836
 837         r = mount_entry_chase(root_directory, m, mount_entry_path(m), !IN_SET(m->mode, INACCESSIBLE, READONLY, READWRITE), &m->path_malloc);
 838         if (r <= 0)
 839                 return r;
 840
 841         log_debug("Applying namespace mount on %s", mount_entry_path(m));
 842
 843         switch (m->mode) {
 844
 845         case INACCESSIBLE: {
 846                 struct stat target;
 847
 848                 /* First, get rid of everything that is below if there
 849                  * is anything... Then, overmount it with an
 850                  * inaccessible path. */
 851                 (void) umount_recursive(mount_entry_path(m), 0);
 852
 853                 if (lstat(mount_entry_path(m), &target) < 0)
 854                         return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m));
 855
 856                 what = mode_to_inaccessible_node(target.st_mode);
 857                 if (!what) {
 858                         log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed");
 859                         return -ELOOP;
 860                 }
 861                 break;
 862         }
 863
 864         case READONLY:
 865         case READWRITE:
 866                 r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
 867                 if (r < 0)
 868                         return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m));
 869                 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
 870                         return 0;
 871                 /* This isn't a mount point yet, let's make it one. */
 872                 what = mount_entry_path(m);
 873                 break;
 874
 875         case BIND_MOUNT:
 876                 rbind = false;
 877
 878                 _fallthrough_;
 879         case BIND_MOUNT_RECURSIVE:
 880                 /* Also chase the source mount */
 881
 882                 r = mount_entry_chase(root_directory, m, mount_entry_source(m), false, &m->source_malloc);
 883                 if (r <= 0)
 884                         return r;
 885
 886                 what = mount_entry_source(m);
 887                 make = true;
 888                 break;
 889
 890         case EMPTY_DIR:
 891         case TMPFS:
 892                 return mount_tmpfs(m);
 893
 894         case PRIVATE_TMP:
 895                 what = mount_entry_source(m);
 896                 make = true;
 897                 break;
 898
 899         case PRIVATE_DEV:
 900                 return mount_private_dev(m);
 901
 902         case BIND_DEV:
 903                 return mount_bind_dev(m);
 904
 905         case SYSFS:
 906                 return mount_sysfs(m);
 907
 908         case PROCFS:
 909                 return mount_procfs(m);
 910
 911         default:
 912                 assert_not_reached("Unknown mode");
 913         }
 914
 915         assert(what);
 916
 917         if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0) {
 918                 bool try_again = false;
 919                 r = -errno;
 920
 921                 if (r == -ENOENT && make) {
 922                         struct stat st;
 923
 924                         /* Hmm, either the source or the destination are missing. Let's see if we can create the destination, then try again */
 925
 926                         if (stat(what, &st) >= 0) {
 927
 928                                 (void) mkdir_parents(mount_entry_path(m), 0755);
 929
 930                                 if (S_ISDIR(st.st_mode))
 931                                         try_again = mkdir(mount_entry_path(m), 0755) >= 0;
 932                                 else
 933                                         try_again = touch(mount_entry_path(m)) >= 0;
 934                         }
 935                 }
 936
 937                 if (try_again) {
 938                         if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0)
 939                                 r = -errno;
 940                         else
 941                                 r = 0;
 942                 }
 943
 944                 if (r < 0)
 945                         return log_debug_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
 946         }
 947
 948         log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
 949         return 0;
 950 }
 951
 952 static int make_read_only(const MountEntry *m, char **blacklist, FILE *proc_self_mountinfo) {
 953         int r = 0;
 954
 955         assert(m);
 956         assert(proc_self_mountinfo);
 957
 958         if (mount_entry_read_only(m)) {
 959                 if (IN_SET(m->mode, EMPTY_DIR, TMPFS)) {
 960                         /* Make superblock readonly */
 961                         if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT | MS_RDONLY | m->flags, mount_entry_options(m)) < 0)
 962                                 r = -errno;
 963                 } else
 964                         r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), true, blacklist, proc_self_mountinfo);
 965         } else if (m->mode == PRIVATE_DEV) {
 966                 /* Superblock can be readonly but the submounts can't */
 967                 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
 968                         r = -errno;
 969         } else
 970                 return 0;
 971
 972         /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
 973          * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
 974          * read-only mounts already applied. */
 975
 976         if (r == -ENOENT && m->ignore)
 977                 r = 0;
 978
 979         return r;
 980 }
 981
 982 static bool namespace_info_mount_apivfs(const char *root_directory, const NamespaceInfo *ns_info) {
 983         assert(ns_info);
 984
 985         /*
 986          * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
 987          * since to protect the API VFS mounts, they need to be around in the
 988          * first place... and RootDirectory= or RootImage= need to be set.
 989          */
 990
 991         /* root_directory should point to a mount point */
 992         return root_directory &&
 993                 (ns_info->mount_apivfs ||
 994                  ns_info->protect_control_groups ||
 995                  ns_info->protect_kernel_tunables);
 996 }
 997
 998 static unsigned namespace_calculate_mounts(
 999                 const char* root_directory,
1000                 const NamespaceInfo *ns_info,
1001                 char** read_write_paths,
1002                 char** read_only_paths,
1003                 char** inaccessible_paths,
1004                 char** empty_directories,
1005                 unsigned n_bind_mounts,
1006                 unsigned n_temporary_filesystems,
1007                 const char* tmp_dir,
1008                 const char* var_tmp_dir,
1009                 ProtectHome protect_home,
1010                 ProtectSystem protect_system) {
1011
1012         unsigned protect_home_cnt;
1013         unsigned protect_system_cnt =
1014                 (protect_system == PROTECT_SYSTEM_STRICT ?
1015                  ELEMENTSOF(protect_system_strict_table) :
1016                  ((protect_system == PROTECT_SYSTEM_FULL) ?
1017                   ELEMENTSOF(protect_system_full_table) :
1018                   ((protect_system == PROTECT_SYSTEM_YES) ?
1019                    ELEMENTSOF(protect_system_yes_table) : 0)));
1020
1021         protect_home_cnt =
1022                 (protect_home == PROTECT_HOME_YES ?
1023                  ELEMENTSOF(protect_home_yes_table) :
1024                  ((protect_home == PROTECT_HOME_READ_ONLY) ?
1025                   ELEMENTSOF(protect_home_read_only_table) :
1026                   ((protect_home == PROTECT_HOME_TMPFS) ?
1027                    ELEMENTSOF(protect_home_tmpfs_table) : 0)));
1028
1029         return !!tmp_dir + !!var_tmp_dir +
1030                 strv_length(read_write_paths) +
1031                 strv_length(read_only_paths) +
1032                 strv_length(inaccessible_paths) +
1033                 strv_length(empty_directories) +
1034                 n_bind_mounts +
1035                 n_temporary_filesystems +
1036                 ns_info->private_dev +
1037                 (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
1038                 (ns_info->protect_control_groups ? 1 : 0) +
1039                 (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
1040                 protect_home_cnt + protect_system_cnt +
1041                 (namespace_info_mount_apivfs(root_directory, ns_info) ? ELEMENTSOF(apivfs_table) : 0);
1042 }
1043
1044 int setup_namespace(
1045                 const char* root_directory,
1046                 const char* root_image,
1047                 const NamespaceInfo *ns_info,
1048                 char** read_write_paths,
1049                 char** read_only_paths,
1050                 char** inaccessible_paths,
1051                 char** empty_directories,
1052                 const BindMount *bind_mounts,
1053                 unsigned n_bind_mounts,
1054                 const TemporaryFileSystem *temporary_filesystems,
1055                 unsigned n_temporary_filesystems,
1056                 const char* tmp_dir,
1057                 const char* var_tmp_dir,
1058                 ProtectHome protect_home,
1059                 ProtectSystem protect_system,
1060                 unsigned long mount_flags,
1061                 DissectImageFlags dissect_image_flags) {
1062
1063         _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
1064         _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
1065         _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
1066         _cleanup_free_ void *root_hash = NULL;
1067         MountEntry *m, *mounts = NULL;
1068         size_t root_hash_size = 0;
1069         bool make_slave = false;
1070         const char *root;
1071         unsigned n_mounts;
1072         bool require_prefix = false;
1073         int r = 0;
1074
1075         assert(ns_info);
1076
1077         if (mount_flags == 0)
1078                 mount_flags = MS_SHARED;
1079
1080         if (root_image) {
1081                 dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT;
1082
1083                 if (protect_system == PROTECT_SYSTEM_STRICT && strv_isempty(read_write_paths))
1084                         dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
1085
1086                 r = loop_device_make_by_path(root_image,
1087                                              dissect_image_flags & DISSECT_IMAGE_READ_ONLY ? O_RDONLY : O_RDWR,
1088                                              &loop_device);
1089                 if (r < 0)
1090                         return r;
1091
1092                 r = root_hash_load(root_image, &root_hash, &root_hash_size);
1093                 if (r < 0)
1094                         return r;
1095
1096                 r = dissect_image(loop_device->fd, root_hash, root_hash_size, dissect_image_flags, &dissected_image);
1097                 if (r < 0)
1098                         return r;
1099
1100                 r = dissected_image_decrypt(dissected_image, NULL, root_hash, root_hash_size, dissect_image_flags, &decrypted_image);
1101                 if (r < 0)
1102                         return r;
1103         }
1104
1105         if (root_directory)
1106                 root = root_directory;
1107         else if (root_image || n_bind_mounts > 0 || n_temporary_filesystems > 0) {
1108
1109                 /* If we are booting from an image, create a mount point for the image, if it's still missing. We use
1110                  * the same mount point for all images, which is safe, since they all live in their own namespaces
1111                  * after all, and hence won't see each other. We also use such a root directory whenever there are bind
1112                  * mounts configured, so that their source mounts are never obstructed by mounts we already applied
1113                  * while we are applying them. */
1114
1115                 root = "/run/systemd/unit-root";
1116                 (void) mkdir_label(root, 0700);
1117                 require_prefix = true;
1118         } else
1119                 root = NULL;
1120
1121         n_mounts = namespace_calculate_mounts(
1122                         root,
1123                         ns_info,
1124                         read_write_paths,
1125                         read_only_paths,
1126                         inaccessible_paths,
1127                         empty_directories,
1128                         n_bind_mounts,
1129                         n_temporary_filesystems,
1130                         tmp_dir, var_tmp_dir,
1131                         protect_home, protect_system);
1132
1133         /* Set mount slave mode */
1134         if (root || n_mounts > 0)
1135                 make_slave = true;
1136
1137         if (n_mounts > 0) {
1138                 m = mounts = (MountEntry *) alloca0(n_mounts * sizeof(MountEntry));
1139                 r = append_access_mounts(&m, read_write_paths, READWRITE, require_prefix);
1140                 if (r < 0)
1141                         goto finish;
1142
1143                 r = append_access_mounts(&m, read_only_paths, READONLY, require_prefix);
1144                 if (r < 0)
1145                         goto finish;
1146
1147                 r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE, require_prefix);
1148                 if (r < 0)
1149                         goto finish;
1150
1151                 r = append_empty_dir_mounts(&m, empty_directories);
1152                 if (r < 0)
1153                         goto finish;
1154
1155                 r = append_bind_mounts(&m, bind_mounts, n_bind_mounts);
1156                 if (r < 0)
1157                         goto finish;
1158
1159                 r = append_tmpfs_mounts(&m, temporary_filesystems, n_temporary_filesystems);
1160                 if (r < 0)
1161                         goto finish;
1162
1163                 if (tmp_dir) {
1164                         *(m++) = (MountEntry) {
1165                                 .path_const = "/tmp",
1166                                 .mode = PRIVATE_TMP,
1167                                 .source_const = tmp_dir,
1168                         };
1169                 }
1170
1171                 if (var_tmp_dir) {
1172                         *(m++) = (MountEntry) {
1173                                 .path_const = "/var/tmp",
1174                                 .mode = PRIVATE_TMP,
1175                                 .source_const = var_tmp_dir,
1176                         };
1177                 }
1178
1179                 if (ns_info->private_dev) {
1180                         *(m++) = (MountEntry) {
1181                                 .path_const = "/dev",
1182                                 .mode = PRIVATE_DEV,
1183                         };
1184                 }
1185
1186                 if (ns_info->protect_kernel_tunables) {
1187                         r = append_static_mounts(&m, protect_kernel_tunables_table, ELEMENTSOF(protect_kernel_tunables_table), ns_info->ignore_protect_paths);
1188                         if (r < 0)
1189                                 goto finish;
1190                 }
1191
1192                 if (ns_info->protect_kernel_modules) {
1193                         r = append_static_mounts(&m, protect_kernel_modules_table, ELEMENTSOF(protect_kernel_modules_table), ns_info->ignore_protect_paths);
1194                         if (r < 0)
1195                                 goto finish;
1196                 }
1197
1198                 if (ns_info->protect_control_groups) {
1199                         *(m++) = (MountEntry) {
1200                                 .path_const = "/sys/fs/cgroup",
1201                                 .mode = READONLY,
1202                         };
1203                 }
1204
1205                 r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths);
1206                 if (r < 0)
1207                         goto finish;
1208
1209                 r = append_protect_system(&m, protect_system, false);
1210                 if (r < 0)
1211                         goto finish;
1212
1213                 if (namespace_info_mount_apivfs(root, ns_info)) {
1214                         r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths);
1215                         if (r < 0)
1216                                 goto finish;
1217                 }
1218
1219                 assert(mounts + n_mounts == m);
1220
1221                 /* Prepend the root directory where that's necessary */
1222                 r = prefix_where_needed(mounts, n_mounts, root);
1223                 if (r < 0)
1224                         goto finish;
1225
1226                 qsort(mounts, n_mounts, sizeof(MountEntry), mount_path_compare);
1227
1228                 drop_duplicates(mounts, &n_mounts);
1229                 drop_outside_root(root, mounts, &n_mounts);
1230                 drop_inaccessible(mounts, &n_mounts);
1231                 drop_nop(mounts, &n_mounts);
1232         }
1233
1234         if (unshare(CLONE_NEWNS) < 0) {
1235                 r = -errno;
1236                 goto finish;
1237         }
1238
1239         if (make_slave) {
1240                 /* Remount / as SLAVE so that nothing now mounted in the namespace
1241                    shows up in the parent */
1242                 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1243                         r = -errno;
1244                         goto finish;
1245                 }
1246         }
1247
1248         if (root_image) {
1249                 /* A root image is specified, mount it to the right place */
1250                 r = dissected_image_mount(dissected_image, root, UID_INVALID, dissect_image_flags);
1251                 if (r < 0)
1252                         goto finish;
1253
1254                 if (decrypted_image) {
1255                         r = decrypted_image_relinquish(decrypted_image);
1256                         if (r < 0)
1257                                 goto finish;
1258                 }
1259
1260                 loop_device_relinquish(loop_device);
1261
1262         } else if (root_directory) {
1263
1264                 /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
1265                 r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW);
1266                 if (r < 0)
1267                         goto finish;
1268                 if (r == 0) {
1269                         if (mount(root, root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1270                                 r = -errno;
1271                                 goto finish;
1272                         }
1273                 }
1274
1275         } else if (root) {
1276
1277                 /* Let's mount the main root directory to the root directory to use */
1278                 if (mount("/", root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1279                         r = -errno;
1280                         goto finish;
1281                 }
1282         }
1283
1284         /* Try to set up the new root directory before mounting anything else there. */
1285         if (root_image || root_directory)
1286                 (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
1287
1288         if (n_mounts > 0) {
1289                 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
1290                 char **blacklist;
1291                 unsigned j;
1292
1293                 /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of /proc.
1294                  * For example, this is the case with the option: 'InaccessiblePaths=/proc' */
1295                 proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
1296                 if (!proc_self_mountinfo) {
1297                         r = -errno;
1298                         goto finish;
1299                 }
1300
1301                 /* First round, add in all special mounts we need */
1302                 for (m = mounts; m < mounts + n_mounts; ++m) {
1303                         r = apply_mount(root, m);
1304                         if (r < 0)
1305                                 goto finish;
1306                 }
1307
1308                 /* Create a blacklist we can pass to bind_mount_recursive() */
1309                 blacklist = newa(char*, n_mounts+1);
1310                 for (j = 0; j < n_mounts; j++)
1311                         blacklist[j] = (char*) mount_entry_path(mounts+j);
1312                 blacklist[j] = NULL;
1313
1314                 /* Second round, flip the ro bits if necessary. */
1315                 for (m = mounts; m < mounts + n_mounts; ++m) {
1316                         r = make_read_only(m, blacklist, proc_self_mountinfo);
1317                         if (r < 0)
1318                                 goto finish;
1319                 }
1320         }
1321
1322         if (root) {
1323                 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
1324                 r = mount_move_root(root);
1325                 if (r < 0)
1326                         goto finish;
1327         }
1328
1329         /* Remount / as the desired mode. Note that this will not
1330          * reestablish propagation from our side to the host, since
1331          * what's disconnected is disconnected. */
1332         if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
1333                 r = -errno;
1334                 goto finish;
1335         }
1336
1337         r = 0;
1338
1339 finish:
1340         for (m = mounts; m < mounts + n_mounts; m++)
1341                 mount_entry_done(m);
1342
1343         return r;
1344 }
1345
1346 void bind_mount_free_many(BindMount *b, unsigned n) {
1347         unsigned i;
1348
1349         assert(b || n == 0);
1350
1351         for (i = 0; i < n; i++) {
1352                 free(b[i].source);
1353                 free(b[i].destination);
1354         }
1355
1356         free(b);
1357 }
1358
1359 int bind_mount_add(BindMount **b, unsigned *n, const BindMount *item) {
1360         _cleanup_free_ char *s = NULL, *d = NULL;
1361         BindMount *c;
1362
1363         assert(b);
1364         assert(n);
1365         assert(item);
1366
1367         s = strdup(item->source);
1368         if (!s)
1369                 return -ENOMEM;
1370
1371         d = strdup(item->destination);
1372         if (!d)
1373                 return -ENOMEM;
1374
1375         c = realloc_multiply(*b, sizeof(BindMount), *n + 1);
1376         if (!c)
1377                 return -ENOMEM;
1378
1379         *b = c;
1380
1381         c[(*n) ++] = (BindMount) {
1382                 .source = s,
1383                 .destination = d,
1384                 .read_only = item->read_only,
1385                 .recursive = item->recursive,
1386                 .ignore_enoent = item->ignore_enoent,
1387         };
1388
1389         s = d = NULL;
1390         return 0;
1391 }
1392
1393 void temporary_filesystem_free_many(TemporaryFileSystem *t, unsigned n) {
1394         unsigned i;
1395
1396         assert(t || n == 0);
1397
1398         for (i = 0; i < n; i++) {
1399                 free(t[i].path);
1400                 free(t[i].options);
1401         }
1402
1403         free(t);
1404 }
1405
1406 int temporary_filesystem_add(
1407                 TemporaryFileSystem **t,
1408                 unsigned *n,
1409                 const char *path,
1410                 const char *options) {
1411
1412         _cleanup_free_ char *p = NULL, *o = NULL;
1413         TemporaryFileSystem *c;
1414
1415         assert(t);
1416         assert(n);
1417         assert(path);
1418
1419         p = strdup(path);
1420         if (!p)
1421                 return -ENOMEM;
1422
1423         if (!isempty(options)) {
1424                 o = strdup(options);
1425                 if (!o)
1426                         return -ENOMEM;
1427         }
1428
1429         c = realloc_multiply(*t, sizeof(TemporaryFileSystem), *n + 1);
1430         if (!c)
1431                 return -ENOMEM;
1432
1433         *t = c;
1434
1435         c[(*n) ++] = (TemporaryFileSystem) {
1436                 .path = p,
1437                 .options = o,
1438         };
1439
1440         p = o = NULL;
1441         return 0;
1442 }
1443
1444 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
1445         _cleanup_free_ char *x = NULL;
1446         char bid[SD_ID128_STRING_MAX];
1447         sd_id128_t boot_id;
1448         int r;
1449
1450         assert(id);
1451         assert(prefix);
1452         assert(path);
1453
1454         /* We include the boot id in the directory so that after a
1455          * reboot we can easily identify obsolete directories. */
1456
1457         r = sd_id128_get_boot(&boot_id);
1458         if (r < 0)
1459                 return r;
1460
1461         x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX");
1462         if (!x)
1463                 return -ENOMEM;
1464
1465         RUN_WITH_UMASK(0077)
1466                 if (!mkdtemp(x))
1467                         return -errno;
1468
1469         RUN_WITH_UMASK(0000) {
1470                 char *y;
1471
1472                 y = strjoina(x, "/tmp");
1473
1474                 if (mkdir(y, 0777 | S_ISVTX) < 0)
1475                         return -errno;
1476         }
1477
1478         *path = x;
1479         x = NULL;
1480
1481         return 0;
1482 }
1483
1484 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
1485         char *a, *b;
1486         int r;
1487
1488         assert(id);
1489         assert(tmp_dir);
1490         assert(var_tmp_dir);
1491
1492         r = setup_one_tmp_dir(id, "/tmp", &a);
1493         if (r < 0)
1494                 return r;
1495
1496         r = setup_one_tmp_dir(id, "/var/tmp", &b);
1497         if (r < 0) {
1498                 char *t;
1499
1500                 t = strjoina(a, "/tmp");
1501                 rmdir(t);
1502                 rmdir(a);
1503
1504                 free(a);
1505                 return r;
1506         }
1507
1508         *tmp_dir = a;
1509         *var_tmp_dir = b;
1510
1511         return 0;
1512 }
1513
1514 int setup_netns(int netns_storage_socket[2]) {
1515         _cleanup_close_ int netns = -1;
1516         int r, q;
1517
1518         assert(netns_storage_socket);
1519         assert(netns_storage_socket[0] >= 0);
1520         assert(netns_storage_socket[1] >= 0);
1521
1522         /* We use the passed socketpair as a storage buffer for our
1523          * namespace reference fd. Whatever process runs this first
1524          * shall create a new namespace, all others should just join
1525          * it. To serialize that we use a file lock on the socket
1526          * pair.
1527          *
1528          * It's a bit crazy, but hey, works great! */
1529
1530         if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1531                 return -errno;
1532
1533         netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1534         if (netns == -EAGAIN) {
1535                 /* Nothing stored yet, so let's create a new namespace */
1536
1537                 if (unshare(CLONE_NEWNET) < 0) {
1538                         r = -errno;
1539                         goto fail;
1540                 }
1541
1542                 loopback_setup();
1543
1544                 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
1545                 if (netns < 0) {
1546                         r = -errno;
1547                         goto fail;
1548                 }
1549
1550                 r = 1;
1551
1552         } else if (netns < 0) {
1553                 r = netns;
1554                 goto fail;
1555
1556         } else {
1557                 /* Yay, found something, so let's join the namespace */
1558                 if (setns(netns, CLONE_NEWNET) < 0) {
1559                         r = -errno;
1560                         goto fail;
1561                 }
1562
1563                 r = 0;
1564         }
1565
1566         q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1567         if (q < 0) {
1568                 r = q;
1569                 goto fail;
1570         }
1571
1572 fail:
1573         (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
1574         return r;
1575 }
1576
1577 bool ns_type_supported(NamespaceType type) {
1578         const char *t, *ns_proc;
1579
1580         t = namespace_type_to_string(type);
1581         if (!t) /* Don't know how to translate this? Then it's not supported */
1582                 return false;
1583
1584         ns_proc = strjoina("/proc/self/ns/", t);
1585         return access(ns_proc, F_OK) == 0;
1586 }
1587
1588 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
1589         [PROTECT_HOME_NO] = "no",
1590         [PROTECT_HOME_YES] = "yes",
1591         [PROTECT_HOME_READ_ONLY] = "read-only",
1592         [PROTECT_HOME_TMPFS] = "tmpfs",
1593 };
1594
1595 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
1596
1597 ProtectHome parse_protect_home_or_bool(const char *s) {
1598         int r;
1599
1600         r = parse_boolean(s);
1601         if (r > 0)
1602                 return PROTECT_HOME_YES;
1603         if (r == 0)
1604                 return PROTECT_HOME_NO;
1605
1606         return protect_home_from_string(s);
1607 }
1608
1609 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
1610         [PROTECT_SYSTEM_NO] = "no",
1611         [PROTECT_SYSTEM_YES] = "yes",
1612         [PROTECT_SYSTEM_FULL] = "full",
1613         [PROTECT_SYSTEM_STRICT] = "strict",
1614 };
1615
1616 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);
1617
1618 ProtectSystem parse_protect_system_or_bool(const char *s) {
1619         int r;
1620
1621         r = parse_boolean(s);
1622         if (r > 0)
1623                 return PROTECT_SYSTEM_YES;
1624         if (r == 0)
1625                 return PROTECT_SYSTEM_NO;
1626
1627         return protect_system_from_string(s);
1628 }
1629
1630 static const char* const namespace_type_table[] = {
1631         [NAMESPACE_MOUNT] = "mnt",
1632         [NAMESPACE_CGROUP] = "cgroup",
1633         [NAMESPACE_UTS] = "uts",
1634         [NAMESPACE_IPC] = "ipc",
1635         [NAMESPACE_USER] = "user",
1636         [NAMESPACE_PID] = "pid",
1637         [NAMESPACE_NET] = "net",
1638 };
1639
1640 DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);