src/core/namespace.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2010 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <errno.h>
  22 #include <sched.h>
  23 #include <stdio.h>
  24 #include <string.h>
  25 #include <sys/mount.h>
  26 #include <sys/stat.h>
  27 #include <unistd.h>
  28 #include <linux/fs.h>
  29
  30 #include "alloc-util.h"
  31 #include "base-filesystem.h"
  32 #include "dev-setup.h"
  33 #include "fd-util.h"
  34 #include "fs-util.h"
  35 #include "label.h"
  36 #include "loop-util.h"
  37 #include "loopback-setup.h"
  38 #include "missing.h"
  39 #include "mkdir.h"
  40 #include "mount-util.h"
  41 #include "namespace.h"
  42 #include "path-util.h"
  43 #include "selinux-util.h"
  44 #include "socket-util.h"
  45 #include "stat-util.h"
  46 #include "string-table.h"
  47 #include "string-util.h"
  48 #include "strv.h"
  49 #include "umask-util.h"
  50 #include "user-util.h"
  51 #include "util.h"
  52
  53 #define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
  54
  55 typedef enum MountMode {
  56         /* This is ordered by priority! */
  57         INACCESSIBLE,
  58         BIND_MOUNT,
  59         BIND_MOUNT_RECURSIVE,
  60         PRIVATE_TMP,
  61         PRIVATE_DEV,
  62         BIND_DEV,
  63         EMPTY_DIR,
  64         SYSFS,
  65         PROCFS,
  66         READONLY,
  67         READWRITE,
  68         TMPFS,
  69 } MountMode;
  70
  71 typedef struct MountEntry {
  72         const char *path_const;   /* Memory allocated on stack or static */
  73         MountMode mode:5;
  74         bool ignore:1;            /* Ignore if path does not exist? */
  75         bool has_prefix:1;        /* Already is prefixed by the root dir? */
  76         bool read_only:1;         /* Shall this mount point be read-only? */
  77         char *path_malloc;        /* Use this instead of 'path_const' if we had to allocate memory */
  78         const char *source_const; /* The source path, for bind mounts */
  79         char *source_malloc;
  80         const char *options_const;/* Mount options for tmpfs */
  81         char *options_malloc;
  82         unsigned long flags;      /* Mount flags used by EMPTY_DIR and TMPFS. Do not include MS_RDONLY here, but please use read_only. */
  83 } MountEntry;
  84
  85 /* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
  86  * something there already. These mounts are hence overriden by any other explicitly configured mounts. */
  87 static const MountEntry apivfs_table[] = {
  88         { "/proc",               PROCFS,       false },
  89         { "/dev",                BIND_DEV,     false },
  90         { "/sys",                SYSFS,        false },
  91 };
  92
  93 /* ProtectKernelTunables= option and the related filesystem APIs */
  94 static const MountEntry protect_kernel_tunables_table[] = {
  95         { "/proc/sys",           READONLY,     false },
  96         { "/proc/sysrq-trigger", READONLY,     true  },
  97         { "/proc/latency_stats", READONLY,     true  },
  98         { "/proc/mtrr",          READONLY,     true  },
  99         { "/proc/apm",           READONLY,     true  }, /* Obsolete API, there's no point in permitting access to this, ever */
 100         { "/proc/acpi",          READONLY,     true  },
 101         { "/proc/timer_stats",   READONLY,     true  },
 102         { "/proc/asound",        READONLY,     true  },
 103         { "/proc/bus",           READONLY,     true  },
 104         { "/proc/fs",            READONLY,     true  },
 105         { "/proc/irq",           READONLY,     true  },
 106         { "/sys",                READONLY,     false },
 107         { "/sys/kernel/debug",   READONLY,     true  },
 108         { "/sys/kernel/tracing", READONLY,     true  },
 109         { "/sys/fs/cgroup",      READWRITE,    false }, /* READONLY is set by ProtectControlGroups= option */
 110         { "/sys/fs/selinux",     READWRITE,    true  },
 111 };
 112
 113 /* ProtectKernelModules= option */
 114 static const MountEntry protect_kernel_modules_table[] = {
 115 #if HAVE_SPLIT_USR
 116         { "/lib/modules",        INACCESSIBLE, true  },
 117 #endif
 118         { "/usr/lib/modules",    INACCESSIBLE, true  },
 119 };
 120
 121 /*
 122  * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
 123  * system should be protected by ProtectSystem=
 124  */
 125 static const MountEntry protect_home_read_only_table[] = {
 126         { "/home",               READONLY,     true  },
 127         { "/run/user",           READONLY,     true  },
 128         { "/root",               READONLY,     true  },
 129 };
 130
 131 /* ProtectHome=tmpfs table */
 132 static const MountEntry protect_home_tmpfs_table[] = {
 133         { "/home",               TMPFS,        true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
 134         { "/run/user",           TMPFS,        true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
 135         { "/root",               TMPFS,        true, .read_only = true, .options_const = "mode=0700", .flags = MS_NODEV|MS_STRICTATIME },
 136 };
 137
 138 /* ProtectHome=yes table */
 139 static const MountEntry protect_home_yes_table[] = {
 140         { "/home",               INACCESSIBLE, true  },
 141         { "/run/user",           INACCESSIBLE, true  },
 142         { "/root",               INACCESSIBLE, true  },
 143 };
 144
 145 /* ProtectSystem=yes table */
 146 static const MountEntry protect_system_yes_table[] = {
 147         { "/usr",                READONLY,     false },
 148         { "/boot",               READONLY,     true  },
 149         { "/efi",                READONLY,     true  },
 150 };
 151
 152 /* ProtectSystem=full includes ProtectSystem=yes */
 153 static const MountEntry protect_system_full_table[] = {
 154         { "/usr",                READONLY,     false },
 155         { "/boot",               READONLY,     true  },
 156         { "/efi",                READONLY,     true  },
 157         { "/etc",                READONLY,     false },
 158 };
 159
 160 /*
 161  * ProtectSystem=strict table. In this strict mode, we mount everything
 162  * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
 163  * which are left writable, but PrivateDevices= + ProtectKernelTunables=
 164  * protect those, and these options should be fully orthogonal.
 165  * (And of course /home and friends are also left writable, as ProtectHome=
 166  * shall manage those, orthogonally).
 167  */
 168 static const MountEntry protect_system_strict_table[] = {
 169         { "/",                   READONLY,     false },
 170         { "/proc",               READWRITE,    false },      /* ProtectKernelTunables= */
 171         { "/sys",                READWRITE,    false },      /* ProtectKernelTunables= */
 172         { "/dev",                READWRITE,    false },      /* PrivateDevices= */
 173         { "/home",               READWRITE,    true  },      /* ProtectHome= */
 174         { "/run/user",           READWRITE,    true  },      /* ProtectHome= */
 175         { "/root",               READWRITE,    true  },      /* ProtectHome= */
 176 };
 177
 178 static const char *mount_entry_path(const MountEntry *p) {
 179         assert(p);
 180
 181         /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
 182          * otherwise the stack/static ->path field is returned. */
 183
 184         return p->path_malloc ?: p->path_const;
 185 }
 186
 187 static bool mount_entry_read_only(const MountEntry *p) {
 188         assert(p);
 189
 190         return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE);
 191 }
 192
 193 static const char *mount_entry_source(const MountEntry *p) {
 194         assert(p);
 195
 196         return p->source_malloc ?: p->source_const;
 197 }
 198
 199 static const char *mount_entry_options(const MountEntry *p) {
 200         assert(p);
 201
 202         return p->options_malloc ?: p->options_const;
 203 }
 204
 205 static void mount_entry_done(MountEntry *p) {
 206         assert(p);
 207
 208         p->path_malloc = mfree(p->path_malloc);
 209         p->source_malloc = mfree(p->source_malloc);
 210         p->options_malloc = mfree(p->options_malloc);
 211 }
 212
 213 static int append_access_mounts(MountEntry **p, char **strv, MountMode mode, bool forcibly_require_prefix) {
 214         char **i;
 215
 216         assert(p);
 217
 218         /* Adds a list of user-supplied READWRITE/READONLY/INACCESSIBLE entries */
 219
 220         STRV_FOREACH(i, strv) {
 221                 bool ignore = false, needs_prefix = false;
 222                 const char *e = *i;
 223
 224                 /* Look for any prefixes */
 225                 if (startswith(e, "-")) {
 226                         e++;
 227                         ignore = true;
 228                 }
 229                 if (startswith(e, "+")) {
 230                         e++;
 231                         needs_prefix = true;
 232                 }
 233
 234                 if (!path_is_absolute(e))
 235                         return -EINVAL;
 236
 237                 *((*p)++) = (MountEntry) {
 238                         .path_const = e,
 239                         .mode = mode,
 240                         .ignore = ignore,
 241                         .has_prefix = !needs_prefix && !forcibly_require_prefix,
 242                 };
 243         }
 244
 245         return 0;
 246 }
 247
 248 static int append_empty_dir_mounts(MountEntry **p, char **strv) {
 249         char **i;
 250
 251         assert(p);
 252
 253         /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
 254          * "/private/" boundary directories for DynamicUser=1. */
 255
 256         STRV_FOREACH(i, strv) {
 257
 258                 *((*p)++) = (MountEntry) {
 259                         .path_const = *i,
 260                         .mode = EMPTY_DIR,
 261                         .ignore = false,
 262                         .has_prefix = false,
 263                         .read_only = true,
 264                         .options_const = "mode=755",
 265                         .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
 266                 };
 267         }
 268
 269         return 0;
 270 }
 271
 272 static int append_bind_mounts(MountEntry **p, const BindMount *binds, unsigned n) {
 273         unsigned i;
 274
 275         assert(p);
 276
 277         for (i = 0; i < n; i++) {
 278                 const BindMount *b = binds + i;
 279
 280                 *((*p)++) = (MountEntry) {
 281                         .path_const = b->destination,
 282                         .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT,
 283                         .read_only = b->read_only,
 284                         .source_const = b->source,
 285                         .ignore = b->ignore_enoent,
 286                 };
 287         }
 288
 289         return 0;
 290 }
 291
 292 static int append_tmpfs_mounts(MountEntry **p, const TemporaryFileSystem *tmpfs, unsigned n) {
 293         unsigned i;
 294         int r;
 295
 296         assert(p);
 297
 298         for (i = 0; i < n; i++) {
 299                 const TemporaryFileSystem *t = tmpfs + i;
 300                 _cleanup_free_ char *o = NULL, *str = NULL;
 301                 unsigned long flags = MS_NODEV|MS_STRICTATIME;
 302                 bool ro = false;
 303
 304                 if (!path_is_absolute(t->path))
 305                         return -EINVAL;
 306
 307                 if (!isempty(t->options)) {
 308                         str = strjoin("mode=0755,", t->options);
 309                         if (!str)
 310                                 return -ENOMEM;
 311
 312                         r = mount_option_mangle(str, MS_NODEV|MS_STRICTATIME, &flags, &o);
 313                         if (r < 0)
 314                                 return r;
 315
 316                         ro = !!(flags & MS_RDONLY);
 317                         if (ro)
 318                                 flags ^= MS_RDONLY;
 319                 }
 320
 321                 *((*p)++) = (MountEntry) {
 322                         .path_const = t->path,
 323                         .mode = TMPFS,
 324                         .read_only = ro,
 325                         .options_malloc = o,
 326                         .flags = flags,
 327                 };
 328
 329                 o = NULL;
 330         }
 331
 332         return 0;
 333 }
 334
 335 static int append_static_mounts(MountEntry **p, const MountEntry *mounts, unsigned n, bool ignore_protect) {
 336         unsigned i;
 337
 338         assert(p);
 339         assert(mounts);
 340
 341         /* Adds a list of static pre-defined entries */
 342
 343         for (i = 0; i < n; i++)
 344                 *((*p)++) = (MountEntry) {
 345                         .path_const = mount_entry_path(mounts+i),
 346                         .mode = mounts[i].mode,
 347                         .ignore = mounts[i].ignore || ignore_protect,
 348                 };
 349
 350         return 0;
 351 }
 352
 353 static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
 354         assert(p);
 355
 356         switch (protect_home) {
 357
 358         case PROTECT_HOME_NO:
 359                 return 0;
 360
 361         case PROTECT_HOME_READ_ONLY:
 362                 return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
 363
 364         case PROTECT_HOME_TMPFS:
 365                 return append_static_mounts(p, protect_home_tmpfs_table, ELEMENTSOF(protect_home_tmpfs_table), ignore_protect);
 366
 367         case PROTECT_HOME_YES:
 368                 return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
 369
 370         default:
 371                 assert_not_reached("Unexpected ProtectHome= value");
 372         }
 373 }
 374
 375 static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
 376         assert(p);
 377
 378         switch (protect_system) {
 379
 380         case PROTECT_SYSTEM_NO:
 381                 return 0;
 382
 383         case PROTECT_SYSTEM_STRICT:
 384                 return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
 385
 386         case PROTECT_SYSTEM_YES:
 387                 return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
 388
 389         case PROTECT_SYSTEM_FULL:
 390                 return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
 391
 392         default:
 393                 assert_not_reached("Unexpected ProtectSystem= value");
 394         }
 395 }
 396
 397 static int mount_path_compare(const void *a, const void *b) {
 398         const MountEntry *p = a, *q = b;
 399         int d;
 400
 401         /* If the paths are not equal, then order prefixes first */
 402         d = path_compare(mount_entry_path(p), mount_entry_path(q));
 403         if (d != 0)
 404                 return d;
 405
 406         /* If the paths are equal, check the mode */
 407         if (p->mode < q->mode)
 408                 return -1;
 409
 410         if (p->mode > q->mode)
 411                 return 1;
 412
 413         return 0;
 414 }
 415
 416 static int prefix_where_needed(MountEntry *m, unsigned n, const char *root_directory) {
 417         unsigned i;
 418
 419         /* Prefixes all paths in the bind mount table with the root directory if it is specified and the entry needs
 420          * that. */
 421
 422         if (!root_directory)
 423                 return 0;
 424
 425         for (i = 0; i < n; i++) {
 426                 char *s;
 427
 428                 if (m[i].has_prefix)
 429                         continue;
 430
 431                 s = prefix_root(root_directory, mount_entry_path(m+i));
 432                 if (!s)
 433                         return -ENOMEM;
 434
 435                 free_and_replace(m[i].path_malloc, s);
 436                 m[i].has_prefix = true;
 437         }
 438
 439         return 0;
 440 }
 441
 442 static void drop_duplicates(MountEntry *m, unsigned *n) {
 443         MountEntry *f, *t, *previous;
 444
 445         assert(m);
 446         assert(n);
 447
 448         /* Drops duplicate entries. Expects that the array is properly ordered already. */
 449
 450         for (f = m, t = m, previous = NULL; f < m + *n; f++) {
 451
 452                 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
 453                  * above. */
 454                 if (previous && path_equal(mount_entry_path(f), mount_entry_path(previous))) {
 455                         log_debug("%s is duplicate.", mount_entry_path(f));
 456                         previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */
 457                         mount_entry_done(f);
 458                         continue;
 459                 }
 460
 461                 *t = *f;
 462                 previous = t;
 463                 t++;
 464         }
 465
 466         *n = t - m;
 467 }
 468
 469 static void drop_inaccessible(MountEntry *m, unsigned *n) {
 470         MountEntry *f, *t;
 471         const char *clear = NULL;
 472
 473         assert(m);
 474         assert(n);
 475
 476         /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
 477          * ordered already. */
 478
 479         for (f = m, t = m; f < m + *n; f++) {
 480
 481                 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
 482                  * it, as inaccessible paths really should drop the entire subtree. */
 483                 if (clear && path_startswith(mount_entry_path(f), clear)) {
 484                         log_debug("%s is masked by %s.", mount_entry_path(f), clear);
 485                         mount_entry_done(f);
 486                         continue;
 487                 }
 488
 489                 clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
 490
 491                 *t = *f;
 492                 t++;
 493         }
 494
 495         *n = t - m;
 496 }
 497
 498 static void drop_nop(MountEntry *m, unsigned *n) {
 499         MountEntry *f, *t;
 500
 501         assert(m);
 502         assert(n);
 503
 504         /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
 505          * list is ordered by prefixes. */
 506
 507         for (f = m, t = m; f < m + *n; f++) {
 508
 509                 /* Only suppress such subtrees for READONLY and READWRITE entries */
 510                 if (IN_SET(f->mode, READONLY, READWRITE)) {
 511                         MountEntry *p;
 512                         bool found = false;
 513
 514                         /* Now let's find the first parent of the entry we are looking at. */
 515                         for (p = t-1; p >= m; p--) {
 516                                 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
 517                                         found = true;
 518                                         break;
 519                                 }
 520                         }
 521
 522                         /* We found it, let's see if it's the same mode, if so, we can drop this entry */
 523                         if (found && p->mode == f->mode) {
 524                                 log_debug("%s is redundant by %s", mount_entry_path(f), mount_entry_path(p));
 525                                 mount_entry_done(f);
 526                                 continue;
 527                         }
 528                 }
 529
 530                 *t = *f;
 531                 t++;
 532         }
 533
 534         *n = t - m;
 535 }
 536
 537 static void drop_outside_root(const char *root_directory, MountEntry *m, unsigned *n) {
 538         MountEntry *f, *t;
 539
 540         assert(m);
 541         assert(n);
 542
 543         /* Nothing to do */
 544         if (!root_directory)
 545                 return;
 546
 547         /* Drops all mounts that are outside of the root directory. */
 548
 549         for (f = m, t = m; f < m + *n; f++) {
 550
 551                 if (!path_startswith(mount_entry_path(f), root_directory)) {
 552                         log_debug("%s is outside of root directory.", mount_entry_path(f));
 553                         mount_entry_done(f);
 554                         continue;
 555                 }
 556
 557                 *t = *f;
 558                 t++;
 559         }
 560
 561         *n = t - m;
 562 }
 563
 564 static int clone_device_node(const char *d, const char *temporary_mount) {
 565         const char *dn;
 566         struct stat st;
 567         int r;
 568
 569         if (stat(d, &st) < 0) {
 570                 if (errno == ENOENT)
 571                         return 0;
 572                 return -errno;
 573         }
 574
 575         if (!S_ISBLK(st.st_mode) &&
 576             !S_ISCHR(st.st_mode))
 577                 return -EINVAL;
 578
 579         if (st.st_rdev == 0)
 580                 return 0;
 581
 582         dn = strjoina(temporary_mount, d);
 583
 584         mac_selinux_create_file_prepare(d, st.st_mode);
 585         r = mknod(dn, st.st_mode, st.st_rdev);
 586         mac_selinux_create_file_clear();
 587         if (r < 0)
 588                 return log_debug_errno(errno, "mknod failed for %s: %m", d);
 589
 590         return 1;
 591 }
 592
 593 static int mount_private_dev(MountEntry *m) {
 594         static const char devnodes[] =
 595                 "/dev/null\0"
 596                 "/dev/zero\0"
 597                 "/dev/full\0"
 598                 "/dev/random\0"
 599                 "/dev/urandom\0"
 600                 "/dev/tty\0";
 601
 602         char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
 603         const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
 604         _cleanup_umask_ mode_t u;
 605         int r;
 606
 607         assert(m);
 608
 609         u = umask(0000);
 610
 611         if (!mkdtemp(temporary_mount))
 612                 return -errno;
 613
 614         dev = strjoina(temporary_mount, "/dev");
 615         (void) mkdir(dev, 0755);
 616         if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
 617                 r = -errno;
 618                 goto fail;
 619         }
 620
 621         devpts = strjoina(temporary_mount, "/dev/pts");
 622         (void) mkdir(devpts, 0755);
 623         if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
 624                 r = -errno;
 625                 goto fail;
 626         }
 627
 628         /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx
 629          * when /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible
 630          * thus, in that case make a clone
 631          *
 632          * in nspawn and other containers it will be a symlink, in that case make it a symlink
 633          */
 634         r = is_symlink("/dev/ptmx");
 635         if (r < 0)
 636                 goto fail;
 637         if (r > 0) {
 638                 devptmx = strjoina(temporary_mount, "/dev/ptmx");
 639                 if (symlink("pts/ptmx", devptmx) < 0) {
 640                         r = -errno;
 641                         goto fail;
 642                 }
 643         } else {
 644                 r = clone_device_node("/dev/ptmx", temporary_mount);
 645                 if (r < 0)
 646                         goto fail;
 647                 if (r == 0) {
 648                         r = -ENXIO;
 649                         goto fail;
 650                 }
 651         }
 652
 653         devshm = strjoina(temporary_mount, "/dev/shm");
 654         (void) mkdir(devshm, 0755);
 655         r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
 656         if (r < 0) {
 657                 r = -errno;
 658                 goto fail;
 659         }
 660
 661         devmqueue = strjoina(temporary_mount, "/dev/mqueue");
 662         (void) mkdir(devmqueue, 0755);
 663         (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
 664
 665         devhugepages = strjoina(temporary_mount, "/dev/hugepages");
 666         (void) mkdir(devhugepages, 0755);
 667         (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
 668
 669         devlog = strjoina(temporary_mount, "/dev/log");
 670         (void) symlink("/run/systemd/journal/dev-log", devlog);
 671
 672         NULSTR_FOREACH(d, devnodes) {
 673                 r = clone_device_node(d, temporary_mount);
 674                 if (r < 0)
 675                         goto fail;
 676         }
 677
 678         dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
 679
 680         /* Create the /dev directory if missing. It is more likely to be
 681          * missing when the service is started with RootDirectory. This is
 682          * consistent with mount units creating the mount points when missing.
 683          */
 684         (void) mkdir_p_label(mount_entry_path(m), 0755);
 685
 686         /* Unmount everything in old /dev */
 687         umount_recursive(mount_entry_path(m), 0);
 688         if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) {
 689                 r = -errno;
 690                 goto fail;
 691         }
 692
 693         rmdir(dev);
 694         rmdir(temporary_mount);
 695
 696         return 0;
 697
 698 fail:
 699         if (devpts)
 700                 umount(devpts);
 701
 702         if (devshm)
 703                 umount(devshm);
 704
 705         if (devhugepages)
 706                 umount(devhugepages);
 707
 708         if (devmqueue)
 709                 umount(devmqueue);
 710
 711         umount(dev);
 712         rmdir(dev);
 713         rmdir(temporary_mount);
 714
 715         return r;
 716 }
 717
 718 static int mount_bind_dev(const MountEntry *m) {
 719         int r;
 720
 721         assert(m);
 722
 723         /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
 724          * /dev. This is only used when RootDirectory= is set. */
 725
 726         (void) mkdir_p_label(mount_entry_path(m), 0755);
 727
 728         r = path_is_mount_point(mount_entry_path(m), NULL, 0);
 729         if (r < 0)
 730                 return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
 731         if (r > 0) /* make this a NOP if /dev is already a mount point */
 732                 return 0;
 733
 734         if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
 735                 return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m));
 736
 737         return 1;
 738 }
 739
 740 static int mount_sysfs(const MountEntry *m) {
 741         int r;
 742
 743         assert(m);
 744
 745         (void) mkdir_p_label(mount_entry_path(m), 0755);
 746
 747         r = path_is_mount_point(mount_entry_path(m), NULL, 0);
 748         if (r < 0)
 749                 return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
 750         if (r > 0) /* make this a NOP if /sys is already a mount point */
 751                 return 0;
 752
 753         /* Bind mount the host's version so that we get all child mounts of it, too. */
 754         if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
 755                 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
 756
 757         return 1;
 758 }
 759
 760 static int mount_procfs(const MountEntry *m) {
 761         int r;
 762
 763         assert(m);
 764
 765         (void) mkdir_p_label(mount_entry_path(m), 0755);
 766
 767         r = path_is_mount_point(mount_entry_path(m), NULL, 0);
 768         if (r < 0)
 769                 return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
 770         if (r > 0) /* make this a NOP if /proc is already a mount point */
 771                 return 0;
 772
 773         /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
 774         if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
 775                 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
 776
 777         return 1;
 778 }
 779
 780 static int mount_tmpfs(const MountEntry *m) {
 781         assert(m);
 782
 783         /* First, get rid of everything that is below if there is anything. Then, overmount with our new tmpfs */
 784
 785         (void) mkdir_p_label(mount_entry_path(m), 0755);
 786         (void) umount_recursive(mount_entry_path(m), 0);
 787
 788         if (mount("tmpfs", mount_entry_path(m), "tmpfs", m->flags, mount_entry_options(m)) < 0)
 789                 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
 790
 791         return 1;
 792 }
 793
 794 static int mount_entry_chase(
 795                 const char *root_directory,
 796                 const MountEntry *m,
 797                 const char *path,
 798                 bool chase_nonexistent,
 799                 char **location) {
 800
 801         char *chased;
 802         int r;
 803
 804         assert(m);
 805
 806         /* Since mount() will always follow symlinks and we need to take the different root directory into account we
 807          * chase the symlinks on our own first. This is called for the destination path, as well as the source path (if
 808          * that applies). The result is stored in "location". */
 809
 810         r = chase_symlinks(path, root_directory, chase_nonexistent ? CHASE_NONEXISTENT : 0, &chased);
 811         if (r == -ENOENT && m->ignore) {
 812                 log_debug_errno(r, "Path %s does not exist, ignoring.", path);
 813                 return 0;
 814         }
 815         if (r < 0)
 816                 return log_debug_errno(r, "Failed to follow symlinks on %s: %m", path);
 817
 818         log_debug("Followed symlinks %s → %s.", path, chased);
 819
 820         free(*location);
 821         *location = chased;
 822
 823         return 1;
 824 }
 825
 826 static int apply_mount(
 827                 const char *root_directory,
 828                 MountEntry *m) {
 829
 830         bool rbind = true, make = false;
 831         const char *what;
 832         int r;
 833
 834         assert(m);
 835
 836         r = mount_entry_chase(root_directory, m, mount_entry_path(m), !IN_SET(m->mode, INACCESSIBLE, READONLY, READWRITE), &m->path_malloc);
 837         if (r <= 0)
 838                 return r;
 839
 840         log_debug("Applying namespace mount on %s", mount_entry_path(m));
 841
 842         switch (m->mode) {
 843
 844         case INACCESSIBLE: {
 845                 struct stat target;
 846
 847                 /* First, get rid of everything that is below if there
 848                  * is anything... Then, overmount it with an
 849                  * inaccessible path. */
 850                 (void) umount_recursive(mount_entry_path(m), 0);
 851
 852                 if (lstat(mount_entry_path(m), &target) < 0)
 853                         return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m));
 854
 855                 what = mode_to_inaccessible_node(target.st_mode);
 856                 if (!what) {
 857                         log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed");
 858                         return -ELOOP;
 859                 }
 860                 break;
 861         }
 862
 863         case READONLY:
 864         case READWRITE:
 865                 r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
 866                 if (r < 0)
 867                         return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m));
 868                 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
 869                         return 0;
 870                 /* This isn't a mount point yet, let's make it one. */
 871                 what = mount_entry_path(m);
 872                 break;
 873
 874         case BIND_MOUNT:
 875                 rbind = false;
 876
 877                 _fallthrough_;
 878         case BIND_MOUNT_RECURSIVE:
 879                 /* Also chase the source mount */
 880
 881                 r = mount_entry_chase(root_directory, m, mount_entry_source(m), false, &m->source_malloc);
 882                 if (r <= 0)
 883                         return r;
 884
 885                 what = mount_entry_source(m);
 886                 make = true;
 887                 break;
 888
 889         case EMPTY_DIR:
 890         case TMPFS:
 891                 return mount_tmpfs(m);
 892
 893         case PRIVATE_TMP:
 894                 what = mount_entry_source(m);
 895                 make = true;
 896                 break;
 897
 898         case PRIVATE_DEV:
 899                 return mount_private_dev(m);
 900
 901         case BIND_DEV:
 902                 return mount_bind_dev(m);
 903
 904         case SYSFS:
 905                 return mount_sysfs(m);
 906
 907         case PROCFS:
 908                 return mount_procfs(m);
 909
 910         default:
 911                 assert_not_reached("Unknown mode");
 912         }
 913
 914         assert(what);
 915
 916         if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0) {
 917                 bool try_again = false;
 918                 r = -errno;
 919
 920                 if (r == -ENOENT && make) {
 921                         struct stat st;
 922
 923                         /* Hmm, either the source or the destination are missing. Let's see if we can create the destination, then try again */
 924
 925                         if (stat(what, &st) >= 0) {
 926
 927                                 (void) mkdir_parents(mount_entry_path(m), 0755);
 928
 929                                 if (S_ISDIR(st.st_mode))
 930                                         try_again = mkdir(mount_entry_path(m), 0755) >= 0;
 931                                 else
 932                                         try_again = touch(mount_entry_path(m)) >= 0;
 933                         }
 934                 }
 935
 936                 if (try_again) {
 937                         if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0)
 938                                 r = -errno;
 939                         else
 940                                 r = 0;
 941                 }
 942
 943                 if (r < 0)
 944                         return log_debug_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
 945         }
 946
 947         log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
 948         return 0;
 949 }
 950
 951 static int make_read_only(const MountEntry *m, char **blacklist, FILE *proc_self_mountinfo) {
 952         int r = 0;
 953
 954         assert(m);
 955         assert(proc_self_mountinfo);
 956
 957         if (mount_entry_read_only(m)) {
 958                 if (IN_SET(m->mode, EMPTY_DIR, TMPFS)) {
 959                         /* Make superblock readonly */
 960                         if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT | MS_RDONLY | m->flags, mount_entry_options(m)) < 0)
 961                                 r = -errno;
 962                 } else
 963                         r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), true, blacklist, proc_self_mountinfo);
 964         } else if (m->mode == PRIVATE_DEV) {
 965                 /* Superblock can be readonly but the submounts can't */
 966                 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
 967                         r = -errno;
 968         } else
 969                 return 0;
 970
 971         /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
 972          * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
 973          * read-only mounts already applied. */
 974
 975         if (r == -ENOENT && m->ignore)
 976                 r = 0;
 977
 978         return r;
 979 }
 980
 981 static bool namespace_info_mount_apivfs(const char *root_directory, const NamespaceInfo *ns_info) {
 982         assert(ns_info);
 983
 984         /*
 985          * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
 986          * since to protect the API VFS mounts, they need to be around in the
 987          * first place... and RootDirectory= or RootImage= need to be set.
 988          */
 989
 990         /* root_directory should point to a mount point */
 991         return root_directory &&
 992                 (ns_info->mount_apivfs ||
 993                  ns_info->protect_control_groups ||
 994                  ns_info->protect_kernel_tunables);
 995 }
 996
 997 static unsigned namespace_calculate_mounts(
 998                 const char* root_directory,
 999                 const NamespaceInfo *ns_info,
1000                 char** read_write_paths,
1001                 char** read_only_paths,
1002                 char** inaccessible_paths,
1003                 char** empty_directories,
1004                 unsigned n_bind_mounts,
1005                 unsigned n_temporary_filesystems,
1006                 const char* tmp_dir,
1007                 const char* var_tmp_dir,
1008                 ProtectHome protect_home,
1009                 ProtectSystem protect_system) {
1010
1011         unsigned protect_home_cnt;
1012         unsigned protect_system_cnt =
1013                 (protect_system == PROTECT_SYSTEM_STRICT ?
1014                  ELEMENTSOF(protect_system_strict_table) :
1015                  ((protect_system == PROTECT_SYSTEM_FULL) ?
1016                   ELEMENTSOF(protect_system_full_table) :
1017                   ((protect_system == PROTECT_SYSTEM_YES) ?
1018                    ELEMENTSOF(protect_system_yes_table) : 0)));
1019
1020         protect_home_cnt =
1021                 (protect_home == PROTECT_HOME_YES ?
1022                  ELEMENTSOF(protect_home_yes_table) :
1023                  ((protect_home == PROTECT_HOME_READ_ONLY) ?
1024                   ELEMENTSOF(protect_home_read_only_table) :
1025                   ((protect_home == PROTECT_HOME_TMPFS) ?
1026                    ELEMENTSOF(protect_home_tmpfs_table) : 0)));
1027
1028         return !!tmp_dir + !!var_tmp_dir +
1029                 strv_length(read_write_paths) +
1030                 strv_length(read_only_paths) +
1031                 strv_length(inaccessible_paths) +
1032                 strv_length(empty_directories) +
1033                 n_bind_mounts +
1034                 n_temporary_filesystems +
1035                 ns_info->private_dev +
1036                 (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
1037                 (ns_info->protect_control_groups ? 1 : 0) +
1038                 (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
1039                 protect_home_cnt + protect_system_cnt +
1040                 (namespace_info_mount_apivfs(root_directory, ns_info) ? ELEMENTSOF(apivfs_table) : 0);
1041 }
1042
1043 int setup_namespace(
1044                 const char* root_directory,
1045                 const char* root_image,
1046                 const NamespaceInfo *ns_info,
1047                 char** read_write_paths,
1048                 char** read_only_paths,
1049                 char** inaccessible_paths,
1050                 char** empty_directories,
1051                 const BindMount *bind_mounts,
1052                 unsigned n_bind_mounts,
1053                 const TemporaryFileSystem *temporary_filesystems,
1054                 unsigned n_temporary_filesystems,
1055                 const char* tmp_dir,
1056                 const char* var_tmp_dir,
1057                 ProtectHome protect_home,
1058                 ProtectSystem protect_system,
1059                 unsigned long mount_flags,
1060                 DissectImageFlags dissect_image_flags) {
1061
1062         _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
1063         _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
1064         _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
1065         _cleanup_free_ void *root_hash = NULL;
1066         MountEntry *m, *mounts = NULL;
1067         size_t root_hash_size = 0;
1068         bool make_slave = false;
1069         const char *root;
1070         unsigned n_mounts;
1071         bool require_prefix = false;
1072         int r = 0;
1073
1074         assert(ns_info);
1075
1076         if (mount_flags == 0)
1077                 mount_flags = MS_SHARED;
1078
1079         if (root_image) {
1080                 dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT;
1081
1082                 if (protect_system == PROTECT_SYSTEM_STRICT && strv_isempty(read_write_paths))
1083                         dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
1084
1085                 r = loop_device_make_by_path(root_image,
1086                                              dissect_image_flags & DISSECT_IMAGE_READ_ONLY ? O_RDONLY : O_RDWR,
1087                                              &loop_device);
1088                 if (r < 0)
1089                         return r;
1090
1091                 r = root_hash_load(root_image, &root_hash, &root_hash_size);
1092                 if (r < 0)
1093                         return r;
1094
1095                 r = dissect_image(loop_device->fd, root_hash, root_hash_size, dissect_image_flags, &dissected_image);
1096                 if (r < 0)
1097                         return r;
1098
1099                 r = dissected_image_decrypt(dissected_image, NULL, root_hash, root_hash_size, dissect_image_flags, &decrypted_image);
1100                 if (r < 0)
1101                         return r;
1102         }
1103
1104         if (root_directory)
1105                 root = root_directory;
1106         else if (root_image || n_bind_mounts > 0 || n_temporary_filesystems > 0) {
1107
1108                 /* If we are booting from an image, create a mount point for the image, if it's still missing. We use
1109                  * the same mount point for all images, which is safe, since they all live in their own namespaces
1110                  * after all, and hence won't see each other. We also use such a root directory whenever there are bind
1111                  * mounts configured, so that their source mounts are never obstructed by mounts we already applied
1112                  * while we are applying them. */
1113
1114                 root = "/run/systemd/unit-root";
1115                 (void) mkdir_label(root, 0700);
1116                 require_prefix = true;
1117         } else
1118                 root = NULL;
1119
1120         n_mounts = namespace_calculate_mounts(
1121                         root,
1122                         ns_info,
1123                         read_write_paths,
1124                         read_only_paths,
1125                         inaccessible_paths,
1126                         empty_directories,
1127                         n_bind_mounts,
1128                         n_temporary_filesystems,
1129                         tmp_dir, var_tmp_dir,
1130                         protect_home, protect_system);
1131
1132         /* Set mount slave mode */
1133         if (root || n_mounts > 0)
1134                 make_slave = true;
1135
1136         if (n_mounts > 0) {
1137                 m = mounts = (MountEntry *) alloca0(n_mounts * sizeof(MountEntry));
1138                 r = append_access_mounts(&m, read_write_paths, READWRITE, require_prefix);
1139                 if (r < 0)
1140                         goto finish;
1141
1142                 r = append_access_mounts(&m, read_only_paths, READONLY, require_prefix);
1143                 if (r < 0)
1144                         goto finish;
1145
1146                 r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE, require_prefix);
1147                 if (r < 0)
1148                         goto finish;
1149
1150                 r = append_empty_dir_mounts(&m, empty_directories);
1151                 if (r < 0)
1152                         goto finish;
1153
1154                 r = append_bind_mounts(&m, bind_mounts, n_bind_mounts);
1155                 if (r < 0)
1156                         goto finish;
1157
1158                 r = append_tmpfs_mounts(&m, temporary_filesystems, n_temporary_filesystems);
1159                 if (r < 0)
1160                         goto finish;
1161
1162                 if (tmp_dir) {
1163                         *(m++) = (MountEntry) {
1164                                 .path_const = "/tmp",
1165                                 .mode = PRIVATE_TMP,
1166                                 .source_const = tmp_dir,
1167                         };
1168                 }
1169
1170                 if (var_tmp_dir) {
1171                         *(m++) = (MountEntry) {
1172                                 .path_const = "/var/tmp",
1173                                 .mode = PRIVATE_TMP,
1174                                 .source_const = var_tmp_dir,
1175                         };
1176                 }
1177
1178                 if (ns_info->private_dev) {
1179                         *(m++) = (MountEntry) {
1180                                 .path_const = "/dev",
1181                                 .mode = PRIVATE_DEV,
1182                         };
1183                 }
1184
1185                 if (ns_info->protect_kernel_tunables) {
1186                         r = append_static_mounts(&m, protect_kernel_tunables_table, ELEMENTSOF(protect_kernel_tunables_table), ns_info->ignore_protect_paths);
1187                         if (r < 0)
1188                                 goto finish;
1189                 }
1190
1191                 if (ns_info->protect_kernel_modules) {
1192                         r = append_static_mounts(&m, protect_kernel_modules_table, ELEMENTSOF(protect_kernel_modules_table), ns_info->ignore_protect_paths);
1193                         if (r < 0)
1194                                 goto finish;
1195                 }
1196
1197                 if (ns_info->protect_control_groups) {
1198                         *(m++) = (MountEntry) {
1199                                 .path_const = "/sys/fs/cgroup",
1200                                 .mode = READONLY,
1201                         };
1202                 }
1203
1204                 r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths);
1205                 if (r < 0)
1206                         goto finish;
1207
1208                 r = append_protect_system(&m, protect_system, false);
1209                 if (r < 0)
1210                         goto finish;
1211
1212                 if (namespace_info_mount_apivfs(root, ns_info)) {
1213                         r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths);
1214                         if (r < 0)
1215                                 goto finish;
1216                 }
1217
1218                 assert(mounts + n_mounts == m);
1219
1220                 /* Prepend the root directory where that's necessary */
1221                 r = prefix_where_needed(mounts, n_mounts, root);
1222                 if (r < 0)
1223                         goto finish;
1224
1225                 qsort(mounts, n_mounts, sizeof(MountEntry), mount_path_compare);
1226
1227                 drop_duplicates(mounts, &n_mounts);
1228                 drop_outside_root(root, mounts, &n_mounts);
1229                 drop_inaccessible(mounts, &n_mounts);
1230                 drop_nop(mounts, &n_mounts);
1231         }
1232
1233         if (unshare(CLONE_NEWNS) < 0) {
1234                 r = -errno;
1235                 goto finish;
1236         }
1237
1238         if (make_slave) {
1239                 /* Remount / as SLAVE so that nothing now mounted in the namespace
1240                    shows up in the parent */
1241                 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1242                         r = -errno;
1243                         goto finish;
1244                 }
1245         }
1246
1247         if (root_image) {
1248                 /* A root image is specified, mount it to the right place */
1249                 r = dissected_image_mount(dissected_image, root, UID_INVALID, dissect_image_flags);
1250                 if (r < 0)
1251                         goto finish;
1252
1253                 if (decrypted_image) {
1254                         r = decrypted_image_relinquish(decrypted_image);
1255                         if (r < 0)
1256                                 goto finish;
1257                 }
1258
1259                 loop_device_relinquish(loop_device);
1260
1261         } else if (root_directory) {
1262
1263                 /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
1264                 r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW);
1265                 if (r < 0)
1266                         goto finish;
1267                 if (r == 0) {
1268                         if (mount(root, root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1269                                 r = -errno;
1270                                 goto finish;
1271                         }
1272                 }
1273
1274         } else if (root) {
1275
1276                 /* Let's mount the main root directory to the root directory to use */
1277                 if (mount("/", root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1278                         r = -errno;
1279                         goto finish;
1280                 }
1281         }
1282
1283         /* Try to set up the new root directory before mounting anything else there. */
1284         if (root_image || root_directory)
1285                 (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
1286
1287         if (n_mounts > 0) {
1288                 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
1289                 char **blacklist;
1290                 unsigned j;
1291
1292                 /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of /proc.
1293                  * For example, this is the case with the option: 'InaccessiblePaths=/proc' */
1294                 proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
1295                 if (!proc_self_mountinfo) {
1296                         r = -errno;
1297                         goto finish;
1298                 }
1299
1300                 /* First round, add in all special mounts we need */
1301                 for (m = mounts; m < mounts + n_mounts; ++m) {
1302                         r = apply_mount(root, m);
1303                         if (r < 0)
1304                                 goto finish;
1305                 }
1306
1307                 /* Create a blacklist we can pass to bind_mount_recursive() */
1308                 blacklist = newa(char*, n_mounts+1);
1309                 for (j = 0; j < n_mounts; j++)
1310                         blacklist[j] = (char*) mount_entry_path(mounts+j);
1311                 blacklist[j] = NULL;
1312
1313                 /* Second round, flip the ro bits if necessary. */
1314                 for (m = mounts; m < mounts + n_mounts; ++m) {
1315                         r = make_read_only(m, blacklist, proc_self_mountinfo);
1316                         if (r < 0)
1317                                 goto finish;
1318                 }
1319         }
1320
1321         if (root) {
1322                 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
1323                 r = mount_move_root(root);
1324                 if (r < 0)
1325                         goto finish;
1326         }
1327
1328         /* Remount / as the desired mode. Note that this will not
1329          * reestablish propagation from our side to the host, since
1330          * what's disconnected is disconnected. */
1331         if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
1332                 r = -errno;
1333                 goto finish;
1334         }
1335
1336         r = 0;
1337
1338 finish:
1339         for (m = mounts; m < mounts + n_mounts; m++)
1340                 mount_entry_done(m);
1341
1342         return r;
1343 }
1344
1345 void bind_mount_free_many(BindMount *b, unsigned n) {
1346         unsigned i;
1347
1348         assert(b || n == 0);
1349
1350         for (i = 0; i < n; i++) {
1351                 free(b[i].source);
1352                 free(b[i].destination);
1353         }
1354
1355         free(b);
1356 }
1357
1358 int bind_mount_add(BindMount **b, unsigned *n, const BindMount *item) {
1359         _cleanup_free_ char *s = NULL, *d = NULL;
1360         BindMount *c;
1361
1362         assert(b);
1363         assert(n);
1364         assert(item);
1365
1366         s = strdup(item->source);
1367         if (!s)
1368                 return -ENOMEM;
1369
1370         d = strdup(item->destination);
1371         if (!d)
1372                 return -ENOMEM;
1373
1374         c = realloc_multiply(*b, sizeof(BindMount), *n + 1);
1375         if (!c)
1376                 return -ENOMEM;
1377
1378         *b = c;
1379
1380         c[(*n) ++] = (BindMount) {
1381                 .source = s,
1382                 .destination = d,
1383                 .read_only = item->read_only,
1384                 .recursive = item->recursive,
1385                 .ignore_enoent = item->ignore_enoent,
1386         };
1387
1388         s = d = NULL;
1389         return 0;
1390 }
1391
1392 void temporary_filesystem_free_many(TemporaryFileSystem *t, unsigned n) {
1393         unsigned i;
1394
1395         assert(t || n == 0);
1396
1397         for (i = 0; i < n; i++) {
1398                 free(t[i].path);
1399                 free(t[i].options);
1400         }
1401
1402         free(t);
1403 }
1404
1405 int temporary_filesystem_add(
1406                 TemporaryFileSystem **t,
1407                 unsigned *n,
1408                 const char *path,
1409                 const char *options) {
1410
1411         _cleanup_free_ char *p = NULL, *o = NULL;
1412         TemporaryFileSystem *c;
1413
1414         assert(t);
1415         assert(n);
1416         assert(path);
1417
1418         p = strdup(path);
1419         if (!p)
1420                 return -ENOMEM;
1421
1422         if (!isempty(options)) {
1423                 o = strdup(options);
1424                 if (!o)
1425                         return -ENOMEM;
1426         }
1427
1428         c = realloc_multiply(*t, sizeof(TemporaryFileSystem), *n + 1);
1429         if (!c)
1430                 return -ENOMEM;
1431
1432         *t = c;
1433
1434         c[(*n) ++] = (TemporaryFileSystem) {
1435                 .path = p,
1436                 .options = o,
1437         };
1438
1439         p = o = NULL;
1440         return 0;
1441 }
1442
1443 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
1444         _cleanup_free_ char *x = NULL;
1445         char bid[SD_ID128_STRING_MAX];
1446         sd_id128_t boot_id;
1447         int r;
1448
1449         assert(id);
1450         assert(prefix);
1451         assert(path);
1452
1453         /* We include the boot id in the directory so that after a
1454          * reboot we can easily identify obsolete directories. */
1455
1456         r = sd_id128_get_boot(&boot_id);
1457         if (r < 0)
1458                 return r;
1459
1460         x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX");
1461         if (!x)
1462                 return -ENOMEM;
1463
1464         RUN_WITH_UMASK(0077)
1465                 if (!mkdtemp(x))
1466                         return -errno;
1467
1468         RUN_WITH_UMASK(0000) {
1469                 char *y;
1470
1471                 y = strjoina(x, "/tmp");
1472
1473                 if (mkdir(y, 0777 | S_ISVTX) < 0)
1474                         return -errno;
1475         }
1476
1477         *path = x;
1478         x = NULL;
1479
1480         return 0;
1481 }
1482
1483 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
1484         char *a, *b;
1485         int r;
1486
1487         assert(id);
1488         assert(tmp_dir);
1489         assert(var_tmp_dir);
1490
1491         r = setup_one_tmp_dir(id, "/tmp", &a);
1492         if (r < 0)
1493                 return r;
1494
1495         r = setup_one_tmp_dir(id, "/var/tmp", &b);
1496         if (r < 0) {
1497                 char *t;
1498
1499                 t = strjoina(a, "/tmp");
1500                 rmdir(t);
1501                 rmdir(a);
1502
1503                 free(a);
1504                 return r;
1505         }
1506
1507         *tmp_dir = a;
1508         *var_tmp_dir = b;
1509
1510         return 0;
1511 }
1512
1513 int setup_netns(int netns_storage_socket[2]) {
1514         _cleanup_close_ int netns = -1;
1515         int r, q;
1516
1517         assert(netns_storage_socket);
1518         assert(netns_storage_socket[0] >= 0);
1519         assert(netns_storage_socket[1] >= 0);
1520
1521         /* We use the passed socketpair as a storage buffer for our
1522          * namespace reference fd. Whatever process runs this first
1523          * shall create a new namespace, all others should just join
1524          * it. To serialize that we use a file lock on the socket
1525          * pair.
1526          *
1527          * It's a bit crazy, but hey, works great! */
1528
1529         if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1530                 return -errno;
1531
1532         netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1533         if (netns == -EAGAIN) {
1534                 /* Nothing stored yet, so let's create a new namespace */
1535
1536                 if (unshare(CLONE_NEWNET) < 0) {
1537                         r = -errno;
1538                         goto fail;
1539                 }
1540
1541                 loopback_setup();
1542
1543                 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
1544                 if (netns < 0) {
1545                         r = -errno;
1546                         goto fail;
1547                 }
1548
1549                 r = 1;
1550
1551         } else if (netns < 0) {
1552                 r = netns;
1553                 goto fail;
1554
1555         } else {
1556                 /* Yay, found something, so let's join the namespace */
1557                 if (setns(netns, CLONE_NEWNET) < 0) {
1558                         r = -errno;
1559                         goto fail;
1560                 }
1561
1562                 r = 0;
1563         }
1564
1565         q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1566         if (q < 0) {
1567                 r = q;
1568                 goto fail;
1569         }
1570
1571 fail:
1572         (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
1573         return r;
1574 }
1575
1576 bool ns_type_supported(NamespaceType type) {
1577         const char *t, *ns_proc;
1578
1579         t = namespace_type_to_string(type);
1580         if (!t) /* Don't know how to translate this? Then it's not supported */
1581                 return false;
1582
1583         ns_proc = strjoina("/proc/self/ns/", t);
1584         return access(ns_proc, F_OK) == 0;
1585 }
1586
1587 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
1588         [PROTECT_HOME_NO] = "no",
1589         [PROTECT_HOME_YES] = "yes",
1590         [PROTECT_HOME_READ_ONLY] = "read-only",
1591         [PROTECT_HOME_TMPFS] = "tmpfs",
1592 };
1593
1594 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
1595
1596 ProtectHome parse_protect_home_or_bool(const char *s) {
1597         int r;
1598
1599         r = parse_boolean(s);
1600         if (r > 0)
1601                 return PROTECT_HOME_YES;
1602         if (r == 0)
1603                 return PROTECT_HOME_NO;
1604
1605         return protect_home_from_string(s);
1606 }
1607
1608 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
1609         [PROTECT_SYSTEM_NO] = "no",
1610         [PROTECT_SYSTEM_YES] = "yes",
1611         [PROTECT_SYSTEM_FULL] = "full",
1612         [PROTECT_SYSTEM_STRICT] = "strict",
1613 };
1614
1615 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);
1616
1617 ProtectSystem parse_protect_system_or_bool(const char *s) {
1618         int r;
1619
1620         r = parse_boolean(s);
1621         if (r > 0)
1622                 return PROTECT_SYSTEM_YES;
1623         if (r == 0)
1624                 return PROTECT_SYSTEM_NO;
1625
1626         return protect_system_from_string(s);
1627 }
1628
1629 static const char* const namespace_type_table[] = {
1630         [NAMESPACE_MOUNT] = "mnt",
1631         [NAMESPACE_CGROUP] = "cgroup",
1632         [NAMESPACE_UTS] = "uts",
1633         [NAMESPACE_IPC] = "ipc",
1634         [NAMESPACE_USER] = "user",
1635         [NAMESPACE_PID] = "pid",
1636         [NAMESPACE_NET] = "net",
1637 };
1638
1639 DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);