src/core/namespace.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2010 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <errno.h>
  22 #include <sched.h>
  23 #include <stdio.h>
  24 #include <string.h>
  25 #include <sys/mount.h>
  26 #include <sys/stat.h>
  27 #include <unistd.h>
  28 #include <linux/fs.h>
  29
  30 #include "alloc-util.h"
  31 #include "base-filesystem.h"
  32 #include "dev-setup.h"
  33 #include "fd-util.h"
  34 #include "fs-util.h"
  35 #include "label.h"
  36 #include "loop-util.h"
  37 #include "loopback-setup.h"
  38 #include "missing.h"
  39 #include "mkdir.h"
  40 #include "mount-util.h"
  41 #include "namespace.h"
  42 #include "path-util.h"
  43 #include "selinux-util.h"
  44 #include "socket-util.h"
  45 #include "stat-util.h"
  46 #include "string-table.h"
  47 #include "string-util.h"
  48 #include "strv.h"
  49 #include "umask-util.h"
  50 #include "user-util.h"
  51 #include "util.h"
  52
  53 #define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
  54
  55 typedef enum MountMode {
  56         /* This is ordered by priority! */
  57         INACCESSIBLE,
  58         BIND_MOUNT,
  59         BIND_MOUNT_RECURSIVE,
  60         PRIVATE_TMP,
  61         PRIVATE_DEV,
  62         BIND_DEV,
  63         EMPTY_DIR,
  64         SYSFS,
  65         PROCFS,
  66         READONLY,
  67         READWRITE,
  68         TMPFS,
  69 } MountMode;
  70
  71 typedef struct MountEntry {
  72         const char *path_const;   /* Memory allocated on stack or static */
  73         MountMode mode:5;
  74         bool ignore:1;            /* Ignore if path does not exist? */
  75         bool has_prefix:1;        /* Already is prefixed by the root dir? */
  76         bool read_only:1;         /* Shall this mount point be read-only? */
  77         char *path_malloc;        /* Use this instead of 'path_const' if we had to allocate memory */
  78         const char *source_const; /* The source path, for bind mounts */
  79         char *source_malloc;
  80         const char *options_const;/* Mount options for tmpfs */
  81         char *options_malloc;
  82         unsigned long flags;      /* Mount flags used by EMPTY_DIR and TMPFS. Do not include MS_RDONLY here, but please use read_only. */
  83 } MountEntry;
  84
  85 /* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
  86  * something there already. These mounts are hence overriden by any other explicitly configured mounts. */
  87 static const MountEntry apivfs_table[] = {
  88         { "/proc",               PROCFS,       false },
  89         { "/dev",                BIND_DEV,     false },
  90         { "/sys",                SYSFS,        false },
  91 };
  92
  93 /* ProtectKernelTunables= option and the related filesystem APIs */
  94 static const MountEntry protect_kernel_tunables_table[] = {
  95         { "/proc/sys",           READONLY,     false },
  96         { "/proc/sysrq-trigger", READONLY,     true  },
  97         { "/proc/latency_stats", READONLY,     true  },
  98         { "/proc/mtrr",          READONLY,     true  },
  99         { "/proc/apm",           READONLY,     true  }, /* Obsolete API, there's no point in permitting access to this, ever */
 100         { "/proc/acpi",          READONLY,     true  },
 101         { "/proc/timer_stats",   READONLY,     true  },
 102         { "/proc/asound",        READONLY,     true  },
 103         { "/proc/bus",           READONLY,     true  },
 104         { "/proc/fs",            READONLY,     true  },
 105         { "/proc/irq",           READONLY,     true  },
 106         { "/sys",                READONLY,     false },
 107         { "/sys/kernel/debug",   READONLY,     true  },
 108         { "/sys/kernel/tracing", READONLY,     true  },
 109         { "/sys/fs/bpf",         READONLY,     true  },
 110         { "/sys/fs/cgroup",      READWRITE,    false }, /* READONLY is set by ProtectControlGroups= option */
 111         { "/sys/fs/selinux",     READWRITE,    true  },
 112 };
 113
 114 /* ProtectKernelModules= option */
 115 static const MountEntry protect_kernel_modules_table[] = {
 116 #if HAVE_SPLIT_USR
 117         { "/lib/modules",        INACCESSIBLE, true  },
 118 #endif
 119         { "/usr/lib/modules",    INACCESSIBLE, true  },
 120 };
 121
 122 /*
 123  * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
 124  * system should be protected by ProtectSystem=
 125  */
 126 static const MountEntry protect_home_read_only_table[] = {
 127         { "/home",               READONLY,     true  },
 128         { "/run/user",           READONLY,     true  },
 129         { "/root",               READONLY,     true  },
 130 };
 131
 132 /* ProtectHome=tmpfs table */
 133 static const MountEntry protect_home_tmpfs_table[] = {
 134         { "/home",               TMPFS,        true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
 135         { "/run/user",           TMPFS,        true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
 136         { "/root",               TMPFS,        true, .read_only = true, .options_const = "mode=0700", .flags = MS_NODEV|MS_STRICTATIME },
 137 };
 138
 139 /* ProtectHome=yes table */
 140 static const MountEntry protect_home_yes_table[] = {
 141         { "/home",               INACCESSIBLE, true  },
 142         { "/run/user",           INACCESSIBLE, true  },
 143         { "/root",               INACCESSIBLE, true  },
 144 };
 145
 146 /* ProtectSystem=yes table */
 147 static const MountEntry protect_system_yes_table[] = {
 148         { "/usr",                READONLY,     false },
 149         { "/boot",               READONLY,     true  },
 150         { "/efi",                READONLY,     true  },
 151 #if HAVE_SPLIT_USR
 152         { "/lib",                READONLY,     true  },
 153         { "/lib64",              READONLY,     true  },
 154         { "/bin",                READONLY,     true  },
 155 #  if HAVE_SPLIT_BIN
 156         { "/sbin",               READONLY,     true  },
 157 #  endif
 158 #endif
 159 };
 160
 161 /* ProtectSystem=full includes ProtectSystem=yes */
 162 static const MountEntry protect_system_full_table[] = {
 163         { "/usr",                READONLY,     false },
 164         { "/boot",               READONLY,     true  },
 165         { "/efi",                READONLY,     true  },
 166         { "/etc",                READONLY,     false },
 167 #if HAVE_SPLIT_USR
 168         { "/lib",                READONLY,     true  },
 169         { "/lib64",              READONLY,     true  },
 170         { "/bin",                READONLY,     true  },
 171 #  if HAVE_SPLIT_BIN
 172         { "/sbin",               READONLY,     true  },
 173 #  endif
 174 #endif
 175 };
 176
 177 /*
 178  * ProtectSystem=strict table. In this strict mode, we mount everything
 179  * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
 180  * which are left writable, but PrivateDevices= + ProtectKernelTunables=
 181  * protect those, and these options should be fully orthogonal.
 182  * (And of course /home and friends are also left writable, as ProtectHome=
 183  * shall manage those, orthogonally).
 184  */
 185 static const MountEntry protect_system_strict_table[] = {
 186         { "/",                   READONLY,     false },
 187         { "/proc",               READWRITE,    false },      /* ProtectKernelTunables= */
 188         { "/sys",                READWRITE,    false },      /* ProtectKernelTunables= */
 189         { "/dev",                READWRITE,    false },      /* PrivateDevices= */
 190         { "/home",               READWRITE,    true  },      /* ProtectHome= */
 191         { "/run/user",           READWRITE,    true  },      /* ProtectHome= */
 192         { "/root",               READWRITE,    true  },      /* ProtectHome= */
 193 };
 194
 195 static const char *mount_entry_path(const MountEntry *p) {
 196         assert(p);
 197
 198         /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
 199          * otherwise the stack/static ->path field is returned. */
 200
 201         return p->path_malloc ?: p->path_const;
 202 }
 203
 204 static bool mount_entry_read_only(const MountEntry *p) {
 205         assert(p);
 206
 207         return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE);
 208 }
 209
 210 static const char *mount_entry_source(const MountEntry *p) {
 211         assert(p);
 212
 213         return p->source_malloc ?: p->source_const;
 214 }
 215
 216 static const char *mount_entry_options(const MountEntry *p) {
 217         assert(p);
 218
 219         return p->options_malloc ?: p->options_const;
 220 }
 221
 222 static void mount_entry_done(MountEntry *p) {
 223         assert(p);
 224
 225         p->path_malloc = mfree(p->path_malloc);
 226         p->source_malloc = mfree(p->source_malloc);
 227         p->options_malloc = mfree(p->options_malloc);
 228 }
 229
 230 static int append_access_mounts(MountEntry **p, char **strv, MountMode mode, bool forcibly_require_prefix) {
 231         char **i;
 232
 233         assert(p);
 234
 235         /* Adds a list of user-supplied READWRITE/READONLY/INACCESSIBLE entries */
 236
 237         STRV_FOREACH(i, strv) {
 238                 bool ignore = false, needs_prefix = false;
 239                 const char *e = *i;
 240
 241                 /* Look for any prefixes */
 242                 if (startswith(e, "-")) {
 243                         e++;
 244                         ignore = true;
 245                 }
 246                 if (startswith(e, "+")) {
 247                         e++;
 248                         needs_prefix = true;
 249                 }
 250
 251                 if (!path_is_absolute(e))
 252                         return -EINVAL;
 253
 254                 *((*p)++) = (MountEntry) {
 255                         .path_const = e,
 256                         .mode = mode,
 257                         .ignore = ignore,
 258                         .has_prefix = !needs_prefix && !forcibly_require_prefix,
 259                 };
 260         }
 261
 262         return 0;
 263 }
 264
 265 static int append_empty_dir_mounts(MountEntry **p, char **strv) {
 266         char **i;
 267
 268         assert(p);
 269
 270         /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
 271          * "/private/" boundary directories for DynamicUser=1. */
 272
 273         STRV_FOREACH(i, strv) {
 274
 275                 *((*p)++) = (MountEntry) {
 276                         .path_const = *i,
 277                         .mode = EMPTY_DIR,
 278                         .ignore = false,
 279                         .has_prefix = false,
 280                         .read_only = true,
 281                         .options_const = "mode=755",
 282                         .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
 283                 };
 284         }
 285
 286         return 0;
 287 }
 288
 289 static int append_bind_mounts(MountEntry **p, const BindMount *binds, unsigned n) {
 290         unsigned i;
 291
 292         assert(p);
 293
 294         for (i = 0; i < n; i++) {
 295                 const BindMount *b = binds + i;
 296
 297                 *((*p)++) = (MountEntry) {
 298                         .path_const = b->destination,
 299                         .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT,
 300                         .read_only = b->read_only,
 301                         .source_const = b->source,
 302                         .ignore = b->ignore_enoent,
 303                 };
 304         }
 305
 306         return 0;
 307 }
 308
 309 static int append_tmpfs_mounts(MountEntry **p, const TemporaryFileSystem *tmpfs, unsigned n) {
 310         unsigned i;
 311         int r;
 312
 313         assert(p);
 314
 315         for (i = 0; i < n; i++) {
 316                 const TemporaryFileSystem *t = tmpfs + i;
 317                 _cleanup_free_ char *o = NULL, *str = NULL;
 318                 unsigned long flags = MS_NODEV|MS_STRICTATIME;
 319                 bool ro = false;
 320
 321                 if (!path_is_absolute(t->path))
 322                         return -EINVAL;
 323
 324                 if (!isempty(t->options)) {
 325                         str = strjoin("mode=0755,", t->options);
 326                         if (!str)
 327                                 return -ENOMEM;
 328
 329                         r = mount_option_mangle(str, MS_NODEV|MS_STRICTATIME, &flags, &o);
 330                         if (r < 0)
 331                                 return r;
 332
 333                         ro = !!(flags & MS_RDONLY);
 334                         if (ro)
 335                                 flags ^= MS_RDONLY;
 336                 }
 337
 338                 *((*p)++) = (MountEntry) {
 339                         .path_const = t->path,
 340                         .mode = TMPFS,
 341                         .read_only = ro,
 342                         .options_malloc = o,
 343                         .flags = flags,
 344                 };
 345
 346                 o = NULL;
 347         }
 348
 349         return 0;
 350 }
 351
 352 static int append_static_mounts(MountEntry **p, const MountEntry *mounts, unsigned n, bool ignore_protect) {
 353         unsigned i;
 354
 355         assert(p);
 356         assert(mounts);
 357
 358         /* Adds a list of static pre-defined entries */
 359
 360         for (i = 0; i < n; i++)
 361                 *((*p)++) = (MountEntry) {
 362                         .path_const = mount_entry_path(mounts+i),
 363                         .mode = mounts[i].mode,
 364                         .ignore = mounts[i].ignore || ignore_protect,
 365                 };
 366
 367         return 0;
 368 }
 369
 370 static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
 371         assert(p);
 372
 373         switch (protect_home) {
 374
 375         case PROTECT_HOME_NO:
 376                 return 0;
 377
 378         case PROTECT_HOME_READ_ONLY:
 379                 return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
 380
 381         case PROTECT_HOME_TMPFS:
 382                 return append_static_mounts(p, protect_home_tmpfs_table, ELEMENTSOF(protect_home_tmpfs_table), ignore_protect);
 383
 384         case PROTECT_HOME_YES:
 385                 return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
 386
 387         default:
 388                 assert_not_reached("Unexpected ProtectHome= value");
 389         }
 390 }
 391
 392 static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
 393         assert(p);
 394
 395         switch (protect_system) {
 396
 397         case PROTECT_SYSTEM_NO:
 398                 return 0;
 399
 400         case PROTECT_SYSTEM_STRICT:
 401                 return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
 402
 403         case PROTECT_SYSTEM_YES:
 404                 return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
 405
 406         case PROTECT_SYSTEM_FULL:
 407                 return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
 408
 409         default:
 410                 assert_not_reached("Unexpected ProtectSystem= value");
 411         }
 412 }
 413
 414 static int mount_path_compare(const void *a, const void *b) {
 415         const MountEntry *p = a, *q = b;
 416         int d;
 417
 418         /* If the paths are not equal, then order prefixes first */
 419         d = path_compare(mount_entry_path(p), mount_entry_path(q));
 420         if (d != 0)
 421                 return d;
 422
 423         /* If the paths are equal, check the mode */
 424         if (p->mode < q->mode)
 425                 return -1;
 426
 427         if (p->mode > q->mode)
 428                 return 1;
 429
 430         return 0;
 431 }
 432
 433 static int prefix_where_needed(MountEntry *m, unsigned n, const char *root_directory) {
 434         unsigned i;
 435
 436         /* Prefixes all paths in the bind mount table with the root directory if it is specified and the entry needs
 437          * that. */
 438
 439         if (!root_directory)
 440                 return 0;
 441
 442         for (i = 0; i < n; i++) {
 443                 char *s;
 444
 445                 if (m[i].has_prefix)
 446                         continue;
 447
 448                 s = prefix_root(root_directory, mount_entry_path(m+i));
 449                 if (!s)
 450                         return -ENOMEM;
 451
 452                 free_and_replace(m[i].path_malloc, s);
 453                 m[i].has_prefix = true;
 454         }
 455
 456         return 0;
 457 }
 458
 459 static void drop_duplicates(MountEntry *m, unsigned *n) {
 460         MountEntry *f, *t, *previous;
 461
 462         assert(m);
 463         assert(n);
 464
 465         /* Drops duplicate entries. Expects that the array is properly ordered already. */
 466
 467         for (f = m, t = m, previous = NULL; f < m + *n; f++) {
 468
 469                 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
 470                  * above. */
 471                 if (previous && path_equal(mount_entry_path(f), mount_entry_path(previous))) {
 472                         log_debug("%s is duplicate.", mount_entry_path(f));
 473                         previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */
 474                         mount_entry_done(f);
 475                         continue;
 476                 }
 477
 478                 *t = *f;
 479                 previous = t;
 480                 t++;
 481         }
 482
 483         *n = t - m;
 484 }
 485
 486 static void drop_inaccessible(MountEntry *m, unsigned *n) {
 487         MountEntry *f, *t;
 488         const char *clear = NULL;
 489
 490         assert(m);
 491         assert(n);
 492
 493         /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
 494          * ordered already. */
 495
 496         for (f = m, t = m; f < m + *n; f++) {
 497
 498                 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
 499                  * it, as inaccessible paths really should drop the entire subtree. */
 500                 if (clear && path_startswith(mount_entry_path(f), clear)) {
 501                         log_debug("%s is masked by %s.", mount_entry_path(f), clear);
 502                         mount_entry_done(f);
 503                         continue;
 504                 }
 505
 506                 clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
 507
 508                 *t = *f;
 509                 t++;
 510         }
 511
 512         *n = t - m;
 513 }
 514
 515 static void drop_nop(MountEntry *m, unsigned *n) {
 516         MountEntry *f, *t;
 517
 518         assert(m);
 519         assert(n);
 520
 521         /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
 522          * list is ordered by prefixes. */
 523
 524         for (f = m, t = m; f < m + *n; f++) {
 525
 526                 /* Only suppress such subtrees for READONLY and READWRITE entries */
 527                 if (IN_SET(f->mode, READONLY, READWRITE)) {
 528                         MountEntry *p;
 529                         bool found = false;
 530
 531                         /* Now let's find the first parent of the entry we are looking at. */
 532                         for (p = t-1; p >= m; p--) {
 533                                 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
 534                                         found = true;
 535                                         break;
 536                                 }
 537                         }
 538
 539                         /* We found it, let's see if it's the same mode, if so, we can drop this entry */
 540                         if (found && p->mode == f->mode) {
 541                                 log_debug("%s is redundant by %s", mount_entry_path(f), mount_entry_path(p));
 542                                 mount_entry_done(f);
 543                                 continue;
 544                         }
 545                 }
 546
 547                 *t = *f;
 548                 t++;
 549         }
 550
 551         *n = t - m;
 552 }
 553
 554 static void drop_outside_root(const char *root_directory, MountEntry *m, unsigned *n) {
 555         MountEntry *f, *t;
 556
 557         assert(m);
 558         assert(n);
 559
 560         /* Nothing to do */
 561         if (!root_directory)
 562                 return;
 563
 564         /* Drops all mounts that are outside of the root directory. */
 565
 566         for (f = m, t = m; f < m + *n; f++) {
 567
 568                 if (!path_startswith(mount_entry_path(f), root_directory)) {
 569                         log_debug("%s is outside of root directory.", mount_entry_path(f));
 570                         mount_entry_done(f);
 571                         continue;
 572                 }
 573
 574                 *t = *f;
 575                 t++;
 576         }
 577
 578         *n = t - m;
 579 }
 580
 581 static int clone_device_node(const char *d, const char *temporary_mount) {
 582         const char *dn;
 583         struct stat st;
 584         int r;
 585
 586         if (stat(d, &st) < 0) {
 587                 if (errno == ENOENT)
 588                         return 0;
 589                 return -errno;
 590         }
 591
 592         if (!S_ISBLK(st.st_mode) &&
 593             !S_ISCHR(st.st_mode))
 594                 return -EINVAL;
 595
 596         if (st.st_rdev == 0)
 597                 return 0;
 598
 599         dn = strjoina(temporary_mount, d);
 600
 601         mac_selinux_create_file_prepare(d, st.st_mode);
 602         r = mknod(dn, st.st_mode, st.st_rdev);
 603         mac_selinux_create_file_clear();
 604         if (r < 0)
 605                 return log_debug_errno(errno, "mknod failed for %s: %m", d);
 606
 607         return 1;
 608 }
 609
 610 static int mount_private_dev(MountEntry *m) {
 611         static const char devnodes[] =
 612                 "/dev/null\0"
 613                 "/dev/zero\0"
 614                 "/dev/full\0"
 615                 "/dev/random\0"
 616                 "/dev/urandom\0"
 617                 "/dev/tty\0";
 618
 619         char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
 620         const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
 621         _cleanup_umask_ mode_t u;
 622         int r;
 623
 624         assert(m);
 625
 626         u = umask(0000);
 627
 628         if (!mkdtemp(temporary_mount))
 629                 return -errno;
 630
 631         dev = strjoina(temporary_mount, "/dev");
 632         (void) mkdir(dev, 0755);
 633         if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
 634                 r = -errno;
 635                 goto fail;
 636         }
 637
 638         devpts = strjoina(temporary_mount, "/dev/pts");
 639         (void) mkdir(devpts, 0755);
 640         if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
 641                 r = -errno;
 642                 goto fail;
 643         }
 644
 645         /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx
 646          * when /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible
 647          * thus, in that case make a clone
 648          *
 649          * in nspawn and other containers it will be a symlink, in that case make it a symlink
 650          */
 651         r = is_symlink("/dev/ptmx");
 652         if (r < 0)
 653                 goto fail;
 654         if (r > 0) {
 655                 devptmx = strjoina(temporary_mount, "/dev/ptmx");
 656                 if (symlink("pts/ptmx", devptmx) < 0) {
 657                         r = -errno;
 658                         goto fail;
 659                 }
 660         } else {
 661                 r = clone_device_node("/dev/ptmx", temporary_mount);
 662                 if (r < 0)
 663                         goto fail;
 664                 if (r == 0) {
 665                         r = -ENXIO;
 666                         goto fail;
 667                 }
 668         }
 669
 670         devshm = strjoina(temporary_mount, "/dev/shm");
 671         (void) mkdir(devshm, 0755);
 672         r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
 673         if (r < 0) {
 674                 r = -errno;
 675                 goto fail;
 676         }
 677
 678         devmqueue = strjoina(temporary_mount, "/dev/mqueue");
 679         (void) mkdir(devmqueue, 0755);
 680         (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
 681
 682         devhugepages = strjoina(temporary_mount, "/dev/hugepages");
 683         (void) mkdir(devhugepages, 0755);
 684         (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
 685
 686         devlog = strjoina(temporary_mount, "/dev/log");
 687         (void) symlink("/run/systemd/journal/dev-log", devlog);
 688
 689         NULSTR_FOREACH(d, devnodes) {
 690                 r = clone_device_node(d, temporary_mount);
 691                 if (r < 0)
 692                         goto fail;
 693         }
 694
 695         dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
 696
 697         /* Create the /dev directory if missing. It is more likely to be
 698          * missing when the service is started with RootDirectory. This is
 699          * consistent with mount units creating the mount points when missing.
 700          */
 701         (void) mkdir_p_label(mount_entry_path(m), 0755);
 702
 703         /* Unmount everything in old /dev */
 704         umount_recursive(mount_entry_path(m), 0);
 705         if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) {
 706                 r = -errno;
 707                 goto fail;
 708         }
 709
 710         rmdir(dev);
 711         rmdir(temporary_mount);
 712
 713         return 0;
 714
 715 fail:
 716         if (devpts)
 717                 umount(devpts);
 718
 719         if (devshm)
 720                 umount(devshm);
 721
 722         if (devhugepages)
 723                 umount(devhugepages);
 724
 725         if (devmqueue)
 726                 umount(devmqueue);
 727
 728         umount(dev);
 729         rmdir(dev);
 730         rmdir(temporary_mount);
 731
 732         return r;
 733 }
 734
 735 static int mount_bind_dev(const MountEntry *m) {
 736         int r;
 737
 738         assert(m);
 739
 740         /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
 741          * /dev. This is only used when RootDirectory= is set. */
 742
 743         (void) mkdir_p_label(mount_entry_path(m), 0755);
 744
 745         r = path_is_mount_point(mount_entry_path(m), NULL, 0);
 746         if (r < 0)
 747                 return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
 748         if (r > 0) /* make this a NOP if /dev is already a mount point */
 749                 return 0;
 750
 751         if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
 752                 return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m));
 753
 754         return 1;
 755 }
 756
 757 static int mount_sysfs(const MountEntry *m) {
 758         int r;
 759
 760         assert(m);
 761
 762         (void) mkdir_p_label(mount_entry_path(m), 0755);
 763
 764         r = path_is_mount_point(mount_entry_path(m), NULL, 0);
 765         if (r < 0)
 766                 return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
 767         if (r > 0) /* make this a NOP if /sys is already a mount point */
 768                 return 0;
 769
 770         /* Bind mount the host's version so that we get all child mounts of it, too. */
 771         if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
 772                 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
 773
 774         return 1;
 775 }
 776
 777 static int mount_procfs(const MountEntry *m) {
 778         int r;
 779
 780         assert(m);
 781
 782         (void) mkdir_p_label(mount_entry_path(m), 0755);
 783
 784         r = path_is_mount_point(mount_entry_path(m), NULL, 0);
 785         if (r < 0)
 786                 return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
 787         if (r > 0) /* make this a NOP if /proc is already a mount point */
 788                 return 0;
 789
 790         /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
 791         if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
 792                 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
 793
 794         return 1;
 795 }
 796
 797 static int mount_tmpfs(const MountEntry *m) {
 798         assert(m);
 799
 800         /* First, get rid of everything that is below if there is anything. Then, overmount with our new tmpfs */
 801
 802         (void) mkdir_p_label(mount_entry_path(m), 0755);
 803         (void) umount_recursive(mount_entry_path(m), 0);
 804
 805         if (mount("tmpfs", mount_entry_path(m), "tmpfs", m->flags, mount_entry_options(m)) < 0)
 806                 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
 807
 808         return 1;
 809 }
 810
 811 static int mount_entry_chase(
 812                 const char *root_directory,
 813                 const MountEntry *m,
 814                 const char *path,
 815                 bool chase_nonexistent,
 816                 char **location) {
 817
 818         char *chased;
 819         int r;
 820
 821         assert(m);
 822
 823         /* Since mount() will always follow symlinks and we need to take the different root directory into account we
 824          * chase the symlinks on our own first. This is called for the destination path, as well as the source path (if
 825          * that applies). The result is stored in "location". */
 826
 827         r = chase_symlinks(path, root_directory, CHASE_TRAIL_SLASH | (chase_nonexistent ? CHASE_NONEXISTENT : 0), &chased);
 828         if (r == -ENOENT && m->ignore) {
 829                 log_debug_errno(r, "Path %s does not exist, ignoring.", path);
 830                 return 0;
 831         }
 832         if (r < 0)
 833                 return log_debug_errno(r, "Failed to follow symlinks on %s: %m", path);
 834
 835         log_debug("Followed symlinks %s → %s.", path, chased);
 836
 837         free(*location);
 838         *location = chased;
 839
 840         return 1;
 841 }
 842
 843 static int apply_mount(
 844                 const char *root_directory,
 845                 MountEntry *m) {
 846
 847         bool rbind = true, make = false;
 848         const char *what;
 849         int r;
 850
 851         assert(m);
 852
 853         r = mount_entry_chase(root_directory, m, mount_entry_path(m), !IN_SET(m->mode, INACCESSIBLE, READONLY, READWRITE), &m->path_malloc);
 854         if (r <= 0)
 855                 return r;
 856
 857         log_debug("Applying namespace mount on %s", mount_entry_path(m));
 858
 859         switch (m->mode) {
 860
 861         case INACCESSIBLE: {
 862                 struct stat target;
 863
 864                 /* First, get rid of everything that is below if there
 865                  * is anything... Then, overmount it with an
 866                  * inaccessible path. */
 867                 (void) umount_recursive(mount_entry_path(m), 0);
 868
 869                 if (lstat(mount_entry_path(m), &target) < 0)
 870                         return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m));
 871
 872                 what = mode_to_inaccessible_node(target.st_mode);
 873                 if (!what) {
 874                         log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed");
 875                         return -ELOOP;
 876                 }
 877                 break;
 878         }
 879
 880         case READONLY:
 881         case READWRITE:
 882                 r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
 883                 if (r < 0)
 884                         return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m));
 885                 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
 886                         return 0;
 887                 /* This isn't a mount point yet, let's make it one. */
 888                 what = mount_entry_path(m);
 889                 break;
 890
 891         case BIND_MOUNT:
 892                 rbind = false;
 893
 894                 _fallthrough_;
 895         case BIND_MOUNT_RECURSIVE:
 896                 /* Also chase the source mount */
 897
 898                 r = mount_entry_chase(root_directory, m, mount_entry_source(m), false, &m->source_malloc);
 899                 if (r <= 0)
 900                         return r;
 901
 902                 what = mount_entry_source(m);
 903                 make = true;
 904                 break;
 905
 906         case EMPTY_DIR:
 907         case TMPFS:
 908                 return mount_tmpfs(m);
 909
 910         case PRIVATE_TMP:
 911                 what = mount_entry_source(m);
 912                 make = true;
 913                 break;
 914
 915         case PRIVATE_DEV:
 916                 return mount_private_dev(m);
 917
 918         case BIND_DEV:
 919                 return mount_bind_dev(m);
 920
 921         case SYSFS:
 922                 return mount_sysfs(m);
 923
 924         case PROCFS:
 925                 return mount_procfs(m);
 926
 927         default:
 928                 assert_not_reached("Unknown mode");
 929         }
 930
 931         assert(what);
 932
 933         if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0) {
 934                 bool try_again = false;
 935                 r = -errno;
 936
 937                 if (r == -ENOENT && make) {
 938                         struct stat st;
 939
 940                         /* Hmm, either the source or the destination are missing. Let's see if we can create the destination, then try again */
 941
 942                         if (stat(what, &st) >= 0) {
 943
 944                                 (void) mkdir_parents(mount_entry_path(m), 0755);
 945
 946                                 if (S_ISDIR(st.st_mode))
 947                                         try_again = mkdir(mount_entry_path(m), 0755) >= 0;
 948                                 else
 949                                         try_again = touch(mount_entry_path(m)) >= 0;
 950                         }
 951                 }
 952
 953                 if (try_again) {
 954                         if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0)
 955                                 r = -errno;
 956                         else
 957                                 r = 0;
 958                 }
 959
 960                 if (r < 0)
 961                         return log_debug_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
 962         }
 963
 964         log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
 965         return 0;
 966 }
 967
 968 static int make_read_only(const MountEntry *m, char **blacklist, FILE *proc_self_mountinfo) {
 969         int r = 0;
 970
 971         assert(m);
 972         assert(proc_self_mountinfo);
 973
 974         if (mount_entry_read_only(m)) {
 975                 if (IN_SET(m->mode, EMPTY_DIR, TMPFS)) {
 976                         /* Make superblock readonly */
 977                         if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT | MS_RDONLY | m->flags, mount_entry_options(m)) < 0)
 978                                 r = -errno;
 979                 } else
 980                         r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), true, blacklist, proc_self_mountinfo);
 981         } else if (m->mode == PRIVATE_DEV) {
 982                 /* Superblock can be readonly but the submounts can't */
 983                 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
 984                         r = -errno;
 985         } else
 986                 return 0;
 987
 988         /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
 989          * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
 990          * read-only mounts already applied. */
 991
 992         if (r == -ENOENT && m->ignore)
 993                 r = 0;
 994
 995         return r;
 996 }
 997
 998 static bool namespace_info_mount_apivfs(const char *root_directory, const NamespaceInfo *ns_info) {
 999         assert(ns_info);
1000
1001         /*
1002          * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
1003          * since to protect the API VFS mounts, they need to be around in the
1004          * first place... and RootDirectory= or RootImage= need to be set.
1005          */
1006
1007         /* root_directory should point to a mount point */
1008         return root_directory &&
1009                 (ns_info->mount_apivfs ||
1010                  ns_info->protect_control_groups ||
1011                  ns_info->protect_kernel_tunables);
1012 }
1013
1014 static unsigned namespace_calculate_mounts(
1015                 const char* root_directory,
1016                 const NamespaceInfo *ns_info,
1017                 char** read_write_paths,
1018                 char** read_only_paths,
1019                 char** inaccessible_paths,
1020                 char** empty_directories,
1021                 unsigned n_bind_mounts,
1022                 unsigned n_temporary_filesystems,
1023                 const char* tmp_dir,
1024                 const char* var_tmp_dir,
1025                 ProtectHome protect_home,
1026                 ProtectSystem protect_system) {
1027
1028         unsigned protect_home_cnt;
1029         unsigned protect_system_cnt =
1030                 (protect_system == PROTECT_SYSTEM_STRICT ?
1031                  ELEMENTSOF(protect_system_strict_table) :
1032                  ((protect_system == PROTECT_SYSTEM_FULL) ?
1033                   ELEMENTSOF(protect_system_full_table) :
1034                   ((protect_system == PROTECT_SYSTEM_YES) ?
1035                    ELEMENTSOF(protect_system_yes_table) : 0)));
1036
1037         protect_home_cnt =
1038                 (protect_home == PROTECT_HOME_YES ?
1039                  ELEMENTSOF(protect_home_yes_table) :
1040                  ((protect_home == PROTECT_HOME_READ_ONLY) ?
1041                   ELEMENTSOF(protect_home_read_only_table) :
1042                   ((protect_home == PROTECT_HOME_TMPFS) ?
1043                    ELEMENTSOF(protect_home_tmpfs_table) : 0)));
1044
1045         return !!tmp_dir + !!var_tmp_dir +
1046                 strv_length(read_write_paths) +
1047                 strv_length(read_only_paths) +
1048                 strv_length(inaccessible_paths) +
1049                 strv_length(empty_directories) +
1050                 n_bind_mounts +
1051                 n_temporary_filesystems +
1052                 ns_info->private_dev +
1053                 (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
1054                 (ns_info->protect_control_groups ? 1 : 0) +
1055                 (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
1056                 protect_home_cnt + protect_system_cnt +
1057                 (namespace_info_mount_apivfs(root_directory, ns_info) ? ELEMENTSOF(apivfs_table) : 0);
1058 }
1059
1060 int setup_namespace(
1061                 const char* root_directory,
1062                 const char* root_image,
1063                 const NamespaceInfo *ns_info,
1064                 char** read_write_paths,
1065                 char** read_only_paths,
1066                 char** inaccessible_paths,
1067                 char** empty_directories,
1068                 const BindMount *bind_mounts,
1069                 unsigned n_bind_mounts,
1070                 const TemporaryFileSystem *temporary_filesystems,
1071                 unsigned n_temporary_filesystems,
1072                 const char* tmp_dir,
1073                 const char* var_tmp_dir,
1074                 ProtectHome protect_home,
1075                 ProtectSystem protect_system,
1076                 unsigned long mount_flags,
1077                 DissectImageFlags dissect_image_flags) {
1078
1079         _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
1080         _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
1081         _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
1082         _cleanup_free_ void *root_hash = NULL;
1083         MountEntry *m, *mounts = NULL;
1084         size_t root_hash_size = 0;
1085         bool make_slave = false;
1086         const char *root;
1087         unsigned n_mounts;
1088         bool require_prefix = false;
1089         int r = 0;
1090
1091         assert(ns_info);
1092
1093         if (mount_flags == 0)
1094                 mount_flags = MS_SHARED;
1095
1096         if (root_image) {
1097                 dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT;
1098
1099                 if (protect_system == PROTECT_SYSTEM_STRICT && strv_isempty(read_write_paths))
1100                         dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
1101
1102                 r = loop_device_make_by_path(root_image,
1103                                              dissect_image_flags & DISSECT_IMAGE_READ_ONLY ? O_RDONLY : O_RDWR,
1104                                              &loop_device);
1105                 if (r < 0)
1106                         return r;
1107
1108                 r = root_hash_load(root_image, &root_hash, &root_hash_size);
1109                 if (r < 0)
1110                         return r;
1111
1112                 r = dissect_image(loop_device->fd, root_hash, root_hash_size, dissect_image_flags, &dissected_image);
1113                 if (r < 0)
1114                         return r;
1115
1116                 r = dissected_image_decrypt(dissected_image, NULL, root_hash, root_hash_size, dissect_image_flags, &decrypted_image);
1117                 if (r < 0)
1118                         return r;
1119         }
1120
1121         if (root_directory)
1122                 root = root_directory;
1123         else if (root_image || n_bind_mounts > 0 || n_temporary_filesystems > 0) {
1124
1125                 /* If we are booting from an image, create a mount point for the image, if it's still missing. We use
1126                  * the same mount point for all images, which is safe, since they all live in their own namespaces
1127                  * after all, and hence won't see each other. We also use such a root directory whenever there are bind
1128                  * mounts configured, so that their source mounts are never obstructed by mounts we already applied
1129                  * while we are applying them. */
1130
1131                 root = "/run/systemd/unit-root";
1132                 (void) mkdir_label(root, 0700);
1133                 require_prefix = true;
1134         } else
1135                 root = NULL;
1136
1137         n_mounts = namespace_calculate_mounts(
1138                         root,
1139                         ns_info,
1140                         read_write_paths,
1141                         read_only_paths,
1142                         inaccessible_paths,
1143                         empty_directories,
1144                         n_bind_mounts,
1145                         n_temporary_filesystems,
1146                         tmp_dir, var_tmp_dir,
1147                         protect_home, protect_system);
1148
1149         /* Set mount slave mode */
1150         if (root || n_mounts > 0)
1151                 make_slave = true;
1152
1153         if (n_mounts > 0) {
1154                 m = mounts = (MountEntry *) alloca0(n_mounts * sizeof(MountEntry));
1155                 r = append_access_mounts(&m, read_write_paths, READWRITE, require_prefix);
1156                 if (r < 0)
1157                         goto finish;
1158
1159                 r = append_access_mounts(&m, read_only_paths, READONLY, require_prefix);
1160                 if (r < 0)
1161                         goto finish;
1162
1163                 r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE, require_prefix);
1164                 if (r < 0)
1165                         goto finish;
1166
1167                 r = append_empty_dir_mounts(&m, empty_directories);
1168                 if (r < 0)
1169                         goto finish;
1170
1171                 r = append_bind_mounts(&m, bind_mounts, n_bind_mounts);
1172                 if (r < 0)
1173                         goto finish;
1174
1175                 r = append_tmpfs_mounts(&m, temporary_filesystems, n_temporary_filesystems);
1176                 if (r < 0)
1177                         goto finish;
1178
1179                 if (tmp_dir) {
1180                         *(m++) = (MountEntry) {
1181                                 .path_const = "/tmp",
1182                                 .mode = PRIVATE_TMP,
1183                                 .source_const = tmp_dir,
1184                         };
1185                 }
1186
1187                 if (var_tmp_dir) {
1188                         *(m++) = (MountEntry) {
1189                                 .path_const = "/var/tmp",
1190                                 .mode = PRIVATE_TMP,
1191                                 .source_const = var_tmp_dir,
1192                         };
1193                 }
1194
1195                 if (ns_info->private_dev) {
1196                         *(m++) = (MountEntry) {
1197                                 .path_const = "/dev",
1198                                 .mode = PRIVATE_DEV,
1199                         };
1200                 }
1201
1202                 if (ns_info->protect_kernel_tunables) {
1203                         r = append_static_mounts(&m, protect_kernel_tunables_table, ELEMENTSOF(protect_kernel_tunables_table), ns_info->ignore_protect_paths);
1204                         if (r < 0)
1205                                 goto finish;
1206                 }
1207
1208                 if (ns_info->protect_kernel_modules) {
1209                         r = append_static_mounts(&m, protect_kernel_modules_table, ELEMENTSOF(protect_kernel_modules_table), ns_info->ignore_protect_paths);
1210                         if (r < 0)
1211                                 goto finish;
1212                 }
1213
1214                 if (ns_info->protect_control_groups) {
1215                         *(m++) = (MountEntry) {
1216                                 .path_const = "/sys/fs/cgroup",
1217                                 .mode = READONLY,
1218                         };
1219                 }
1220
1221                 r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths);
1222                 if (r < 0)
1223                         goto finish;
1224
1225                 r = append_protect_system(&m, protect_system, false);
1226                 if (r < 0)
1227                         goto finish;
1228
1229                 if (namespace_info_mount_apivfs(root, ns_info)) {
1230                         r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths);
1231                         if (r < 0)
1232                                 goto finish;
1233                 }
1234
1235                 assert(mounts + n_mounts == m);
1236
1237                 /* Prepend the root directory where that's necessary */
1238                 r = prefix_where_needed(mounts, n_mounts, root);
1239                 if (r < 0)
1240                         goto finish;
1241
1242                 qsort(mounts, n_mounts, sizeof(MountEntry), mount_path_compare);
1243
1244                 drop_duplicates(mounts, &n_mounts);
1245                 drop_outside_root(root, mounts, &n_mounts);
1246                 drop_inaccessible(mounts, &n_mounts);
1247                 drop_nop(mounts, &n_mounts);
1248         }
1249
1250         if (unshare(CLONE_NEWNS) < 0) {
1251                 r = -errno;
1252                 goto finish;
1253         }
1254
1255         if (make_slave) {
1256                 /* Remount / as SLAVE so that nothing now mounted in the namespace
1257                    shows up in the parent */
1258                 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1259                         r = -errno;
1260                         goto finish;
1261                 }
1262         }
1263
1264         if (root_image) {
1265                 /* A root image is specified, mount it to the right place */
1266                 r = dissected_image_mount(dissected_image, root, UID_INVALID, dissect_image_flags);
1267                 if (r < 0)
1268                         goto finish;
1269
1270                 if (decrypted_image) {
1271                         r = decrypted_image_relinquish(decrypted_image);
1272                         if (r < 0)
1273                                 goto finish;
1274                 }
1275
1276                 loop_device_relinquish(loop_device);
1277
1278         } else if (root_directory) {
1279
1280                 /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
1281                 r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW);
1282                 if (r < 0)
1283                         goto finish;
1284                 if (r == 0) {
1285                         if (mount(root, root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1286                                 r = -errno;
1287                                 goto finish;
1288                         }
1289                 }
1290
1291         } else if (root) {
1292
1293                 /* Let's mount the main root directory to the root directory to use */
1294                 if (mount("/", root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1295                         r = -errno;
1296                         goto finish;
1297                 }
1298         }
1299
1300         /* Try to set up the new root directory before mounting anything else there. */
1301         if (root_image || root_directory)
1302                 (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
1303
1304         if (n_mounts > 0) {
1305                 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
1306                 char **blacklist;
1307                 unsigned j;
1308
1309                 /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of /proc.
1310                  * For example, this is the case with the option: 'InaccessiblePaths=/proc' */
1311                 proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
1312                 if (!proc_self_mountinfo) {
1313                         r = -errno;
1314                         goto finish;
1315                 }
1316
1317                 /* First round, add in all special mounts we need */
1318                 for (m = mounts; m < mounts + n_mounts; ++m) {
1319                         r = apply_mount(root, m);
1320                         if (r < 0)
1321                                 goto finish;
1322                 }
1323
1324                 /* Create a blacklist we can pass to bind_mount_recursive() */
1325                 blacklist = newa(char*, n_mounts+1);
1326                 for (j = 0; j < n_mounts; j++)
1327                         blacklist[j] = (char*) mount_entry_path(mounts+j);
1328                 blacklist[j] = NULL;
1329
1330                 /* Second round, flip the ro bits if necessary. */
1331                 for (m = mounts; m < mounts + n_mounts; ++m) {
1332                         r = make_read_only(m, blacklist, proc_self_mountinfo);
1333                         if (r < 0)
1334                                 goto finish;
1335                 }
1336         }
1337
1338         if (root) {
1339                 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
1340                 r = mount_move_root(root);
1341                 if (r < 0)
1342                         goto finish;
1343         }
1344
1345         /* Remount / as the desired mode. Note that this will not
1346          * reestablish propagation from our side to the host, since
1347          * what's disconnected is disconnected. */
1348         if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
1349                 r = -errno;
1350                 goto finish;
1351         }
1352
1353         r = 0;
1354
1355 finish:
1356         for (m = mounts; m < mounts + n_mounts; m++)
1357                 mount_entry_done(m);
1358
1359         return r;
1360 }
1361
1362 void bind_mount_free_many(BindMount *b, unsigned n) {
1363         unsigned i;
1364
1365         assert(b || n == 0);
1366
1367         for (i = 0; i < n; i++) {
1368                 free(b[i].source);
1369                 free(b[i].destination);
1370         }
1371
1372         free(b);
1373 }
1374
1375 int bind_mount_add(BindMount **b, unsigned *n, const BindMount *item) {
1376         _cleanup_free_ char *s = NULL, *d = NULL;
1377         BindMount *c;
1378
1379         assert(b);
1380         assert(n);
1381         assert(item);
1382
1383         s = strdup(item->source);
1384         if (!s)
1385                 return -ENOMEM;
1386
1387         d = strdup(item->destination);
1388         if (!d)
1389                 return -ENOMEM;
1390
1391         c = reallocarray(*b, *n + 1, sizeof(BindMount));
1392         if (!c)
1393                 return -ENOMEM;
1394
1395         *b = c;
1396
1397         c[(*n) ++] = (BindMount) {
1398                 .source = TAKE_PTR(s),
1399                 .destination = TAKE_PTR(d),
1400                 .read_only = item->read_only,
1401                 .recursive = item->recursive,
1402                 .ignore_enoent = item->ignore_enoent,
1403         };
1404
1405         return 0;
1406 }
1407
1408 void temporary_filesystem_free_many(TemporaryFileSystem *t, unsigned n) {
1409         unsigned i;
1410
1411         assert(t || n == 0);
1412
1413         for (i = 0; i < n; i++) {
1414                 free(t[i].path);
1415                 free(t[i].options);
1416         }
1417
1418         free(t);
1419 }
1420
1421 int temporary_filesystem_add(
1422                 TemporaryFileSystem **t,
1423                 unsigned *n,
1424                 const char *path,
1425                 const char *options) {
1426
1427         _cleanup_free_ char *p = NULL, *o = NULL;
1428         TemporaryFileSystem *c;
1429
1430         assert(t);
1431         assert(n);
1432         assert(path);
1433
1434         p = strdup(path);
1435         if (!p)
1436                 return -ENOMEM;
1437
1438         if (!isempty(options)) {
1439                 o = strdup(options);
1440                 if (!o)
1441                         return -ENOMEM;
1442         }
1443
1444         c = reallocarray(*t, *n + 1, sizeof(TemporaryFileSystem));
1445         if (!c)
1446                 return -ENOMEM;
1447
1448         *t = c;
1449
1450         c[(*n) ++] = (TemporaryFileSystem) {
1451                 .path = TAKE_PTR(p),
1452                 .options = TAKE_PTR(o),
1453         };
1454
1455         return 0;
1456 }
1457
1458 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
1459         _cleanup_free_ char *x = NULL;
1460         char bid[SD_ID128_STRING_MAX];
1461         sd_id128_t boot_id;
1462         int r;
1463
1464         assert(id);
1465         assert(prefix);
1466         assert(path);
1467
1468         /* We include the boot id in the directory so that after a
1469          * reboot we can easily identify obsolete directories. */
1470
1471         r = sd_id128_get_boot(&boot_id);
1472         if (r < 0)
1473                 return r;
1474
1475         x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX");
1476         if (!x)
1477                 return -ENOMEM;
1478
1479         RUN_WITH_UMASK(0077)
1480                 if (!mkdtemp(x))
1481                         return -errno;
1482
1483         RUN_WITH_UMASK(0000) {
1484                 char *y;
1485
1486                 y = strjoina(x, "/tmp");
1487
1488                 if (mkdir(y, 0777 | S_ISVTX) < 0)
1489                         return -errno;
1490         }
1491
1492         *path = TAKE_PTR(x);
1493
1494         return 0;
1495 }
1496
1497 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
1498         char *a, *b;
1499         int r;
1500
1501         assert(id);
1502         assert(tmp_dir);
1503         assert(var_tmp_dir);
1504
1505         r = setup_one_tmp_dir(id, "/tmp", &a);
1506         if (r < 0)
1507                 return r;
1508
1509         r = setup_one_tmp_dir(id, "/var/tmp", &b);
1510         if (r < 0) {
1511                 char *t;
1512
1513                 t = strjoina(a, "/tmp");
1514                 rmdir(t);
1515                 rmdir(a);
1516
1517                 free(a);
1518                 return r;
1519         }
1520
1521         *tmp_dir = a;
1522         *var_tmp_dir = b;
1523
1524         return 0;
1525 }
1526
1527 int setup_netns(int netns_storage_socket[2]) {
1528         _cleanup_close_ int netns = -1;
1529         int r, q;
1530
1531         assert(netns_storage_socket);
1532         assert(netns_storage_socket[0] >= 0);
1533         assert(netns_storage_socket[1] >= 0);
1534
1535         /* We use the passed socketpair as a storage buffer for our
1536          * namespace reference fd. Whatever process runs this first
1537          * shall create a new namespace, all others should just join
1538          * it. To serialize that we use a file lock on the socket
1539          * pair.
1540          *
1541          * It's a bit crazy, but hey, works great! */
1542
1543         if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1544                 return -errno;
1545
1546         netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1547         if (netns == -EAGAIN) {
1548                 /* Nothing stored yet, so let's create a new namespace */
1549
1550                 if (unshare(CLONE_NEWNET) < 0) {
1551                         r = -errno;
1552                         goto fail;
1553                 }
1554
1555                 loopback_setup();
1556
1557                 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
1558                 if (netns < 0) {
1559                         r = -errno;
1560                         goto fail;
1561                 }
1562
1563                 r = 1;
1564
1565         } else if (netns < 0) {
1566                 r = netns;
1567                 goto fail;
1568
1569         } else {
1570                 /* Yay, found something, so let's join the namespace */
1571                 if (setns(netns, CLONE_NEWNET) < 0) {
1572                         r = -errno;
1573                         goto fail;
1574                 }
1575
1576                 r = 0;
1577         }
1578
1579         q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1580         if (q < 0) {
1581                 r = q;
1582                 goto fail;
1583         }
1584
1585 fail:
1586         (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
1587         return r;
1588 }
1589
1590 bool ns_type_supported(NamespaceType type) {
1591         const char *t, *ns_proc;
1592
1593         t = namespace_type_to_string(type);
1594         if (!t) /* Don't know how to translate this? Then it's not supported */
1595                 return false;
1596
1597         ns_proc = strjoina("/proc/self/ns/", t);
1598         return access(ns_proc, F_OK) == 0;
1599 }
1600
1601 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
1602         [PROTECT_HOME_NO] = "no",
1603         [PROTECT_HOME_YES] = "yes",
1604         [PROTECT_HOME_READ_ONLY] = "read-only",
1605         [PROTECT_HOME_TMPFS] = "tmpfs",
1606 };
1607
1608 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
1609
1610 ProtectHome parse_protect_home_or_bool(const char *s) {
1611         int r;
1612
1613         r = parse_boolean(s);
1614         if (r > 0)
1615                 return PROTECT_HOME_YES;
1616         if (r == 0)
1617                 return PROTECT_HOME_NO;
1618
1619         return protect_home_from_string(s);
1620 }
1621
1622 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
1623         [PROTECT_SYSTEM_NO] = "no",
1624         [PROTECT_SYSTEM_YES] = "yes",
1625         [PROTECT_SYSTEM_FULL] = "full",
1626         [PROTECT_SYSTEM_STRICT] = "strict",
1627 };
1628
1629 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);
1630
1631 ProtectSystem parse_protect_system_or_bool(const char *s) {
1632         int r;
1633
1634         r = parse_boolean(s);
1635         if (r > 0)
1636                 return PROTECT_SYSTEM_YES;
1637         if (r == 0)
1638                 return PROTECT_SYSTEM_NO;
1639
1640         return protect_system_from_string(s);
1641 }
1642
1643 static const char* const namespace_type_table[] = {
1644         [NAMESPACE_MOUNT] = "mnt",
1645         [NAMESPACE_CGROUP] = "cgroup",
1646         [NAMESPACE_UTS] = "uts",
1647         [NAMESPACE_IPC] = "ipc",
1648         [NAMESPACE_USER] = "user",
1649         [NAMESPACE_PID] = "pid",
1650         [NAMESPACE_NET] = "net",
1651 };
1652
1653 DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);