src/core/namespace.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2010 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <errno.h>
  22 #include <sched.h>
  23 #include <stdio.h>
  24 #include <string.h>
  25 #include <sys/mount.h>
  26 #include <sys/stat.h>
  27 #include <unistd.h>
  28 #include <linux/fs.h>
  29
  30 #include "alloc-util.h"
  31 #include "base-filesystem.h"
  32 #include "dev-setup.h"
  33 #include "fd-util.h"
  34 #include "fs-util.h"
  35 #include "label.h"
  36 #include "loop-util.h"
  37 #include "loopback-setup.h"
  38 #include "missing.h"
  39 #include "mkdir.h"
  40 #include "mount-util.h"
  41 #include "namespace.h"
  42 #include "path-util.h"
  43 #include "selinux-util.h"
  44 #include "socket-util.h"
  45 #include "stat-util.h"
  46 #include "string-table.h"
  47 #include "string-util.h"
  48 #include "strv.h"
  49 #include "umask-util.h"
  50 #include "user-util.h"
  51 #include "util.h"
  52
  53 #define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
  54
  55 typedef enum MountMode {
  56         /* This is ordered by priority! */
  57         INACCESSIBLE,
  58         BIND_MOUNT,
  59         BIND_MOUNT_RECURSIVE,
  60         PRIVATE_TMP,
  61         PRIVATE_DEV,
  62         BIND_DEV,
  63         EMPTY_DIR,
  64         SYSFS,
  65         PROCFS,
  66         READONLY,
  67         READWRITE,
  68 } MountMode;
  69
  70 typedef struct MountEntry {
  71         const char *path_const;   /* Memory allocated on stack or static */
  72         MountMode mode:5;
  73         bool ignore:1;            /* Ignore if path does not exist? */
  74         bool has_prefix:1;        /* Already is prefixed by the root dir? */
  75         bool read_only:1;         /* Shall this mount point be read-only? */
  76         char *path_malloc;        /* Use this instead of 'path_const' if we had to allocate memory */
  77         const char *source_const; /* The source path, for bind mounts */
  78         char *source_malloc;
  79 } MountEntry;
  80
  81 /* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
  82  * something there already. These mounts are hence overriden by any other explicitly configured mounts. */
  83 static const MountEntry apivfs_table[] = {
  84         { "/proc",               PROCFS,       false },
  85         { "/dev",                BIND_DEV,     false },
  86         { "/sys",                SYSFS,        false },
  87 };
  88
  89 /* ProtectKernelTunables= option and the related filesystem APIs */
  90 static const MountEntry protect_kernel_tunables_table[] = {
  91         { "/proc/sys",           READONLY,     false },
  92         { "/proc/sysrq-trigger", READONLY,     true  },
  93         { "/proc/latency_stats", READONLY,     true  },
  94         { "/proc/mtrr",          READONLY,     true  },
  95         { "/proc/apm",           READONLY,     true  }, /* Obsolete API, there's no point in permitting access to this, ever */
  96         { "/proc/acpi",          READONLY,     true  },
  97         { "/proc/timer_stats",   READONLY,     true  },
  98         { "/proc/asound",        READONLY,     true  },
  99         { "/proc/bus",           READONLY,     true  },
 100         { "/proc/fs",            READONLY,     true  },
 101         { "/proc/irq",           READONLY,     true  },
 102         { "/sys",                READONLY,     false },
 103         { "/sys/kernel/debug",   READONLY,     true  },
 104         { "/sys/kernel/tracing", READONLY,     true  },
 105         { "/sys/fs/cgroup",      READWRITE,    false }, /* READONLY is set by ProtectControlGroups= option */
 106         { "/sys/fs/selinux",     READWRITE,    true  },
 107 };
 108
 109 /* ProtectKernelModules= option */
 110 static const MountEntry protect_kernel_modules_table[] = {
 111 #if HAVE_SPLIT_USR
 112         { "/lib/modules",        INACCESSIBLE, true  },
 113 #endif
 114         { "/usr/lib/modules",    INACCESSIBLE, true  },
 115 };
 116
 117 /*
 118  * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
 119  * system should be protected by ProtectSystem=
 120  */
 121 static const MountEntry protect_home_read_only_table[] = {
 122         { "/home",               READONLY,     true  },
 123         { "/run/user",           READONLY,     true  },
 124         { "/root",               READONLY,     true  },
 125 };
 126
 127 /* ProtectHome=yes table */
 128 static const MountEntry protect_home_yes_table[] = {
 129         { "/home",               INACCESSIBLE, true  },
 130         { "/run/user",           INACCESSIBLE, true  },
 131         { "/root",               INACCESSIBLE, true  },
 132 };
 133
 134 /* ProtectSystem=yes table */
 135 static const MountEntry protect_system_yes_table[] = {
 136         { "/usr",                READONLY,     false },
 137         { "/boot",               READONLY,     true  },
 138         { "/efi",                READONLY,     true  },
 139 };
 140
 141 /* ProtectSystem=full includes ProtectSystem=yes */
 142 static const MountEntry protect_system_full_table[] = {
 143         { "/usr",                READONLY,     false },
 144         { "/boot",               READONLY,     true  },
 145         { "/efi",                READONLY,     true  },
 146         { "/etc",                READONLY,     false },
 147 };
 148
 149 /*
 150  * ProtectSystem=strict table. In this strict mode, we mount everything
 151  * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
 152  * which are left writable, but PrivateDevices= + ProtectKernelTunables=
 153  * protect those, and these options should be fully orthogonal.
 154  * (And of course /home and friends are also left writable, as ProtectHome=
 155  * shall manage those, orthogonally).
 156  */
 157 static const MountEntry protect_system_strict_table[] = {
 158         { "/",                   READONLY,     false },
 159         { "/proc",               READWRITE,    false },      /* ProtectKernelTunables= */
 160         { "/sys",                READWRITE,    false },      /* ProtectKernelTunables= */
 161         { "/dev",                READWRITE,    false },      /* PrivateDevices= */
 162         { "/home",               READWRITE,    true  },      /* ProtectHome= */
 163         { "/run/user",           READWRITE,    true  },      /* ProtectHome= */
 164         { "/root",               READWRITE,    true  },      /* ProtectHome= */
 165 };
 166
 167 static const char *mount_entry_path(const MountEntry *p) {
 168         assert(p);
 169
 170         /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
 171          * otherwise the stack/static ->path field is returned. */
 172
 173         return p->path_malloc ?: p->path_const;
 174 }
 175
 176 static bool mount_entry_read_only(const MountEntry *p) {
 177         assert(p);
 178
 179         return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE);
 180 }
 181
 182 static const char *mount_entry_source(const MountEntry *p) {
 183         assert(p);
 184
 185         return p->source_malloc ?: p->source_const;
 186 }
 187
 188 static void mount_entry_done(MountEntry *p) {
 189         assert(p);
 190
 191         p->path_malloc = mfree(p->path_malloc);
 192         p->source_malloc = mfree(p->source_malloc);
 193 }
 194
 195 static int append_access_mounts(MountEntry **p, char **strv, MountMode mode, bool forcibly_require_prefix) {
 196         char **i;
 197
 198         assert(p);
 199
 200         /* Adds a list of user-supplied READWRITE/READONLY/INACCESSIBLE entries */
 201
 202         STRV_FOREACH(i, strv) {
 203                 bool ignore = false, needs_prefix = false;
 204                 const char *e = *i;
 205
 206                 /* Look for any prefixes */
 207                 if (startswith(e, "-")) {
 208                         e++;
 209                         ignore = true;
 210                 }
 211                 if (startswith(e, "+")) {
 212                         e++;
 213                         needs_prefix = true;
 214                 }
 215
 216                 if (!path_is_absolute(e))
 217                         return -EINVAL;
 218
 219                 *((*p)++) = (MountEntry) {
 220                         .path_const = e,
 221                         .mode = mode,
 222                         .ignore = ignore,
 223                         .has_prefix = !needs_prefix && !forcibly_require_prefix,
 224                 };
 225         }
 226
 227         return 0;
 228 }
 229
 230 static int append_empty_dir_mounts(MountEntry **p, char **strv) {
 231         char **i;
 232
 233         assert(p);
 234
 235         /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
 236          * "/private/" boundary directories for DynamicUser=1. */
 237
 238         STRV_FOREACH(i, strv) {
 239
 240                 *((*p)++) = (MountEntry) {
 241                         .path_const = *i,
 242                         .mode = EMPTY_DIR,
 243                         .ignore = false,
 244                         .has_prefix = false,
 245                         .read_only = true,
 246                 };
 247         }
 248
 249         return 0;
 250 }
 251
 252 static int append_bind_mounts(MountEntry **p, const BindMount *binds, unsigned n) {
 253         unsigned i;
 254
 255         assert(p);
 256
 257         for (i = 0; i < n; i++) {
 258                 const BindMount *b = binds + i;
 259
 260                 *((*p)++) = (MountEntry) {
 261                         .path_const = b->destination,
 262                         .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT,
 263                         .read_only = b->read_only,
 264                         .source_const = b->source,
 265                 };
 266         }
 267
 268         return 0;
 269 }
 270
 271 static int append_static_mounts(MountEntry **p, const MountEntry *mounts, unsigned n, bool ignore_protect) {
 272         unsigned i;
 273
 274         assert(p);
 275         assert(mounts);
 276
 277         /* Adds a list of static pre-defined entries */
 278
 279         for (i = 0; i < n; i++)
 280                 *((*p)++) = (MountEntry) {
 281                         .path_const = mount_entry_path(mounts+i),
 282                         .mode = mounts[i].mode,
 283                         .ignore = mounts[i].ignore || ignore_protect,
 284                 };
 285
 286         return 0;
 287 }
 288
 289 static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
 290         assert(p);
 291
 292         switch (protect_home) {
 293
 294         case PROTECT_HOME_NO:
 295                 return 0;
 296
 297         case PROTECT_HOME_READ_ONLY:
 298                 return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
 299
 300         case PROTECT_HOME_YES:
 301                 return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
 302
 303         default:
 304                 assert_not_reached("Unexpected ProtectHome= value");
 305         }
 306 }
 307
 308 static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
 309         assert(p);
 310
 311         switch (protect_system) {
 312
 313         case PROTECT_SYSTEM_NO:
 314                 return 0;
 315
 316         case PROTECT_SYSTEM_STRICT:
 317                 return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
 318
 319         case PROTECT_SYSTEM_YES:
 320                 return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
 321
 322         case PROTECT_SYSTEM_FULL:
 323                 return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
 324
 325         default:
 326                 assert_not_reached("Unexpected ProtectSystem= value");
 327         }
 328 }
 329
 330 static int mount_path_compare(const void *a, const void *b) {
 331         const MountEntry *p = a, *q = b;
 332         int d;
 333
 334         /* If the paths are not equal, then order prefixes first */
 335         d = path_compare(mount_entry_path(p), mount_entry_path(q));
 336         if (d != 0)
 337                 return d;
 338
 339         /* If the paths are equal, check the mode */
 340         if (p->mode < q->mode)
 341                 return -1;
 342
 343         if (p->mode > q->mode)
 344                 return 1;
 345
 346         return 0;
 347 }
 348
 349 static int prefix_where_needed(MountEntry *m, unsigned n, const char *root_directory) {
 350         unsigned i;
 351
 352         /* Prefixes all paths in the bind mount table with the root directory if it is specified and the entry needs
 353          * that. */
 354
 355         if (!root_directory)
 356                 return 0;
 357
 358         for (i = 0; i < n; i++) {
 359                 char *s;
 360
 361                 if (m[i].has_prefix)
 362                         continue;
 363
 364                 s = prefix_root(root_directory, mount_entry_path(m+i));
 365                 if (!s)
 366                         return -ENOMEM;
 367
 368                 free(m[i].path_malloc);
 369                 m[i].path_malloc = s;
 370
 371                 m[i].has_prefix = true;
 372         }
 373
 374         return 0;
 375 }
 376
 377 static void drop_duplicates(MountEntry *m, unsigned *n) {
 378         MountEntry *f, *t, *previous;
 379
 380         assert(m);
 381         assert(n);
 382
 383         /* Drops duplicate entries. Expects that the array is properly ordered already. */
 384
 385         for (f = m, t = m, previous = NULL; f < m + *n; f++) {
 386
 387                 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
 388                  * above. */
 389                 if (previous && path_equal(mount_entry_path(f), mount_entry_path(previous))) {
 390                         log_debug("%s is duplicate.", mount_entry_path(f));
 391                         previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */
 392                         mount_entry_done(f);
 393                         continue;
 394                 }
 395
 396                 *t = *f;
 397                 previous = t;
 398                 t++;
 399         }
 400
 401         *n = t - m;
 402 }
 403
 404 static void drop_inaccessible(MountEntry *m, unsigned *n) {
 405         MountEntry *f, *t;
 406         const char *clear = NULL;
 407
 408         assert(m);
 409         assert(n);
 410
 411         /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
 412          * ordered already. */
 413
 414         for (f = m, t = m; f < m + *n; f++) {
 415
 416                 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
 417                  * it, as inaccessible paths really should drop the entire subtree. */
 418                 if (clear && path_startswith(mount_entry_path(f), clear)) {
 419                         log_debug("%s is masked by %s.", mount_entry_path(f), clear);
 420                         mount_entry_done(f);
 421                         continue;
 422                 }
 423
 424                 clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
 425
 426                 *t = *f;
 427                 t++;
 428         }
 429
 430         *n = t - m;
 431 }
 432
 433 static void drop_nop(MountEntry *m, unsigned *n) {
 434         MountEntry *f, *t;
 435
 436         assert(m);
 437         assert(n);
 438
 439         /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
 440          * list is ordered by prefixes. */
 441
 442         for (f = m, t = m; f < m + *n; f++) {
 443
 444                 /* Only suppress such subtrees for READONLY and READWRITE entries */
 445                 if (IN_SET(f->mode, READONLY, READWRITE)) {
 446                         MountEntry *p;
 447                         bool found = false;
 448
 449                         /* Now let's find the first parent of the entry we are looking at. */
 450                         for (p = t-1; p >= m; p--) {
 451                                 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
 452                                         found = true;
 453                                         break;
 454                                 }
 455                         }
 456
 457                         /* We found it, let's see if it's the same mode, if so, we can drop this entry */
 458                         if (found && p->mode == f->mode) {
 459                                 log_debug("%s is redundant by %s", mount_entry_path(f), mount_entry_path(p));
 460                                 mount_entry_done(f);
 461                                 continue;
 462                         }
 463                 }
 464
 465                 *t = *f;
 466                 t++;
 467         }
 468
 469         *n = t - m;
 470 }
 471
 472 static void drop_outside_root(const char *root_directory, MountEntry *m, unsigned *n) {
 473         MountEntry *f, *t;
 474
 475         assert(m);
 476         assert(n);
 477
 478         /* Nothing to do */
 479         if (!root_directory)
 480                 return;
 481
 482         /* Drops all mounts that are outside of the root directory. */
 483
 484         for (f = m, t = m; f < m + *n; f++) {
 485
 486                 if (!path_startswith(mount_entry_path(f), root_directory)) {
 487                         log_debug("%s is outside of root directory.", mount_entry_path(f));
 488                         mount_entry_done(f);
 489                         continue;
 490                 }
 491
 492                 *t = *f;
 493                 t++;
 494         }
 495
 496         *n = t - m;
 497 }
 498
 499 static int clone_device_node(const char *d, const char *temporary_mount) {
 500         const char *dn;
 501         struct stat st;
 502         int r;
 503
 504         if (stat(d, &st) < 0) {
 505                 if (errno == ENOENT)
 506                         return 0;
 507                 return -errno;
 508         }
 509
 510         if (!S_ISBLK(st.st_mode) &&
 511             !S_ISCHR(st.st_mode))
 512                 return -EINVAL;
 513
 514         if (st.st_rdev == 0)
 515                 return 0;
 516
 517         dn = strjoina(temporary_mount, d);
 518
 519         mac_selinux_create_file_prepare(d, st.st_mode);
 520         r = mknod(dn, st.st_mode, st.st_rdev);
 521         mac_selinux_create_file_clear();
 522         if (r < 0)
 523                 return log_debug_errno(errno, "mknod failed for %s: %m", d);
 524
 525         return 1;
 526 }
 527
 528 static int mount_private_dev(MountEntry *m) {
 529         static const char devnodes[] =
 530                 "/dev/null\0"
 531                 "/dev/zero\0"
 532                 "/dev/full\0"
 533                 "/dev/random\0"
 534                 "/dev/urandom\0"
 535                 "/dev/tty\0";
 536
 537         char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
 538         const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
 539         _cleanup_umask_ mode_t u;
 540         int r;
 541
 542         assert(m);
 543
 544         u = umask(0000);
 545
 546         if (!mkdtemp(temporary_mount))
 547                 return -errno;
 548
 549         dev = strjoina(temporary_mount, "/dev");
 550         (void) mkdir(dev, 0755);
 551         if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
 552                 r = -errno;
 553                 goto fail;
 554         }
 555
 556         devpts = strjoina(temporary_mount, "/dev/pts");
 557         (void) mkdir(devpts, 0755);
 558         if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
 559                 r = -errno;
 560                 goto fail;
 561         }
 562
 563         /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx
 564          * when /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible
 565          * thus, in that case make a clone
 566          *
 567          * in nspawn and other containers it will be a symlink, in that case make it a symlink
 568          */
 569         r = is_symlink("/dev/ptmx");
 570         if (r < 0)
 571                 goto fail;
 572         if (r > 0) {
 573                 devptmx = strjoina(temporary_mount, "/dev/ptmx");
 574                 if (symlink("pts/ptmx", devptmx) < 0) {
 575                         r = -errno;
 576                         goto fail;
 577                 }
 578         } else {
 579                 r = clone_device_node("/dev/ptmx", temporary_mount);
 580                 if (r < 0)
 581                         goto fail;
 582                 if (r == 0) {
 583                         r = -ENXIO;
 584                         goto fail;
 585                 }
 586         }
 587
 588         devshm = strjoina(temporary_mount, "/dev/shm");
 589         (void) mkdir(devshm, 0755);
 590         r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
 591         if (r < 0) {
 592                 r = -errno;
 593                 goto fail;
 594         }
 595
 596         devmqueue = strjoina(temporary_mount, "/dev/mqueue");
 597         (void) mkdir(devmqueue, 0755);
 598         (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
 599
 600         devhugepages = strjoina(temporary_mount, "/dev/hugepages");
 601         (void) mkdir(devhugepages, 0755);
 602         (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
 603
 604         devlog = strjoina(temporary_mount, "/dev/log");
 605         (void) symlink("/run/systemd/journal/dev-log", devlog);
 606
 607         NULSTR_FOREACH(d, devnodes) {
 608                 r = clone_device_node(d, temporary_mount);
 609                 if (r < 0)
 610                         goto fail;
 611         }
 612
 613         dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
 614
 615         /* Create the /dev directory if missing. It is more likely to be
 616          * missing when the service is started with RootDirectory. This is
 617          * consistent with mount units creating the mount points when missing.
 618          */
 619         (void) mkdir_p_label(mount_entry_path(m), 0755);
 620
 621         /* Unmount everything in old /dev */
 622         umount_recursive(mount_entry_path(m), 0);
 623         if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) {
 624                 r = -errno;
 625                 goto fail;
 626         }
 627
 628         rmdir(dev);
 629         rmdir(temporary_mount);
 630
 631         return 0;
 632
 633 fail:
 634         if (devpts)
 635                 umount(devpts);
 636
 637         if (devshm)
 638                 umount(devshm);
 639
 640         if (devhugepages)
 641                 umount(devhugepages);
 642
 643         if (devmqueue)
 644                 umount(devmqueue);
 645
 646         umount(dev);
 647         rmdir(dev);
 648         rmdir(temporary_mount);
 649
 650         return r;
 651 }
 652
 653 static int mount_bind_dev(const MountEntry *m) {
 654         int r;
 655
 656         assert(m);
 657
 658         /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
 659          * /dev. This is only used when RootDirectory= is set. */
 660
 661         (void) mkdir_p_label(mount_entry_path(m), 0755);
 662
 663         r = path_is_mount_point(mount_entry_path(m), NULL, 0);
 664         if (r < 0)
 665                 return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
 666         if (r > 0) /* make this a NOP if /dev is already a mount point */
 667                 return 0;
 668
 669         if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
 670                 return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m));
 671
 672         return 1;
 673 }
 674
 675 static int mount_sysfs(const MountEntry *m) {
 676         int r;
 677
 678         assert(m);
 679
 680         (void) mkdir_p_label(mount_entry_path(m), 0755);
 681
 682         r = path_is_mount_point(mount_entry_path(m), NULL, 0);
 683         if (r < 0)
 684                 return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
 685         if (r > 0) /* make this a NOP if /sys is already a mount point */
 686                 return 0;
 687
 688         /* Bind mount the host's version so that we get all child mounts of it, too. */
 689         if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
 690                 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
 691
 692         return 1;
 693 }
 694
 695 static int mount_procfs(const MountEntry *m) {
 696         int r;
 697
 698         assert(m);
 699
 700         (void) mkdir_p_label(mount_entry_path(m), 0755);
 701
 702         r = path_is_mount_point(mount_entry_path(m), NULL, 0);
 703         if (r < 0)
 704                 return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
 705         if (r > 0) /* make this a NOP if /proc is already a mount point */
 706                 return 0;
 707
 708         /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
 709         if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
 710                 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
 711
 712         return 1;
 713 }
 714
 715 static int mount_empty_dir(const MountEntry *m) {
 716         assert(m);
 717
 718         /* First, get rid of everything that is below if there is anything. Then, overmount with our new empty dir */
 719
 720         (void) mkdir_p_label(mount_entry_path(m), 0755);
 721         (void) umount_recursive(mount_entry_path(m), 0);
 722
 723         if (mount("tmpfs", mount_entry_path(m), "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
 724                 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
 725
 726         return 1;
 727 }
 728
 729 static int mount_entry_chase(
 730                 const char *root_directory,
 731                 const MountEntry *m,
 732                 const char *path,
 733                 char **location) {
 734
 735         char *chased;
 736         int r;
 737         unsigned flags = 0;
 738
 739         assert(m);
 740
 741         /* Since mount() will always follow symlinks and we need to take the different root directory into account we
 742          * chase the symlinks on our own first. This is called for the destination path, as well as the source path (if
 743          * that applies). The result is stored in "location". */
 744
 745         if (IN_SET(m->mode,
 746                    BIND_MOUNT,
 747                    BIND_MOUNT_RECURSIVE,
 748                    PRIVATE_TMP,
 749                    PRIVATE_DEV,
 750                    BIND_DEV,
 751                    EMPTY_DIR,
 752                    SYSFS,
 753                    PROCFS))
 754                 flags |= CHASE_NONEXISTENT;
 755
 756         r = chase_symlinks(path, root_directory, flags, &chased);
 757         if (r == -ENOENT && m->ignore) {
 758                 log_debug_errno(r, "Path %s does not exist, ignoring.", path);
 759                 return 0;
 760         }
 761         if (r < 0)
 762                 return log_debug_errno(r, "Failed to follow symlinks on %s: %m", path);
 763
 764         log_debug("Followed symlinks %s → %s.", path, chased);
 765
 766         free(*location);
 767         *location = chased;
 768
 769         return 1;
 770 }
 771
 772 static int apply_mount(
 773                 const char *root_directory,
 774                 MountEntry *m) {
 775
 776         bool rbind = true, make = false;
 777         const char *what;
 778         int r;
 779
 780         assert(m);
 781
 782         r = mount_entry_chase(root_directory, m, mount_entry_path(m), &m->path_malloc);
 783         if (r <= 0)
 784                 return r;
 785
 786         log_debug("Applying namespace mount on %s", mount_entry_path(m));
 787
 788         switch (m->mode) {
 789
 790         case INACCESSIBLE: {
 791                 struct stat target;
 792
 793                 /* First, get rid of everything that is below if there
 794                  * is anything... Then, overmount it with an
 795                  * inaccessible path. */
 796                 (void) umount_recursive(mount_entry_path(m), 0);
 797
 798                 if (lstat(mount_entry_path(m), &target) < 0)
 799                         return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m));
 800
 801                 what = mode_to_inaccessible_node(target.st_mode);
 802                 if (!what) {
 803                         log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed");
 804                         return -ELOOP;
 805                 }
 806                 break;
 807         }
 808
 809         case READONLY:
 810         case READWRITE:
 811                 r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
 812                 if (r < 0)
 813                         return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m));
 814                 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
 815                         return 0;
 816                 /* This isn't a mount point yet, let's make it one. */
 817                 what = mount_entry_path(m);
 818                 break;
 819
 820         case BIND_MOUNT:
 821                 rbind = false;
 822
 823                 _fallthrough_;
 824         case BIND_MOUNT_RECURSIVE:
 825                 /* Also chase the source mount */
 826
 827                 r = mount_entry_chase(root_directory, m, mount_entry_source(m), &m->source_malloc);
 828                 if (r <= 0)
 829                         return r;
 830
 831                 what = mount_entry_source(m);
 832                 make = true;
 833                 break;
 834
 835         case EMPTY_DIR:
 836                 return mount_empty_dir(m);
 837
 838         case PRIVATE_TMP:
 839                 what = mount_entry_source(m);
 840                 make = true;
 841                 break;
 842
 843         case PRIVATE_DEV:
 844                 return mount_private_dev(m);
 845
 846         case BIND_DEV:
 847                 return mount_bind_dev(m);
 848
 849         case SYSFS:
 850                 return mount_sysfs(m);
 851
 852         case PROCFS:
 853                 return mount_procfs(m);
 854
 855         default:
 856                 assert_not_reached("Unknown mode");
 857         }
 858
 859         assert(what);
 860
 861         if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0) {
 862                 bool try_again = false;
 863                 r = -errno;
 864
 865                 if (r == -ENOENT && make) {
 866                         struct stat st;
 867
 868                         /* Hmm, either the source or the destination are missing. Let's see if we can create the destination, then try again */
 869
 870                         if (stat(what, &st) >= 0) {
 871
 872                                 (void) mkdir_parents(mount_entry_path(m), 0755);
 873
 874                                 if (S_ISDIR(st.st_mode))
 875                                         try_again = mkdir(mount_entry_path(m), 0755) >= 0;
 876                                 else
 877                                         try_again = touch(mount_entry_path(m)) >= 0;
 878                         }
 879                 }
 880
 881                 if (try_again) {
 882                         if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0)
 883                                 r = -errno;
 884                         else
 885                                 r = 0;
 886                 }
 887
 888                 if (r < 0)
 889                         return log_debug_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
 890         }
 891
 892         log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
 893         return 0;
 894 }
 895
 896 static int make_read_only(const MountEntry *m, char **blacklist, FILE *proc_self_mountinfo) {
 897         int r = 0;
 898
 899         assert(m);
 900         assert(proc_self_mountinfo);
 901
 902         if (mount_entry_read_only(m))
 903                 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), true, blacklist, proc_self_mountinfo);
 904         else if (m->mode == PRIVATE_DEV) { /* Superblock can be readonly but the submounts can't */
 905                 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
 906                         r = -errno;
 907         } else
 908                 return 0;
 909
 910         /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
 911          * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
 912          * read-only mounts already applied. */
 913
 914         if (r == -ENOENT && m->ignore)
 915                 r = 0;
 916
 917         return r;
 918 }
 919
 920 static bool namespace_info_mount_apivfs(const char *root_directory, const NamespaceInfo *ns_info) {
 921         assert(ns_info);
 922
 923         /*
 924          * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
 925          * since to protect the API VFS mounts, they need to be around in the
 926          * first place... and RootDirectory= or RootImage= need to be set.
 927          */
 928
 929         /* root_directory should point to a mount point */
 930         return root_directory &&
 931                 (ns_info->mount_apivfs ||
 932                  ns_info->protect_control_groups ||
 933                  ns_info->protect_kernel_tunables);
 934 }
 935
 936 static unsigned namespace_calculate_mounts(
 937                 const char* root_directory,
 938                 const NamespaceInfo *ns_info,
 939                 char** read_write_paths,
 940                 char** read_only_paths,
 941                 char** inaccessible_paths,
 942                 char** empty_directories,
 943                 const BindMount *bind_mounts,
 944                 unsigned n_bind_mounts,
 945                 const char* tmp_dir,
 946                 const char* var_tmp_dir,
 947                 ProtectHome protect_home,
 948                 ProtectSystem protect_system) {
 949
 950         unsigned protect_home_cnt;
 951         unsigned protect_system_cnt =
 952                 (protect_system == PROTECT_SYSTEM_STRICT ?
 953                  ELEMENTSOF(protect_system_strict_table) :
 954                  ((protect_system == PROTECT_SYSTEM_FULL) ?
 955                   ELEMENTSOF(protect_system_full_table) :
 956                   ((protect_system == PROTECT_SYSTEM_YES) ?
 957                    ELEMENTSOF(protect_system_yes_table) : 0)));
 958
 959         protect_home_cnt =
 960                 (protect_home == PROTECT_HOME_YES ?
 961                  ELEMENTSOF(protect_home_yes_table) :
 962                  ((protect_home == PROTECT_HOME_READ_ONLY) ?
 963                   ELEMENTSOF(protect_home_read_only_table) : 0));
 964
 965         return !!tmp_dir + !!var_tmp_dir +
 966                 strv_length(read_write_paths) +
 967                 strv_length(read_only_paths) +
 968                 strv_length(inaccessible_paths) +
 969                 strv_length(empty_directories) +
 970                 n_bind_mounts +
 971                 ns_info->private_dev +
 972                 (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
 973                 (ns_info->protect_control_groups ? 1 : 0) +
 974                 (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
 975                 protect_home_cnt + protect_system_cnt +
 976                 (namespace_info_mount_apivfs(root_directory, ns_info) ? ELEMENTSOF(apivfs_table) : 0);
 977 }
 978
 979 int setup_namespace(
 980                 const char* root_directory,
 981                 const char* root_image,
 982                 const NamespaceInfo *ns_info,
 983                 char** read_write_paths,
 984                 char** read_only_paths,
 985                 char** inaccessible_paths,
 986                 char** empty_directories,
 987                 const BindMount *bind_mounts,
 988                 unsigned n_bind_mounts,
 989                 const char* tmp_dir,
 990                 const char* var_tmp_dir,
 991                 ProtectHome protect_home,
 992                 ProtectSystem protect_system,
 993                 unsigned long mount_flags,
 994                 DissectImageFlags dissect_image_flags) {
 995
 996         _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
 997         _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
 998         _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
 999         _cleanup_free_ void *root_hash = NULL;
1000         MountEntry *m, *mounts = NULL;
1001         size_t root_hash_size = 0;
1002         bool make_slave = false;
1003         const char *root;
1004         unsigned n_mounts;
1005         bool require_prefix = false;
1006         int r = 0;
1007
1008         assert(ns_info);
1009
1010         if (mount_flags == 0)
1011                 mount_flags = MS_SHARED;
1012
1013         if (root_image) {
1014                 dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT;
1015
1016                 if (protect_system == PROTECT_SYSTEM_STRICT && strv_isempty(read_write_paths))
1017                         dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
1018
1019                 r = loop_device_make_by_path(root_image,
1020                                              dissect_image_flags & DISSECT_IMAGE_READ_ONLY ? O_RDONLY : O_RDWR,
1021                                              &loop_device);
1022                 if (r < 0)
1023                         return r;
1024
1025                 r = root_hash_load(root_image, &root_hash, &root_hash_size);
1026                 if (r < 0)
1027                         return r;
1028
1029                 r = dissect_image(loop_device->fd, root_hash, root_hash_size, dissect_image_flags, &dissected_image);
1030                 if (r < 0)
1031                         return r;
1032
1033                 r = dissected_image_decrypt(dissected_image, NULL, root_hash, root_hash_size, dissect_image_flags, &decrypted_image);
1034                 if (r < 0)
1035                         return r;
1036         }
1037
1038         if (root_directory)
1039                 root = root_directory;
1040         else if (root_image || n_bind_mounts > 0) {
1041
1042                 /* If we are booting from an image, create a mount point for the image, if it's still missing. We use
1043                  * the same mount point for all images, which is safe, since they all live in their own namespaces
1044                  * after all, and hence won't see each other. We also use such a root directory whenever there are bind
1045                  * mounts configured, so that their source mounts are never obstructed by mounts we already applied
1046                  * while we are applying them. */
1047
1048                 root = "/run/systemd/unit-root";
1049                 (void) mkdir_label(root, 0700);
1050                 require_prefix = true;
1051         } else
1052                 root = NULL;
1053
1054         n_mounts = namespace_calculate_mounts(
1055                         root,
1056                         ns_info,
1057                         read_write_paths,
1058                         read_only_paths,
1059                         inaccessible_paths,
1060                         empty_directories,
1061                         bind_mounts, n_bind_mounts,
1062                         tmp_dir, var_tmp_dir,
1063                         protect_home, protect_system);
1064
1065         /* Set mount slave mode */
1066         if (root || n_mounts > 0)
1067                 make_slave = true;
1068
1069         if (n_mounts > 0) {
1070                 m = mounts = (MountEntry *) alloca0(n_mounts * sizeof(MountEntry));
1071                 r = append_access_mounts(&m, read_write_paths, READWRITE, require_prefix);
1072                 if (r < 0)
1073                         goto finish;
1074
1075                 r = append_access_mounts(&m, read_only_paths, READONLY, require_prefix);
1076                 if (r < 0)
1077                         goto finish;
1078
1079                 r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE, require_prefix);
1080                 if (r < 0)
1081                         goto finish;
1082
1083                 r = append_empty_dir_mounts(&m, empty_directories);
1084                 if (r < 0)
1085                         goto finish;
1086
1087                 r = append_bind_mounts(&m, bind_mounts, n_bind_mounts);
1088                 if (r < 0)
1089                         goto finish;
1090
1091                 if (tmp_dir) {
1092                         *(m++) = (MountEntry) {
1093                                 .path_const = "/tmp",
1094                                 .mode = PRIVATE_TMP,
1095                                 .source_const = tmp_dir,
1096                         };
1097                 }
1098
1099                 if (var_tmp_dir) {
1100                         *(m++) = (MountEntry) {
1101                                 .path_const = "/var/tmp",
1102                                 .mode = PRIVATE_TMP,
1103                                 .source_const = var_tmp_dir,
1104                         };
1105                 }
1106
1107                 if (ns_info->private_dev) {
1108                         *(m++) = (MountEntry) {
1109                                 .path_const = "/dev",
1110                                 .mode = PRIVATE_DEV,
1111                         };
1112                 }
1113
1114                 if (ns_info->protect_kernel_tunables) {
1115                         r = append_static_mounts(&m, protect_kernel_tunables_table, ELEMENTSOF(protect_kernel_tunables_table), ns_info->ignore_protect_paths);
1116                         if (r < 0)
1117                                 goto finish;
1118                 }
1119
1120                 if (ns_info->protect_kernel_modules) {
1121                         r = append_static_mounts(&m, protect_kernel_modules_table, ELEMENTSOF(protect_kernel_modules_table), ns_info->ignore_protect_paths);
1122                         if (r < 0)
1123                                 goto finish;
1124                 }
1125
1126                 if (ns_info->protect_control_groups) {
1127                         *(m++) = (MountEntry) {
1128                                 .path_const = "/sys/fs/cgroup",
1129                                 .mode = READONLY,
1130                         };
1131                 }
1132
1133                 r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths);
1134                 if (r < 0)
1135                         goto finish;
1136
1137                 r = append_protect_system(&m, protect_system, false);
1138                 if (r < 0)
1139                         goto finish;
1140
1141                 if (namespace_info_mount_apivfs(root, ns_info)) {
1142                         r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths);
1143                         if (r < 0)
1144                                 goto finish;
1145                 }
1146
1147                 assert(mounts + n_mounts == m);
1148
1149                 /* Prepend the root directory where that's necessary */
1150                 r = prefix_where_needed(mounts, n_mounts, root);
1151                 if (r < 0)
1152                         goto finish;
1153
1154                 qsort(mounts, n_mounts, sizeof(MountEntry), mount_path_compare);
1155
1156                 drop_duplicates(mounts, &n_mounts);
1157                 drop_outside_root(root, mounts, &n_mounts);
1158                 drop_inaccessible(mounts, &n_mounts);
1159                 drop_nop(mounts, &n_mounts);
1160         }
1161
1162         if (unshare(CLONE_NEWNS) < 0) {
1163                 r = -errno;
1164                 goto finish;
1165         }
1166
1167         if (make_slave) {
1168                 /* Remount / as SLAVE so that nothing now mounted in the namespace
1169                    shows up in the parent */
1170                 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1171                         r = -errno;
1172                         goto finish;
1173                 }
1174         }
1175
1176         if (root_image) {
1177                 /* A root image is specified, mount it to the right place */
1178                 r = dissected_image_mount(dissected_image, root, UID_INVALID, dissect_image_flags);
1179                 if (r < 0)
1180                         goto finish;
1181
1182                 if (decrypted_image) {
1183                         r = decrypted_image_relinquish(decrypted_image);
1184                         if (r < 0)
1185                                 goto finish;
1186                 }
1187
1188                 loop_device_relinquish(loop_device);
1189
1190         } else if (root_directory) {
1191
1192                 /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
1193                 r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW);
1194                 if (r < 0)
1195                         goto finish;
1196                 if (r == 0) {
1197                         if (mount(root, root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1198                                 r = -errno;
1199                                 goto finish;
1200                         }
1201                 }
1202
1203         } else if (root) {
1204
1205                 /* Let's mount the main root directory to the root directory to use */
1206                 if (mount("/", root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1207                         r = -errno;
1208                         goto finish;
1209                 }
1210         }
1211
1212         /* Try to set up the new root directory before mounting anything else there. */
1213         if (root_image || root_directory)
1214                 (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
1215
1216         if (n_mounts > 0) {
1217                 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
1218                 char **blacklist;
1219                 unsigned j;
1220
1221                 /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of /proc.
1222                  * For example, this is the case with the option: 'InaccessiblePaths=/proc' */
1223                 proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
1224                 if (!proc_self_mountinfo) {
1225                         r = -errno;
1226                         goto finish;
1227                 }
1228
1229                 /* First round, add in all special mounts we need */
1230                 for (m = mounts; m < mounts + n_mounts; ++m) {
1231                         r = apply_mount(root, m);
1232                         if (r < 0)
1233                                 goto finish;
1234                 }
1235
1236                 /* Create a blacklist we can pass to bind_mount_recursive() */
1237                 blacklist = newa(char*, n_mounts+1);
1238                 for (j = 0; j < n_mounts; j++)
1239                         blacklist[j] = (char*) mount_entry_path(mounts+j);
1240                 blacklist[j] = NULL;
1241
1242                 /* Second round, flip the ro bits if necessary. */
1243                 for (m = mounts; m < mounts + n_mounts; ++m) {
1244                         r = make_read_only(m, blacklist, proc_self_mountinfo);
1245                         if (r < 0)
1246                                 goto finish;
1247                 }
1248         }
1249
1250         if (root) {
1251                 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
1252                 r = mount_move_root(root);
1253                 if (r < 0)
1254                         goto finish;
1255         }
1256
1257         /* Remount / as the desired mode. Note that this will not
1258          * reestablish propagation from our side to the host, since
1259          * what's disconnected is disconnected. */
1260         if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
1261                 r = -errno;
1262                 goto finish;
1263         }
1264
1265         r = 0;
1266
1267 finish:
1268         for (m = mounts; m < mounts + n_mounts; m++)
1269                 mount_entry_done(m);
1270
1271         return r;
1272 }
1273
1274 void bind_mount_free_many(BindMount *b, unsigned n) {
1275         unsigned i;
1276
1277         assert(b || n == 0);
1278
1279         for (i = 0; i < n; i++) {
1280                 free(b[i].source);
1281                 free(b[i].destination);
1282         }
1283
1284         free(b);
1285 }
1286
1287 int bind_mount_add(BindMount **b, unsigned *n, const BindMount *item) {
1288         _cleanup_free_ char *s = NULL, *d = NULL;
1289         BindMount *c;
1290
1291         assert(b);
1292         assert(n);
1293         assert(item);
1294
1295         s = strdup(item->source);
1296         if (!s)
1297                 return -ENOMEM;
1298
1299         d = strdup(item->destination);
1300         if (!d)
1301                 return -ENOMEM;
1302
1303         c = realloc_multiply(*b, sizeof(BindMount), *n + 1);
1304         if (!c)
1305                 return -ENOMEM;
1306
1307         *b = c;
1308
1309         c[(*n) ++] = (BindMount) {
1310                 .source = s,
1311                 .destination = d,
1312                 .read_only = item->read_only,
1313                 .recursive = item->recursive,
1314                 .ignore_enoent = item->ignore_enoent,
1315         };
1316
1317         s = d = NULL;
1318         return 0;
1319 }
1320
1321 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
1322         _cleanup_free_ char *x = NULL;
1323         char bid[SD_ID128_STRING_MAX];
1324         sd_id128_t boot_id;
1325         int r;
1326
1327         assert(id);
1328         assert(prefix);
1329         assert(path);
1330
1331         /* We include the boot id in the directory so that after a
1332          * reboot we can easily identify obsolete directories. */
1333
1334         r = sd_id128_get_boot(&boot_id);
1335         if (r < 0)
1336                 return r;
1337
1338         x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX");
1339         if (!x)
1340                 return -ENOMEM;
1341
1342         RUN_WITH_UMASK(0077)
1343                 if (!mkdtemp(x))
1344                         return -errno;
1345
1346         RUN_WITH_UMASK(0000) {
1347                 char *y;
1348
1349                 y = strjoina(x, "/tmp");
1350
1351                 if (mkdir(y, 0777 | S_ISVTX) < 0)
1352                         return -errno;
1353         }
1354
1355         *path = x;
1356         x = NULL;
1357
1358         return 0;
1359 }
1360
1361 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
1362         char *a, *b;
1363         int r;
1364
1365         assert(id);
1366         assert(tmp_dir);
1367         assert(var_tmp_dir);
1368
1369         r = setup_one_tmp_dir(id, "/tmp", &a);
1370         if (r < 0)
1371                 return r;
1372
1373         r = setup_one_tmp_dir(id, "/var/tmp", &b);
1374         if (r < 0) {
1375                 char *t;
1376
1377                 t = strjoina(a, "/tmp");
1378                 rmdir(t);
1379                 rmdir(a);
1380
1381                 free(a);
1382                 return r;
1383         }
1384
1385         *tmp_dir = a;
1386         *var_tmp_dir = b;
1387
1388         return 0;
1389 }
1390
1391 int setup_netns(int netns_storage_socket[2]) {
1392         _cleanup_close_ int netns = -1;
1393         int r, q;
1394
1395         assert(netns_storage_socket);
1396         assert(netns_storage_socket[0] >= 0);
1397         assert(netns_storage_socket[1] >= 0);
1398
1399         /* We use the passed socketpair as a storage buffer for our
1400          * namespace reference fd. Whatever process runs this first
1401          * shall create a new namespace, all others should just join
1402          * it. To serialize that we use a file lock on the socket
1403          * pair.
1404          *
1405          * It's a bit crazy, but hey, works great! */
1406
1407         if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1408                 return -errno;
1409
1410         netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1411         if (netns == -EAGAIN) {
1412                 /* Nothing stored yet, so let's create a new namespace */
1413
1414                 if (unshare(CLONE_NEWNET) < 0) {
1415                         r = -errno;
1416                         goto fail;
1417                 }
1418
1419                 loopback_setup();
1420
1421                 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
1422                 if (netns < 0) {
1423                         r = -errno;
1424                         goto fail;
1425                 }
1426
1427                 r = 1;
1428
1429         } else if (netns < 0) {
1430                 r = netns;
1431                 goto fail;
1432
1433         } else {
1434                 /* Yay, found something, so let's join the namespace */
1435                 if (setns(netns, CLONE_NEWNET) < 0) {
1436                         r = -errno;
1437                         goto fail;
1438                 }
1439
1440                 r = 0;
1441         }
1442
1443         q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1444         if (q < 0) {
1445                 r = q;
1446                 goto fail;
1447         }
1448
1449 fail:
1450         (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
1451         return r;
1452 }
1453
1454 bool ns_type_supported(NamespaceType type) {
1455         const char *t, *ns_proc;
1456
1457         t = namespace_type_to_string(type);
1458         if (!t) /* Don't know how to translate this? Then it's not supported */
1459                 return false;
1460
1461         ns_proc = strjoina("/proc/self/ns/", t);
1462         return access(ns_proc, F_OK) == 0;
1463 }
1464
1465 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
1466         [PROTECT_HOME_NO] = "no",
1467         [PROTECT_HOME_YES] = "yes",
1468         [PROTECT_HOME_READ_ONLY] = "read-only",
1469 };
1470
1471 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
1472
1473 ProtectHome parse_protect_home_or_bool(const char *s) {
1474         int r;
1475
1476         r = parse_boolean(s);
1477         if (r > 0)
1478                 return PROTECT_HOME_YES;
1479         if (r == 0)
1480                 return PROTECT_HOME_NO;
1481
1482         return protect_home_from_string(s);
1483 }
1484
1485 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
1486         [PROTECT_SYSTEM_NO] = "no",
1487         [PROTECT_SYSTEM_YES] = "yes",
1488         [PROTECT_SYSTEM_FULL] = "full",
1489         [PROTECT_SYSTEM_STRICT] = "strict",
1490 };
1491
1492 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);
1493
1494 ProtectSystem parse_protect_system_or_bool(const char *s) {
1495         int r;
1496
1497         r = parse_boolean(s);
1498         if (r > 0)
1499                 return PROTECT_SYSTEM_YES;
1500         if (r == 0)
1501                 return PROTECT_SYSTEM_NO;
1502
1503         return protect_system_from_string(s);
1504 }
1505
1506 static const char* const namespace_type_table[] = {
1507         [NAMESPACE_MOUNT] = "mnt",
1508         [NAMESPACE_CGROUP] = "cgroup",
1509         [NAMESPACE_UTS] = "uts",
1510         [NAMESPACE_IPC] = "ipc",
1511         [NAMESPACE_USER] = "user",
1512         [NAMESPACE_PID] = "pid",
1513         [NAMESPACE_NET] = "net",
1514 };
1515
1516 DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);