src/core/namespace.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2010 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <errno.h>
  22 #include <sched.h>
  23 #include <stdio.h>
  24 #include <string.h>
  25 #include <sys/mount.h>
  26 #include <sys/stat.h>
  27 #include <unistd.h>
  28 #include <linux/fs.h>
  29
  30 #include "alloc-util.h"
  31 #include "base-filesystem.h"
  32 #include "dev-setup.h"
  33 #include "fd-util.h"
  34 #include "fs-util.h"
  35 #include "label.h"
  36 #include "loop-util.h"
  37 #include "loopback-setup.h"
  38 #include "missing.h"
  39 #include "mkdir.h"
  40 #include "mount-util.h"
  41 #include "namespace.h"
  42 #include "path-util.h"
  43 #include "selinux-util.h"
  44 #include "socket-util.h"
  45 #include "stat-util.h"
  46 #include "string-table.h"
  47 #include "string-util.h"
  48 #include "strv.h"
  49 #include "umask-util.h"
  50 #include "user-util.h"
  51 #include "util.h"
  52
  53 #define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
  54
  55 typedef enum MountMode {
  56         /* This is ordered by priority! */
  57         INACCESSIBLE,
  58         BIND_MOUNT,
  59         BIND_MOUNT_RECURSIVE,
  60         PRIVATE_TMP,
  61         PRIVATE_DEV,
  62         BIND_DEV,
  63         EMPTY_DIR,
  64         SYSFS,
  65         PROCFS,
  66         READONLY,
  67         READWRITE,
  68 } MountMode;
  69
  70 typedef struct MountEntry {
  71         const char *path_const;   /* Memory allocated on stack or static */
  72         MountMode mode:5;
  73         bool ignore:1;            /* Ignore if path does not exist? */
  74         bool has_prefix:1;        /* Already is prefixed by the root dir? */
  75         bool read_only:1;         /* Shall this mount point be read-only? */
  76         char *path_malloc;        /* Use this instead of 'path_const' if we had to allocate memory */
  77         const char *source_const; /* The source path, for bind mounts */
  78         char *source_malloc;
  79 } MountEntry;
  80
  81 /* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
  82  * something there already. These mounts are hence overriden by any other explicitly configured mounts. */
  83 static const MountEntry apivfs_table[] = {
  84         { "/proc",               PROCFS,       false },
  85         { "/dev",                BIND_DEV,     false },
  86         { "/sys",                SYSFS,        false },
  87 };
  88
  89 /* ProtectKernelTunables= option and the related filesystem APIs */
  90 static const MountEntry protect_kernel_tunables_table[] = {
  91         { "/proc/sys",           READONLY,     false },
  92         { "/proc/sysrq-trigger", READONLY,     true  },
  93         { "/proc/latency_stats", READONLY,     true  },
  94         { "/proc/mtrr",          READONLY,     true  },
  95         { "/proc/apm",           READONLY,     true  }, /* Obsolete API, there's no point in permitting access to this, ever */
  96         { "/proc/acpi",          READONLY,     true  },
  97         { "/proc/timer_stats",   READONLY,     true  },
  98         { "/proc/asound",        READONLY,     true  },
  99         { "/proc/bus",           READONLY,     true  },
 100         { "/proc/fs",            READONLY,     true  },
 101         { "/proc/irq",           READONLY,     true  },
 102         { "/sys",                READONLY,     false },
 103         { "/sys/kernel/debug",   READONLY,     true  },
 104         { "/sys/kernel/tracing", READONLY,     true  },
 105         { "/sys/fs/cgroup",      READWRITE,    false }, /* READONLY is set by ProtectControlGroups= option */
 106         { "/sys/fs/selinux",     READWRITE,    true  },
 107 };
 108
 109 /* ProtectKernelModules= option */
 110 static const MountEntry protect_kernel_modules_table[] = {
 111 #if HAVE_SPLIT_USR
 112         { "/lib/modules",        INACCESSIBLE, true  },
 113 #endif
 114         { "/usr/lib/modules",    INACCESSIBLE, true  },
 115 };
 116
 117 /*
 118  * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
 119  * system should be protected by ProtectSystem=
 120  */
 121 static const MountEntry protect_home_read_only_table[] = {
 122         { "/home",               READONLY,     true  },
 123         { "/run/user",           READONLY,     true  },
 124         { "/root",               READONLY,     true  },
 125 };
 126
 127 /* ProtectHome=yes table */
 128 static const MountEntry protect_home_yes_table[] = {
 129         { "/home",               INACCESSIBLE, true  },
 130         { "/run/user",           INACCESSIBLE, true  },
 131         { "/root",               INACCESSIBLE, true  },
 132 };
 133
 134 /* ProtectSystem=yes table */
 135 static const MountEntry protect_system_yes_table[] = {
 136         { "/usr",                READONLY,     false },
 137         { "/boot",               READONLY,     true  },
 138         { "/efi",                READONLY,     true  },
 139 };
 140
 141 /* ProtectSystem=full includes ProtectSystem=yes */
 142 static const MountEntry protect_system_full_table[] = {
 143         { "/usr",                READONLY,     false },
 144         { "/boot",               READONLY,     true  },
 145         { "/efi",                READONLY,     true  },
 146         { "/etc",                READONLY,     false },
 147 };
 148
 149 /*
 150  * ProtectSystem=strict table. In this strict mode, we mount everything
 151  * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
 152  * which are left writable, but PrivateDevices= + ProtectKernelTunables=
 153  * protect those, and these options should be fully orthogonal.
 154  * (And of course /home and friends are also left writable, as ProtectHome=
 155  * shall manage those, orthogonally).
 156  */
 157 static const MountEntry protect_system_strict_table[] = {
 158         { "/",                   READONLY,     false },
 159         { "/proc",               READWRITE,    false },      /* ProtectKernelTunables= */
 160         { "/sys",                READWRITE,    false },      /* ProtectKernelTunables= */
 161         { "/dev",                READWRITE,    false },      /* PrivateDevices= */
 162         { "/home",               READWRITE,    true  },      /* ProtectHome= */
 163         { "/run/user",           READWRITE,    true  },      /* ProtectHome= */
 164         { "/root",               READWRITE,    true  },      /* ProtectHome= */
 165 };
 166
 167 static const char *mount_entry_path(const MountEntry *p) {
 168         assert(p);
 169
 170         /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
 171          * otherwise the stack/static ->path field is returned. */
 172
 173         return p->path_malloc ?: p->path_const;
 174 }
 175
 176 static bool mount_entry_read_only(const MountEntry *p) {
 177         assert(p);
 178
 179         return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE);
 180 }
 181
 182 static const char *mount_entry_source(const MountEntry *p) {
 183         assert(p);
 184
 185         return p->source_malloc ?: p->source_const;
 186 }
 187
 188 static void mount_entry_done(MountEntry *p) {
 189         assert(p);
 190
 191         p->path_malloc = mfree(p->path_malloc);
 192         p->source_malloc = mfree(p->source_malloc);
 193 }
 194
 195 static int append_access_mounts(MountEntry **p, char **strv, MountMode mode, bool forcibly_require_prefix) {
 196         char **i;
 197
 198         assert(p);
 199
 200         /* Adds a list of user-supplied READWRITE/READONLY/INACCESSIBLE entries */
 201
 202         STRV_FOREACH(i, strv) {
 203                 bool ignore = false, needs_prefix = false;
 204                 const char *e = *i;
 205
 206                 /* Look for any prefixes */
 207                 if (startswith(e, "-")) {
 208                         e++;
 209                         ignore = true;
 210                 }
 211                 if (startswith(e, "+")) {
 212                         e++;
 213                         needs_prefix = true;
 214                 }
 215
 216                 if (!path_is_absolute(e))
 217                         return -EINVAL;
 218
 219                 *((*p)++) = (MountEntry) {
 220                         .path_const = e,
 221                         .mode = mode,
 222                         .ignore = ignore,
 223                         .has_prefix = !needs_prefix && !forcibly_require_prefix,
 224                 };
 225         }
 226
 227         return 0;
 228 }
 229
 230 static int append_empty_dir_mounts(MountEntry **p, char **strv) {
 231         char **i;
 232
 233         assert(p);
 234
 235         /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
 236          * "/private/" boundary directories for DynamicUser=1. */
 237
 238         STRV_FOREACH(i, strv) {
 239
 240                 *((*p)++) = (MountEntry) {
 241                         .path_const = *i,
 242                         .mode = EMPTY_DIR,
 243                         .ignore = false,
 244                         .has_prefix = false,
 245                         .read_only = true,
 246                 };
 247         }
 248
 249         return 0;
 250 }
 251
 252 static int append_bind_mounts(MountEntry **p, const BindMount *binds, unsigned n) {
 253         unsigned i;
 254
 255         assert(p);
 256
 257         for (i = 0; i < n; i++) {
 258                 const BindMount *b = binds + i;
 259
 260                 *((*p)++) = (MountEntry) {
 261                         .path_const = b->destination,
 262                         .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT,
 263                         .read_only = b->read_only,
 264                         .source_const = b->source,
 265                 };
 266         }
 267
 268         return 0;
 269 }
 270
 271 static int append_static_mounts(MountEntry **p, const MountEntry *mounts, unsigned n, bool ignore_protect) {
 272         unsigned i;
 273
 274         assert(p);
 275         assert(mounts);
 276
 277         /* Adds a list of static pre-defined entries */
 278
 279         for (i = 0; i < n; i++)
 280                 *((*p)++) = (MountEntry) {
 281                         .path_const = mount_entry_path(mounts+i),
 282                         .mode = mounts[i].mode,
 283                         .ignore = mounts[i].ignore || ignore_protect,
 284                 };
 285
 286         return 0;
 287 }
 288
 289 static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
 290         assert(p);
 291
 292         switch (protect_home) {
 293
 294         case PROTECT_HOME_NO:
 295                 return 0;
 296
 297         case PROTECT_HOME_READ_ONLY:
 298                 return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
 299
 300         case PROTECT_HOME_YES:
 301                 return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
 302
 303         default:
 304                 assert_not_reached("Unexpected ProtectHome= value");
 305         }
 306 }
 307
 308 static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
 309         assert(p);
 310
 311         switch (protect_system) {
 312
 313         case PROTECT_SYSTEM_NO:
 314                 return 0;
 315
 316         case PROTECT_SYSTEM_STRICT:
 317                 return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
 318
 319         case PROTECT_SYSTEM_YES:
 320                 return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
 321
 322         case PROTECT_SYSTEM_FULL:
 323                 return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
 324
 325         default:
 326                 assert_not_reached("Unexpected ProtectSystem= value");
 327         }
 328 }
 329
 330 static int mount_path_compare(const void *a, const void *b) {
 331         const MountEntry *p = a, *q = b;
 332         int d;
 333
 334         /* If the paths are not equal, then order prefixes first */
 335         d = path_compare(mount_entry_path(p), mount_entry_path(q));
 336         if (d != 0)
 337                 return d;
 338
 339         /* If the paths are equal, check the mode */
 340         if (p->mode < q->mode)
 341                 return -1;
 342
 343         if (p->mode > q->mode)
 344                 return 1;
 345
 346         return 0;
 347 }
 348
 349 static int prefix_where_needed(MountEntry *m, unsigned n, const char *root_directory) {
 350         unsigned i;
 351
 352         /* Prefixes all paths in the bind mount table with the root directory if it is specified and the entry needs
 353          * that. */
 354
 355         if (!root_directory)
 356                 return 0;
 357
 358         for (i = 0; i < n; i++) {
 359                 char *s;
 360
 361                 if (m[i].has_prefix)
 362                         continue;
 363
 364                 s = prefix_root(root_directory, mount_entry_path(m+i));
 365                 if (!s)
 366                         return -ENOMEM;
 367
 368                 free_and_replace(m[i].path_malloc, s);
 369                 m[i].has_prefix = true;
 370         }
 371
 372         return 0;
 373 }
 374
 375 static void drop_duplicates(MountEntry *m, unsigned *n) {
 376         MountEntry *f, *t, *previous;
 377
 378         assert(m);
 379         assert(n);
 380
 381         /* Drops duplicate entries. Expects that the array is properly ordered already. */
 382
 383         for (f = m, t = m, previous = NULL; f < m + *n; f++) {
 384
 385                 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
 386                  * above. */
 387                 if (previous && path_equal(mount_entry_path(f), mount_entry_path(previous))) {
 388                         log_debug("%s is duplicate.", mount_entry_path(f));
 389                         previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */
 390                         mount_entry_done(f);
 391                         continue;
 392                 }
 393
 394                 *t = *f;
 395                 previous = t;
 396                 t++;
 397         }
 398
 399         *n = t - m;
 400 }
 401
 402 static void drop_inaccessible(MountEntry *m, unsigned *n) {
 403         MountEntry *f, *t;
 404         const char *clear = NULL;
 405
 406         assert(m);
 407         assert(n);
 408
 409         /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
 410          * ordered already. */
 411
 412         for (f = m, t = m; f < m + *n; f++) {
 413
 414                 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
 415                  * it, as inaccessible paths really should drop the entire subtree. */
 416                 if (clear && path_startswith(mount_entry_path(f), clear)) {
 417                         log_debug("%s is masked by %s.", mount_entry_path(f), clear);
 418                         mount_entry_done(f);
 419                         continue;
 420                 }
 421
 422                 clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
 423
 424                 *t = *f;
 425                 t++;
 426         }
 427
 428         *n = t - m;
 429 }
 430
 431 static void drop_nop(MountEntry *m, unsigned *n) {
 432         MountEntry *f, *t;
 433
 434         assert(m);
 435         assert(n);
 436
 437         /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
 438          * list is ordered by prefixes. */
 439
 440         for (f = m, t = m; f < m + *n; f++) {
 441
 442                 /* Only suppress such subtrees for READONLY and READWRITE entries */
 443                 if (IN_SET(f->mode, READONLY, READWRITE)) {
 444                         MountEntry *p;
 445                         bool found = false;
 446
 447                         /* Now let's find the first parent of the entry we are looking at. */
 448                         for (p = t-1; p >= m; p--) {
 449                                 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
 450                                         found = true;
 451                                         break;
 452                                 }
 453                         }
 454
 455                         /* We found it, let's see if it's the same mode, if so, we can drop this entry */
 456                         if (found && p->mode == f->mode) {
 457                                 log_debug("%s is redundant by %s", mount_entry_path(f), mount_entry_path(p));
 458                                 mount_entry_done(f);
 459                                 continue;
 460                         }
 461                 }
 462
 463                 *t = *f;
 464                 t++;
 465         }
 466
 467         *n = t - m;
 468 }
 469
 470 static void drop_outside_root(const char *root_directory, MountEntry *m, unsigned *n) {
 471         MountEntry *f, *t;
 472
 473         assert(m);
 474         assert(n);
 475
 476         /* Nothing to do */
 477         if (!root_directory)
 478                 return;
 479
 480         /* Drops all mounts that are outside of the root directory. */
 481
 482         for (f = m, t = m; f < m + *n; f++) {
 483
 484                 if (!path_startswith(mount_entry_path(f), root_directory)) {
 485                         log_debug("%s is outside of root directory.", mount_entry_path(f));
 486                         mount_entry_done(f);
 487                         continue;
 488                 }
 489
 490                 *t = *f;
 491                 t++;
 492         }
 493
 494         *n = t - m;
 495 }
 496
 497 static int clone_device_node(const char *d, const char *temporary_mount) {
 498         const char *dn;
 499         struct stat st;
 500         int r;
 501
 502         if (stat(d, &st) < 0) {
 503                 if (errno == ENOENT)
 504                         return 0;
 505                 return -errno;
 506         }
 507
 508         if (!S_ISBLK(st.st_mode) &&
 509             !S_ISCHR(st.st_mode))
 510                 return -EINVAL;
 511
 512         if (st.st_rdev == 0)
 513                 return 0;
 514
 515         dn = strjoina(temporary_mount, d);
 516
 517         mac_selinux_create_file_prepare(d, st.st_mode);
 518         r = mknod(dn, st.st_mode, st.st_rdev);
 519         mac_selinux_create_file_clear();
 520         if (r < 0)
 521                 return log_debug_errno(errno, "mknod failed for %s: %m", d);
 522
 523         return 1;
 524 }
 525
 526 static int mount_private_dev(MountEntry *m) {
 527         static const char devnodes[] =
 528                 "/dev/null\0"
 529                 "/dev/zero\0"
 530                 "/dev/full\0"
 531                 "/dev/random\0"
 532                 "/dev/urandom\0"
 533                 "/dev/tty\0";
 534
 535         char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
 536         const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
 537         _cleanup_umask_ mode_t u;
 538         int r;
 539
 540         assert(m);
 541
 542         u = umask(0000);
 543
 544         if (!mkdtemp(temporary_mount))
 545                 return -errno;
 546
 547         dev = strjoina(temporary_mount, "/dev");
 548         (void) mkdir(dev, 0755);
 549         if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
 550                 r = -errno;
 551                 goto fail;
 552         }
 553
 554         devpts = strjoina(temporary_mount, "/dev/pts");
 555         (void) mkdir(devpts, 0755);
 556         if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
 557                 r = -errno;
 558                 goto fail;
 559         }
 560
 561         /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx
 562          * when /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible
 563          * thus, in that case make a clone
 564          *
 565          * in nspawn and other containers it will be a symlink, in that case make it a symlink
 566          */
 567         r = is_symlink("/dev/ptmx");
 568         if (r < 0)
 569                 goto fail;
 570         if (r > 0) {
 571                 devptmx = strjoina(temporary_mount, "/dev/ptmx");
 572                 if (symlink("pts/ptmx", devptmx) < 0) {
 573                         r = -errno;
 574                         goto fail;
 575                 }
 576         } else {
 577                 r = clone_device_node("/dev/ptmx", temporary_mount);
 578                 if (r < 0)
 579                         goto fail;
 580                 if (r == 0) {
 581                         r = -ENXIO;
 582                         goto fail;
 583                 }
 584         }
 585
 586         devshm = strjoina(temporary_mount, "/dev/shm");
 587         (void) mkdir(devshm, 0755);
 588         r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
 589         if (r < 0) {
 590                 r = -errno;
 591                 goto fail;
 592         }
 593
 594         devmqueue = strjoina(temporary_mount, "/dev/mqueue");
 595         (void) mkdir(devmqueue, 0755);
 596         (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
 597
 598         devhugepages = strjoina(temporary_mount, "/dev/hugepages");
 599         (void) mkdir(devhugepages, 0755);
 600         (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
 601
 602         devlog = strjoina(temporary_mount, "/dev/log");
 603         (void) symlink("/run/systemd/journal/dev-log", devlog);
 604
 605         NULSTR_FOREACH(d, devnodes) {
 606                 r = clone_device_node(d, temporary_mount);
 607                 if (r < 0)
 608                         goto fail;
 609         }
 610
 611         dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
 612
 613         /* Create the /dev directory if missing. It is more likely to be
 614          * missing when the service is started with RootDirectory. This is
 615          * consistent with mount units creating the mount points when missing.
 616          */
 617         (void) mkdir_p_label(mount_entry_path(m), 0755);
 618
 619         /* Unmount everything in old /dev */
 620         umount_recursive(mount_entry_path(m), 0);
 621         if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) {
 622                 r = -errno;
 623                 goto fail;
 624         }
 625
 626         rmdir(dev);
 627         rmdir(temporary_mount);
 628
 629         return 0;
 630
 631 fail:
 632         if (devpts)
 633                 umount(devpts);
 634
 635         if (devshm)
 636                 umount(devshm);
 637
 638         if (devhugepages)
 639                 umount(devhugepages);
 640
 641         if (devmqueue)
 642                 umount(devmqueue);
 643
 644         umount(dev);
 645         rmdir(dev);
 646         rmdir(temporary_mount);
 647
 648         return r;
 649 }
 650
 651 static int mount_bind_dev(const MountEntry *m) {
 652         int r;
 653
 654         assert(m);
 655
 656         /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
 657          * /dev. This is only used when RootDirectory= is set. */
 658
 659         (void) mkdir_p_label(mount_entry_path(m), 0755);
 660
 661         r = path_is_mount_point(mount_entry_path(m), NULL, 0);
 662         if (r < 0)
 663                 return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
 664         if (r > 0) /* make this a NOP if /dev is already a mount point */
 665                 return 0;
 666
 667         if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
 668                 return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m));
 669
 670         return 1;
 671 }
 672
 673 static int mount_sysfs(const MountEntry *m) {
 674         int r;
 675
 676         assert(m);
 677
 678         (void) mkdir_p_label(mount_entry_path(m), 0755);
 679
 680         r = path_is_mount_point(mount_entry_path(m), NULL, 0);
 681         if (r < 0)
 682                 return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
 683         if (r > 0) /* make this a NOP if /sys is already a mount point */
 684                 return 0;
 685
 686         /* Bind mount the host's version so that we get all child mounts of it, too. */
 687         if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
 688                 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
 689
 690         return 1;
 691 }
 692
 693 static int mount_procfs(const MountEntry *m) {
 694         int r;
 695
 696         assert(m);
 697
 698         (void) mkdir_p_label(mount_entry_path(m), 0755);
 699
 700         r = path_is_mount_point(mount_entry_path(m), NULL, 0);
 701         if (r < 0)
 702                 return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
 703         if (r > 0) /* make this a NOP if /proc is already a mount point */
 704                 return 0;
 705
 706         /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
 707         if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
 708                 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
 709
 710         return 1;
 711 }
 712
 713 static int mount_empty_dir(const MountEntry *m) {
 714         assert(m);
 715
 716         /* First, get rid of everything that is below if there is anything. Then, overmount with our new empty dir */
 717
 718         (void) mkdir_p_label(mount_entry_path(m), 0755);
 719         (void) umount_recursive(mount_entry_path(m), 0);
 720
 721         if (mount("tmpfs", mount_entry_path(m), "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
 722                 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
 723
 724         return 1;
 725 }
 726
 727 static int mount_entry_chase(
 728                 const char *root_directory,
 729                 const MountEntry *m,
 730                 const char *path,
 731                 char **location) {
 732
 733         char *chased;
 734         int r;
 735         unsigned flags = 0;
 736
 737         assert(m);
 738
 739         /* Since mount() will always follow symlinks and we need to take the different root directory into account we
 740          * chase the symlinks on our own first. This is called for the destination path, as well as the source path (if
 741          * that applies). The result is stored in "location". */
 742
 743         if (IN_SET(m->mode,
 744                    BIND_MOUNT,
 745                    BIND_MOUNT_RECURSIVE,
 746                    PRIVATE_TMP,
 747                    PRIVATE_DEV,
 748                    BIND_DEV,
 749                    EMPTY_DIR,
 750                    SYSFS,
 751                    PROCFS))
 752                 flags |= CHASE_NONEXISTENT;
 753
 754         r = chase_symlinks(path, root_directory, flags, &chased);
 755         if (r == -ENOENT && m->ignore) {
 756                 log_debug_errno(r, "Path %s does not exist, ignoring.", path);
 757                 return 0;
 758         }
 759         if (r < 0)
 760                 return log_debug_errno(r, "Failed to follow symlinks on %s: %m", path);
 761
 762         log_debug("Followed symlinks %s → %s.", path, chased);
 763
 764         free(*location);
 765         *location = chased;
 766
 767         return 1;
 768 }
 769
 770 static int apply_mount(
 771                 const char *root_directory,
 772                 MountEntry *m) {
 773
 774         bool rbind = true, make = false;
 775         const char *what;
 776         int r;
 777
 778         assert(m);
 779
 780         r = mount_entry_chase(root_directory, m, mount_entry_path(m), &m->path_malloc);
 781         if (r <= 0)
 782                 return r;
 783
 784         log_debug("Applying namespace mount on %s", mount_entry_path(m));
 785
 786         switch (m->mode) {
 787
 788         case INACCESSIBLE: {
 789                 struct stat target;
 790
 791                 /* First, get rid of everything that is below if there
 792                  * is anything... Then, overmount it with an
 793                  * inaccessible path. */
 794                 (void) umount_recursive(mount_entry_path(m), 0);
 795
 796                 if (lstat(mount_entry_path(m), &target) < 0)
 797                         return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m));
 798
 799                 what = mode_to_inaccessible_node(target.st_mode);
 800                 if (!what) {
 801                         log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed");
 802                         return -ELOOP;
 803                 }
 804                 break;
 805         }
 806
 807         case READONLY:
 808         case READWRITE:
 809                 r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
 810                 if (r < 0)
 811                         return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m));
 812                 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
 813                         return 0;
 814                 /* This isn't a mount point yet, let's make it one. */
 815                 what = mount_entry_path(m);
 816                 break;
 817
 818         case BIND_MOUNT:
 819                 rbind = false;
 820
 821                 _fallthrough_;
 822         case BIND_MOUNT_RECURSIVE:
 823                 /* Also chase the source mount */
 824
 825                 r = mount_entry_chase(root_directory, m, mount_entry_source(m), &m->source_malloc);
 826                 if (r <= 0)
 827                         return r;
 828
 829                 what = mount_entry_source(m);
 830                 make = true;
 831                 break;
 832
 833         case EMPTY_DIR:
 834                 return mount_empty_dir(m);
 835
 836         case PRIVATE_TMP:
 837                 what = mount_entry_source(m);
 838                 make = true;
 839                 break;
 840
 841         case PRIVATE_DEV:
 842                 return mount_private_dev(m);
 843
 844         case BIND_DEV:
 845                 return mount_bind_dev(m);
 846
 847         case SYSFS:
 848                 return mount_sysfs(m);
 849
 850         case PROCFS:
 851                 return mount_procfs(m);
 852
 853         default:
 854                 assert_not_reached("Unknown mode");
 855         }
 856
 857         assert(what);
 858
 859         if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0) {
 860                 bool try_again = false;
 861                 r = -errno;
 862
 863                 if (r == -ENOENT && make) {
 864                         struct stat st;
 865
 866                         /* Hmm, either the source or the destination are missing. Let's see if we can create the destination, then try again */
 867
 868                         if (stat(what, &st) >= 0) {
 869
 870                                 (void) mkdir_parents(mount_entry_path(m), 0755);
 871
 872                                 if (S_ISDIR(st.st_mode))
 873                                         try_again = mkdir(mount_entry_path(m), 0755) >= 0;
 874                                 else
 875                                         try_again = touch(mount_entry_path(m)) >= 0;
 876                         }
 877                 }
 878
 879                 if (try_again) {
 880                         if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0)
 881                                 r = -errno;
 882                         else
 883                                 r = 0;
 884                 }
 885
 886                 if (r < 0)
 887                         return log_debug_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
 888         }
 889
 890         log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
 891         return 0;
 892 }
 893
 894 static int make_read_only(const MountEntry *m, char **blacklist, FILE *proc_self_mountinfo) {
 895         int r = 0;
 896
 897         assert(m);
 898         assert(proc_self_mountinfo);
 899
 900         if (mount_entry_read_only(m))
 901                 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), true, blacklist, proc_self_mountinfo);
 902         else if (m->mode == PRIVATE_DEV) { /* Superblock can be readonly but the submounts can't */
 903                 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
 904                         r = -errno;
 905         } else
 906                 return 0;
 907
 908         /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
 909          * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
 910          * read-only mounts already applied. */
 911
 912         if (r == -ENOENT && m->ignore)
 913                 r = 0;
 914
 915         return r;
 916 }
 917
 918 static bool namespace_info_mount_apivfs(const char *root_directory, const NamespaceInfo *ns_info) {
 919         assert(ns_info);
 920
 921         /*
 922          * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
 923          * since to protect the API VFS mounts, they need to be around in the
 924          * first place... and RootDirectory= or RootImage= need to be set.
 925          */
 926
 927         /* root_directory should point to a mount point */
 928         return root_directory &&
 929                 (ns_info->mount_apivfs ||
 930                  ns_info->protect_control_groups ||
 931                  ns_info->protect_kernel_tunables);
 932 }
 933
 934 static unsigned namespace_calculate_mounts(
 935                 const char* root_directory,
 936                 const NamespaceInfo *ns_info,
 937                 char** read_write_paths,
 938                 char** read_only_paths,
 939                 char** inaccessible_paths,
 940                 char** empty_directories,
 941                 const BindMount *bind_mounts,
 942                 unsigned n_bind_mounts,
 943                 const char* tmp_dir,
 944                 const char* var_tmp_dir,
 945                 ProtectHome protect_home,
 946                 ProtectSystem protect_system) {
 947
 948         unsigned protect_home_cnt;
 949         unsigned protect_system_cnt =
 950                 (protect_system == PROTECT_SYSTEM_STRICT ?
 951                  ELEMENTSOF(protect_system_strict_table) :
 952                  ((protect_system == PROTECT_SYSTEM_FULL) ?
 953                   ELEMENTSOF(protect_system_full_table) :
 954                   ((protect_system == PROTECT_SYSTEM_YES) ?
 955                    ELEMENTSOF(protect_system_yes_table) : 0)));
 956
 957         protect_home_cnt =
 958                 (protect_home == PROTECT_HOME_YES ?
 959                  ELEMENTSOF(protect_home_yes_table) :
 960                  ((protect_home == PROTECT_HOME_READ_ONLY) ?
 961                   ELEMENTSOF(protect_home_read_only_table) : 0));
 962
 963         return !!tmp_dir + !!var_tmp_dir +
 964                 strv_length(read_write_paths) +
 965                 strv_length(read_only_paths) +
 966                 strv_length(inaccessible_paths) +
 967                 strv_length(empty_directories) +
 968                 n_bind_mounts +
 969                 ns_info->private_dev +
 970                 (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
 971                 (ns_info->protect_control_groups ? 1 : 0) +
 972                 (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
 973                 protect_home_cnt + protect_system_cnt +
 974                 (namespace_info_mount_apivfs(root_directory, ns_info) ? ELEMENTSOF(apivfs_table) : 0);
 975 }
 976
 977 int setup_namespace(
 978                 const char* root_directory,
 979                 const char* root_image,
 980                 const NamespaceInfo *ns_info,
 981                 char** read_write_paths,
 982                 char** read_only_paths,
 983                 char** inaccessible_paths,
 984                 char** empty_directories,
 985                 const BindMount *bind_mounts,
 986                 unsigned n_bind_mounts,
 987                 const char* tmp_dir,
 988                 const char* var_tmp_dir,
 989                 ProtectHome protect_home,
 990                 ProtectSystem protect_system,
 991                 unsigned long mount_flags,
 992                 DissectImageFlags dissect_image_flags) {
 993
 994         _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
 995         _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
 996         _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
 997         _cleanup_free_ void *root_hash = NULL;
 998         MountEntry *m, *mounts = NULL;
 999         size_t root_hash_size = 0;
1000         bool make_slave = false;
1001         const char *root;
1002         unsigned n_mounts;
1003         bool require_prefix = false;
1004         int r = 0;
1005
1006         assert(ns_info);
1007
1008         if (mount_flags == 0)
1009                 mount_flags = MS_SHARED;
1010
1011         if (root_image) {
1012                 dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT;
1013
1014                 if (protect_system == PROTECT_SYSTEM_STRICT && strv_isempty(read_write_paths))
1015                         dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
1016
1017                 r = loop_device_make_by_path(root_image,
1018                                              dissect_image_flags & DISSECT_IMAGE_READ_ONLY ? O_RDONLY : O_RDWR,
1019                                              &loop_device);
1020                 if (r < 0)
1021                         return r;
1022
1023                 r = root_hash_load(root_image, &root_hash, &root_hash_size);
1024                 if (r < 0)
1025                         return r;
1026
1027                 r = dissect_image(loop_device->fd, root_hash, root_hash_size, dissect_image_flags, &dissected_image);
1028                 if (r < 0)
1029                         return r;
1030
1031                 r = dissected_image_decrypt(dissected_image, NULL, root_hash, root_hash_size, dissect_image_flags, &decrypted_image);
1032                 if (r < 0)
1033                         return r;
1034         }
1035
1036         if (root_directory)
1037                 root = root_directory;
1038         else if (root_image || n_bind_mounts > 0) {
1039
1040                 /* If we are booting from an image, create a mount point for the image, if it's still missing. We use
1041                  * the same mount point for all images, which is safe, since they all live in their own namespaces
1042                  * after all, and hence won't see each other. We also use such a root directory whenever there are bind
1043                  * mounts configured, so that their source mounts are never obstructed by mounts we already applied
1044                  * while we are applying them. */
1045
1046                 root = "/run/systemd/unit-root";
1047                 (void) mkdir_label(root, 0700);
1048                 require_prefix = true;
1049         } else
1050                 root = NULL;
1051
1052         n_mounts = namespace_calculate_mounts(
1053                         root,
1054                         ns_info,
1055                         read_write_paths,
1056                         read_only_paths,
1057                         inaccessible_paths,
1058                         empty_directories,
1059                         bind_mounts, n_bind_mounts,
1060                         tmp_dir, var_tmp_dir,
1061                         protect_home, protect_system);
1062
1063         /* Set mount slave mode */
1064         if (root || n_mounts > 0)
1065                 make_slave = true;
1066
1067         if (n_mounts > 0) {
1068                 m = mounts = (MountEntry *) alloca0(n_mounts * sizeof(MountEntry));
1069                 r = append_access_mounts(&m, read_write_paths, READWRITE, require_prefix);
1070                 if (r < 0)
1071                         goto finish;
1072
1073                 r = append_access_mounts(&m, read_only_paths, READONLY, require_prefix);
1074                 if (r < 0)
1075                         goto finish;
1076
1077                 r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE, require_prefix);
1078                 if (r < 0)
1079                         goto finish;
1080
1081                 r = append_empty_dir_mounts(&m, empty_directories);
1082                 if (r < 0)
1083                         goto finish;
1084
1085                 r = append_bind_mounts(&m, bind_mounts, n_bind_mounts);
1086                 if (r < 0)
1087                         goto finish;
1088
1089                 if (tmp_dir) {
1090                         *(m++) = (MountEntry) {
1091                                 .path_const = "/tmp",
1092                                 .mode = PRIVATE_TMP,
1093                                 .source_const = tmp_dir,
1094                         };
1095                 }
1096
1097                 if (var_tmp_dir) {
1098                         *(m++) = (MountEntry) {
1099                                 .path_const = "/var/tmp",
1100                                 .mode = PRIVATE_TMP,
1101                                 .source_const = var_tmp_dir,
1102                         };
1103                 }
1104
1105                 if (ns_info->private_dev) {
1106                         *(m++) = (MountEntry) {
1107                                 .path_const = "/dev",
1108                                 .mode = PRIVATE_DEV,
1109                         };
1110                 }
1111
1112                 if (ns_info->protect_kernel_tunables) {
1113                         r = append_static_mounts(&m, protect_kernel_tunables_table, ELEMENTSOF(protect_kernel_tunables_table), ns_info->ignore_protect_paths);
1114                         if (r < 0)
1115                                 goto finish;
1116                 }
1117
1118                 if (ns_info->protect_kernel_modules) {
1119                         r = append_static_mounts(&m, protect_kernel_modules_table, ELEMENTSOF(protect_kernel_modules_table), ns_info->ignore_protect_paths);
1120                         if (r < 0)
1121                                 goto finish;
1122                 }
1123
1124                 if (ns_info->protect_control_groups) {
1125                         *(m++) = (MountEntry) {
1126                                 .path_const = "/sys/fs/cgroup",
1127                                 .mode = READONLY,
1128                         };
1129                 }
1130
1131                 r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths);
1132                 if (r < 0)
1133                         goto finish;
1134
1135                 r = append_protect_system(&m, protect_system, false);
1136                 if (r < 0)
1137                         goto finish;
1138
1139                 if (namespace_info_mount_apivfs(root, ns_info)) {
1140                         r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths);
1141                         if (r < 0)
1142                                 goto finish;
1143                 }
1144
1145                 assert(mounts + n_mounts == m);
1146
1147                 /* Prepend the root directory where that's necessary */
1148                 r = prefix_where_needed(mounts, n_mounts, root);
1149                 if (r < 0)
1150                         goto finish;
1151
1152                 qsort(mounts, n_mounts, sizeof(MountEntry), mount_path_compare);
1153
1154                 drop_duplicates(mounts, &n_mounts);
1155                 drop_outside_root(root, mounts, &n_mounts);
1156                 drop_inaccessible(mounts, &n_mounts);
1157                 drop_nop(mounts, &n_mounts);
1158         }
1159
1160         if (unshare(CLONE_NEWNS) < 0) {
1161                 r = -errno;
1162                 goto finish;
1163         }
1164
1165         if (make_slave) {
1166                 /* Remount / as SLAVE so that nothing now mounted in the namespace
1167                    shows up in the parent */
1168                 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1169                         r = -errno;
1170                         goto finish;
1171                 }
1172         }
1173
1174         if (root_image) {
1175                 /* A root image is specified, mount it to the right place */
1176                 r = dissected_image_mount(dissected_image, root, UID_INVALID, dissect_image_flags);
1177                 if (r < 0)
1178                         goto finish;
1179
1180                 if (decrypted_image) {
1181                         r = decrypted_image_relinquish(decrypted_image);
1182                         if (r < 0)
1183                                 goto finish;
1184                 }
1185
1186                 loop_device_relinquish(loop_device);
1187
1188         } else if (root_directory) {
1189
1190                 /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
1191                 r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW);
1192                 if (r < 0)
1193                         goto finish;
1194                 if (r == 0) {
1195                         if (mount(root, root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1196                                 r = -errno;
1197                                 goto finish;
1198                         }
1199                 }
1200
1201         } else if (root) {
1202
1203                 /* Let's mount the main root directory to the root directory to use */
1204                 if (mount("/", root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1205                         r = -errno;
1206                         goto finish;
1207                 }
1208         }
1209
1210         /* Try to set up the new root directory before mounting anything else there. */
1211         if (root_image || root_directory)
1212                 (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
1213
1214         if (n_mounts > 0) {
1215                 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
1216                 char **blacklist;
1217                 unsigned j;
1218
1219                 /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of /proc.
1220                  * For example, this is the case with the option: 'InaccessiblePaths=/proc' */
1221                 proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
1222                 if (!proc_self_mountinfo) {
1223                         r = -errno;
1224                         goto finish;
1225                 }
1226
1227                 /* First round, add in all special mounts we need */
1228                 for (m = mounts; m < mounts + n_mounts; ++m) {
1229                         r = apply_mount(root, m);
1230                         if (r < 0)
1231                                 goto finish;
1232                 }
1233
1234                 /* Create a blacklist we can pass to bind_mount_recursive() */
1235                 blacklist = newa(char*, n_mounts+1);
1236                 for (j = 0; j < n_mounts; j++)
1237                         blacklist[j] = (char*) mount_entry_path(mounts+j);
1238                 blacklist[j] = NULL;
1239
1240                 /* Second round, flip the ro bits if necessary. */
1241                 for (m = mounts; m < mounts + n_mounts; ++m) {
1242                         r = make_read_only(m, blacklist, proc_self_mountinfo);
1243                         if (r < 0)
1244                                 goto finish;
1245                 }
1246         }
1247
1248         if (root) {
1249                 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
1250                 r = mount_move_root(root);
1251                 if (r < 0)
1252                         goto finish;
1253         }
1254
1255         /* Remount / as the desired mode. Note that this will not
1256          * reestablish propagation from our side to the host, since
1257          * what's disconnected is disconnected. */
1258         if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
1259                 r = -errno;
1260                 goto finish;
1261         }
1262
1263         r = 0;
1264
1265 finish:
1266         for (m = mounts; m < mounts + n_mounts; m++)
1267                 mount_entry_done(m);
1268
1269         return r;
1270 }
1271
1272 void bind_mount_free_many(BindMount *b, unsigned n) {
1273         unsigned i;
1274
1275         assert(b || n == 0);
1276
1277         for (i = 0; i < n; i++) {
1278                 free(b[i].source);
1279                 free(b[i].destination);
1280         }
1281
1282         free(b);
1283 }
1284
1285 int bind_mount_add(BindMount **b, unsigned *n, const BindMount *item) {
1286         _cleanup_free_ char *s = NULL, *d = NULL;
1287         BindMount *c;
1288
1289         assert(b);
1290         assert(n);
1291         assert(item);
1292
1293         s = strdup(item->source);
1294         if (!s)
1295                 return -ENOMEM;
1296
1297         d = strdup(item->destination);
1298         if (!d)
1299                 return -ENOMEM;
1300
1301         c = realloc_multiply(*b, sizeof(BindMount), *n + 1);
1302         if (!c)
1303                 return -ENOMEM;
1304
1305         *b = c;
1306
1307         c[(*n) ++] = (BindMount) {
1308                 .source = s,
1309                 .destination = d,
1310                 .read_only = item->read_only,
1311                 .recursive = item->recursive,
1312                 .ignore_enoent = item->ignore_enoent,
1313         };
1314
1315         s = d = NULL;
1316         return 0;
1317 }
1318
1319 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
1320         _cleanup_free_ char *x = NULL;
1321         char bid[SD_ID128_STRING_MAX];
1322         sd_id128_t boot_id;
1323         int r;
1324
1325         assert(id);
1326         assert(prefix);
1327         assert(path);
1328
1329         /* We include the boot id in the directory so that after a
1330          * reboot we can easily identify obsolete directories. */
1331
1332         r = sd_id128_get_boot(&boot_id);
1333         if (r < 0)
1334                 return r;
1335
1336         x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX");
1337         if (!x)
1338                 return -ENOMEM;
1339
1340         RUN_WITH_UMASK(0077)
1341                 if (!mkdtemp(x))
1342                         return -errno;
1343
1344         RUN_WITH_UMASK(0000) {
1345                 char *y;
1346
1347                 y = strjoina(x, "/tmp");
1348
1349                 if (mkdir(y, 0777 | S_ISVTX) < 0)
1350                         return -errno;
1351         }
1352
1353         *path = x;
1354         x = NULL;
1355
1356         return 0;
1357 }
1358
1359 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
1360         char *a, *b;
1361         int r;
1362
1363         assert(id);
1364         assert(tmp_dir);
1365         assert(var_tmp_dir);
1366
1367         r = setup_one_tmp_dir(id, "/tmp", &a);
1368         if (r < 0)
1369                 return r;
1370
1371         r = setup_one_tmp_dir(id, "/var/tmp", &b);
1372         if (r < 0) {
1373                 char *t;
1374
1375                 t = strjoina(a, "/tmp");
1376                 rmdir(t);
1377                 rmdir(a);
1378
1379                 free(a);
1380                 return r;
1381         }
1382
1383         *tmp_dir = a;
1384         *var_tmp_dir = b;
1385
1386         return 0;
1387 }
1388
1389 int setup_netns(int netns_storage_socket[2]) {
1390         _cleanup_close_ int netns = -1;
1391         int r, q;
1392
1393         assert(netns_storage_socket);
1394         assert(netns_storage_socket[0] >= 0);
1395         assert(netns_storage_socket[1] >= 0);
1396
1397         /* We use the passed socketpair as a storage buffer for our
1398          * namespace reference fd. Whatever process runs this first
1399          * shall create a new namespace, all others should just join
1400          * it. To serialize that we use a file lock on the socket
1401          * pair.
1402          *
1403          * It's a bit crazy, but hey, works great! */
1404
1405         if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1406                 return -errno;
1407
1408         netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1409         if (netns == -EAGAIN) {
1410                 /* Nothing stored yet, so let's create a new namespace */
1411
1412                 if (unshare(CLONE_NEWNET) < 0) {
1413                         r = -errno;
1414                         goto fail;
1415                 }
1416
1417                 loopback_setup();
1418
1419                 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
1420                 if (netns < 0) {
1421                         r = -errno;
1422                         goto fail;
1423                 }
1424
1425                 r = 1;
1426
1427         } else if (netns < 0) {
1428                 r = netns;
1429                 goto fail;
1430
1431         } else {
1432                 /* Yay, found something, so let's join the namespace */
1433                 if (setns(netns, CLONE_NEWNET) < 0) {
1434                         r = -errno;
1435                         goto fail;
1436                 }
1437
1438                 r = 0;
1439         }
1440
1441         q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1442         if (q < 0) {
1443                 r = q;
1444                 goto fail;
1445         }
1446
1447 fail:
1448         (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
1449         return r;
1450 }
1451
1452 bool ns_type_supported(NamespaceType type) {
1453         const char *t, *ns_proc;
1454
1455         t = namespace_type_to_string(type);
1456         if (!t) /* Don't know how to translate this? Then it's not supported */
1457                 return false;
1458
1459         ns_proc = strjoina("/proc/self/ns/", t);
1460         return access(ns_proc, F_OK) == 0;
1461 }
1462
1463 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
1464         [PROTECT_HOME_NO] = "no",
1465         [PROTECT_HOME_YES] = "yes",
1466         [PROTECT_HOME_READ_ONLY] = "read-only",
1467 };
1468
1469 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
1470
1471 ProtectHome parse_protect_home_or_bool(const char *s) {
1472         int r;
1473
1474         r = parse_boolean(s);
1475         if (r > 0)
1476                 return PROTECT_HOME_YES;
1477         if (r == 0)
1478                 return PROTECT_HOME_NO;
1479
1480         return protect_home_from_string(s);
1481 }
1482
1483 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
1484         [PROTECT_SYSTEM_NO] = "no",
1485         [PROTECT_SYSTEM_YES] = "yes",
1486         [PROTECT_SYSTEM_FULL] = "full",
1487         [PROTECT_SYSTEM_STRICT] = "strict",
1488 };
1489
1490 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);
1491
1492 ProtectSystem parse_protect_system_or_bool(const char *s) {
1493         int r;
1494
1495         r = parse_boolean(s);
1496         if (r > 0)
1497                 return PROTECT_SYSTEM_YES;
1498         if (r == 0)
1499                 return PROTECT_SYSTEM_NO;
1500
1501         return protect_system_from_string(s);
1502 }
1503
1504 static const char* const namespace_type_table[] = {
1505         [NAMESPACE_MOUNT] = "mnt",
1506         [NAMESPACE_CGROUP] = "cgroup",
1507         [NAMESPACE_UTS] = "uts",
1508         [NAMESPACE_IPC] = "ipc",
1509         [NAMESPACE_USER] = "user",
1510         [NAMESPACE_PID] = "pid",
1511         [NAMESPACE_NET] = "net",
1512 };
1513
1514 DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);