src/core/namespace.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2010 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <errno.h>
  22 #include <sched.h>
  23 #include <stdio.h>
  24 #include <string.h>
  25 #include <sys/mount.h>
  26 #include <sys/stat.h>
  27 #include <unistd.h>
  28 #include <linux/fs.h>
  29
  30 #include "alloc-util.h"
  31 #include "base-filesystem.h"
  32 #include "dev-setup.h"
  33 #include "fd-util.h"
  34 #include "fs-util.h"
  35 #include "label.h"
  36 #include "loop-util.h"
  37 #include "loopback-setup.h"
  38 #include "missing.h"
  39 #include "mkdir.h"
  40 #include "mount-util.h"
  41 #include "namespace.h"
  42 #include "path-util.h"
  43 #include "selinux-util.h"
  44 #include "socket-util.h"
  45 #include "stat-util.h"
  46 #include "string-table.h"
  47 #include "string-util.h"
  48 #include "strv.h"
  49 #include "umask-util.h"
  50 #include "user-util.h"
  51 #include "util.h"
  52
  53 #define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
  54
  55 typedef enum MountMode {
  56         /* This is ordered by priority! */
  57         INACCESSIBLE,
  58         BIND_MOUNT,
  59         BIND_MOUNT_RECURSIVE,
  60         PRIVATE_TMP,
  61         PRIVATE_DEV,
  62         BIND_DEV,
  63         EMPTY_DIR,
  64         SYSFS,
  65         PROCFS,
  66         READONLY,
  67         READWRITE,
  68 } MountMode;
  69
  70 typedef struct MountEntry {
  71         const char *path_const;   /* Memory allocated on stack or static */
  72         MountMode mode:5;
  73         bool ignore:1;            /* Ignore if path does not exist? */
  74         bool has_prefix:1;        /* Already is prefixed by the root dir? */
  75         bool read_only:1;         /* Shall this mount point be read-only? */
  76         char *path_malloc;        /* Use this instead of 'path_const' if we had to allocate memory */
  77         const char *source_const; /* The source path, for bind mounts */
  78         char *source_malloc;
  79 } MountEntry;
  80
  81 /* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
  82  * something there already. These mounts are hence overriden by any other explicitly configured mounts. */
  83 static const MountEntry apivfs_table[] = {
  84         { "/proc",               PROCFS,       false },
  85         { "/dev",                BIND_DEV,     false },
  86         { "/sys",                SYSFS,        false },
  87 };
  88
  89 /* ProtectKernelTunables= option and the related filesystem APIs */
  90 static const MountEntry protect_kernel_tunables_table[] = {
  91         { "/proc/sys",           READONLY,     false },
  92         { "/proc/sysrq-trigger", READONLY,     true  },
  93         { "/proc/latency_stats", READONLY,     true  },
  94         { "/proc/mtrr",          READONLY,     true  },
  95         { "/proc/apm",           READONLY,     true  }, /* Obsolete API, there's no point in permitting access to this, ever */
  96         { "/proc/acpi",          READONLY,     true  },
  97         { "/proc/timer_stats",   READONLY,     true  },
  98         { "/proc/asound",        READONLY,     true  },
  99         { "/proc/bus",           READONLY,     true  },
 100         { "/proc/fs",            READONLY,     true  },
 101         { "/proc/irq",           READONLY,     true  },
 102         { "/sys",                READONLY,     false },
 103         { "/sys/kernel/debug",   READONLY,     true  },
 104         { "/sys/kernel/tracing", READONLY,     true  },
 105         { "/sys/fs/cgroup",      READWRITE,    false }, /* READONLY is set by ProtectControlGroups= option */
 106         { "/sys/fs/selinux",     READWRITE,    true  },
 107 };
 108
 109 /* ProtectKernelModules= option */
 110 static const MountEntry protect_kernel_modules_table[] = {
 111 #if HAVE_SPLIT_USR
 112         { "/lib/modules",        INACCESSIBLE, true  },
 113 #endif
 114         { "/usr/lib/modules",    INACCESSIBLE, true  },
 115 };
 116
 117 /*
 118  * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
 119  * system should be protected by ProtectSystem=
 120  */
 121 static const MountEntry protect_home_read_only_table[] = {
 122         { "/home",               READONLY,     true  },
 123         { "/run/user",           READONLY,     true  },
 124         { "/root",               READONLY,     true  },
 125 };
 126
 127 /* ProtectHome=yes table */
 128 static const MountEntry protect_home_yes_table[] = {
 129         { "/home",               INACCESSIBLE, true  },
 130         { "/run/user",           INACCESSIBLE, true  },
 131         { "/root",               INACCESSIBLE, true  },
 132 };
 133
 134 /* ProtectSystem=yes table */
 135 static const MountEntry protect_system_yes_table[] = {
 136         { "/usr",                READONLY,     false },
 137         { "/boot",               READONLY,     true  },
 138         { "/efi",                READONLY,     true  },
 139 };
 140
 141 /* ProtectSystem=full includes ProtectSystem=yes */
 142 static const MountEntry protect_system_full_table[] = {
 143         { "/usr",                READONLY,     false },
 144         { "/boot",               READONLY,     true  },
 145         { "/efi",                READONLY,     true  },
 146         { "/etc",                READONLY,     false },
 147 };
 148
 149 /*
 150  * ProtectSystem=strict table. In this strict mode, we mount everything
 151  * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
 152  * which are left writable, but PrivateDevices= + ProtectKernelTunables=
 153  * protect those, and these options should be fully orthogonal.
 154  * (And of course /home and friends are also left writable, as ProtectHome=
 155  * shall manage those, orthogonally).
 156  */
 157 static const MountEntry protect_system_strict_table[] = {
 158         { "/",                   READONLY,     false },
 159         { "/proc",               READWRITE,    false },      /* ProtectKernelTunables= */
 160         { "/sys",                READWRITE,    false },      /* ProtectKernelTunables= */
 161         { "/dev",                READWRITE,    false },      /* PrivateDevices= */
 162         { "/home",               READWRITE,    true  },      /* ProtectHome= */
 163         { "/run/user",           READWRITE,    true  },      /* ProtectHome= */
 164         { "/root",               READWRITE,    true  },      /* ProtectHome= */
 165 };
 166
 167 static const char *mount_entry_path(const MountEntry *p) {
 168         assert(p);
 169
 170         /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
 171          * otherwise the stack/static ->path field is returned. */
 172
 173         return p->path_malloc ?: p->path_const;
 174 }
 175
 176 static bool mount_entry_read_only(const MountEntry *p) {
 177         assert(p);
 178
 179         return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE);
 180 }
 181
 182 static const char *mount_entry_source(const MountEntry *p) {
 183         assert(p);
 184
 185         return p->source_malloc ?: p->source_const;
 186 }
 187
 188 static void mount_entry_done(MountEntry *p) {
 189         assert(p);
 190
 191         p->path_malloc = mfree(p->path_malloc);
 192         p->source_malloc = mfree(p->source_malloc);
 193 }
 194
 195 static int append_access_mounts(MountEntry **p, char **strv, MountMode mode, bool forcibly_require_prefix) {
 196         char **i;
 197
 198         assert(p);
 199
 200         /* Adds a list of user-supplied READWRITE/READONLY/INACCESSIBLE entries */
 201
 202         STRV_FOREACH(i, strv) {
 203                 bool ignore = false, needs_prefix = false;
 204                 const char *e = *i;
 205
 206                 /* Look for any prefixes */
 207                 if (startswith(e, "-")) {
 208                         e++;
 209                         ignore = true;
 210                 }
 211                 if (startswith(e, "+")) {
 212                         e++;
 213                         needs_prefix = true;
 214                 }
 215
 216                 if (!path_is_absolute(e))
 217                         return -EINVAL;
 218
 219                 *((*p)++) = (MountEntry) {
 220                         .path_const = e,
 221                         .mode = mode,
 222                         .ignore = ignore,
 223                         .has_prefix = !needs_prefix && !forcibly_require_prefix,
 224                 };
 225         }
 226
 227         return 0;
 228 }
 229
 230 static int append_empty_dir_mounts(MountEntry **p, char **strv) {
 231         char **i;
 232
 233         assert(p);
 234
 235         /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
 236          * "/private/" boundary directories for DynamicUser=1. */
 237
 238         STRV_FOREACH(i, strv) {
 239
 240                 *((*p)++) = (MountEntry) {
 241                         .path_const = *i,
 242                         .mode = EMPTY_DIR,
 243                         .ignore = false,
 244                         .has_prefix = false,
 245                         .read_only = true,
 246                 };
 247         }
 248
 249         return 0;
 250 }
 251
 252 static int append_bind_mounts(MountEntry **p, const BindMount *binds, unsigned n) {
 253         unsigned i;
 254
 255         assert(p);
 256
 257         for (i = 0; i < n; i++) {
 258                 const BindMount *b = binds + i;
 259
 260                 *((*p)++) = (MountEntry) {
 261                         .path_const = b->destination,
 262                         .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT,
 263                         .read_only = b->read_only,
 264                         .source_const = b->source,
 265                         .ignore = b->ignore_enoent,
 266                 };
 267         }
 268
 269         return 0;
 270 }
 271
 272 static int append_static_mounts(MountEntry **p, const MountEntry *mounts, unsigned n, bool ignore_protect) {
 273         unsigned i;
 274
 275         assert(p);
 276         assert(mounts);
 277
 278         /* Adds a list of static pre-defined entries */
 279
 280         for (i = 0; i < n; i++)
 281                 *((*p)++) = (MountEntry) {
 282                         .path_const = mount_entry_path(mounts+i),
 283                         .mode = mounts[i].mode,
 284                         .ignore = mounts[i].ignore || ignore_protect,
 285                 };
 286
 287         return 0;
 288 }
 289
 290 static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
 291         assert(p);
 292
 293         switch (protect_home) {
 294
 295         case PROTECT_HOME_NO:
 296                 return 0;
 297
 298         case PROTECT_HOME_READ_ONLY:
 299                 return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
 300
 301         case PROTECT_HOME_YES:
 302                 return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
 303
 304         default:
 305                 assert_not_reached("Unexpected ProtectHome= value");
 306         }
 307 }
 308
 309 static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
 310         assert(p);
 311
 312         switch (protect_system) {
 313
 314         case PROTECT_SYSTEM_NO:
 315                 return 0;
 316
 317         case PROTECT_SYSTEM_STRICT:
 318                 return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
 319
 320         case PROTECT_SYSTEM_YES:
 321                 return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
 322
 323         case PROTECT_SYSTEM_FULL:
 324                 return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
 325
 326         default:
 327                 assert_not_reached("Unexpected ProtectSystem= value");
 328         }
 329 }
 330
 331 static int mount_path_compare(const void *a, const void *b) {
 332         const MountEntry *p = a, *q = b;
 333         int d;
 334
 335         /* If the paths are not equal, then order prefixes first */
 336         d = path_compare(mount_entry_path(p), mount_entry_path(q));
 337         if (d != 0)
 338                 return d;
 339
 340         /* If the paths are equal, check the mode */
 341         if (p->mode < q->mode)
 342                 return -1;
 343
 344         if (p->mode > q->mode)
 345                 return 1;
 346
 347         return 0;
 348 }
 349
 350 static int prefix_where_needed(MountEntry *m, unsigned n, const char *root_directory) {
 351         unsigned i;
 352
 353         /* Prefixes all paths in the bind mount table with the root directory if it is specified and the entry needs
 354          * that. */
 355
 356         if (!root_directory)
 357                 return 0;
 358
 359         for (i = 0; i < n; i++) {
 360                 char *s;
 361
 362                 if (m[i].has_prefix)
 363                         continue;
 364
 365                 s = prefix_root(root_directory, mount_entry_path(m+i));
 366                 if (!s)
 367                         return -ENOMEM;
 368
 369                 free_and_replace(m[i].path_malloc, s);
 370                 m[i].has_prefix = true;
 371         }
 372
 373         return 0;
 374 }
 375
 376 static void drop_duplicates(MountEntry *m, unsigned *n) {
 377         MountEntry *f, *t, *previous;
 378
 379         assert(m);
 380         assert(n);
 381
 382         /* Drops duplicate entries. Expects that the array is properly ordered already. */
 383
 384         for (f = m, t = m, previous = NULL; f < m + *n; f++) {
 385
 386                 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
 387                  * above. */
 388                 if (previous && path_equal(mount_entry_path(f), mount_entry_path(previous))) {
 389                         log_debug("%s is duplicate.", mount_entry_path(f));
 390                         previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */
 391                         mount_entry_done(f);
 392                         continue;
 393                 }
 394
 395                 *t = *f;
 396                 previous = t;
 397                 t++;
 398         }
 399
 400         *n = t - m;
 401 }
 402
 403 static void drop_inaccessible(MountEntry *m, unsigned *n) {
 404         MountEntry *f, *t;
 405         const char *clear = NULL;
 406
 407         assert(m);
 408         assert(n);
 409
 410         /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
 411          * ordered already. */
 412
 413         for (f = m, t = m; f < m + *n; f++) {
 414
 415                 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
 416                  * it, as inaccessible paths really should drop the entire subtree. */
 417                 if (clear && path_startswith(mount_entry_path(f), clear)) {
 418                         log_debug("%s is masked by %s.", mount_entry_path(f), clear);
 419                         mount_entry_done(f);
 420                         continue;
 421                 }
 422
 423                 clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
 424
 425                 *t = *f;
 426                 t++;
 427         }
 428
 429         *n = t - m;
 430 }
 431
 432 static void drop_nop(MountEntry *m, unsigned *n) {
 433         MountEntry *f, *t;
 434
 435         assert(m);
 436         assert(n);
 437
 438         /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
 439          * list is ordered by prefixes. */
 440
 441         for (f = m, t = m; f < m + *n; f++) {
 442
 443                 /* Only suppress such subtrees for READONLY and READWRITE entries */
 444                 if (IN_SET(f->mode, READONLY, READWRITE)) {
 445                         MountEntry *p;
 446                         bool found = false;
 447
 448                         /* Now let's find the first parent of the entry we are looking at. */
 449                         for (p = t-1; p >= m; p--) {
 450                                 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
 451                                         found = true;
 452                                         break;
 453                                 }
 454                         }
 455
 456                         /* We found it, let's see if it's the same mode, if so, we can drop this entry */
 457                         if (found && p->mode == f->mode) {
 458                                 log_debug("%s is redundant by %s", mount_entry_path(f), mount_entry_path(p));
 459                                 mount_entry_done(f);
 460                                 continue;
 461                         }
 462                 }
 463
 464                 *t = *f;
 465                 t++;
 466         }
 467
 468         *n = t - m;
 469 }
 470
 471 static void drop_outside_root(const char *root_directory, MountEntry *m, unsigned *n) {
 472         MountEntry *f, *t;
 473
 474         assert(m);
 475         assert(n);
 476
 477         /* Nothing to do */
 478         if (!root_directory)
 479                 return;
 480
 481         /* Drops all mounts that are outside of the root directory. */
 482
 483         for (f = m, t = m; f < m + *n; f++) {
 484
 485                 if (!path_startswith(mount_entry_path(f), root_directory)) {
 486                         log_debug("%s is outside of root directory.", mount_entry_path(f));
 487                         mount_entry_done(f);
 488                         continue;
 489                 }
 490
 491                 *t = *f;
 492                 t++;
 493         }
 494
 495         *n = t - m;
 496 }
 497
 498 static int clone_device_node(const char *d, const char *temporary_mount) {
 499         const char *dn;
 500         struct stat st;
 501         int r;
 502
 503         if (stat(d, &st) < 0) {
 504                 if (errno == ENOENT)
 505                         return 0;
 506                 return -errno;
 507         }
 508
 509         if (!S_ISBLK(st.st_mode) &&
 510             !S_ISCHR(st.st_mode))
 511                 return -EINVAL;
 512
 513         if (st.st_rdev == 0)
 514                 return 0;
 515
 516         dn = strjoina(temporary_mount, d);
 517
 518         mac_selinux_create_file_prepare(d, st.st_mode);
 519         r = mknod(dn, st.st_mode, st.st_rdev);
 520         mac_selinux_create_file_clear();
 521         if (r < 0)
 522                 return log_debug_errno(errno, "mknod failed for %s: %m", d);
 523
 524         return 1;
 525 }
 526
 527 static int mount_private_dev(MountEntry *m) {
 528         static const char devnodes[] =
 529                 "/dev/null\0"
 530                 "/dev/zero\0"
 531                 "/dev/full\0"
 532                 "/dev/random\0"
 533                 "/dev/urandom\0"
 534                 "/dev/tty\0";
 535
 536         char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
 537         const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
 538         _cleanup_umask_ mode_t u;
 539         int r;
 540
 541         assert(m);
 542
 543         u = umask(0000);
 544
 545         if (!mkdtemp(temporary_mount))
 546                 return -errno;
 547
 548         dev = strjoina(temporary_mount, "/dev");
 549         (void) mkdir(dev, 0755);
 550         if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
 551                 r = -errno;
 552                 goto fail;
 553         }
 554
 555         devpts = strjoina(temporary_mount, "/dev/pts");
 556         (void) mkdir(devpts, 0755);
 557         if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
 558                 r = -errno;
 559                 goto fail;
 560         }
 561
 562         /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx
 563          * when /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible
 564          * thus, in that case make a clone
 565          *
 566          * in nspawn and other containers it will be a symlink, in that case make it a symlink
 567          */
 568         r = is_symlink("/dev/ptmx");
 569         if (r < 0)
 570                 goto fail;
 571         if (r > 0) {
 572                 devptmx = strjoina(temporary_mount, "/dev/ptmx");
 573                 if (symlink("pts/ptmx", devptmx) < 0) {
 574                         r = -errno;
 575                         goto fail;
 576                 }
 577         } else {
 578                 r = clone_device_node("/dev/ptmx", temporary_mount);
 579                 if (r < 0)
 580                         goto fail;
 581                 if (r == 0) {
 582                         r = -ENXIO;
 583                         goto fail;
 584                 }
 585         }
 586
 587         devshm = strjoina(temporary_mount, "/dev/shm");
 588         (void) mkdir(devshm, 0755);
 589         r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
 590         if (r < 0) {
 591                 r = -errno;
 592                 goto fail;
 593         }
 594
 595         devmqueue = strjoina(temporary_mount, "/dev/mqueue");
 596         (void) mkdir(devmqueue, 0755);
 597         (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
 598
 599         devhugepages = strjoina(temporary_mount, "/dev/hugepages");
 600         (void) mkdir(devhugepages, 0755);
 601         (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
 602
 603         devlog = strjoina(temporary_mount, "/dev/log");
 604         (void) symlink("/run/systemd/journal/dev-log", devlog);
 605
 606         NULSTR_FOREACH(d, devnodes) {
 607                 r = clone_device_node(d, temporary_mount);
 608                 if (r < 0)
 609                         goto fail;
 610         }
 611
 612         dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
 613
 614         /* Create the /dev directory if missing. It is more likely to be
 615          * missing when the service is started with RootDirectory. This is
 616          * consistent with mount units creating the mount points when missing.
 617          */
 618         (void) mkdir_p_label(mount_entry_path(m), 0755);
 619
 620         /* Unmount everything in old /dev */
 621         umount_recursive(mount_entry_path(m), 0);
 622         if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) {
 623                 r = -errno;
 624                 goto fail;
 625         }
 626
 627         rmdir(dev);
 628         rmdir(temporary_mount);
 629
 630         return 0;
 631
 632 fail:
 633         if (devpts)
 634                 umount(devpts);
 635
 636         if (devshm)
 637                 umount(devshm);
 638
 639         if (devhugepages)
 640                 umount(devhugepages);
 641
 642         if (devmqueue)
 643                 umount(devmqueue);
 644
 645         umount(dev);
 646         rmdir(dev);
 647         rmdir(temporary_mount);
 648
 649         return r;
 650 }
 651
 652 static int mount_bind_dev(const MountEntry *m) {
 653         int r;
 654
 655         assert(m);
 656
 657         /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
 658          * /dev. This is only used when RootDirectory= is set. */
 659
 660         (void) mkdir_p_label(mount_entry_path(m), 0755);
 661
 662         r = path_is_mount_point(mount_entry_path(m), NULL, 0);
 663         if (r < 0)
 664                 return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
 665         if (r > 0) /* make this a NOP if /dev is already a mount point */
 666                 return 0;
 667
 668         if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
 669                 return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m));
 670
 671         return 1;
 672 }
 673
 674 static int mount_sysfs(const MountEntry *m) {
 675         int r;
 676
 677         assert(m);
 678
 679         (void) mkdir_p_label(mount_entry_path(m), 0755);
 680
 681         r = path_is_mount_point(mount_entry_path(m), NULL, 0);
 682         if (r < 0)
 683                 return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
 684         if (r > 0) /* make this a NOP if /sys is already a mount point */
 685                 return 0;
 686
 687         /* Bind mount the host's version so that we get all child mounts of it, too. */
 688         if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
 689                 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
 690
 691         return 1;
 692 }
 693
 694 static int mount_procfs(const MountEntry *m) {
 695         int r;
 696
 697         assert(m);
 698
 699         (void) mkdir_p_label(mount_entry_path(m), 0755);
 700
 701         r = path_is_mount_point(mount_entry_path(m), NULL, 0);
 702         if (r < 0)
 703                 return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
 704         if (r > 0) /* make this a NOP if /proc is already a mount point */
 705                 return 0;
 706
 707         /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
 708         if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
 709                 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
 710
 711         return 1;
 712 }
 713
 714 static int mount_empty_dir(const MountEntry *m) {
 715         assert(m);
 716
 717         /* First, get rid of everything that is below if there is anything. Then, overmount with our new empty dir */
 718
 719         (void) mkdir_p_label(mount_entry_path(m), 0755);
 720         (void) umount_recursive(mount_entry_path(m), 0);
 721
 722         if (mount("tmpfs", mount_entry_path(m), "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
 723                 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
 724
 725         return 1;
 726 }
 727
 728 static int mount_entry_chase(
 729                 const char *root_directory,
 730                 const MountEntry *m,
 731                 const char *path,
 732                 bool chase_nonexistent,
 733                 char **location) {
 734
 735         char *chased;
 736         int r;
 737
 738         assert(m);
 739
 740         /* Since mount() will always follow symlinks and we need to take the different root directory into account we
 741          * chase the symlinks on our own first. This is called for the destination path, as well as the source path (if
 742          * that applies). The result is stored in "location". */
 743
 744         r = chase_symlinks(path, root_directory, chase_nonexistent ? CHASE_NONEXISTENT : 0, &chased);
 745         if (r == -ENOENT && m->ignore) {
 746                 log_debug_errno(r, "Path %s does not exist, ignoring.", path);
 747                 return 0;
 748         }
 749         if (r < 0)
 750                 return log_debug_errno(r, "Failed to follow symlinks on %s: %m", path);
 751
 752         log_debug("Followed symlinks %s → %s.", path, chased);
 753
 754         free(*location);
 755         *location = chased;
 756
 757         return 1;
 758 }
 759
 760 static int apply_mount(
 761                 const char *root_directory,
 762                 MountEntry *m) {
 763
 764         bool rbind = true, make = false;
 765         const char *what;
 766         int r;
 767
 768         assert(m);
 769
 770         r = mount_entry_chase(root_directory, m, mount_entry_path(m), !IN_SET(m->mode, INACCESSIBLE, READONLY, READWRITE), &m->path_malloc);
 771         if (r <= 0)
 772                 return r;
 773
 774         log_debug("Applying namespace mount on %s", mount_entry_path(m));
 775
 776         switch (m->mode) {
 777
 778         case INACCESSIBLE: {
 779                 struct stat target;
 780
 781                 /* First, get rid of everything that is below if there
 782                  * is anything... Then, overmount it with an
 783                  * inaccessible path. */
 784                 (void) umount_recursive(mount_entry_path(m), 0);
 785
 786                 if (lstat(mount_entry_path(m), &target) < 0)
 787                         return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m));
 788
 789                 what = mode_to_inaccessible_node(target.st_mode);
 790                 if (!what) {
 791                         log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed");
 792                         return -ELOOP;
 793                 }
 794                 break;
 795         }
 796
 797         case READONLY:
 798         case READWRITE:
 799                 r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
 800                 if (r < 0)
 801                         return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m));
 802                 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
 803                         return 0;
 804                 /* This isn't a mount point yet, let's make it one. */
 805                 what = mount_entry_path(m);
 806                 break;
 807
 808         case BIND_MOUNT:
 809                 rbind = false;
 810
 811                 _fallthrough_;
 812         case BIND_MOUNT_RECURSIVE:
 813                 /* Also chase the source mount */
 814
 815                 r = mount_entry_chase(root_directory, m, mount_entry_source(m), false, &m->source_malloc);
 816                 if (r <= 0)
 817                         return r;
 818
 819                 what = mount_entry_source(m);
 820                 make = true;
 821                 break;
 822
 823         case EMPTY_DIR:
 824                 return mount_empty_dir(m);
 825
 826         case PRIVATE_TMP:
 827                 what = mount_entry_source(m);
 828                 make = true;
 829                 break;
 830
 831         case PRIVATE_DEV:
 832                 return mount_private_dev(m);
 833
 834         case BIND_DEV:
 835                 return mount_bind_dev(m);
 836
 837         case SYSFS:
 838                 return mount_sysfs(m);
 839
 840         case PROCFS:
 841                 return mount_procfs(m);
 842
 843         default:
 844                 assert_not_reached("Unknown mode");
 845         }
 846
 847         assert(what);
 848
 849         if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0) {
 850                 bool try_again = false;
 851                 r = -errno;
 852
 853                 if (r == -ENOENT && make) {
 854                         struct stat st;
 855
 856                         /* Hmm, either the source or the destination are missing. Let's see if we can create the destination, then try again */
 857
 858                         if (stat(what, &st) >= 0) {
 859
 860                                 (void) mkdir_parents(mount_entry_path(m), 0755);
 861
 862                                 if (S_ISDIR(st.st_mode))
 863                                         try_again = mkdir(mount_entry_path(m), 0755) >= 0;
 864                                 else
 865                                         try_again = touch(mount_entry_path(m)) >= 0;
 866                         }
 867                 }
 868
 869                 if (try_again) {
 870                         if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0)
 871                                 r = -errno;
 872                         else
 873                                 r = 0;
 874                 }
 875
 876                 if (r < 0)
 877                         return log_debug_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
 878         }
 879
 880         log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
 881         return 0;
 882 }
 883
 884 static int make_read_only(const MountEntry *m, char **blacklist, FILE *proc_self_mountinfo) {
 885         int r = 0;
 886
 887         assert(m);
 888         assert(proc_self_mountinfo);
 889
 890         if (mount_entry_read_only(m))
 891                 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), true, blacklist, proc_self_mountinfo);
 892         else if (m->mode == PRIVATE_DEV) { /* Superblock can be readonly but the submounts can't */
 893                 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
 894                         r = -errno;
 895         } else
 896                 return 0;
 897
 898         /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
 899          * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
 900          * read-only mounts already applied. */
 901
 902         if (r == -ENOENT && m->ignore)
 903                 r = 0;
 904
 905         return r;
 906 }
 907
 908 static bool namespace_info_mount_apivfs(const char *root_directory, const NamespaceInfo *ns_info) {
 909         assert(ns_info);
 910
 911         /*
 912          * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
 913          * since to protect the API VFS mounts, they need to be around in the
 914          * first place... and RootDirectory= or RootImage= need to be set.
 915          */
 916
 917         /* root_directory should point to a mount point */
 918         return root_directory &&
 919                 (ns_info->mount_apivfs ||
 920                  ns_info->protect_control_groups ||
 921                  ns_info->protect_kernel_tunables);
 922 }
 923
 924 static unsigned namespace_calculate_mounts(
 925                 const char* root_directory,
 926                 const NamespaceInfo *ns_info,
 927                 char** read_write_paths,
 928                 char** read_only_paths,
 929                 char** inaccessible_paths,
 930                 char** empty_directories,
 931                 unsigned n_bind_mounts,
 932                 const char* tmp_dir,
 933                 const char* var_tmp_dir,
 934                 ProtectHome protect_home,
 935                 ProtectSystem protect_system) {
 936
 937         unsigned protect_home_cnt;
 938         unsigned protect_system_cnt =
 939                 (protect_system == PROTECT_SYSTEM_STRICT ?
 940                  ELEMENTSOF(protect_system_strict_table) :
 941                  ((protect_system == PROTECT_SYSTEM_FULL) ?
 942                   ELEMENTSOF(protect_system_full_table) :
 943                   ((protect_system == PROTECT_SYSTEM_YES) ?
 944                    ELEMENTSOF(protect_system_yes_table) : 0)));
 945
 946         protect_home_cnt =
 947                 (protect_home == PROTECT_HOME_YES ?
 948                  ELEMENTSOF(protect_home_yes_table) :
 949                  ((protect_home == PROTECT_HOME_READ_ONLY) ?
 950                   ELEMENTSOF(protect_home_read_only_table) : 0));
 951
 952         return !!tmp_dir + !!var_tmp_dir +
 953                 strv_length(read_write_paths) +
 954                 strv_length(read_only_paths) +
 955                 strv_length(inaccessible_paths) +
 956                 strv_length(empty_directories) +
 957                 n_bind_mounts +
 958                 ns_info->private_dev +
 959                 (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
 960                 (ns_info->protect_control_groups ? 1 : 0) +
 961                 (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
 962                 protect_home_cnt + protect_system_cnt +
 963                 (namespace_info_mount_apivfs(root_directory, ns_info) ? ELEMENTSOF(apivfs_table) : 0);
 964 }
 965
 966 int setup_namespace(
 967                 const char* root_directory,
 968                 const char* root_image,
 969                 const NamespaceInfo *ns_info,
 970                 char** read_write_paths,
 971                 char** read_only_paths,
 972                 char** inaccessible_paths,
 973                 char** empty_directories,
 974                 const BindMount *bind_mounts,
 975                 unsigned n_bind_mounts,
 976                 const char* tmp_dir,
 977                 const char* var_tmp_dir,
 978                 ProtectHome protect_home,
 979                 ProtectSystem protect_system,
 980                 unsigned long mount_flags,
 981                 DissectImageFlags dissect_image_flags) {
 982
 983         _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
 984         _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
 985         _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
 986         _cleanup_free_ void *root_hash = NULL;
 987         MountEntry *m, *mounts = NULL;
 988         size_t root_hash_size = 0;
 989         bool make_slave = false;
 990         const char *root;
 991         unsigned n_mounts;
 992         bool require_prefix = false;
 993         int r = 0;
 994
 995         assert(ns_info);
 996
 997         if (mount_flags == 0)
 998                 mount_flags = MS_SHARED;
 999
1000         if (root_image) {
1001                 dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT;
1002
1003                 if (protect_system == PROTECT_SYSTEM_STRICT && strv_isempty(read_write_paths))
1004                         dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
1005
1006                 r = loop_device_make_by_path(root_image,
1007                                              dissect_image_flags & DISSECT_IMAGE_READ_ONLY ? O_RDONLY : O_RDWR,
1008                                              &loop_device);
1009                 if (r < 0)
1010                         return r;
1011
1012                 r = root_hash_load(root_image, &root_hash, &root_hash_size);
1013                 if (r < 0)
1014                         return r;
1015
1016                 r = dissect_image(loop_device->fd, root_hash, root_hash_size, dissect_image_flags, &dissected_image);
1017                 if (r < 0)
1018                         return r;
1019
1020                 r = dissected_image_decrypt(dissected_image, NULL, root_hash, root_hash_size, dissect_image_flags, &decrypted_image);
1021                 if (r < 0)
1022                         return r;
1023         }
1024
1025         if (root_directory)
1026                 root = root_directory;
1027         else if (root_image || n_bind_mounts > 0) {
1028
1029                 /* If we are booting from an image, create a mount point for the image, if it's still missing. We use
1030                  * the same mount point for all images, which is safe, since they all live in their own namespaces
1031                  * after all, and hence won't see each other. We also use such a root directory whenever there are bind
1032                  * mounts configured, so that their source mounts are never obstructed by mounts we already applied
1033                  * while we are applying them. */
1034
1035                 root = "/run/systemd/unit-root";
1036                 (void) mkdir_label(root, 0700);
1037                 require_prefix = true;
1038         } else
1039                 root = NULL;
1040
1041         n_mounts = namespace_calculate_mounts(
1042                         root,
1043                         ns_info,
1044                         read_write_paths,
1045                         read_only_paths,
1046                         inaccessible_paths,
1047                         empty_directories,
1048                         n_bind_mounts,
1049                         tmp_dir, var_tmp_dir,
1050                         protect_home, protect_system);
1051
1052         /* Set mount slave mode */
1053         if (root || n_mounts > 0)
1054                 make_slave = true;
1055
1056         if (n_mounts > 0) {
1057                 m = mounts = (MountEntry *) alloca0(n_mounts * sizeof(MountEntry));
1058                 r = append_access_mounts(&m, read_write_paths, READWRITE, require_prefix);
1059                 if (r < 0)
1060                         goto finish;
1061
1062                 r = append_access_mounts(&m, read_only_paths, READONLY, require_prefix);
1063                 if (r < 0)
1064                         goto finish;
1065
1066                 r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE, require_prefix);
1067                 if (r < 0)
1068                         goto finish;
1069
1070                 r = append_empty_dir_mounts(&m, empty_directories);
1071                 if (r < 0)
1072                         goto finish;
1073
1074                 r = append_bind_mounts(&m, bind_mounts, n_bind_mounts);
1075                 if (r < 0)
1076                         goto finish;
1077
1078                 if (tmp_dir) {
1079                         *(m++) = (MountEntry) {
1080                                 .path_const = "/tmp",
1081                                 .mode = PRIVATE_TMP,
1082                                 .source_const = tmp_dir,
1083                         };
1084                 }
1085
1086                 if (var_tmp_dir) {
1087                         *(m++) = (MountEntry) {
1088                                 .path_const = "/var/tmp",
1089                                 .mode = PRIVATE_TMP,
1090                                 .source_const = var_tmp_dir,
1091                         };
1092                 }
1093
1094                 if (ns_info->private_dev) {
1095                         *(m++) = (MountEntry) {
1096                                 .path_const = "/dev",
1097                                 .mode = PRIVATE_DEV,
1098                         };
1099                 }
1100
1101                 if (ns_info->protect_kernel_tunables) {
1102                         r = append_static_mounts(&m, protect_kernel_tunables_table, ELEMENTSOF(protect_kernel_tunables_table), ns_info->ignore_protect_paths);
1103                         if (r < 0)
1104                                 goto finish;
1105                 }
1106
1107                 if (ns_info->protect_kernel_modules) {
1108                         r = append_static_mounts(&m, protect_kernel_modules_table, ELEMENTSOF(protect_kernel_modules_table), ns_info->ignore_protect_paths);
1109                         if (r < 0)
1110                                 goto finish;
1111                 }
1112
1113                 if (ns_info->protect_control_groups) {
1114                         *(m++) = (MountEntry) {
1115                                 .path_const = "/sys/fs/cgroup",
1116                                 .mode = READONLY,
1117                         };
1118                 }
1119
1120                 r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths);
1121                 if (r < 0)
1122                         goto finish;
1123
1124                 r = append_protect_system(&m, protect_system, false);
1125                 if (r < 0)
1126                         goto finish;
1127
1128                 if (namespace_info_mount_apivfs(root, ns_info)) {
1129                         r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths);
1130                         if (r < 0)
1131                                 goto finish;
1132                 }
1133
1134                 assert(mounts + n_mounts == m);
1135
1136                 /* Prepend the root directory where that's necessary */
1137                 r = prefix_where_needed(mounts, n_mounts, root);
1138                 if (r < 0)
1139                         goto finish;
1140
1141                 qsort(mounts, n_mounts, sizeof(MountEntry), mount_path_compare);
1142
1143                 drop_duplicates(mounts, &n_mounts);
1144                 drop_outside_root(root, mounts, &n_mounts);
1145                 drop_inaccessible(mounts, &n_mounts);
1146                 drop_nop(mounts, &n_mounts);
1147         }
1148
1149         if (unshare(CLONE_NEWNS) < 0) {
1150                 r = -errno;
1151                 goto finish;
1152         }
1153
1154         if (make_slave) {
1155                 /* Remount / as SLAVE so that nothing now mounted in the namespace
1156                    shows up in the parent */
1157                 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1158                         r = -errno;
1159                         goto finish;
1160                 }
1161         }
1162
1163         if (root_image) {
1164                 /* A root image is specified, mount it to the right place */
1165                 r = dissected_image_mount(dissected_image, root, UID_INVALID, dissect_image_flags);
1166                 if (r < 0)
1167                         goto finish;
1168
1169                 if (decrypted_image) {
1170                         r = decrypted_image_relinquish(decrypted_image);
1171                         if (r < 0)
1172                                 goto finish;
1173                 }
1174
1175                 loop_device_relinquish(loop_device);
1176
1177         } else if (root_directory) {
1178
1179                 /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
1180                 r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW);
1181                 if (r < 0)
1182                         goto finish;
1183                 if (r == 0) {
1184                         if (mount(root, root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1185                                 r = -errno;
1186                                 goto finish;
1187                         }
1188                 }
1189
1190         } else if (root) {
1191
1192                 /* Let's mount the main root directory to the root directory to use */
1193                 if (mount("/", root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1194                         r = -errno;
1195                         goto finish;
1196                 }
1197         }
1198
1199         /* Try to set up the new root directory before mounting anything else there. */
1200         if (root_image || root_directory)
1201                 (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
1202
1203         if (n_mounts > 0) {
1204                 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
1205                 char **blacklist;
1206                 unsigned j;
1207
1208                 /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of /proc.
1209                  * For example, this is the case with the option: 'InaccessiblePaths=/proc' */
1210                 proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
1211                 if (!proc_self_mountinfo) {
1212                         r = -errno;
1213                         goto finish;
1214                 }
1215
1216                 /* First round, add in all special mounts we need */
1217                 for (m = mounts; m < mounts + n_mounts; ++m) {
1218                         r = apply_mount(root, m);
1219                         if (r < 0)
1220                                 goto finish;
1221                 }
1222
1223                 /* Create a blacklist we can pass to bind_mount_recursive() */
1224                 blacklist = newa(char*, n_mounts+1);
1225                 for (j = 0; j < n_mounts; j++)
1226                         blacklist[j] = (char*) mount_entry_path(mounts+j);
1227                 blacklist[j] = NULL;
1228
1229                 /* Second round, flip the ro bits if necessary. */
1230                 for (m = mounts; m < mounts + n_mounts; ++m) {
1231                         r = make_read_only(m, blacklist, proc_self_mountinfo);
1232                         if (r < 0)
1233                                 goto finish;
1234                 }
1235         }
1236
1237         if (root) {
1238                 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
1239                 r = mount_move_root(root);
1240                 if (r < 0)
1241                         goto finish;
1242         }
1243
1244         /* Remount / as the desired mode. Note that this will not
1245          * reestablish propagation from our side to the host, since
1246          * what's disconnected is disconnected. */
1247         if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
1248                 r = -errno;
1249                 goto finish;
1250         }
1251
1252         r = 0;
1253
1254 finish:
1255         for (m = mounts; m < mounts + n_mounts; m++)
1256                 mount_entry_done(m);
1257
1258         return r;
1259 }
1260
1261 void bind_mount_free_many(BindMount *b, unsigned n) {
1262         unsigned i;
1263
1264         assert(b || n == 0);
1265
1266         for (i = 0; i < n; i++) {
1267                 free(b[i].source);
1268                 free(b[i].destination);
1269         }
1270
1271         free(b);
1272 }
1273
1274 int bind_mount_add(BindMount **b, unsigned *n, const BindMount *item) {
1275         _cleanup_free_ char *s = NULL, *d = NULL;
1276         BindMount *c;
1277
1278         assert(b);
1279         assert(n);
1280         assert(item);
1281
1282         s = strdup(item->source);
1283         if (!s)
1284                 return -ENOMEM;
1285
1286         d = strdup(item->destination);
1287         if (!d)
1288                 return -ENOMEM;
1289
1290         c = realloc_multiply(*b, sizeof(BindMount), *n + 1);
1291         if (!c)
1292                 return -ENOMEM;
1293
1294         *b = c;
1295
1296         c[(*n) ++] = (BindMount) {
1297                 .source = s,
1298                 .destination = d,
1299                 .read_only = item->read_only,
1300                 .recursive = item->recursive,
1301                 .ignore_enoent = item->ignore_enoent,
1302         };
1303
1304         s = d = NULL;
1305         return 0;
1306 }
1307
1308 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
1309         _cleanup_free_ char *x = NULL;
1310         char bid[SD_ID128_STRING_MAX];
1311         sd_id128_t boot_id;
1312         int r;
1313
1314         assert(id);
1315         assert(prefix);
1316         assert(path);
1317
1318         /* We include the boot id in the directory so that after a
1319          * reboot we can easily identify obsolete directories. */
1320
1321         r = sd_id128_get_boot(&boot_id);
1322         if (r < 0)
1323                 return r;
1324
1325         x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX");
1326         if (!x)
1327                 return -ENOMEM;
1328
1329         RUN_WITH_UMASK(0077)
1330                 if (!mkdtemp(x))
1331                         return -errno;
1332
1333         RUN_WITH_UMASK(0000) {
1334                 char *y;
1335
1336                 y = strjoina(x, "/tmp");
1337
1338                 if (mkdir(y, 0777 | S_ISVTX) < 0)
1339                         return -errno;
1340         }
1341
1342         *path = x;
1343         x = NULL;
1344
1345         return 0;
1346 }
1347
1348 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
1349         char *a, *b;
1350         int r;
1351
1352         assert(id);
1353         assert(tmp_dir);
1354         assert(var_tmp_dir);
1355
1356         r = setup_one_tmp_dir(id, "/tmp", &a);
1357         if (r < 0)
1358                 return r;
1359
1360         r = setup_one_tmp_dir(id, "/var/tmp", &b);
1361         if (r < 0) {
1362                 char *t;
1363
1364                 t = strjoina(a, "/tmp");
1365                 rmdir(t);
1366                 rmdir(a);
1367
1368                 free(a);
1369                 return r;
1370         }
1371
1372         *tmp_dir = a;
1373         *var_tmp_dir = b;
1374
1375         return 0;
1376 }
1377
1378 int setup_netns(int netns_storage_socket[2]) {
1379         _cleanup_close_ int netns = -1;
1380         int r, q;
1381
1382         assert(netns_storage_socket);
1383         assert(netns_storage_socket[0] >= 0);
1384         assert(netns_storage_socket[1] >= 0);
1385
1386         /* We use the passed socketpair as a storage buffer for our
1387          * namespace reference fd. Whatever process runs this first
1388          * shall create a new namespace, all others should just join
1389          * it. To serialize that we use a file lock on the socket
1390          * pair.
1391          *
1392          * It's a bit crazy, but hey, works great! */
1393
1394         if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1395                 return -errno;
1396
1397         netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1398         if (netns == -EAGAIN) {
1399                 /* Nothing stored yet, so let's create a new namespace */
1400
1401                 if (unshare(CLONE_NEWNET) < 0) {
1402                         r = -errno;
1403                         goto fail;
1404                 }
1405
1406                 loopback_setup();
1407
1408                 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
1409                 if (netns < 0) {
1410                         r = -errno;
1411                         goto fail;
1412                 }
1413
1414                 r = 1;
1415
1416         } else if (netns < 0) {
1417                 r = netns;
1418                 goto fail;
1419
1420         } else {
1421                 /* Yay, found something, so let's join the namespace */
1422                 if (setns(netns, CLONE_NEWNET) < 0) {
1423                         r = -errno;
1424                         goto fail;
1425                 }
1426
1427                 r = 0;
1428         }
1429
1430         q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1431         if (q < 0) {
1432                 r = q;
1433                 goto fail;
1434         }
1435
1436 fail:
1437         (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
1438         return r;
1439 }
1440
1441 bool ns_type_supported(NamespaceType type) {
1442         const char *t, *ns_proc;
1443
1444         t = namespace_type_to_string(type);
1445         if (!t) /* Don't know how to translate this? Then it's not supported */
1446                 return false;
1447
1448         ns_proc = strjoina("/proc/self/ns/", t);
1449         return access(ns_proc, F_OK) == 0;
1450 }
1451
1452 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
1453         [PROTECT_HOME_NO] = "no",
1454         [PROTECT_HOME_YES] = "yes",
1455         [PROTECT_HOME_READ_ONLY] = "read-only",
1456 };
1457
1458 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
1459
1460 ProtectHome parse_protect_home_or_bool(const char *s) {
1461         int r;
1462
1463         r = parse_boolean(s);
1464         if (r > 0)
1465                 return PROTECT_HOME_YES;
1466         if (r == 0)
1467                 return PROTECT_HOME_NO;
1468
1469         return protect_home_from_string(s);
1470 }
1471
1472 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
1473         [PROTECT_SYSTEM_NO] = "no",
1474         [PROTECT_SYSTEM_YES] = "yes",
1475         [PROTECT_SYSTEM_FULL] = "full",
1476         [PROTECT_SYSTEM_STRICT] = "strict",
1477 };
1478
1479 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);
1480
1481 ProtectSystem parse_protect_system_or_bool(const char *s) {
1482         int r;
1483
1484         r = parse_boolean(s);
1485         if (r > 0)
1486                 return PROTECT_SYSTEM_YES;
1487         if (r == 0)
1488                 return PROTECT_SYSTEM_NO;
1489
1490         return protect_system_from_string(s);
1491 }
1492
1493 static const char* const namespace_type_table[] = {
1494         [NAMESPACE_MOUNT] = "mnt",
1495         [NAMESPACE_CGROUP] = "cgroup",
1496         [NAMESPACE_UTS] = "uts",
1497         [NAMESPACE_IPC] = "ipc",
1498         [NAMESPACE_USER] = "user",
1499         [NAMESPACE_PID] = "pid",
1500         [NAMESPACE_NET] = "net",
1501 };
1502
1503 DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);