src/shared/mount-util.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <errno.h>
   4 #include <linux/loop.h>
   5 #include <stdlib.h>
   6 #include <sys/mount.h>
   7 #include <sys/stat.h>
   8 #include <sys/statvfs.h>
   9 #include <unistd.h>
  10
  11 #include "alloc-util.h"
  12 #include "dissect-image.h"
  13 #include "extract-word.h"
  14 #include "fd-util.h"
  15 #include "fileio.h"
  16 #include "fs-util.h"
  17 #include "hashmap.h"
  18 #include "libmount-util.h"
  19 #include "mkdir.h"
  20 #include "mount-util.h"
  21 #include "mountpoint-util.h"
  22 #include "namespace-util.h"
  23 #include "parse-util.h"
  24 #include "path-util.h"
  25 #include "process-util.h"
  26 #include "set.h"
  27 #include "stat-util.h"
  28 #include "stdio-util.h"
  29 #include "string-util.h"
  30 #include "strv.h"
  31 #include "tmpfile-util.h"
  32 #include "user-util.h"
  33
  34 int mount_fd(const char *source,
  35              int target_fd,
  36              const char *filesystemtype,
  37              unsigned long mountflags,
  38              const void *data) {
  39
  40         char path[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
  41
  42         xsprintf(path, "/proc/self/fd/%i", target_fd);
  43         if (mount(source, path, filesystemtype, mountflags, data) < 0) {
  44                 if (errno != ENOENT)
  45                         return -errno;
  46
  47                 /* ENOENT can mean two things: either that the source is missing, or that /proc/ isn't
  48                  * mounted. Check for the latter to generate better error messages. */
  49                 if (proc_mounted() == 0)
  50                         return -ENOSYS;
  51
  52                 return -ENOENT;
  53         }
  54
  55         return 0;
  56 }
  57
  58 int mount_nofollow(
  59                 const char *source,
  60                 const char *target,
  61                 const char *filesystemtype,
  62                 unsigned long mountflags,
  63                 const void *data) {
  64
  65         _cleanup_close_ int fd = -1;
  66
  67         /* In almost all cases we want to manipulate the mount table without following symlinks, hence
  68          * mount_nofollow() is usually the way to go. The only exceptions are environments where /proc/ is
  69          * not available yet, since we need /proc/self/fd/ for this logic to work. i.e. during the early
  70          * initialization of namespacing/container stuff where /proc is not yet mounted (and maybe even the
  71          * fs to mount) we can only use traditional mount() directly.
  72          *
  73          * Note that this disables following only for the final component of the target, i.e symlinks within
  74          * the path of the target are honoured, as are symlinks in the source path everywhere. */
  75
  76         fd = open(target, O_PATH|O_CLOEXEC|O_NOFOLLOW);
  77         if (fd < 0)
  78                 return -errno;
  79
  80         return mount_fd(source, fd, filesystemtype, mountflags, data);
  81 }
  82
  83 int umount_recursive(const char *prefix, int flags) {
  84         int n = 0, r;
  85         bool again;
  86
  87         /* Try to umount everything recursively below a
  88          * directory. Also, take care of stacked mounts, and keep
  89          * unmounting them until they are gone. */
  90
  91         do {
  92                 _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
  93                 _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL;
  94
  95                 again = false;
  96
  97                 r = libmount_parse("/proc/self/mountinfo", NULL, &table, &iter);
  98                 if (r < 0)
  99                         return log_debug_errno(r, "Failed to parse /proc/self/mountinfo: %m");
 100
 101                 for (;;) {
 102                         struct libmnt_fs *fs;
 103                         const char *path;
 104
 105                         r = mnt_table_next_fs(table, iter, &fs);
 106                         if (r == 1)
 107                                 break;
 108                         if (r < 0)
 109                                 return log_debug_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m");
 110
 111                         path = mnt_fs_get_target(fs);
 112                         if (!path)
 113                                 continue;
 114
 115                         if (!path_startswith(path, prefix))
 116                                 continue;
 117
 118                         if (umount2(path, flags | UMOUNT_NOFOLLOW) < 0) {
 119                                 log_debug_errno(errno, "Failed to umount %s, ignoring: %m", path);
 120                                 continue;
 121                         }
 122
 123                         log_debug("Successfully unmounted %s", path);
 124
 125                         again = true;
 126                         n++;
 127
 128                         break;
 129                 }
 130         } while (again);
 131
 132         return n;
 133 }
 134
 135 static int get_mount_flags(
 136                 struct libmnt_table *table,
 137                 const char *path,
 138                 unsigned long *ret) {
 139
 140         _cleanup_close_ int fd = -1;
 141         struct libmnt_fs *fs;
 142         struct statvfs buf;
 143         const char *opts;
 144         int r;
 145
 146         /* Get the mount flags for the mountpoint at "path" from "table". We have a fallback using statvfs()
 147          * in place (which provides us with mostly the same info), but it's just a fallback, since using it
 148          * means triggering autofs or NFS mounts, which we'd rather avoid needlessly.
 149          *
 150          * This generally doesn't follow symlinks. */
 151
 152         fs = mnt_table_find_target(table, path, MNT_ITER_FORWARD);
 153         if (!fs) {
 154                 log_debug("Could not find '%s' in mount table, ignoring.", path);
 155                 goto fallback;
 156         }
 157
 158         opts = mnt_fs_get_vfs_options(fs);
 159         if (!opts) {
 160                 *ret = 0;
 161                 return 0;
 162         }
 163
 164         r = mnt_optstr_get_flags(opts, ret, mnt_get_builtin_optmap(MNT_LINUX_MAP));
 165         if (r != 0) {
 166                 log_debug_errno(r, "Could not get flags for '%s', ignoring: %m", path);
 167                 goto fallback;
 168         }
 169
 170         /* MS_RELATIME is default and trying to set it in an unprivileged container causes EPERM */
 171         *ret &= ~MS_RELATIME;
 172         return 0;
 173
 174 fallback:
 175         fd = open(path, O_PATH|O_CLOEXEC|O_NOFOLLOW);
 176         if (fd < 0)
 177                 return -errno;
 178
 179         if (fstatvfs(fd, &buf) < 0)
 180                 return -errno;
 181
 182         /* The statvfs() flags and the mount flags mostly have the same values, but for some cases do
 183          * not. Hence map the flags manually. (Strictly speaking, ST_RELATIME/MS_RELATIME is the most
 184          * prominent one that doesn't match, but that's the one we mask away anyway, see above.) */
 185
 186         *ret =
 187                 FLAGS_SET(buf.f_flag, ST_RDONLY) * MS_RDONLY |
 188                 FLAGS_SET(buf.f_flag, ST_NODEV) * MS_NODEV |
 189                 FLAGS_SET(buf.f_flag, ST_NOEXEC) * MS_NOEXEC |
 190                 FLAGS_SET(buf.f_flag, ST_NOSUID) * MS_NOSUID |
 191                 FLAGS_SET(buf.f_flag, ST_NOATIME) * MS_NOATIME |
 192                 FLAGS_SET(buf.f_flag, ST_NODIRATIME) * MS_NODIRATIME;
 193
 194         return 0;
 195 }
 196
 197 /* Use this function only if you do not have direct access to /proc/self/mountinfo but the caller can open it
 198  * for you. This is the case when /proc is masked or not mounted. Otherwise, use bind_remount_recursive. */
 199 int bind_remount_recursive_with_mountinfo(
 200                 const char *prefix,
 201                 unsigned long new_flags,
 202                 unsigned long flags_mask,
 203                 char **deny_list,
 204                 FILE *proc_self_mountinfo) {
 205
 206         _cleanup_set_free_free_ Set *done = NULL;
 207         _cleanup_free_ char *simplified = NULL;
 208         int r;
 209
 210         assert(prefix);
 211         assert(proc_self_mountinfo);
 212
 213         /* Recursively remount a directory (and all its submounts) with desired flags (MS_READONLY,
 214          * MS_NOSUID, MS_NOEXEC). If the directory is already mounted, we reuse the mount and simply mark it
 215          * MS_BIND|MS_RDONLY (or remove the MS_RDONLY for read-write operation), ditto for other flags. If it
 216          * isn't we first make it one. Afterwards we apply (or remove) the flags to all submounts we can
 217          * access, too. When mounts are stacked on the same mount point we only care for each individual
 218          * "top-level" mount on each point, as we cannot influence/access the underlying mounts anyway. We do
 219          * not have any effect on future submounts that might get propagated, they might be writable
 220          * etc. This includes future submounts that have been triggered via autofs.
 221          *
 222          * If the "deny_list" parameter is specified it may contain a list of subtrees to exclude from the
 223          * remount operation. Note that we'll ignore the deny list for the top-level path. */
 224
 225         simplified = strdup(prefix);
 226         if (!simplified)
 227                 return -ENOMEM;
 228
 229         path_simplify(simplified, false);
 230
 231         done = set_new(&path_hash_ops);
 232         if (!done)
 233                 return -ENOMEM;
 234
 235         for (;;) {
 236                 _cleanup_set_free_free_ Set *todo = NULL;
 237                 _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
 238                 _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL;
 239                 bool top_autofs = false;
 240                 char *x;
 241                 unsigned long orig_flags;
 242
 243                 todo = set_new(&path_hash_ops);
 244                 if (!todo)
 245                         return -ENOMEM;
 246
 247                 rewind(proc_self_mountinfo);
 248
 249                 r = libmount_parse("/proc/self/mountinfo", proc_self_mountinfo, &table, &iter);
 250                 if (r < 0)
 251                         return log_debug_errno(r, "Failed to parse /proc/self/mountinfo: %m");
 252
 253                 for (;;) {
 254                         struct libmnt_fs *fs;
 255                         const char *path, *type;
 256
 257                         r = mnt_table_next_fs(table, iter, &fs);
 258                         if (r == 1)
 259                                 break;
 260                         if (r < 0)
 261                                 return log_debug_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m");
 262
 263                         path = mnt_fs_get_target(fs);
 264                         type = mnt_fs_get_fstype(fs);
 265                         if (!path || !type)
 266                                 continue;
 267
 268                         if (!path_startswith(path, simplified))
 269                                 continue;
 270
 271                         /* Ignore this mount if it is deny-listed, but only if it isn't the top-level mount
 272                          * we shall operate on. */
 273                         if (!path_equal(path, simplified)) {
 274                                 bool deny_listed = false;
 275                                 char **i;
 276
 277                                 STRV_FOREACH(i, deny_list) {
 278                                         if (path_equal(*i, simplified))
 279                                                 continue;
 280
 281                                         if (!path_startswith(*i, simplified))
 282                                                 continue;
 283
 284                                         if (path_startswith(path, *i)) {
 285                                                 deny_listed = true;
 286                                                 log_debug("Not remounting %s deny-listed by %s, called for %s",
 287                                                           path, *i, simplified);
 288                                                 break;
 289                                         }
 290                                 }
 291                                 if (deny_listed)
 292                                         continue;
 293                         }
 294
 295                         /* Let's ignore autofs mounts.  If they aren't
 296                          * triggered yet, we want to avoid triggering
 297                          * them, as we don't make any guarantees for
 298                          * future submounts anyway.  If they are
 299                          * already triggered, then we will find
 300                          * another entry for this. */
 301                         if (streq(type, "autofs")) {
 302                                 top_autofs = top_autofs || path_equal(path, simplified);
 303                                 continue;
 304                         }
 305
 306                         if (!set_contains(done, path)) {
 307                                 r = set_put_strdup(&todo, path);
 308                                 if (r < 0)
 309                                         return r;
 310                         }
 311                 }
 312
 313                 /* If we have no submounts to process anymore and if
 314                  * the root is either already done, or an autofs, we
 315                  * are done */
 316                 if (set_isempty(todo) &&
 317                     (top_autofs || set_contains(done, simplified)))
 318                         return 0;
 319
 320                 if (!set_contains(done, simplified) &&
 321                     !set_contains(todo, simplified)) {
 322                         /* The prefix directory itself is not yet a mount, make it one. */
 323                         r = mount_nofollow(simplified, simplified, NULL, MS_BIND|MS_REC, NULL);
 324                         if (r < 0)
 325                                 return r;
 326
 327                         orig_flags = 0;
 328                         (void) get_mount_flags(table, simplified, &orig_flags);
 329
 330                         r = mount_nofollow(NULL, simplified, NULL, (orig_flags & ~flags_mask)|MS_BIND|MS_REMOUNT|new_flags, NULL);
 331                         if (r < 0)
 332                                 return r;
 333
 334                         log_debug("Made top-level directory %s a mount point.", prefix);
 335
 336                         r = set_put_strdup(&done, simplified);
 337                         if (r < 0)
 338                                 return r;
 339                 }
 340
 341                 while ((x = set_steal_first(todo))) {
 342
 343                         r = set_consume(done, x);
 344                         if (IN_SET(r, 0, -EEXIST))
 345                                 continue;
 346                         if (r < 0)
 347                                 return r;
 348
 349                         /* Deal with mount points that are obstructed by a later mount */
 350                         r = path_is_mount_point(x, NULL, 0);
 351                         if (IN_SET(r, 0, -ENOENT))
 352                                 continue;
 353                         if (r < 0) {
 354                                 if (!ERRNO_IS_PRIVILEGE(r))
 355                                         return r;
 356
 357                                 /* Even if root user invoke this, submounts under private FUSE or NFS mount points
 358                                  * may not be acceessed. E.g.,
 359                                  *
 360                                  * $ bindfs --no-allow-other ~/mnt/mnt ~/mnt/mnt
 361                                  * $ bindfs --no-allow-other ~/mnt ~/mnt
 362                                  *
 363                                  * Then, root user cannot access the mount point ~/mnt/mnt.
 364                                  * In such cases, the submounts are ignored, as we have no way to manage them. */
 365                                 log_debug_errno(r, "Failed to determine '%s' is mount point or not, ignoring: %m", x);
 366                                 continue;
 367                         }
 368
 369                         /* Try to reuse the original flag set */
 370                         orig_flags = 0;
 371                         (void) get_mount_flags(table, x, &orig_flags);
 372
 373                         r = mount_nofollow(NULL, x, NULL, (orig_flags & ~flags_mask)|MS_BIND|MS_REMOUNT|new_flags, NULL);
 374                         if (r < 0)
 375                                 return r;
 376
 377                         log_debug("Remounted %s read-only.", x);
 378                 }
 379         }
 380 }
 381
 382 int bind_remount_recursive(
 383                 const char *prefix,
 384                 unsigned long new_flags,
 385                 unsigned long flags_mask,
 386                 char **deny_list) {
 387
 388         _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
 389         int r;
 390
 391         r = fopen_unlocked("/proc/self/mountinfo", "re", &proc_self_mountinfo);
 392         if (r < 0)
 393                 return r;
 394
 395         return bind_remount_recursive_with_mountinfo(prefix, new_flags, flags_mask, deny_list, proc_self_mountinfo);
 396 }
 397
 398 int bind_remount_one_with_mountinfo(
 399                 const char *path,
 400                 unsigned long new_flags,
 401                 unsigned long flags_mask,
 402                 FILE *proc_self_mountinfo) {
 403
 404         _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
 405         unsigned long orig_flags = 0;
 406         int r;
 407
 408         assert(path);
 409         assert(proc_self_mountinfo);
 410
 411         rewind(proc_self_mountinfo);
 412
 413         table = mnt_new_table();
 414         if (!table)
 415                 return -ENOMEM;
 416
 417         r = mnt_table_parse_stream(table, proc_self_mountinfo, "/proc/self/mountinfo");
 418         if (r < 0)
 419                 return r;
 420
 421         /* Try to reuse the original flag set */
 422         (void) get_mount_flags(table, path, &orig_flags);
 423
 424         r = mount_nofollow(NULL, path, NULL, (orig_flags & ~flags_mask)|MS_BIND|MS_REMOUNT|new_flags, NULL);
 425         if (r < 0)
 426                 return r;
 427
 428         return 0;
 429 }
 430
 431 int mount_move_root(const char *path) {
 432         assert(path);
 433
 434         if (chdir(path) < 0)
 435                 return -errno;
 436
 437         if (mount(path, "/", NULL, MS_MOVE, NULL) < 0)
 438                 return -errno;
 439
 440         if (chroot(".") < 0)
 441                 return -errno;
 442
 443         if (chdir("/") < 0)
 444                 return -errno;
 445
 446         return 0;
 447 }
 448
 449 int repeat_unmount(const char *path, int flags) {
 450         bool done = false;
 451
 452         assert(path);
 453
 454         /* If there are multiple mounts on a mount point, this
 455          * removes them all */
 456
 457         for (;;) {
 458                 if (umount2(path, flags) < 0) {
 459
 460                         if (errno == EINVAL)
 461                                 return done;
 462
 463                         return -errno;
 464                 }
 465
 466                 done = true;
 467         }
 468 }
 469
 470 int mode_to_inaccessible_node(
 471                 const char *runtime_dir,
 472                 mode_t mode,
 473                 char **ret) {
 474
 475         /* This function maps a node type to a corresponding inaccessible file node. These nodes are created
 476          * during early boot by PID 1. In some cases we lacked the privs to create the character and block
 477          * devices (maybe because we run in an userns environment, or miss CAP_SYS_MKNOD, or run with a
 478          * devices policy that excludes device nodes with major and minor of 0), but that's fine, in that
 479          * case we use an AF_UNIX file node instead, which is not the same, but close enough for most
 480          * uses. And most importantly, the kernel allows bind mounts from socket nodes to any non-directory
 481          * file nodes, and that's the most important thing that matters.
 482          *
 483          * Note that the runtime directory argument shall be the top-level runtime directory, i.e. /run/ if
 484          * we operate in system context and $XDG_RUNTIME_DIR if we operate in user context. */
 485
 486         _cleanup_free_ char *d = NULL;
 487         const char *node = NULL;
 488
 489         assert(ret);
 490
 491         if (!runtime_dir)
 492                 runtime_dir = "/run";
 493
 494         switch(mode & S_IFMT) {
 495                 case S_IFREG:
 496                         node = "/systemd/inaccessible/reg";
 497                         break;
 498
 499                 case S_IFDIR:
 500                         node = "/systemd/inaccessible/dir";
 501                         break;
 502
 503                 case S_IFCHR:
 504                         node = "/systemd/inaccessible/chr";
 505                         break;
 506
 507                 case S_IFBLK:
 508                         node = "/systemd/inaccessible/blk";
 509                         break;
 510
 511                 case S_IFIFO:
 512                         node = "/systemd/inaccessible/fifo";
 513                         break;
 514
 515                 case S_IFSOCK:
 516                         node = "/systemd/inaccessible/sock";
 517                         break;
 518         }
 519         if (!node)
 520                 return -EINVAL;
 521
 522         d = path_join(runtime_dir, node);
 523         if (!d)
 524                 return -ENOMEM;
 525
 526         /* On new kernels unprivileged users are permitted to create 0:0 char device nodes (because they also
 527          * act as whiteout inode for overlayfs), but no other char or block device nodes. On old kernels no
 528          * device node whatsoever may be created by unprivileged processes. Hence, if the caller asks for the
 529          * inaccessible block device node let's see if the block device node actually exists, and if not,
 530          * fall back to the character device node. From there fall back to the socket device node. This means
 531          * in the best case we'll get the right device node type — but if not we'll hopefully at least get a
 532          * device node at all. */
 533
 534         if (S_ISBLK(mode) &&
 535             access(d, F_OK) < 0 && errno == ENOENT) {
 536                 free(d);
 537                 d = path_join(runtime_dir, "/systemd/inaccessible/chr");
 538                 if (!d)
 539                         return -ENOMEM;
 540         }
 541
 542         if (IN_SET(mode & S_IFMT, S_IFBLK, S_IFCHR) &&
 543             access(d, F_OK) < 0 && errno == ENOENT) {
 544                 free(d);
 545                 d = path_join(runtime_dir, "/systemd/inaccessible/sock");
 546                 if (!d)
 547                         return -ENOMEM;
 548         }
 549
 550         *ret = TAKE_PTR(d);
 551         return 0;
 552 }
 553
 554 #define FLAG(name) (flags & name ? STRINGIFY(name) "|" : "")
 555 static char* mount_flags_to_string(long unsigned flags) {
 556         char *x;
 557         _cleanup_free_ char *y = NULL;
 558         long unsigned overflow;
 559
 560         overflow = flags & ~(MS_RDONLY |
 561                              MS_NOSUID |
 562                              MS_NODEV |
 563                              MS_NOEXEC |
 564                              MS_SYNCHRONOUS |
 565                              MS_REMOUNT |
 566                              MS_MANDLOCK |
 567                              MS_DIRSYNC |
 568                              MS_NOATIME |
 569                              MS_NODIRATIME |
 570                              MS_BIND |
 571                              MS_MOVE |
 572                              MS_REC |
 573                              MS_SILENT |
 574                              MS_POSIXACL |
 575                              MS_UNBINDABLE |
 576                              MS_PRIVATE |
 577                              MS_SLAVE |
 578                              MS_SHARED |
 579                              MS_RELATIME |
 580                              MS_KERNMOUNT |
 581                              MS_I_VERSION |
 582                              MS_STRICTATIME |
 583                              MS_LAZYTIME);
 584
 585         if (flags == 0 || overflow != 0)
 586                 if (asprintf(&y, "%lx", overflow) < 0)
 587                         return NULL;
 588
 589         x = strjoin(FLAG(MS_RDONLY),
 590                     FLAG(MS_NOSUID),
 591                     FLAG(MS_NODEV),
 592                     FLAG(MS_NOEXEC),
 593                     FLAG(MS_SYNCHRONOUS),
 594                     FLAG(MS_REMOUNT),
 595                     FLAG(MS_MANDLOCK),
 596                     FLAG(MS_DIRSYNC),
 597                     FLAG(MS_NOATIME),
 598                     FLAG(MS_NODIRATIME),
 599                     FLAG(MS_BIND),
 600                     FLAG(MS_MOVE),
 601                     FLAG(MS_REC),
 602                     FLAG(MS_SILENT),
 603                     FLAG(MS_POSIXACL),
 604                     FLAG(MS_UNBINDABLE),
 605                     FLAG(MS_PRIVATE),
 606                     FLAG(MS_SLAVE),
 607                     FLAG(MS_SHARED),
 608                     FLAG(MS_RELATIME),
 609                     FLAG(MS_KERNMOUNT),
 610                     FLAG(MS_I_VERSION),
 611                     FLAG(MS_STRICTATIME),
 612                     FLAG(MS_LAZYTIME),
 613                     y);
 614         if (!x)
 615                 return NULL;
 616         if (!y)
 617                 x[strlen(x) - 1] = '\0'; /* truncate the last | */
 618         return x;
 619 }
 620
 621 int mount_verbose_full(
 622                 int error_log_level,
 623                 const char *what,
 624                 const char *where,
 625                 const char *type,
 626                 unsigned long flags,
 627                 const char *options,
 628                 bool follow_symlink) {
 629
 630         _cleanup_free_ char *fl = NULL, *o = NULL;
 631         unsigned long f;
 632         int r;
 633
 634         r = mount_option_mangle(options, flags, &f, &o);
 635         if (r < 0)
 636                 return log_full_errno(error_log_level, r,
 637                                       "Failed to mangle mount options %s: %m",
 638                                       strempty(options));
 639
 640         fl = mount_flags_to_string(f);
 641
 642         if ((f & MS_REMOUNT) && !what && !type)
 643                 log_debug("Remounting %s (%s \"%s\")...",
 644                           where, strnull(fl), strempty(o));
 645         else if (!what && !type)
 646                 log_debug("Mounting %s (%s \"%s\")...",
 647                           where, strnull(fl), strempty(o));
 648         else if ((f & MS_BIND) && !type)
 649                 log_debug("Bind-mounting %s on %s (%s \"%s\")...",
 650                           what, where, strnull(fl), strempty(o));
 651         else if (f & MS_MOVE)
 652                 log_debug("Moving mount %s → %s (%s \"%s\")...",
 653                           what, where, strnull(fl), strempty(o));
 654         else
 655                 log_debug("Mounting %s (%s) on %s (%s \"%s\")...",
 656                           strna(what), strna(type), where, strnull(fl), strempty(o));
 657
 658         if (follow_symlink)
 659                 r = mount(what, where, type, f, o) < 0 ? -errno : 0;
 660         else
 661                 r = mount_nofollow(what, where, type, f, o);
 662         if (r < 0)
 663                 return log_full_errno(error_log_level, r,
 664                                       "Failed to mount %s (type %s) on %s (%s \"%s\"): %m",
 665                                       strna(what), strna(type), where, strnull(fl), strempty(o));
 666         return 0;
 667 }
 668
 669 int umount_verbose(
 670                 int error_log_level,
 671                 const char *what,
 672                 int flags) {
 673
 674         assert(what);
 675
 676         log_debug("Umounting %s...", what);
 677
 678         if (umount2(what, flags) < 0)
 679                 return log_full_errno(error_log_level, errno,
 680                                       "Failed to unmount %s: %m", what);
 681
 682         return 0;
 683 }
 684
 685 int mount_option_mangle(
 686                 const char *options,
 687                 unsigned long mount_flags,
 688                 unsigned long *ret_mount_flags,
 689                 char **ret_remaining_options) {
 690
 691         const struct libmnt_optmap *map;
 692         _cleanup_free_ char *ret = NULL;
 693         const char *p;
 694         int r;
 695
 696         /* This extracts mount flags from the mount options, and store
 697          * non-mount-flag options to '*ret_remaining_options'.
 698          * E.g.,
 699          * "rw,nosuid,nodev,relatime,size=1630748k,mode=700,uid=1000,gid=1000"
 700          * is split to MS_NOSUID|MS_NODEV|MS_RELATIME and
 701          * "size=1630748k,mode=700,uid=1000,gid=1000".
 702          * See more examples in test-mount-utils.c.
 703          *
 704          * Note that if 'options' does not contain any non-mount-flag options,
 705          * then '*ret_remaining_options' is set to NULL instead of empty string.
 706          * Note that this does not check validity of options stored in
 707          * '*ret_remaining_options'.
 708          * Note that if 'options' is NULL, then this just copies 'mount_flags'
 709          * to '*ret_mount_flags'. */
 710
 711         assert(ret_mount_flags);
 712         assert(ret_remaining_options);
 713
 714         map = mnt_get_builtin_optmap(MNT_LINUX_MAP);
 715         if (!map)
 716                 return -EINVAL;
 717
 718         p = options;
 719         for (;;) {
 720                 _cleanup_free_ char *word = NULL;
 721                 const struct libmnt_optmap *ent;
 722
 723                 r = extract_first_word(&p, &word, ",", EXTRACT_UNQUOTE);
 724                 if (r < 0)
 725                         return r;
 726                 if (r == 0)
 727                         break;
 728
 729                 for (ent = map; ent->name; ent++) {
 730                         /* All entries in MNT_LINUX_MAP do not take any argument.
 731                          * Thus, ent->name does not contain "=" or "[=]". */
 732                         if (!streq(word, ent->name))
 733                                 continue;
 734
 735                         if (!(ent->mask & MNT_INVERT))
 736                                 mount_flags |= ent->id;
 737                         else if (mount_flags & ent->id)
 738                                 mount_flags ^= ent->id;
 739
 740                         break;
 741                 }
 742
 743                 /* If 'word' is not a mount flag, then store it in '*ret_remaining_options'. */
 744                 if (!ent->name && !strextend_with_separator(&ret, ",", word))
 745                         return -ENOMEM;
 746         }
 747
 748         *ret_mount_flags = mount_flags;
 749         *ret_remaining_options = TAKE_PTR(ret);
 750
 751         return 0;
 752 }
 753
 754 static int mount_in_namespace(
 755                 pid_t target,
 756                 const char *propagate_path,
 757                 const char *incoming_path,
 758                 const char *src,
 759                 const char *dest,
 760                 bool read_only,
 761                 bool make_file_or_directory,
 762                 const MountOptions *options,
 763                 bool is_image) {
 764
 765         _cleanup_close_pair_ int errno_pipe_fd[2] = { -1, -1 };
 766         _cleanup_close_ int self_mntns_fd = -1, mntns_fd = -1, root_fd = -1, pidns_fd = -1, chased_src_fd = -1;
 767         char mount_slave[] = "/tmp/propagate.XXXXXX", *mount_tmp, *mount_outside, *p,
 768                 chased_src[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
 769         bool mount_slave_created = false, mount_slave_mounted = false,
 770                 mount_tmp_created = false, mount_tmp_mounted = false,
 771                 mount_outside_created = false, mount_outside_mounted = false;
 772         struct stat st, self_mntns_st;
 773         pid_t child;
 774         int r;
 775
 776         assert(target > 0);
 777         assert(propagate_path);
 778         assert(incoming_path);
 779         assert(src);
 780         assert(dest);
 781         assert(!options || is_image);
 782
 783         r = namespace_open(target, &pidns_fd, &mntns_fd, NULL, NULL, &root_fd);
 784         if (r < 0)
 785                 return log_debug_errno(r, "Failed to retrieve FDs of the target process' namespace: %m");
 786
 787         if (fstat(mntns_fd, &st) < 0)
 788                 return log_debug_errno(errno, "Failed to fstat mount namespace FD of target process: %m");
 789
 790         r = namespace_open(0, NULL, &self_mntns_fd, NULL, NULL, NULL);
 791         if (r < 0)
 792                 return log_debug_errno(r, "Failed to retrieve FDs of systemd's namespace: %m");
 793
 794         if (fstat(self_mntns_fd, &self_mntns_st) < 0)
 795                 return log_debug_errno(errno, "Failed to fstat mount namespace FD of systemd: %m");
 796
 797         /* We can't add new mounts at runtime if the process wasn't started in a namespace */
 798         if (st.st_ino == self_mntns_st.st_ino && st.st_dev == self_mntns_st.st_dev)
 799                 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to activate bind mount in target, not running in a mount namespace");
 800
 801         /* One day, when bind mounting /proc/self/fd/n works across
 802          * namespace boundaries we should rework this logic to make
 803          * use of it... */
 804
 805         p = strjoina(propagate_path, "/");
 806         r = laccess(p, F_OK);
 807         if (r < 0)
 808                 return log_debug_errno(r == -ENOENT ? SYNTHETIC_ERRNO(EOPNOTSUPP) : r, "Target does not allow propagation of mount points");
 809
 810         r = chase_symlinks(src, NULL, CHASE_TRAIL_SLASH, NULL, &chased_src_fd);
 811         if (r < 0)
 812                 return log_debug_errno(r, "Failed to resolve source path of %s: %m", src);
 813         xsprintf(chased_src, "/proc/self/fd/%i", chased_src_fd);
 814
 815         if (fstat(chased_src_fd, &st) < 0)
 816                 return log_debug_errno(errno, "Failed to stat() resolved source path %s: %m", src);
 817         if (S_ISLNK(st.st_mode)) /* This shouldn't really happen, given that we just chased the symlinks above, but let's better be safe… */
 818                 return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Source directory %s can't be a symbolic link", src);
 819
 820         /* Our goal is to install a new bind mount into the container,
 821            possibly read-only. This is irritatingly complex
 822            unfortunately, currently.
 823
 824            First, we start by creating a private playground in /tmp,
 825            that we can mount MS_SLAVE. (Which is necessary, since
 826            MS_MOVE cannot be applied to mounts with MS_SHARED parent
 827            mounts.) */
 828
 829         if (!mkdtemp(mount_slave))
 830                 return log_debug_errno(errno, "Failed to create playground %s: %m", mount_slave);
 831
 832         mount_slave_created = true;
 833
 834         r = mount_nofollow_verbose(LOG_DEBUG, mount_slave, mount_slave, NULL, MS_BIND, NULL);
 835         if (r < 0)
 836                 goto finish;
 837
 838         mount_slave_mounted = true;
 839
 840         r = mount_nofollow_verbose(LOG_DEBUG, NULL, mount_slave, NULL, MS_SLAVE, NULL);
 841         if (r < 0)
 842                 goto finish;
 843
 844         /* Second, we mount the source file or directory to a directory inside of our MS_SLAVE playground. */
 845         mount_tmp = strjoina(mount_slave, "/mount");
 846         if (is_image)
 847                 r = mkdir_p(mount_tmp, 0700);
 848         else
 849                 r = make_mount_point_inode_from_stat(&st, mount_tmp, 0700);
 850         if (r < 0) {
 851                 log_debug_errno(r, "Failed to create temporary mount point %s: %m", mount_tmp);
 852                 goto finish;
 853         }
 854
 855         mount_tmp_created = true;
 856
 857         if (is_image)
 858                 r = verity_dissect_and_mount(chased_src, mount_tmp, options, NULL, NULL, NULL);
 859         else
 860                 r = mount_follow_verbose(LOG_DEBUG, chased_src, mount_tmp, NULL, MS_BIND, NULL);
 861         if (r < 0)
 862                 goto finish;
 863
 864         mount_tmp_mounted = true;
 865
 866         /* Third, we remount the new bind mount read-only if requested. */
 867         if (read_only) {
 868                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, mount_tmp, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
 869                 if (r < 0)
 870                         goto finish;
 871         }
 872
 873         /* Fourth, we move the new bind mount into the propagation directory. This way it will appear there read-only
 874          * right-away. */
 875
 876         mount_outside = strjoina(propagate_path, "/XXXXXX");
 877         if (is_image || S_ISDIR(st.st_mode))
 878                 r = mkdtemp(mount_outside) ? 0 : -errno;
 879         else {
 880                 r = mkostemp_safe(mount_outside);
 881                 safe_close(r);
 882         }
 883         if (r < 0) {
 884                 log_debug_errno(r, "Cannot create propagation file or directory %s: %m", mount_outside);
 885                 goto finish;
 886         }
 887
 888         mount_outside_created = true;
 889
 890         r = mount_nofollow_verbose(LOG_DEBUG, mount_tmp, mount_outside, NULL, MS_MOVE, NULL);
 891         if (r < 0)
 892                 goto finish;
 893
 894         mount_outside_mounted = true;
 895         mount_tmp_mounted = false;
 896
 897         if (is_image || S_ISDIR(st.st_mode))
 898                 (void) rmdir(mount_tmp);
 899         else
 900                 (void) unlink(mount_tmp);
 901         mount_tmp_created = false;
 902
 903         (void) umount_verbose(LOG_DEBUG, mount_slave, UMOUNT_NOFOLLOW);
 904         mount_slave_mounted = false;
 905
 906         (void) rmdir(mount_slave);
 907         mount_slave_created = false;
 908
 909         if (pipe2(errno_pipe_fd, O_CLOEXEC|O_NONBLOCK) < 0) {
 910                 log_debug_errno(errno, "Failed to create pipe: %m");
 911                 goto finish;
 912         }
 913
 914         r = namespace_fork("(sd-bindmnt)", "(sd-bindmnt-inner)", NULL, 0, FORK_RESET_SIGNALS|FORK_DEATHSIG,
 915                            pidns_fd, mntns_fd, -1, -1, root_fd, &child);
 916         if (r < 0)
 917                 goto finish;
 918         if (r == 0) {
 919                 const char *mount_inside;
 920
 921                 errno_pipe_fd[0] = safe_close(errno_pipe_fd[0]);
 922
 923                 if (make_file_or_directory) {
 924                         if (!is_image) {
 925                                 (void) mkdir_parents(dest, 0755);
 926                                 (void) make_mount_point_inode_from_stat(&st, dest, 0700);
 927                         } else
 928                                 (void) mkdir_p(dest, 0755);
 929                 }
 930
 931                 /* Fifth, move the mount to the right place inside */
 932                 mount_inside = strjoina(incoming_path, basename(mount_outside));
 933                 r = mount_nofollow_verbose(LOG_ERR, mount_inside, dest, NULL, MS_MOVE, NULL);
 934                 if (r < 0)
 935                         goto child_fail;
 936
 937                 _exit(EXIT_SUCCESS);
 938
 939         child_fail:
 940                 (void) write(errno_pipe_fd[1], &r, sizeof(r));
 941                 errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]);
 942
 943                 _exit(EXIT_FAILURE);
 944         }
 945
 946         errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]);
 947
 948         r = wait_for_terminate_and_check("(sd-bindmnt)", child, 0);
 949         if (r < 0) {
 950                 log_debug_errno(r, "Failed to wait for child: %m");
 951                 goto finish;
 952         }
 953         if (r != EXIT_SUCCESS) {
 954                 if (read(errno_pipe_fd[0], &r, sizeof(r)) == sizeof(r))
 955                         log_debug_errno(r, "Failed to mount: %m");
 956                 else
 957                         log_debug("Child failed.");
 958                 goto finish;
 959         }
 960
 961 finish:
 962         if (mount_outside_mounted)
 963                 (void) umount_verbose(LOG_DEBUG, mount_outside, UMOUNT_NOFOLLOW);
 964         if (mount_outside_created) {
 965                 if (is_image || S_ISDIR(st.st_mode))
 966                         (void) rmdir(mount_outside);
 967                 else
 968                         (void) unlink(mount_outside);
 969         }
 970
 971         if (mount_tmp_mounted)
 972                 (void) umount_verbose(LOG_DEBUG, mount_tmp, UMOUNT_NOFOLLOW);
 973         if (mount_tmp_created) {
 974                 if (is_image || S_ISDIR(st.st_mode))
 975                         (void) rmdir(mount_tmp);
 976                 else
 977                         (void) unlink(mount_tmp);
 978         }
 979
 980         if (mount_slave_mounted)
 981                 (void) umount_verbose(LOG_DEBUG, mount_slave, UMOUNT_NOFOLLOW);
 982         if (mount_slave_created)
 983                 (void) rmdir(mount_slave);
 984
 985         return r;
 986 }
 987
 988 int bind_mount_in_namespace(
 989                 pid_t target,
 990                 const char *propagate_path,
 991                 const char *incoming_path,
 992                 const char *src,
 993                 const char *dest,
 994                 bool read_only,
 995                 bool make_file_or_directory) {
 996
 997         return mount_in_namespace(target, propagate_path, incoming_path, src, dest, read_only, make_file_or_directory, NULL, false);
 998 }
 999
1000 int mount_image_in_namespace(
1001                 pid_t target,
1002                 const char *propagate_path,
1003                 const char *incoming_path,
1004                 const char *src,
1005                 const char *dest,
1006                 bool read_only,
1007                 bool make_file_or_directory,
1008                 const MountOptions *options) {
1009
1010         return mount_in_namespace(target, propagate_path, incoming_path, src, dest, read_only, make_file_or_directory, options, true);
1011 }