src/basic/mount-util.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2010 Lennart Poettering
   6 ***/
   7
   8 #include <errno.h>
   9 #include <stdio_ext.h>
  10 #include <stdlib.h>
  11 #include <string.h>
  12 #include <sys/mount.h>
  13 #include <sys/stat.h>
  14 #include <sys/statvfs.h>
  15 #include <unistd.h>
  16
  17 /* Include later */
  18 #include <libmount.h>
  19
  20 #include "alloc-util.h"
  21 #include "escape.h"
  22 #include "extract-word.h"
  23 #include "fd-util.h"
  24 #include "fileio.h"
  25 #include "fs-util.h"
  26 #include "hashmap.h"
  27 #include "mount-util.h"
  28 #include "parse-util.h"
  29 #include "path-util.h"
  30 #include "set.h"
  31 #include "stdio-util.h"
  32 #include "string-util.h"
  33 #include "strv.h"
  34
  35 /* This is the original MAX_HANDLE_SZ definition from the kernel, when the API was introduced. We use that in place of
  36  * any more currently defined value to future-proof things: if the size is increased in the API headers, and our code
  37  * is recompiled then it would cease working on old kernels, as those refuse any sizes larger than this value with
  38  * EINVAL right-away. Hence, let's disconnect ourselves from any such API changes, and stick to the original definition
  39  * from when it was introduced. We use it as a start value only anyway (see below), and hence should be able to deal
  40  * with large file handles anyway. */
  41 #define ORIGINAL_MAX_HANDLE_SZ 128
  42
  43 int name_to_handle_at_loop(
  44                 int fd,
  45                 const char *path,
  46                 struct file_handle **ret_handle,
  47                 int *ret_mnt_id,
  48                 int flags) {
  49
  50         _cleanup_free_ struct file_handle *h = NULL;
  51         size_t n = ORIGINAL_MAX_HANDLE_SZ;
  52
  53         /* We need to invoke name_to_handle_at() in a loop, given that it might return EOVERFLOW when the specified
  54          * buffer is too small. Note that in contrast to what the docs might suggest, MAX_HANDLE_SZ is only good as a
  55          * start value, it is not an upper bound on the buffer size required.
  56          *
  57          * This improves on raw name_to_handle_at() also in one other regard: ret_handle and ret_mnt_id can be passed
  58          * as NULL if there's no interest in either. */
  59
  60         for (;;) {
  61                 int mnt_id = -1;
  62
  63                 h = malloc0(offsetof(struct file_handle, f_handle) + n);
  64                 if (!h)
  65                         return -ENOMEM;
  66
  67                 h->handle_bytes = n;
  68
  69                 if (name_to_handle_at(fd, path, h, &mnt_id, flags) >= 0) {
  70
  71                         if (ret_handle)
  72                                 *ret_handle = TAKE_PTR(h);
  73
  74                         if (ret_mnt_id)
  75                                 *ret_mnt_id = mnt_id;
  76
  77                         return 0;
  78                 }
  79                 if (errno != EOVERFLOW)
  80                         return -errno;
  81
  82                 if (!ret_handle && ret_mnt_id && mnt_id >= 0) {
  83
  84                         /* As it appears, name_to_handle_at() fills in mnt_id even when it returns EOVERFLOW when the
  85                          * buffer is too small, but that's undocumented. Hence, let's make use of this if it appears to
  86                          * be filled in, and the caller was interested in only the mount ID an nothing else. */
  87
  88                         *ret_mnt_id = mnt_id;
  89                         return 0;
  90                 }
  91
  92                 /* If name_to_handle_at() didn't increase the byte size, then this EOVERFLOW is caused by something
  93                  * else (apparently EOVERFLOW is returned for untriggered nfs4 mounts sometimes), not by the too small
  94                  * buffer. In that case propagate EOVERFLOW */
  95                 if (h->handle_bytes <= n)
  96                         return -EOVERFLOW;
  97
  98                 /* The buffer was too small. Size the new buffer by what name_to_handle_at() returned. */
  99                 n = h->handle_bytes;
 100                 if (offsetof(struct file_handle, f_handle) + n < n) /* check for addition overflow */
 101                         return -EOVERFLOW;
 102
 103                 h = mfree(h);
 104         }
 105 }
 106
 107 static int fd_fdinfo_mnt_id(int fd, const char *filename, int flags, int *mnt_id) {
 108         char path[STRLEN("/proc/self/fdinfo/") + DECIMAL_STR_MAX(int)];
 109         _cleanup_free_ char *fdinfo = NULL;
 110         _cleanup_close_ int subfd = -1;
 111         char *p;
 112         int r;
 113
 114         if ((flags & AT_EMPTY_PATH) && isempty(filename))
 115                 xsprintf(path, "/proc/self/fdinfo/%i", fd);
 116         else {
 117                 subfd = openat(fd, filename, O_CLOEXEC|O_PATH);
 118                 if (subfd < 0)
 119                         return -errno;
 120
 121                 xsprintf(path, "/proc/self/fdinfo/%i", subfd);
 122         }
 123
 124         r = read_full_file(path, &fdinfo, NULL);
 125         if (r == -ENOENT) /* The fdinfo directory is a relatively new addition */
 126                 return -EOPNOTSUPP;
 127         if (r < 0)
 128                 return r;
 129
 130         p = startswith(fdinfo, "mnt_id:");
 131         if (!p) {
 132                 p = strstr(fdinfo, "\nmnt_id:");
 133                 if (!p) /* The mnt_id field is a relatively new addition */
 134                         return -EOPNOTSUPP;
 135
 136                 p += 8;
 137         }
 138
 139         p += strspn(p, WHITESPACE);
 140         p[strcspn(p, WHITESPACE)] = 0;
 141
 142         return safe_atoi(p, mnt_id);
 143 }
 144
 145 int fd_is_mount_point(int fd, const char *filename, int flags) {
 146         _cleanup_free_ struct file_handle *h = NULL, *h_parent = NULL;
 147         int mount_id = -1, mount_id_parent = -1;
 148         bool nosupp = false, check_st_dev = true;
 149         struct stat a, b;
 150         int r;
 151
 152         assert(fd >= 0);
 153         assert(filename);
 154
 155         /* First we will try the name_to_handle_at() syscall, which
 156          * tells us the mount id and an opaque file "handle". It is
 157          * not supported everywhere though (kernel compile-time
 158          * option, not all file systems are hooked up). If it works
 159          * the mount id is usually good enough to tell us whether
 160          * something is a mount point.
 161          *
 162          * If that didn't work we will try to read the mount id from
 163          * /proc/self/fdinfo/<fd>. This is almost as good as
 164          * name_to_handle_at(), however, does not return the
 165          * opaque file handle. The opaque file handle is pretty useful
 166          * to detect the root directory, which we should always
 167          * consider a mount point. Hence we use this only as
 168          * fallback. Exporting the mnt_id in fdinfo is a pretty recent
 169          * kernel addition.
 170          *
 171          * As last fallback we do traditional fstat() based st_dev
 172          * comparisons. This is how things were traditionally done,
 173          * but unionfs breaks this since it exposes file
 174          * systems with a variety of st_dev reported. Also, btrfs
 175          * subvolumes have different st_dev, even though they aren't
 176          * real mounts of their own. */
 177
 178         r = name_to_handle_at_loop(fd, filename, &h, &mount_id, flags);
 179         if (IN_SET(r, -ENOSYS, -EACCES, -EPERM, -EOVERFLOW, -EINVAL))
 180                 /* This kernel does not support name_to_handle_at() at all (ENOSYS), or the syscall was blocked
 181                  * (EACCES/EPERM; maybe through seccomp, because we are running inside of a container?), or the mount
 182                  * point is not triggered yet (EOVERFLOW, think nfs4), or some general name_to_handle_at() flakiness
 183                  * (EINVAL): fall back to simpler logic. */
 184                 goto fallback_fdinfo;
 185         else if (r == -EOPNOTSUPP)
 186                 /* This kernel or file system does not support name_to_handle_at(), hence let's see if the upper fs
 187                  * supports it (in which case it is a mount point), otherwise fallback to the traditional stat()
 188                  * logic */
 189                 nosupp = true;
 190         else if (r < 0)
 191                 return r;
 192
 193         r = name_to_handle_at_loop(fd, "", &h_parent, &mount_id_parent, AT_EMPTY_PATH);
 194         if (r == -EOPNOTSUPP) {
 195                 if (nosupp)
 196                         /* Neither parent nor child do name_to_handle_at()?  We have no choice but to fall back. */
 197                         goto fallback_fdinfo;
 198                 else
 199                         /* The parent can't do name_to_handle_at() but the directory we are interested in can?  If so,
 200                          * it must be a mount point. */
 201                         return 1;
 202         } else if (r < 0)
 203                 return r;
 204
 205         /* The parent can do name_to_handle_at() but the
 206          * directory we are interested in can't? If so, it
 207          * must be a mount point. */
 208         if (nosupp)
 209                 return 1;
 210
 211         /* If the file handle for the directory we are
 212          * interested in and its parent are identical, we
 213          * assume this is the root directory, which is a mount
 214          * point. */
 215
 216         if (h->handle_bytes == h_parent->handle_bytes &&
 217             h->handle_type == h_parent->handle_type &&
 218             memcmp(h->f_handle, h_parent->f_handle, h->handle_bytes) == 0)
 219                 return 1;
 220
 221         return mount_id != mount_id_parent;
 222
 223 fallback_fdinfo:
 224         r = fd_fdinfo_mnt_id(fd, filename, flags, &mount_id);
 225         if (IN_SET(r, -EOPNOTSUPP, -EACCES, -EPERM))
 226                 goto fallback_fstat;
 227         if (r < 0)
 228                 return r;
 229
 230         r = fd_fdinfo_mnt_id(fd, "", AT_EMPTY_PATH, &mount_id_parent);
 231         if (r < 0)
 232                 return r;
 233
 234         if (mount_id != mount_id_parent)
 235                 return 1;
 236
 237         /* Hmm, so, the mount ids are the same. This leaves one
 238          * special case though for the root file system. For that,
 239          * let's see if the parent directory has the same inode as we
 240          * are interested in. Hence, let's also do fstat() checks now,
 241          * too, but avoid the st_dev comparisons, since they aren't
 242          * that useful on unionfs mounts. */
 243         check_st_dev = false;
 244
 245 fallback_fstat:
 246         /* yay for fstatat() taking a different set of flags than the other
 247          * _at() above */
 248         if (flags & AT_SYMLINK_FOLLOW)
 249                 flags &= ~AT_SYMLINK_FOLLOW;
 250         else
 251                 flags |= AT_SYMLINK_NOFOLLOW;
 252         if (fstatat(fd, filename, &a, flags) < 0)
 253                 return -errno;
 254
 255         if (fstatat(fd, "", &b, AT_EMPTY_PATH) < 0)
 256                 return -errno;
 257
 258         /* A directory with same device and inode as its parent? Must
 259          * be the root directory */
 260         if (a.st_dev == b.st_dev &&
 261             a.st_ino == b.st_ino)
 262                 return 1;
 263
 264         return check_st_dev && (a.st_dev != b.st_dev);
 265 }
 266
 267 /* flags can be AT_SYMLINK_FOLLOW or 0 */
 268 int path_is_mount_point(const char *t, const char *root, int flags) {
 269         _cleanup_free_ char *canonical = NULL, *parent = NULL;
 270         _cleanup_close_ int fd = -1;
 271         int r;
 272
 273         assert(t);
 274         assert((flags & ~AT_SYMLINK_FOLLOW) == 0);
 275
 276         if (path_equal(t, "/"))
 277                 return 1;
 278
 279         /* we need to resolve symlinks manually, we can't just rely on
 280          * fd_is_mount_point() to do that for us; if we have a structure like
 281          * /bin -> /usr/bin/ and /usr is a mount point, then the parent that we
 282          * look at needs to be /usr, not /. */
 283         if (flags & AT_SYMLINK_FOLLOW) {
 284                 r = chase_symlinks(t, root, CHASE_TRAIL_SLASH, &canonical);
 285                 if (r < 0)
 286                         return r;
 287
 288                 t = canonical;
 289         }
 290
 291         parent = dirname_malloc(t);
 292         if (!parent)
 293                 return -ENOMEM;
 294
 295         fd = openat(AT_FDCWD, parent, O_DIRECTORY|O_CLOEXEC|O_PATH);
 296         if (fd < 0)
 297                 return -errno;
 298
 299         return fd_is_mount_point(fd, last_path_component(t), flags);
 300 }
 301
 302 int path_get_mnt_id(const char *path, int *ret) {
 303         int r;
 304
 305         r = name_to_handle_at_loop(AT_FDCWD, path, NULL, ret, 0);
 306         if (IN_SET(r, -EOPNOTSUPP, -ENOSYS, -EACCES, -EPERM, -EOVERFLOW, -EINVAL)) /* kernel/fs don't support this, or seccomp blocks access, or untriggered mount, or name_to_handle_at() is flaky */
 307                 return fd_fdinfo_mnt_id(AT_FDCWD, path, 0, ret);
 308
 309         return r;
 310 }
 311
 312 int umount_recursive(const char *prefix, int flags) {
 313         bool again;
 314         int n = 0, r;
 315
 316         /* Try to umount everything recursively below a
 317          * directory. Also, take care of stacked mounts, and keep
 318          * unmounting them until they are gone. */
 319
 320         do {
 321                 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
 322
 323                 again = false;
 324                 r = 0;
 325
 326                 proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
 327                 if (!proc_self_mountinfo)
 328                         return -errno;
 329
 330                 (void) __fsetlocking(proc_self_mountinfo, FSETLOCKING_BYCALLER);
 331
 332                 for (;;) {
 333                         _cleanup_free_ char *path = NULL, *p = NULL;
 334                         int k;
 335
 336                         k = fscanf(proc_self_mountinfo,
 337                                    "%*s "       /* (1) mount id */
 338                                    "%*s "       /* (2) parent id */
 339                                    "%*s "       /* (3) major:minor */
 340                                    "%*s "       /* (4) root */
 341                                    "%ms "       /* (5) mount point */
 342                                    "%*s"        /* (6) mount options */
 343                                    "%*[^-]"     /* (7) optional fields */
 344                                    "- "         /* (8) separator */
 345                                    "%*s "       /* (9) file system type */
 346                                    "%*s"        /* (10) mount source */
 347                                    "%*s"        /* (11) mount options 2 */
 348                                    "%*[^\n]",   /* some rubbish at the end */
 349                                    &path);
 350                         if (k != 1) {
 351                                 if (k == EOF)
 352                                         break;
 353
 354                                 continue;
 355                         }
 356
 357                         r = cunescape(path, UNESCAPE_RELAX, &p);
 358                         if (r < 0)
 359                                 return r;
 360
 361                         if (!path_startswith(p, prefix))
 362                                 continue;
 363
 364                         if (umount2(p, flags) < 0) {
 365                                 r = log_debug_errno(errno, "Failed to umount %s: %m", p);
 366                                 continue;
 367                         }
 368
 369                         log_debug("Successfully unmounted %s", p);
 370
 371                         again = true;
 372                         n++;
 373
 374                         break;
 375                 }
 376
 377         } while (again);
 378
 379         return r ? r : n;
 380 }
 381
 382 static int get_mount_flags(const char *path, unsigned long *flags) {
 383         struct statvfs buf;
 384
 385         if (statvfs(path, &buf) < 0)
 386                 return -errno;
 387         *flags = buf.f_flag;
 388         return 0;
 389 }
 390
 391 /* Use this function only if do you have direct access to /proc/self/mountinfo
 392  * and need the caller to open it for you. This is the case when /proc is
 393  * masked or not mounted. Otherwise, use bind_remount_recursive. */
 394 int bind_remount_recursive_with_mountinfo(const char *prefix, bool ro, char **blacklist, FILE *proc_self_mountinfo) {
 395         _cleanup_set_free_free_ Set *done = NULL;
 396         _cleanup_free_ char *cleaned = NULL;
 397         int r;
 398
 399         assert(proc_self_mountinfo);
 400
 401         /* Recursively remount a directory (and all its submounts) read-only or read-write. If the directory is already
 402          * mounted, we reuse the mount and simply mark it MS_BIND|MS_RDONLY (or remove the MS_RDONLY for read-write
 403          * operation). If it isn't we first make it one. Afterwards we apply MS_BIND|MS_RDONLY (or remove MS_RDONLY) to
 404          * all submounts we can access, too. When mounts are stacked on the same mount point we only care for each
 405          * individual "top-level" mount on each point, as we cannot influence/access the underlying mounts anyway. We
 406          * do not have any effect on future submounts that might get propagated, they migt be writable. This includes
 407          * future submounts that have been triggered via autofs.
 408          *
 409          * If the "blacklist" parameter is specified it may contain a list of subtrees to exclude from the
 410          * remount operation. Note that we'll ignore the blacklist for the top-level path. */
 411
 412         cleaned = strdup(prefix);
 413         if (!cleaned)
 414                 return -ENOMEM;
 415
 416         path_kill_slashes(cleaned);
 417
 418         done = set_new(&path_hash_ops);
 419         if (!done)
 420                 return -ENOMEM;
 421
 422         for (;;) {
 423                 _cleanup_set_free_free_ Set *todo = NULL;
 424                 bool top_autofs = false;
 425                 char *x;
 426                 unsigned long orig_flags;
 427
 428                 todo = set_new(&path_hash_ops);
 429                 if (!todo)
 430                         return -ENOMEM;
 431
 432                 rewind(proc_self_mountinfo);
 433
 434                 for (;;) {
 435                         _cleanup_free_ char *path = NULL, *p = NULL, *type = NULL;
 436                         int k;
 437
 438                         k = fscanf(proc_self_mountinfo,
 439                                    "%*s "       /* (1) mount id */
 440                                    "%*s "       /* (2) parent id */
 441                                    "%*s "       /* (3) major:minor */
 442                                    "%*s "       /* (4) root */
 443                                    "%ms "       /* (5) mount point */
 444                                    "%*s"        /* (6) mount options (superblock) */
 445                                    "%*[^-]"     /* (7) optional fields */
 446                                    "- "         /* (8) separator */
 447                                    "%ms "       /* (9) file system type */
 448                                    "%*s"        /* (10) mount source */
 449                                    "%*s"        /* (11) mount options (bind mount) */
 450                                    "%*[^\n]",   /* some rubbish at the end */
 451                                    &path,
 452                                    &type);
 453                         if (k != 2) {
 454                                 if (k == EOF)
 455                                         break;
 456
 457                                 continue;
 458                         }
 459
 460                         r = cunescape(path, UNESCAPE_RELAX, &p);
 461                         if (r < 0)
 462                                 return r;
 463
 464                         if (!path_startswith(p, cleaned))
 465                                 continue;
 466
 467                         /* Ignore this mount if it is blacklisted, but only if it isn't the top-level mount we shall
 468                          * operate on. */
 469                         if (!path_equal(cleaned, p)) {
 470                                 bool blacklisted = false;
 471                                 char **i;
 472
 473                                 STRV_FOREACH(i, blacklist) {
 474
 475                                         if (path_equal(*i, cleaned))
 476                                                 continue;
 477
 478                                         if (!path_startswith(*i, cleaned))
 479                                                 continue;
 480
 481                                         if (path_startswith(p, *i)) {
 482                                                 blacklisted = true;
 483                                                 log_debug("Not remounting %s, because blacklisted by %s, called for %s", p, *i, cleaned);
 484                                                 break;
 485                                         }
 486                                 }
 487                                 if (blacklisted)
 488                                         continue;
 489                         }
 490
 491                         /* Let's ignore autofs mounts.  If they aren't
 492                          * triggered yet, we want to avoid triggering
 493                          * them, as we don't make any guarantees for
 494                          * future submounts anyway.  If they are
 495                          * already triggered, then we will find
 496                          * another entry for this. */
 497                         if (streq(type, "autofs")) {
 498                                 top_autofs = top_autofs || path_equal(cleaned, p);
 499                                 continue;
 500                         }
 501
 502                         if (!set_contains(done, p)) {
 503                                 r = set_consume(todo, p);
 504                                 p = NULL;
 505                                 if (r == -EEXIST)
 506                                         continue;
 507                                 if (r < 0)
 508                                         return r;
 509                         }
 510                 }
 511
 512                 /* If we have no submounts to process anymore and if
 513                  * the root is either already done, or an autofs, we
 514                  * are done */
 515                 if (set_isempty(todo) &&
 516                     (top_autofs || set_contains(done, cleaned)))
 517                         return 0;
 518
 519                 if (!set_contains(done, cleaned) &&
 520                     !set_contains(todo, cleaned)) {
 521                         /* The prefix directory itself is not yet a mount, make it one. */
 522                         if (mount(cleaned, cleaned, NULL, MS_BIND|MS_REC, NULL) < 0)
 523                                 return -errno;
 524
 525                         orig_flags = 0;
 526                         (void) get_mount_flags(cleaned, &orig_flags);
 527                         orig_flags &= ~MS_RDONLY;
 528
 529                         if (mount(NULL, prefix, NULL, orig_flags|MS_BIND|MS_REMOUNT|(ro ? MS_RDONLY : 0), NULL) < 0)
 530                                 return -errno;
 531
 532                         log_debug("Made top-level directory %s a mount point.", prefix);
 533
 534                         x = strdup(cleaned);
 535                         if (!x)
 536                                 return -ENOMEM;
 537
 538                         r = set_consume(done, x);
 539                         if (r < 0)
 540                                 return r;
 541                 }
 542
 543                 while ((x = set_steal_first(todo))) {
 544
 545                         r = set_consume(done, x);
 546                         if (IN_SET(r, 0, -EEXIST))
 547                                 continue;
 548                         if (r < 0)
 549                                 return r;
 550
 551                         /* Deal with mount points that are obstructed by a later mount */
 552                         r = path_is_mount_point(x, NULL, 0);
 553                         if (IN_SET(r, 0, -ENOENT))
 554                                 continue;
 555                         if (r < 0)
 556                                 return r;
 557
 558                         /* Try to reuse the original flag set */
 559                         orig_flags = 0;
 560                         (void) get_mount_flags(x, &orig_flags);
 561                         orig_flags &= ~MS_RDONLY;
 562
 563                         if (mount(NULL, x, NULL, orig_flags|MS_BIND|MS_REMOUNT|(ro ? MS_RDONLY : 0), NULL) < 0)
 564                                 return -errno;
 565
 566                         log_debug("Remounted %s read-only.", x);
 567                 }
 568         }
 569 }
 570
 571 int bind_remount_recursive(const char *prefix, bool ro, char **blacklist) {
 572         _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
 573
 574         proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
 575         if (!proc_self_mountinfo)
 576                 return -errno;
 577
 578         (void) __fsetlocking(proc_self_mountinfo, FSETLOCKING_BYCALLER);
 579
 580         return bind_remount_recursive_with_mountinfo(prefix, ro, blacklist, proc_self_mountinfo);
 581 }
 582
 583 int mount_move_root(const char *path) {
 584         assert(path);
 585
 586         if (chdir(path) < 0)
 587                 return -errno;
 588
 589         if (mount(path, "/", NULL, MS_MOVE, NULL) < 0)
 590                 return -errno;
 591
 592         if (chroot(".") < 0)
 593                 return -errno;
 594
 595         if (chdir("/") < 0)
 596                 return -errno;
 597
 598         return 0;
 599 }
 600
 601 bool fstype_is_network(const char *fstype) {
 602         const char *x;
 603
 604         x = startswith(fstype, "fuse.");
 605         if (x)
 606                 fstype = x;
 607
 608         return STR_IN_SET(fstype,
 609                           "afs",
 610                           "cifs",
 611                           "smbfs",
 612                           "sshfs",
 613                           "ncpfs",
 614                           "ncp",
 615                           "nfs",
 616                           "nfs4",
 617                           "gfs",
 618                           "gfs2",
 619                           "glusterfs",
 620                           "pvfs2", /* OrangeFS */
 621                           "ocfs2",
 622                           "lustre");
 623 }
 624
 625 bool fstype_is_api_vfs(const char *fstype) {
 626         return STR_IN_SET(fstype,
 627                           "autofs",
 628                           "bpf",
 629                           "cgroup",
 630                           "cgroup2",
 631                           "configfs",
 632                           "cpuset",
 633                           "debugfs",
 634                           "devpts",
 635                           "devtmpfs",
 636                           "efivarfs",
 637                           "fusectl",
 638                           "hugetlbfs",
 639                           "mqueue",
 640                           "proc",
 641                           "pstore",
 642                           "ramfs",
 643                           "securityfs",
 644                           "sysfs",
 645                           "tmpfs",
 646                           "tracefs");
 647 }
 648
 649 bool fstype_is_ro(const char *fstype) {
 650         /* All Linux file systems that are necessarily read-only */
 651         return STR_IN_SET(fstype,
 652                           "DM_verity_hash",
 653                           "iso9660",
 654                           "squashfs");
 655 }
 656
 657 bool fstype_can_discard(const char *fstype) {
 658         return STR_IN_SET(fstype,
 659                           "btrfs",
 660                           "ext4",
 661                           "vfat",
 662                           "xfs");
 663 }
 664
 665 bool fstype_can_uid_gid(const char *fstype) {
 666
 667         /* All file systems that have a uid=/gid= mount option that fixates the owners of all files and directories,
 668          * current and future. */
 669
 670         return STR_IN_SET(fstype,
 671                           "adfs",
 672                           "fat",
 673                           "hfs",
 674                           "hpfs",
 675                           "iso9660",
 676                           "msdos",
 677                           "ntfs",
 678                           "vfat");
 679 }
 680
 681 int repeat_unmount(const char *path, int flags) {
 682         bool done = false;
 683
 684         assert(path);
 685
 686         /* If there are multiple mounts on a mount point, this
 687          * removes them all */
 688
 689         for (;;) {
 690                 if (umount2(path, flags) < 0) {
 691
 692                         if (errno == EINVAL)
 693                                 return done;
 694
 695                         return -errno;
 696                 }
 697
 698                 done = true;
 699         }
 700 }
 701
 702 const char* mode_to_inaccessible_node(mode_t mode) {
 703         /* This function maps a node type to a corresponding inaccessible file node. These nodes are created during
 704          * early boot by PID 1. In some cases we lacked the privs to create the character and block devices (maybe
 705          * because we run in an userns environment, or miss CAP_SYS_MKNOD, or run with a devices policy that excludes
 706          * device nodes with major and minor of 0), but that's fine, in that case we use an AF_UNIX file node instead,
 707          * which is not the same, but close enough for most uses. And most importantly, the kernel allows bind mounts
 708          * from socket nodes to any non-directory file nodes, and that's the most important thing that matters. */
 709
 710         switch(mode & S_IFMT) {
 711                 case S_IFREG:
 712                         return "/run/systemd/inaccessible/reg";
 713
 714                 case S_IFDIR:
 715                         return "/run/systemd/inaccessible/dir";
 716
 717                 case S_IFCHR:
 718                         if (access("/run/systemd/inaccessible/chr", F_OK) == 0)
 719                                 return "/run/systemd/inaccessible/chr";
 720                         return "/run/systemd/inaccessible/sock";
 721
 722                 case S_IFBLK:
 723                         if (access("/run/systemd/inaccessible/blk", F_OK) == 0)
 724                                 return "/run/systemd/inaccessible/blk";
 725                         return "/run/systemd/inaccessible/sock";
 726
 727                 case S_IFIFO:
 728                         return "/run/systemd/inaccessible/fifo";
 729
 730                 case S_IFSOCK:
 731                         return "/run/systemd/inaccessible/sock";
 732         }
 733         return NULL;
 734 }
 735
 736 #define FLAG(name) (flags & name ? STRINGIFY(name) "|" : "")
 737 static char* mount_flags_to_string(long unsigned flags) {
 738         char *x;
 739         _cleanup_free_ char *y = NULL;
 740         long unsigned overflow;
 741
 742         overflow = flags & ~(MS_RDONLY |
 743                              MS_NOSUID |
 744                              MS_NODEV |
 745                              MS_NOEXEC |
 746                              MS_SYNCHRONOUS |
 747                              MS_REMOUNT |
 748                              MS_MANDLOCK |
 749                              MS_DIRSYNC |
 750                              MS_NOATIME |
 751                              MS_NODIRATIME |
 752                              MS_BIND |
 753                              MS_MOVE |
 754                              MS_REC |
 755                              MS_SILENT |
 756                              MS_POSIXACL |
 757                              MS_UNBINDABLE |
 758                              MS_PRIVATE |
 759                              MS_SLAVE |
 760                              MS_SHARED |
 761                              MS_RELATIME |
 762                              MS_KERNMOUNT |
 763                              MS_I_VERSION |
 764                              MS_STRICTATIME |
 765                              MS_LAZYTIME);
 766
 767         if (flags == 0 || overflow != 0)
 768                 if (asprintf(&y, "%lx", overflow) < 0)
 769                         return NULL;
 770
 771         x = strjoin(FLAG(MS_RDONLY),
 772                     FLAG(MS_NOSUID),
 773                     FLAG(MS_NODEV),
 774                     FLAG(MS_NOEXEC),
 775                     FLAG(MS_SYNCHRONOUS),
 776                     FLAG(MS_REMOUNT),
 777                     FLAG(MS_MANDLOCK),
 778                     FLAG(MS_DIRSYNC),
 779                     FLAG(MS_NOATIME),
 780                     FLAG(MS_NODIRATIME),
 781                     FLAG(MS_BIND),
 782                     FLAG(MS_MOVE),
 783                     FLAG(MS_REC),
 784                     FLAG(MS_SILENT),
 785                     FLAG(MS_POSIXACL),
 786                     FLAG(MS_UNBINDABLE),
 787                     FLAG(MS_PRIVATE),
 788                     FLAG(MS_SLAVE),
 789                     FLAG(MS_SHARED),
 790                     FLAG(MS_RELATIME),
 791                     FLAG(MS_KERNMOUNT),
 792                     FLAG(MS_I_VERSION),
 793                     FLAG(MS_STRICTATIME),
 794                     FLAG(MS_LAZYTIME),
 795                     y);
 796         if (!x)
 797                 return NULL;
 798         if (!y)
 799                 x[strlen(x) - 1] = '\0'; /* truncate the last | */
 800         return x;
 801 }
 802
 803 int mount_verbose(
 804                 int error_log_level,
 805                 const char *what,
 806                 const char *where,
 807                 const char *type,
 808                 unsigned long flags,
 809                 const char *options) {
 810
 811         _cleanup_free_ char *fl = NULL, *o = NULL;
 812         unsigned long f;
 813         int r;
 814
 815         r = mount_option_mangle(options, flags, &f, &o);
 816         if (r < 0)
 817                 return log_full_errno(error_log_level, r,
 818                                       "Failed to mangle mount options %s: %m",
 819                                       strempty(options));
 820
 821         fl = mount_flags_to_string(f);
 822
 823         if ((f & MS_REMOUNT) && !what && !type)
 824                 log_debug("Remounting %s (%s \"%s\")...",
 825                           where, strnull(fl), strempty(o));
 826         else if (!what && !type)
 827                 log_debug("Mounting %s (%s \"%s\")...",
 828                           where, strnull(fl), strempty(o));
 829         else if ((f & MS_BIND) && !type)
 830                 log_debug("Bind-mounting %s on %s (%s \"%s\")...",
 831                           what, where, strnull(fl), strempty(o));
 832         else if (f & MS_MOVE)
 833                 log_debug("Moving mount %s → %s (%s \"%s\")...",
 834                           what, where, strnull(fl), strempty(o));
 835         else
 836                 log_debug("Mounting %s on %s (%s \"%s\")...",
 837                           strna(type), where, strnull(fl), strempty(o));
 838         if (mount(what, where, type, f, o) < 0)
 839                 return log_full_errno(error_log_level, errno,
 840                                       "Failed to mount %s on %s (%s \"%s\"): %m",
 841                                       strna(type), where, strnull(fl), strempty(o));
 842         return 0;
 843 }
 844
 845 int umount_verbose(const char *what) {
 846         log_debug("Umounting %s...", what);
 847         if (umount(what) < 0)
 848                 return log_error_errno(errno, "Failed to unmount %s: %m", what);
 849         return 0;
 850 }
 851
 852 const char *mount_propagation_flags_to_string(unsigned long flags) {
 853
 854         switch (flags & (MS_SHARED|MS_SLAVE|MS_PRIVATE)) {
 855         case 0:
 856                 return "";
 857         case MS_SHARED:
 858                 return "shared";
 859         case MS_SLAVE:
 860                 return "slave";
 861         case MS_PRIVATE:
 862                 return "private";
 863         }
 864
 865         return NULL;
 866 }
 867
 868 int mount_propagation_flags_from_string(const char *name, unsigned long *ret) {
 869
 870         if (isempty(name))
 871                 *ret = 0;
 872         else if (streq(name, "shared"))
 873                 *ret = MS_SHARED;
 874         else if (streq(name, "slave"))
 875                 *ret = MS_SLAVE;
 876         else if (streq(name, "private"))
 877                 *ret = MS_PRIVATE;
 878         else
 879                 return -EINVAL;
 880         return 0;
 881 }
 882
 883 int mount_option_mangle(
 884                 const char *options,
 885                 unsigned long mount_flags,
 886                 unsigned long *ret_mount_flags,
 887                 char **ret_remaining_options) {
 888
 889         const struct libmnt_optmap *map;
 890         _cleanup_free_ char *ret = NULL;
 891         const char *p;
 892         int r;
 893
 894         /* This extracts mount flags from the mount options, and store
 895          * non-mount-flag options to '*ret_remaining_options'.
 896          * E.g.,
 897          * "rw,nosuid,nodev,relatime,size=1630748k,mode=700,uid=1000,gid=1000"
 898          * is split to MS_NOSUID|MS_NODEV|MS_RELATIME and
 899          * "size=1630748k,mode=700,uid=1000,gid=1000".
 900          * See more examples in test-mount-utils.c.
 901          *
 902          * Note that if 'options' does not contain any non-mount-flag options,
 903          * then '*ret_remaining_options' is set to NULL instread of empty string.
 904          * Note that this does not check validity of options stored in
 905          * '*ret_remaining_options'.
 906          * Note that if 'options' is NULL, then this just copies 'mount_flags'
 907          * to '*ret_mount_flags'. */
 908
 909         assert(ret_mount_flags);
 910         assert(ret_remaining_options);
 911
 912         map = mnt_get_builtin_optmap(MNT_LINUX_MAP);
 913         if (!map)
 914                 return -EINVAL;
 915
 916         p = options;
 917         for (;;) {
 918                 _cleanup_free_ char *word = NULL;
 919                 const struct libmnt_optmap *ent;
 920
 921                 r = extract_first_word(&p, &word, ",", EXTRACT_QUOTES);
 922                 if (r < 0)
 923                         return r;
 924                 if (r == 0)
 925                         break;
 926
 927                 for (ent = map; ent->name; ent++) {
 928                         /* All entries in MNT_LINUX_MAP do not take any argument.
 929                          * Thus, ent->name does not contain "=" or "[=]". */
 930                         if (!streq(word, ent->name))
 931                                 continue;
 932
 933                         if (!(ent->mask & MNT_INVERT))
 934                                 mount_flags |= ent->id;
 935                         else if (mount_flags & ent->id)
 936                                 mount_flags ^= ent->id;
 937
 938                         break;
 939                 }
 940
 941                 /* If 'word' is not a mount flag, then store it in '*ret_remaining_options'. */
 942                 if (!ent->name && !strextend_with_separator(&ret, ",", word, NULL))
 943                         return -ENOMEM;
 944         }
 945
 946         *ret_mount_flags = mount_flags;
 947         *ret_remaining_options = TAKE_PTR(ret);
 948
 949         return 0;
 950 }