src/basic/mountpoint-util.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <sys/mount.h>
   6
   7 #include "alloc-util.h"
   8 #include "fd-util.h"
   9 #include "fileio.h"
  10 #include "fs-util.h"
  11 #include "missing_stat.h"
  12 #include "missing_syscall.h"
  13 #include "mountpoint-util.h"
  14 #include "parse-util.h"
  15 #include "path-util.h"
  16 #include "stat-util.h"
  17 #include "stdio-util.h"
  18 #include "strv.h"
  19
  20 /* This is the original MAX_HANDLE_SZ definition from the kernel, when the API was introduced. We use that in place of
  21  * any more currently defined value to future-proof things: if the size is increased in the API headers, and our code
  22  * is recompiled then it would cease working on old kernels, as those refuse any sizes larger than this value with
  23  * EINVAL right-away. Hence, let's disconnect ourselves from any such API changes, and stick to the original definition
  24  * from when it was introduced. We use it as a start value only anyway (see below), and hence should be able to deal
  25  * with large file handles anyway. */
  26 #define ORIGINAL_MAX_HANDLE_SZ 128
  27
  28 int name_to_handle_at_loop(
  29                 int fd,
  30                 const char *path,
  31                 struct file_handle **ret_handle,
  32                 int *ret_mnt_id,
  33                 int flags) {
  34
  35         _cleanup_free_ struct file_handle *h = NULL;
  36         size_t n = ORIGINAL_MAX_HANDLE_SZ;
  37
  38         assert((flags & ~(AT_SYMLINK_FOLLOW|AT_EMPTY_PATH)) == 0);
  39
  40         /* We need to invoke name_to_handle_at() in a loop, given that it might return EOVERFLOW when the specified
  41          * buffer is too small. Note that in contrast to what the docs might suggest, MAX_HANDLE_SZ is only good as a
  42          * start value, it is not an upper bound on the buffer size required.
  43          *
  44          * This improves on raw name_to_handle_at() also in one other regard: ret_handle and ret_mnt_id can be passed
  45          * as NULL if there's no interest in either. */
  46
  47         for (;;) {
  48                 int mnt_id = -1;
  49
  50                 h = malloc0(offsetof(struct file_handle, f_handle) + n);
  51                 if (!h)
  52                         return -ENOMEM;
  53
  54                 h->handle_bytes = n;
  55
  56                 if (name_to_handle_at(fd, path, h, &mnt_id, flags) >= 0) {
  57
  58                         if (ret_handle)
  59                                 *ret_handle = TAKE_PTR(h);
  60
  61                         if (ret_mnt_id)
  62                                 *ret_mnt_id = mnt_id;
  63
  64                         return 0;
  65                 }
  66                 if (errno != EOVERFLOW)
  67                         return -errno;
  68
  69                 if (!ret_handle && ret_mnt_id && mnt_id >= 0) {
  70
  71                         /* As it appears, name_to_handle_at() fills in mnt_id even when it returns EOVERFLOW when the
  72                          * buffer is too small, but that's undocumented. Hence, let's make use of this if it appears to
  73                          * be filled in, and the caller was interested in only the mount ID an nothing else. */
  74
  75                         *ret_mnt_id = mnt_id;
  76                         return 0;
  77                 }
  78
  79                 /* If name_to_handle_at() didn't increase the byte size, then this EOVERFLOW is caused by something
  80                  * else (apparently EOVERFLOW is returned for untriggered nfs4 mounts sometimes), not by the too small
  81                  * buffer. In that case propagate EOVERFLOW */
  82                 if (h->handle_bytes <= n)
  83                         return -EOVERFLOW;
  84
  85                 /* The buffer was too small. Size the new buffer by what name_to_handle_at() returned. */
  86                 n = h->handle_bytes;
  87                 if (offsetof(struct file_handle, f_handle) + n < n) /* check for addition overflow */
  88                         return -EOVERFLOW;
  89
  90                 h = mfree(h);
  91         }
  92 }
  93
  94 static int fd_fdinfo_mnt_id(int fd, const char *filename, int flags, int *ret_mnt_id) {
  95         char path[STRLEN("/proc/self/fdinfo/") + DECIMAL_STR_MAX(int)];
  96         _cleanup_free_ char *fdinfo = NULL;
  97         _cleanup_close_ int subfd = -1;
  98         char *p;
  99         int r;
 100
 101         assert(ret_mnt_id);
 102         assert((flags & ~(AT_SYMLINK_FOLLOW|AT_EMPTY_PATH)) == 0);
 103
 104         if ((flags & AT_EMPTY_PATH) && isempty(filename))
 105                 xsprintf(path, "/proc/self/fdinfo/%i", fd);
 106         else {
 107                 subfd = openat(fd, filename, O_CLOEXEC|O_PATH|(flags & AT_SYMLINK_FOLLOW ? 0 : O_NOFOLLOW));
 108                 if (subfd < 0)
 109                         return -errno;
 110
 111                 xsprintf(path, "/proc/self/fdinfo/%i", subfd);
 112         }
 113
 114         r = read_full_file(path, &fdinfo, NULL);
 115         if (r == -ENOENT) /* The fdinfo directory is a relatively new addition */
 116                 return -EOPNOTSUPP;
 117         if (r < 0)
 118                 return r;
 119
 120         p = startswith(fdinfo, "mnt_id:");
 121         if (!p) {
 122                 p = strstr(fdinfo, "\nmnt_id:");
 123                 if (!p) /* The mnt_id field is a relatively new addition */
 124                         return -EOPNOTSUPP;
 125
 126                 p += 8;
 127         }
 128
 129         p += strspn(p, WHITESPACE);
 130         p[strcspn(p, WHITESPACE)] = 0;
 131
 132         return safe_atoi(p, ret_mnt_id);
 133 }
 134
 135 int fd_is_mount_point(int fd, const char *filename, int flags) {
 136         _cleanup_free_ struct file_handle *h = NULL, *h_parent = NULL;
 137         int mount_id = -1, mount_id_parent = -1;
 138         bool nosupp = false, check_st_dev = true;
 139         STRUCT_STATX_DEFINE(sx);
 140         struct stat a, b;
 141         int r;
 142
 143         assert(fd >= 0);
 144         assert(filename);
 145         assert((flags & ~(AT_SYMLINK_FOLLOW|AT_EMPTY_PATH)) == 0);
 146
 147         /* First we will try statx()' STATX_ATTR_MOUNT_ROOT attribute, which is our ideal API, available
 148          * since kernel 5.8.
 149          *
 150          * If that fails, our second try is the name_to_handle_at() syscall, which tells us the mount id and
 151          * an opaque file "handle". It is not supported everywhere though (kernel compile-time option, not
 152          * all file systems are hooked up). If it works the mount id is usually good enough to tell us
 153          * whether something is a mount point.
 154          *
 155          * If that didn't work we will try to read the mount id from /proc/self/fdinfo/<fd>. This is almost
 156          * as good as name_to_handle_at(), however, does not return the opaque file handle. The opaque file
 157          * handle is pretty useful to detect the root directory, which we should always consider a mount
 158          * point. Hence we use this only as fallback. Exporting the mnt_id in fdinfo is a pretty recent
 159          * kernel addition.
 160          *
 161          * As last fallback we do traditional fstat() based st_dev comparisons. This is how things were
 162          * traditionally done, but unionfs breaks this since it exposes file systems with a variety of st_dev
 163          * reported. Also, btrfs subvolumes have different st_dev, even though they aren't real mounts of
 164          * their own. */
 165
 166         if (statx(fd, filename, (FLAGS_SET(flags, AT_SYMLINK_FOLLOW) ? 0 : AT_SYMLINK_NOFOLLOW) |
 167                                 (flags & AT_EMPTY_PATH) |
 168                                 AT_NO_AUTOMOUNT, 0, &sx) < 0) {
 169                 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
 170                         return -errno;
 171
 172                 /* If statx() is not available or forbidden, fall back to name_to_handle_at() below */
 173         } else if (FLAGS_SET(sx.stx_attributes_mask, STATX_ATTR_MOUNT_ROOT)) /* yay! */
 174                 return FLAGS_SET(sx.stx_attributes, STATX_ATTR_MOUNT_ROOT);
 175
 176         r = name_to_handle_at_loop(fd, filename, &h, &mount_id, flags);
 177         if (IN_SET(r, -ENOSYS, -EACCES, -EPERM, -EOVERFLOW, -EINVAL))
 178                 /* This kernel does not support name_to_handle_at() at all (ENOSYS), or the syscall was blocked
 179                  * (EACCES/EPERM; maybe through seccomp, because we are running inside of a container?), or the mount
 180                  * point is not triggered yet (EOVERFLOW, think nfs4), or some general name_to_handle_at() flakiness
 181                  * (EINVAL): fall back to simpler logic. */
 182                 goto fallback_fdinfo;
 183         else if (r == -EOPNOTSUPP)
 184                 /* This kernel or file system does not support name_to_handle_at(), hence let's see if the upper fs
 185                  * supports it (in which case it is a mount point), otherwise fall back to the traditional stat()
 186                  * logic */
 187                 nosupp = true;
 188         else if (r < 0)
 189                 return r;
 190
 191         r = name_to_handle_at_loop(fd, "", &h_parent, &mount_id_parent, AT_EMPTY_PATH);
 192         if (r == -EOPNOTSUPP) {
 193                 if (nosupp)
 194                         /* Neither parent nor child do name_to_handle_at()?  We have no choice but to fall back. */
 195                         goto fallback_fdinfo;
 196                 else
 197                         /* The parent can't do name_to_handle_at() but the directory we are interested in can?  If so,
 198                          * it must be a mount point. */
 199                         return 1;
 200         } else if (r < 0)
 201                 return r;
 202
 203         /* The parent can do name_to_handle_at() but the
 204          * directory we are interested in can't? If so, it
 205          * must be a mount point. */
 206         if (nosupp)
 207                 return 1;
 208
 209         /* If the file handle for the directory we are
 210          * interested in and its parent are identical, we
 211          * assume this is the root directory, which is a mount
 212          * point. */
 213
 214         if (h->handle_bytes == h_parent->handle_bytes &&
 215             h->handle_type == h_parent->handle_type &&
 216             memcmp(h->f_handle, h_parent->f_handle, h->handle_bytes) == 0)
 217                 return 1;
 218
 219         return mount_id != mount_id_parent;
 220
 221 fallback_fdinfo:
 222         r = fd_fdinfo_mnt_id(fd, filename, flags, &mount_id);
 223         if (IN_SET(r, -EOPNOTSUPP, -EACCES, -EPERM))
 224                 goto fallback_fstat;
 225         if (r < 0)
 226                 return r;
 227
 228         r = fd_fdinfo_mnt_id(fd, "", AT_EMPTY_PATH, &mount_id_parent);
 229         if (r < 0)
 230                 return r;
 231
 232         if (mount_id != mount_id_parent)
 233                 return 1;
 234
 235         /* Hmm, so, the mount ids are the same. This leaves one
 236          * special case though for the root file system. For that,
 237          * let's see if the parent directory has the same inode as we
 238          * are interested in. Hence, let's also do fstat() checks now,
 239          * too, but avoid the st_dev comparisons, since they aren't
 240          * that useful on unionfs mounts. */
 241         check_st_dev = false;
 242
 243 fallback_fstat:
 244         /* yay for fstatat() taking a different set of flags than the other
 245          * _at() above */
 246         if (flags & AT_SYMLINK_FOLLOW)
 247                 flags &= ~AT_SYMLINK_FOLLOW;
 248         else
 249                 flags |= AT_SYMLINK_NOFOLLOW;
 250         if (fstatat(fd, filename, &a, flags) < 0)
 251                 return -errno;
 252
 253         if (fstatat(fd, "", &b, AT_EMPTY_PATH) < 0)
 254                 return -errno;
 255
 256         /* A directory with same device and inode as its parent? Must
 257          * be the root directory */
 258         if (a.st_dev == b.st_dev &&
 259             a.st_ino == b.st_ino)
 260                 return 1;
 261
 262         return check_st_dev && (a.st_dev != b.st_dev);
 263 }
 264
 265 /* flags can be AT_SYMLINK_FOLLOW or 0 */
 266 int path_is_mount_point(const char *t, const char *root, int flags) {
 267         _cleanup_free_ char *canonical = NULL;
 268         _cleanup_close_ int fd = -1;
 269         int r;
 270
 271         assert(t);
 272         assert((flags & ~AT_SYMLINK_FOLLOW) == 0);
 273
 274         if (path_equal(t, "/"))
 275                 return 1;
 276
 277         /* we need to resolve symlinks manually, we can't just rely on
 278          * fd_is_mount_point() to do that for us; if we have a structure like
 279          * /bin -> /usr/bin/ and /usr is a mount point, then the parent that we
 280          * look at needs to be /usr, not /. */
 281         if (flags & AT_SYMLINK_FOLLOW) {
 282                 r = chase_symlinks(t, root, CHASE_TRAIL_SLASH, &canonical, NULL);
 283                 if (r < 0)
 284                         return r;
 285
 286                 t = canonical;
 287         }
 288
 289         fd = open_parent(t, O_PATH|O_CLOEXEC, 0);
 290         if (fd < 0)
 291                 return fd;
 292
 293         return fd_is_mount_point(fd, last_path_component(t), flags);
 294 }
 295
 296 int path_get_mnt_id(const char *path, int *ret) {
 297         STRUCT_NEW_STATX_DEFINE(buf);
 298         int r;
 299
 300         if (statx(AT_FDCWD, path, AT_SYMLINK_NOFOLLOW|AT_NO_AUTOMOUNT, STATX_MNT_ID, &buf.sx) < 0) {
 301                 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
 302                         return -errno;
 303
 304                 /* Fall back to name_to_handle_at() and then fdinfo if statx is not supported or we lack
 305                  * privileges */
 306
 307         } else if (FLAGS_SET(buf.nsx.stx_mask, STATX_MNT_ID)) {
 308                 *ret = buf.nsx.stx_mnt_id;
 309                 return 0;
 310         }
 311
 312         r = name_to_handle_at_loop(AT_FDCWD, path, NULL, ret, 0);
 313         if (IN_SET(r, -EOPNOTSUPP, -ENOSYS, -EACCES, -EPERM, -EOVERFLOW, -EINVAL)) /* kernel/fs don't support this, or seccomp blocks access, or untriggered mount, or name_to_handle_at() is flaky */
 314                 return fd_fdinfo_mnt_id(AT_FDCWD, path, 0, ret);
 315
 316         return r;
 317 }
 318
 319 bool fstype_is_network(const char *fstype) {
 320         const char *x;
 321
 322         x = startswith(fstype, "fuse.");
 323         if (x)
 324                 fstype = x;
 325
 326         return STR_IN_SET(fstype,
 327                           "afs",
 328                           "ceph",
 329                           "cifs",
 330                           "smb3",
 331                           "smbfs",
 332                           "sshfs",
 333                           "ncpfs",
 334                           "ncp",
 335                           "nfs",
 336                           "nfs4",
 337                           "gfs",
 338                           "gfs2",
 339                           "glusterfs",
 340                           "pvfs2", /* OrangeFS */
 341                           "ocfs2",
 342                           "lustre",
 343                           "davfs");
 344 }
 345
 346 bool fstype_is_api_vfs(const char *fstype) {
 347         return STR_IN_SET(fstype,
 348                           "autofs",
 349                           "bpf",
 350                           "cgroup",
 351                           "cgroup2",
 352                           "configfs",
 353                           "cpuset",
 354                           "debugfs",
 355                           "devpts",
 356                           "devtmpfs",
 357                           "efivarfs",
 358                           "fusectl",
 359                           "hugetlbfs",
 360                           "mqueue",
 361                           "proc",
 362                           "pstore",
 363                           "ramfs",
 364                           "securityfs",
 365                           "sysfs",
 366                           "tmpfs",
 367                           "tracefs");
 368 }
 369
 370 bool fstype_is_blockdev_backed(const char *fstype) {
 371         const char *x;
 372
 373         x = startswith(fstype, "fuse.");
 374         if (x)
 375                 fstype = x;
 376
 377         return !streq(fstype, "9p") && !fstype_is_network(fstype) && !fstype_is_api_vfs(fstype);
 378 }
 379
 380 bool fstype_is_ro(const char *fstype) {
 381         /* All Linux file systems that are necessarily read-only */
 382         return STR_IN_SET(fstype,
 383                           "DM_verity_hash",
 384                           "iso9660",
 385                           "squashfs");
 386 }
 387
 388 bool fstype_can_discard(const char *fstype) {
 389         return STR_IN_SET(fstype,
 390                           "btrfs",
 391                           "ext4",
 392                           "vfat",
 393                           "xfs");
 394 }
 395
 396 bool fstype_can_uid_gid(const char *fstype) {
 397
 398         /* All file systems that have a uid=/gid= mount option that fixates the owners of all files and directories,
 399          * current and future. */
 400
 401         return STR_IN_SET(fstype,
 402                           "adfs",
 403                           "exfat",
 404                           "fat",
 405                           "hfs",
 406                           "hpfs",
 407                           "iso9660",
 408                           "msdos",
 409                           "ntfs",
 410                           "vfat");
 411 }
 412
 413 int dev_is_devtmpfs(void) {
 414         _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
 415         int mount_id, r;
 416         char *e;
 417
 418         r = path_get_mnt_id("/dev", &mount_id);
 419         if (r < 0)
 420                 return r;
 421
 422         r = fopen_unlocked("/proc/self/mountinfo", "re", &proc_self_mountinfo);
 423         if (r < 0)
 424                 return r;
 425
 426         for (;;) {
 427                 _cleanup_free_ char *line = NULL;
 428                 int mid;
 429
 430                 r = read_line(proc_self_mountinfo, LONG_LINE_MAX, &line);
 431                 if (r < 0)
 432                         return r;
 433                 if (r == 0)
 434                         break;
 435
 436                 if (sscanf(line, "%i", &mid) != 1)
 437                         continue;
 438
 439                 if (mid != mount_id)
 440                         continue;
 441
 442                 e = strstr(line, " - ");
 443                 if (!e)
 444                         continue;
 445
 446                 /* accept any name that starts with the currently expected type */
 447                 if (startswith(e + 3, "devtmpfs"))
 448                         return true;
 449         }
 450
 451         return false;
 452 }
 453
 454 const char *mount_propagation_flags_to_string(unsigned long flags) {
 455
 456         switch (flags & (MS_SHARED|MS_SLAVE|MS_PRIVATE)) {
 457         case 0:
 458                 return "";
 459         case MS_SHARED:
 460                 return "shared";
 461         case MS_SLAVE:
 462                 return "slave";
 463         case MS_PRIVATE:
 464                 return "private";
 465         }
 466
 467         return NULL;
 468 }
 469
 470 int mount_propagation_flags_from_string(const char *name, unsigned long *ret) {
 471
 472         if (isempty(name))
 473                 *ret = 0;
 474         else if (streq(name, "shared"))
 475                 *ret = MS_SHARED;
 476         else if (streq(name, "slave"))
 477                 *ret = MS_SLAVE;
 478         else if (streq(name, "private"))
 479                 *ret = MS_PRIVATE;
 480         else
 481                 return -EINVAL;
 482         return 0;
 483 }