]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/basic/mountpoint-util.c
update TODO
[thirdparty/systemd.git] / src / basic / mountpoint-util.c
CommitLineData
049af8ad
ZJS
1/* SPDX-License-Identifier: LGPL-2.1+ */
2
3#include <errno.h>
4#include <fcntl.h>
049af8ad
ZJS
5#include <sys/mount.h>
6
7#include "alloc-util.h"
8#include "fd-util.h"
9#include "fileio.h"
10#include "fs-util.h"
049af8ad
ZJS
11#include "mountpoint-util.h"
12#include "parse-util.h"
13#include "path-util.h"
14#include "stdio-util.h"
15#include "strv.h"
16
17/* This is the original MAX_HANDLE_SZ definition from the kernel, when the API was introduced. We use that in place of
18 * any more currently defined value to future-proof things: if the size is increased in the API headers, and our code
19 * is recompiled then it would cease working on old kernels, as those refuse any sizes larger than this value with
20 * EINVAL right-away. Hence, let's disconnect ourselves from any such API changes, and stick to the original definition
21 * from when it was introduced. We use it as a start value only anyway (see below), and hence should be able to deal
22 * with large file handles anyway. */
23#define ORIGINAL_MAX_HANDLE_SZ 128
24
25int name_to_handle_at_loop(
26 int fd,
27 const char *path,
28 struct file_handle **ret_handle,
29 int *ret_mnt_id,
30 int flags) {
31
32 _cleanup_free_ struct file_handle *h = NULL;
33 size_t n = ORIGINAL_MAX_HANDLE_SZ;
34
35 /* We need to invoke name_to_handle_at() in a loop, given that it might return EOVERFLOW when the specified
36 * buffer is too small. Note that in contrast to what the docs might suggest, MAX_HANDLE_SZ is only good as a
37 * start value, it is not an upper bound on the buffer size required.
38 *
39 * This improves on raw name_to_handle_at() also in one other regard: ret_handle and ret_mnt_id can be passed
40 * as NULL if there's no interest in either. */
41
42 for (;;) {
43 int mnt_id = -1;
44
45 h = malloc0(offsetof(struct file_handle, f_handle) + n);
46 if (!h)
47 return -ENOMEM;
48
49 h->handle_bytes = n;
50
51 if (name_to_handle_at(fd, path, h, &mnt_id, flags) >= 0) {
52
53 if (ret_handle)
54 *ret_handle = TAKE_PTR(h);
55
56 if (ret_mnt_id)
57 *ret_mnt_id = mnt_id;
58
59 return 0;
60 }
61 if (errno != EOVERFLOW)
62 return -errno;
63
64 if (!ret_handle && ret_mnt_id && mnt_id >= 0) {
65
66 /* As it appears, name_to_handle_at() fills in mnt_id even when it returns EOVERFLOW when the
67 * buffer is too small, but that's undocumented. Hence, let's make use of this if it appears to
68 * be filled in, and the caller was interested in only the mount ID an nothing else. */
69
70 *ret_mnt_id = mnt_id;
71 return 0;
72 }
73
74 /* If name_to_handle_at() didn't increase the byte size, then this EOVERFLOW is caused by something
75 * else (apparently EOVERFLOW is returned for untriggered nfs4 mounts sometimes), not by the too small
76 * buffer. In that case propagate EOVERFLOW */
77 if (h->handle_bytes <= n)
78 return -EOVERFLOW;
79
80 /* The buffer was too small. Size the new buffer by what name_to_handle_at() returned. */
81 n = h->handle_bytes;
82 if (offsetof(struct file_handle, f_handle) + n < n) /* check for addition overflow */
83 return -EOVERFLOW;
84
85 h = mfree(h);
86 }
87}
88
89static int fd_fdinfo_mnt_id(int fd, const char *filename, int flags, int *mnt_id) {
90 char path[STRLEN("/proc/self/fdinfo/") + DECIMAL_STR_MAX(int)];
91 _cleanup_free_ char *fdinfo = NULL;
92 _cleanup_close_ int subfd = -1;
93 char *p;
94 int r;
95
96 if ((flags & AT_EMPTY_PATH) && isempty(filename))
97 xsprintf(path, "/proc/self/fdinfo/%i", fd);
98 else {
be24321f 99 subfd = openat(fd, filename, O_CLOEXEC|O_PATH|(flags & AT_SYMLINK_FOLLOW ? 0 : O_NOFOLLOW));
049af8ad
ZJS
100 if (subfd < 0)
101 return -errno;
102
103 xsprintf(path, "/proc/self/fdinfo/%i", subfd);
104 }
105
106 r = read_full_file(path, &fdinfo, NULL);
107 if (r == -ENOENT) /* The fdinfo directory is a relatively new addition */
108 return -EOPNOTSUPP;
109 if (r < 0)
110 return r;
111
112 p = startswith(fdinfo, "mnt_id:");
113 if (!p) {
114 p = strstr(fdinfo, "\nmnt_id:");
115 if (!p) /* The mnt_id field is a relatively new addition */
116 return -EOPNOTSUPP;
117
118 p += 8;
119 }
120
121 p += strspn(p, WHITESPACE);
122 p[strcspn(p, WHITESPACE)] = 0;
123
124 return safe_atoi(p, mnt_id);
125}
126
127int fd_is_mount_point(int fd, const char *filename, int flags) {
128 _cleanup_free_ struct file_handle *h = NULL, *h_parent = NULL;
129 int mount_id = -1, mount_id_parent = -1;
130 bool nosupp = false, check_st_dev = true;
131 struct stat a, b;
132 int r;
133
134 assert(fd >= 0);
135 assert(filename);
136
137 /* First we will try the name_to_handle_at() syscall, which
138 * tells us the mount id and an opaque file "handle". It is
139 * not supported everywhere though (kernel compile-time
140 * option, not all file systems are hooked up). If it works
141 * the mount id is usually good enough to tell us whether
142 * something is a mount point.
143 *
144 * If that didn't work we will try to read the mount id from
145 * /proc/self/fdinfo/<fd>. This is almost as good as
146 * name_to_handle_at(), however, does not return the
147 * opaque file handle. The opaque file handle is pretty useful
148 * to detect the root directory, which we should always
149 * consider a mount point. Hence we use this only as
150 * fallback. Exporting the mnt_id in fdinfo is a pretty recent
151 * kernel addition.
152 *
153 * As last fallback we do traditional fstat() based st_dev
154 * comparisons. This is how things were traditionally done,
155 * but unionfs breaks this since it exposes file
156 * systems with a variety of st_dev reported. Also, btrfs
157 * subvolumes have different st_dev, even though they aren't
158 * real mounts of their own. */
159
160 r = name_to_handle_at_loop(fd, filename, &h, &mount_id, flags);
161 if (IN_SET(r, -ENOSYS, -EACCES, -EPERM, -EOVERFLOW, -EINVAL))
162 /* This kernel does not support name_to_handle_at() at all (ENOSYS), or the syscall was blocked
163 * (EACCES/EPERM; maybe through seccomp, because we are running inside of a container?), or the mount
164 * point is not triggered yet (EOVERFLOW, think nfs4), or some general name_to_handle_at() flakiness
165 * (EINVAL): fall back to simpler logic. */
166 goto fallback_fdinfo;
167 else if (r == -EOPNOTSUPP)
168 /* This kernel or file system does not support name_to_handle_at(), hence let's see if the upper fs
169 * supports it (in which case it is a mount point), otherwise fallback to the traditional stat()
170 * logic */
171 nosupp = true;
172 else if (r < 0)
173 return r;
174
175 r = name_to_handle_at_loop(fd, "", &h_parent, &mount_id_parent, AT_EMPTY_PATH);
176 if (r == -EOPNOTSUPP) {
177 if (nosupp)
178 /* Neither parent nor child do name_to_handle_at()? We have no choice but to fall back. */
179 goto fallback_fdinfo;
180 else
181 /* The parent can't do name_to_handle_at() but the directory we are interested in can? If so,
182 * it must be a mount point. */
183 return 1;
184 } else if (r < 0)
185 return r;
186
187 /* The parent can do name_to_handle_at() but the
188 * directory we are interested in can't? If so, it
189 * must be a mount point. */
190 if (nosupp)
191 return 1;
192
193 /* If the file handle for the directory we are
194 * interested in and its parent are identical, we
195 * assume this is the root directory, which is a mount
196 * point. */
197
198 if (h->handle_bytes == h_parent->handle_bytes &&
199 h->handle_type == h_parent->handle_type &&
200 memcmp(h->f_handle, h_parent->f_handle, h->handle_bytes) == 0)
201 return 1;
202
203 return mount_id != mount_id_parent;
204
205fallback_fdinfo:
206 r = fd_fdinfo_mnt_id(fd, filename, flags, &mount_id);
207 if (IN_SET(r, -EOPNOTSUPP, -EACCES, -EPERM))
208 goto fallback_fstat;
209 if (r < 0)
210 return r;
211
212 r = fd_fdinfo_mnt_id(fd, "", AT_EMPTY_PATH, &mount_id_parent);
213 if (r < 0)
214 return r;
215
216 if (mount_id != mount_id_parent)
217 return 1;
218
219 /* Hmm, so, the mount ids are the same. This leaves one
220 * special case though for the root file system. For that,
221 * let's see if the parent directory has the same inode as we
222 * are interested in. Hence, let's also do fstat() checks now,
223 * too, but avoid the st_dev comparisons, since they aren't
224 * that useful on unionfs mounts. */
225 check_st_dev = false;
226
227fallback_fstat:
228 /* yay for fstatat() taking a different set of flags than the other
229 * _at() above */
230 if (flags & AT_SYMLINK_FOLLOW)
231 flags &= ~AT_SYMLINK_FOLLOW;
232 else
233 flags |= AT_SYMLINK_NOFOLLOW;
234 if (fstatat(fd, filename, &a, flags) < 0)
235 return -errno;
236
237 if (fstatat(fd, "", &b, AT_EMPTY_PATH) < 0)
238 return -errno;
239
240 /* A directory with same device and inode as its parent? Must
241 * be the root directory */
242 if (a.st_dev == b.st_dev &&
243 a.st_ino == b.st_ino)
244 return 1;
245
246 return check_st_dev && (a.st_dev != b.st_dev);
247}
248
249/* flags can be AT_SYMLINK_FOLLOW or 0 */
250int path_is_mount_point(const char *t, const char *root, int flags) {
251 _cleanup_free_ char *canonical = NULL;
252 _cleanup_close_ int fd = -1;
253 int r;
254
255 assert(t);
256 assert((flags & ~AT_SYMLINK_FOLLOW) == 0);
257
258 if (path_equal(t, "/"))
259 return 1;
260
261 /* we need to resolve symlinks manually, we can't just rely on
262 * fd_is_mount_point() to do that for us; if we have a structure like
263 * /bin -> /usr/bin/ and /usr is a mount point, then the parent that we
264 * look at needs to be /usr, not /. */
265 if (flags & AT_SYMLINK_FOLLOW) {
a5648b80 266 r = chase_symlinks(t, root, CHASE_TRAIL_SLASH, &canonical, NULL);
049af8ad
ZJS
267 if (r < 0)
268 return r;
269
270 t = canonical;
271 }
272
273 fd = open_parent(t, O_PATH|O_CLOEXEC, 0);
274 if (fd < 0)
89a5385f 275 return fd;
049af8ad
ZJS
276
277 return fd_is_mount_point(fd, last_path_component(t), flags);
278}
279
280int path_get_mnt_id(const char *path, int *ret) {
281 int r;
282
283 r = name_to_handle_at_loop(AT_FDCWD, path, NULL, ret, 0);
284 if (IN_SET(r, -EOPNOTSUPP, -ENOSYS, -EACCES, -EPERM, -EOVERFLOW, -EINVAL)) /* kernel/fs don't support this, or seccomp blocks access, or untriggered mount, or name_to_handle_at() is flaky */
285 return fd_fdinfo_mnt_id(AT_FDCWD, path, 0, ret);
286
287 return r;
288}
289
290bool fstype_is_network(const char *fstype) {
291 const char *x;
292
293 x = startswith(fstype, "fuse.");
294 if (x)
295 fstype = x;
296
297 return STR_IN_SET(fstype,
298 "afs",
c4742de6 299 "ceph",
049af8ad 300 "cifs",
ff7d6a74 301 "smb3",
049af8ad
ZJS
302 "smbfs",
303 "sshfs",
304 "ncpfs",
305 "ncp",
306 "nfs",
307 "nfs4",
308 "gfs",
309 "gfs2",
310 "glusterfs",
311 "pvfs2", /* OrangeFS */
312 "ocfs2",
137d4487 313 "lustre",
314 "davfs");
049af8ad
ZJS
315}
316
317bool fstype_is_api_vfs(const char *fstype) {
318 return STR_IN_SET(fstype,
319 "autofs",
320 "bpf",
321 "cgroup",
322 "cgroup2",
323 "configfs",
324 "cpuset",
325 "debugfs",
326 "devpts",
327 "devtmpfs",
328 "efivarfs",
329 "fusectl",
330 "hugetlbfs",
331 "mqueue",
332 "proc",
333 "pstore",
334 "ramfs",
335 "securityfs",
336 "sysfs",
337 "tmpfs",
338 "tracefs");
339}
340
ac2474e4
Y
341bool fstype_is_blockdev_backed(const char *fstype) {
342 const char *x;
343
344 x = startswith(fstype, "fuse.");
345 if (x)
346 fstype = x;
347
348 return !streq(fstype, "9p") && !fstype_is_network(fstype) && !fstype_is_api_vfs(fstype);
349}
350
049af8ad
ZJS
351bool fstype_is_ro(const char *fstype) {
352 /* All Linux file systems that are necessarily read-only */
353 return STR_IN_SET(fstype,
354 "DM_verity_hash",
355 "iso9660",
356 "squashfs");
357}
358
359bool fstype_can_discard(const char *fstype) {
360 return STR_IN_SET(fstype,
361 "btrfs",
362 "ext4",
363 "vfat",
364 "xfs");
365}
366
367bool fstype_can_uid_gid(const char *fstype) {
368
369 /* All file systems that have a uid=/gid= mount option that fixates the owners of all files and directories,
370 * current and future. */
371
372 return STR_IN_SET(fstype,
373 "adfs",
5797a122 374 "exfat",
049af8ad
ZJS
375 "fat",
376 "hfs",
377 "hpfs",
378 "iso9660",
379 "msdos",
380 "ntfs",
381 "vfat");
382}
383
384int dev_is_devtmpfs(void) {
385 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
386 int mount_id, r;
387 char *e;
388
389 r = path_get_mnt_id("/dev", &mount_id);
390 if (r < 0)
391 return r;
392
fdeea3f4
ZJS
393 r = fopen_unlocked("/proc/self/mountinfo", "re", &proc_self_mountinfo);
394 if (r < 0)
395 return r;
049af8ad
ZJS
396
397 for (;;) {
398 _cleanup_free_ char *line = NULL;
399 int mid;
400
401 r = read_line(proc_self_mountinfo, LONG_LINE_MAX, &line);
402 if (r < 0)
403 return r;
404 if (r == 0)
405 break;
406
407 if (sscanf(line, "%i", &mid) != 1)
408 continue;
409
410 if (mid != mount_id)
411 continue;
412
413 e = strstr(line, " - ");
414 if (!e)
415 continue;
416
417 /* accept any name that starts with the currently expected type */
418 if (startswith(e + 3, "devtmpfs"))
419 return true;
420 }
421
422 return false;
423}
424
425const char *mount_propagation_flags_to_string(unsigned long flags) {
426
427 switch (flags & (MS_SHARED|MS_SLAVE|MS_PRIVATE)) {
428 case 0:
429 return "";
430 case MS_SHARED:
431 return "shared";
432 case MS_SLAVE:
433 return "slave";
434 case MS_PRIVATE:
435 return "private";
436 }
437
438 return NULL;
439}
440
441int mount_propagation_flags_from_string(const char *name, unsigned long *ret) {
442
443 if (isempty(name))
444 *ret = 0;
445 else if (streq(name, "shared"))
446 *ret = MS_SHARED;
447 else if (streq(name, "slave"))
448 *ret = MS_SLAVE;
449 else if (streq(name, "private"))
450 *ret = MS_PRIVATE;
451 else
452 return -EINVAL;
453 return 0;
454}