]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/mount-setup.c
Add SPDX license identifiers to source files under the LGPL
[thirdparty/systemd.git] / src / core / mount-setup.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2010 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19 ***/
20
21 #include <errno.h>
22 #include <ftw.h>
23 #include <stdlib.h>
24 #include <sys/mount.h>
25 #include <unistd.h>
26
27 #include "alloc-util.h"
28 #include "bus-util.h"
29 #include "cgroup-util.h"
30 #include "dev-setup.h"
31 #include "efivars.h"
32 #include "fs-util.h"
33 #include "label.h"
34 #include "log.h"
35 #include "macro.h"
36 #include "missing.h"
37 #include "mkdir.h"
38 #include "mount-setup.h"
39 #include "mount-util.h"
40 #include "path-util.h"
41 #include "set.h"
42 #include "smack-util.h"
43 #include "strv.h"
44 #include "user-util.h"
45 #include "util.h"
46 #include "virt.h"
47
48 typedef enum MountMode {
49 MNT_NONE = 0,
50 MNT_FATAL = 1 << 0,
51 MNT_IN_CONTAINER = 1 << 1,
52 } MountMode;
53
54 typedef struct MountPoint {
55 const char *what;
56 const char *where;
57 const char *type;
58 const char *options;
59 unsigned long flags;
60 bool (*condition_fn)(void);
61 MountMode mode;
62 } MountPoint;
63
64 /* The first three entries we might need before SELinux is up. The
65 * fourth (securityfs) is needed by IMA to load a custom policy. The
66 * other ones we can delay until SELinux and IMA are loaded. When
67 * SMACK is enabled we need smackfs, too, so it's a fifth one. */
68 #if ENABLE_SMACK
69 #define N_EARLY_MOUNT 5
70 #else
71 #define N_EARLY_MOUNT 4
72 #endif
73
74 static const MountPoint mount_table[] = {
75 { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
76 NULL, MNT_FATAL|MNT_IN_CONTAINER },
77 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
78 NULL, MNT_FATAL|MNT_IN_CONTAINER },
79 { "devtmpfs", "/dev", "devtmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,
80 NULL, MNT_FATAL|MNT_IN_CONTAINER },
81 { "securityfs", "/sys/kernel/security", "securityfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
82 NULL, MNT_NONE },
83 #if ENABLE_SMACK
84 { "smackfs", "/sys/fs/smackfs", "smackfs", "smackfsdef=*", MS_NOSUID|MS_NOEXEC|MS_NODEV,
85 mac_smack_use, MNT_FATAL },
86 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
87 mac_smack_use, MNT_FATAL },
88 #endif
89 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
90 NULL, MNT_FATAL|MNT_IN_CONTAINER },
91 { "devpts", "/dev/pts", "devpts", "mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC,
92 NULL, MNT_IN_CONTAINER },
93 #if ENABLE_SMACK
94 { "tmpfs", "/run", "tmpfs", "mode=755,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
95 mac_smack_use, MNT_FATAL },
96 #endif
97 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
98 NULL, MNT_FATAL|MNT_IN_CONTAINER },
99 { "cgroup", "/sys/fs/cgroup", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV,
100 cg_is_unified_wanted, MNT_IN_CONTAINER },
101 { "cgroup", "/sys/fs/cgroup", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
102 cg_is_unified_wanted, MNT_IN_CONTAINER },
103 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
104 cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
105 { "cgroup", "/sys/fs/cgroup/unified", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV,
106 cg_is_hybrid_wanted, MNT_IN_CONTAINER },
107 { "cgroup", "/sys/fs/cgroup/unified", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
108 cg_is_hybrid_wanted, MNT_IN_CONTAINER },
109 { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd,xattr", MS_NOSUID|MS_NOEXEC|MS_NODEV,
110 cg_is_legacy_wanted, MNT_IN_CONTAINER },
111 { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd", MS_NOSUID|MS_NOEXEC|MS_NODEV,
112 cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
113 { "pstore", "/sys/fs/pstore", "pstore", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
114 NULL, MNT_NONE },
115 #if ENABLE_EFI
116 { "efivarfs", "/sys/firmware/efi/efivars", "efivarfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
117 is_efi_boot, MNT_NONE },
118 #endif
119 };
120
121 /* These are API file systems that might be mounted by other software,
122 * we just list them here so that we know that we should ignore them */
123
124 static const char ignore_paths[] =
125 /* SELinux file systems */
126 "/sys/fs/selinux\0"
127 /* Container bind mounts */
128 "/proc/sys\0"
129 "/dev/console\0"
130 "/proc/kmsg\0";
131
132 bool mount_point_is_api(const char *path) {
133 unsigned i;
134
135 /* Checks if this mount point is considered "API", and hence
136 * should be ignored */
137
138 for (i = 0; i < ELEMENTSOF(mount_table); i ++)
139 if (path_equal(path, mount_table[i].where))
140 return true;
141
142 return path_startswith(path, "/sys/fs/cgroup/");
143 }
144
145 bool mount_point_ignore(const char *path) {
146 const char *i;
147
148 NULSTR_FOREACH(i, ignore_paths)
149 if (path_equal(path, i))
150 return true;
151
152 return false;
153 }
154
155 static int mount_one(const MountPoint *p, bool relabel) {
156 int r;
157
158 assert(p);
159
160 if (p->condition_fn && !p->condition_fn())
161 return 0;
162
163 /* Relabel first, just in case */
164 if (relabel)
165 (void) label_fix(p->where, true, true);
166
167 r = path_is_mount_point(p->where, NULL, AT_SYMLINK_FOLLOW);
168 if (r < 0 && r != -ENOENT) {
169 log_full_errno((p->mode & MNT_FATAL) ? LOG_ERR : LOG_DEBUG, r, "Failed to determine whether %s is a mount point: %m", p->where);
170 return (p->mode & MNT_FATAL) ? r : 0;
171 }
172 if (r > 0)
173 return 0;
174
175 /* Skip securityfs in a container */
176 if (!(p->mode & MNT_IN_CONTAINER) && detect_container() > 0)
177 return 0;
178
179 /* The access mode here doesn't really matter too much, since
180 * the mounted file system will take precedence anyway. */
181 if (relabel)
182 (void) mkdir_p_label(p->where, 0755);
183 else
184 (void) mkdir_p(p->where, 0755);
185
186 log_debug("Mounting %s to %s of type %s with options %s.",
187 p->what,
188 p->where,
189 p->type,
190 strna(p->options));
191
192 if (mount(p->what,
193 p->where,
194 p->type,
195 p->flags,
196 p->options) < 0) {
197 log_full_errno((p->mode & MNT_FATAL) ? LOG_ERR : LOG_DEBUG, errno, "Failed to mount %s at %s: %m", p->type, p->where);
198 return (p->mode & MNT_FATAL) ? -errno : 0;
199 }
200
201 /* Relabel again, since we now mounted something fresh here */
202 if (relabel)
203 (void) label_fix(p->where, false, false);
204
205 return 1;
206 }
207
208 static int mount_points_setup(unsigned n, bool loaded_policy) {
209 unsigned i;
210 int r = 0;
211
212 for (i = 0; i < n; i ++) {
213 int j;
214
215 j = mount_one(mount_table + i, loaded_policy);
216 if (j != 0 && r >= 0)
217 r = j;
218 }
219
220 return r;
221 }
222
223 int mount_setup_early(void) {
224 assert_cc(N_EARLY_MOUNT <= ELEMENTSOF(mount_table));
225
226 /* Do a minimal mount of /proc and friends to enable the most
227 * basic stuff, such as SELinux */
228 return mount_points_setup(N_EARLY_MOUNT, false);
229 }
230
231 int mount_cgroup_controllers(char ***join_controllers) {
232 _cleanup_set_free_free_ Set *controllers = NULL;
233 int r;
234
235 if (!cg_is_legacy_wanted())
236 return 0;
237
238 /* Mount all available cgroup controllers that are built into the kernel. */
239
240 controllers = set_new(&string_hash_ops);
241 if (!controllers)
242 return log_oom();
243
244 r = cg_kernel_controllers(controllers);
245 if (r < 0)
246 return log_error_errno(r, "Failed to enumerate cgroup controllers: %m");
247
248 for (;;) {
249 _cleanup_free_ char *options = NULL, *controller = NULL, *where = NULL;
250 MountPoint p = {
251 .what = "cgroup",
252 .type = "cgroup",
253 .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV,
254 .mode = MNT_IN_CONTAINER,
255 };
256 char ***k = NULL;
257
258 controller = set_steal_first(controllers);
259 if (!controller)
260 break;
261
262 if (join_controllers)
263 for (k = join_controllers; *k; k++)
264 if (strv_find(*k, controller))
265 break;
266
267 if (k && *k) {
268 char **i, **j;
269
270 for (i = *k, j = *k; *i; i++) {
271
272 if (!streq(*i, controller)) {
273 _cleanup_free_ char *t;
274
275 t = set_remove(controllers, *i);
276 if (!t) {
277 free(*i);
278 continue;
279 }
280 }
281
282 *(j++) = *i;
283 }
284
285 *j = NULL;
286
287 options = strv_join(*k, ",");
288 if (!options)
289 return log_oom();
290 } else {
291 options = controller;
292 controller = NULL;
293 }
294
295 where = strappend("/sys/fs/cgroup/", options);
296 if (!where)
297 return log_oom();
298
299 p.where = where;
300 p.options = options;
301
302 r = mount_one(&p, true);
303 if (r < 0)
304 return r;
305
306 if (r > 0 && k && *k) {
307 char **i;
308
309 for (i = *k; *i; i++) {
310 _cleanup_free_ char *t = NULL;
311
312 t = strappend("/sys/fs/cgroup/", *i);
313 if (!t)
314 return log_oom();
315
316 r = symlink(options, t);
317 if (r >= 0) {
318 #ifdef SMACK_RUN_LABEL
319 _cleanup_free_ char *src;
320 src = strappend("/sys/fs/cgroup/", options);
321 if (!src)
322 return log_oom();
323 r = mac_smack_copy(t, src);
324 if (r < 0 && r != -EOPNOTSUPP)
325 return log_error_errno(r, "Failed to copy smack label from %s to %s: %m", src, t);
326 #endif
327 } else if (errno != EEXIST)
328 return log_error_errno(errno, "Failed to create symlink %s: %m", t);
329 }
330 }
331 }
332
333 /* Now that we mounted everything, let's make the tmpfs the
334 * cgroup file systems are mounted into read-only. */
335 (void) mount("tmpfs", "/sys/fs/cgroup", "tmpfs", MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
336
337 return 0;
338 }
339
340 #if HAVE_SELINUX || ENABLE_SMACK
341 static int nftw_cb(
342 const char *fpath,
343 const struct stat *sb,
344 int tflag,
345 struct FTW *ftwbuf) {
346
347 /* No need to label /dev twice in a row... */
348 if (_unlikely_(ftwbuf->level == 0))
349 return FTW_CONTINUE;
350
351 label_fix(fpath, false, false);
352
353 /* /run/initramfs is static data and big, no need to
354 * dynamically relabel its contents at boot... */
355 if (_unlikely_(ftwbuf->level == 1 &&
356 tflag == FTW_D &&
357 streq(fpath, "/run/initramfs")))
358 return FTW_SKIP_SUBTREE;
359
360 return FTW_CONTINUE;
361 };
362 #endif
363
364 int mount_setup(bool loaded_policy) {
365 int r = 0;
366
367 r = mount_points_setup(ELEMENTSOF(mount_table), loaded_policy);
368 if (r < 0)
369 return r;
370
371 #if HAVE_SELINUX || ENABLE_SMACK
372 /* Nodes in devtmpfs and /run need to be manually updated for
373 * the appropriate labels, after mounting. The other virtual
374 * API file systems like /sys and /proc do not need that, they
375 * use the same label for all their files. */
376 if (loaded_policy) {
377 usec_t before_relabel, after_relabel;
378 char timespan[FORMAT_TIMESPAN_MAX];
379
380 before_relabel = now(CLOCK_MONOTONIC);
381
382 nftw("/dev", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
383 nftw("/dev/shm", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
384 nftw("/run", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
385
386 after_relabel = now(CLOCK_MONOTONIC);
387
388 log_info("Relabelled /dev and /run in %s.",
389 format_timespan(timespan, sizeof(timespan), after_relabel - before_relabel, 0));
390 }
391 #endif
392
393 /* Create a few default symlinks, which are normally created
394 * by udevd, but some scripts might need them before we start
395 * udevd. */
396 dev_setup(NULL, UID_INVALID, GID_INVALID);
397
398 /* Mark the root directory as shared in regards to mount propagation. The kernel defaults to "private", but we
399 * think it makes more sense to have a default of "shared" so that nspawn and the container tools work out of
400 * the box. If specific setups need other settings they can reset the propagation mode to private if
401 * needed. Note that we set this only when we are invoked directly by the kernel. If we are invoked by a
402 * container manager we assume the container manager knows what it is doing (for example, because it set up
403 * some directories with different propagation modes). */
404 if (detect_container() <= 0)
405 if (mount(NULL, "/", NULL, MS_REC|MS_SHARED, NULL) < 0)
406 log_warning_errno(errno, "Failed to set up the root directory for shared mount propagation: %m");
407
408 /* Create a few directories we always want around, Note that sd_booted() checks for /run/systemd/system, so
409 * this mkdir really needs to stay for good, otherwise software that copied sd-daemon.c into their sources will
410 * misdetect systemd. */
411 (void) mkdir_label("/run/systemd", 0755);
412 (void) mkdir_label("/run/systemd/system", 0755);
413
414 /* Set up inaccessible items */
415 (void) mkdir_label("/run/systemd/inaccessible", 0000);
416 (void) mknod("/run/systemd/inaccessible/reg", S_IFREG | 0000, 0);
417 (void) mkdir_label("/run/systemd/inaccessible/dir", 0000);
418 (void) mknod("/run/systemd/inaccessible/chr", S_IFCHR | 0000, makedev(0, 0));
419 (void) mknod("/run/systemd/inaccessible/blk", S_IFBLK | 0000, makedev(0, 0));
420 (void) mkfifo("/run/systemd/inaccessible/fifo", 0000);
421 (void) mknod("/run/systemd/inaccessible/sock", S_IFSOCK | 0000, 0);
422
423 return 0;
424 }