]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/mount-setup.c
systemd,nspawn: use extended attributes to store metadata
[thirdparty/systemd.git] / src / core / mount-setup.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mount.h>
23 #include <errno.h>
24 #include <sys/stat.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <libgen.h>
28 #include <assert.h>
29 #include <unistd.h>
30 #include <ftw.h>
31
32 #include "mount-setup.h"
33 #include "dev-setup.h"
34 #include "log.h"
35 #include "macro.h"
36 #include "util.h"
37 #include "label.h"
38 #include "set.h"
39 #include "strv.h"
40 #include "mkdir.h"
41 #include "path-util.h"
42 #include "missing.h"
43 #include "virt.h"
44 #include "efivars.h"
45
46 #ifndef TTY_GID
47 #define TTY_GID 5
48 #endif
49
50 typedef enum MountMode {
51 MNT_NONE = 0,
52 MNT_FATAL = 1 << 0,
53 MNT_IN_CONTAINER = 1 << 1,
54 } MountMode;
55
56 typedef struct MountPoint {
57 const char *what;
58 const char *where;
59 const char *type;
60 const char *options;
61 unsigned long flags;
62 bool (*condition_fn)(void);
63 MountMode mode;
64 } MountPoint;
65
66 /* The first three entries we might need before SELinux is up. The
67 * fourth (securityfs) is needed by IMA to load a custom policy. The
68 * other ones we can delay until SELinux and IMA are loaded. */
69 #define N_EARLY_MOUNT 5
70
71 #ifdef HAVE_XATTR
72 # define FS_XATTR_OPT ",xattr"
73 #else
74 # define FS_XATTR_OPT ""
75 #endif
76
77 static const MountPoint mount_table[] = {
78 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
79 NULL, MNT_FATAL|MNT_IN_CONTAINER },
80 { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
81 NULL, MNT_FATAL|MNT_IN_CONTAINER },
82 { "devtmpfs", "/dev", "devtmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,
83 NULL, MNT_FATAL|MNT_IN_CONTAINER },
84 { "securityfs", "/sys/kernel/security", "securityfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
85 NULL, MNT_NONE },
86 { "smackfs", "/sys/fs/smackfs", "smackfs", "smackfsdef=*", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
87 NULL, MNT_NONE },
88 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
89 NULL, MNT_FATAL|MNT_IN_CONTAINER },
90 { "devpts", "/dev/pts", "devpts", "mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC,
91 NULL, MNT_IN_CONTAINER },
92 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
93 NULL, MNT_FATAL|MNT_IN_CONTAINER },
94 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
95 NULL, MNT_IN_CONTAINER },
96 { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd" FS_XATTR_OPT, MS_NOSUID|MS_NOEXEC|MS_NODEV,
97 NULL, MNT_IN_CONTAINER },
98 { "pstore", "/sys/fs/pstore", "pstore", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
99 NULL, MNT_NONE },
100 #ifdef ENABLE_EFI
101 { "efivarfs", "/sys/firmware/efi/efivars", "efivarfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
102 is_efi_boot, MNT_NONE },
103 #endif
104 };
105
106 /* These are API file systems that might be mounted by other software,
107 * we just list them here so that we know that we should ignore them */
108
109 static const char ignore_paths[] =
110 /* SELinux file systems */
111 "/sys/fs/selinux\0"
112 "/selinux\0"
113 /* Legacy cgroup mount points */
114 "/dev/cgroup\0"
115 "/cgroup\0"
116 /* Legacy kernel file system */
117 "/proc/bus/usb\0"
118 /* Container bind mounts */
119 "/proc/sys\0"
120 "/dev/console\0"
121 "/proc/kmsg\0";
122
123 bool mount_point_is_api(const char *path) {
124 unsigned i;
125
126 /* Checks if this mount point is considered "API", and hence
127 * should be ignored */
128
129 for (i = 0; i < ELEMENTSOF(mount_table); i ++)
130 if (path_equal(path, mount_table[i].where))
131 return true;
132
133 return path_startswith(path, "/sys/fs/cgroup/");
134 }
135
136 bool mount_point_ignore(const char *path) {
137 const char *i;
138
139 NULSTR_FOREACH(i, ignore_paths)
140 if (path_equal(path, i))
141 return true;
142
143 return false;
144 }
145
146 static int mount_one(const MountPoint *p, bool relabel) {
147 int r;
148
149 assert(p);
150
151 if (p->condition_fn && !p->condition_fn())
152 return 0;
153
154 /* Relabel first, just in case */
155 if (relabel)
156 label_fix(p->where, true, true);
157
158 r = path_is_mount_point(p->where, true);
159 if (r < 0)
160 return r;
161
162 if (r > 0)
163 return 0;
164
165 /* Skip securityfs in a container */
166 if (!(p->mode & MNT_IN_CONTAINER) && detect_container(NULL) > 0)
167 return 0;
168
169 /* The access mode here doesn't really matter too much, since
170 * the mounted file system will take precedence anyway. */
171 mkdir_p_label(p->where, 0755);
172
173 log_debug("Mounting %s to %s of type %s with options %s.",
174 p->what,
175 p->where,
176 p->type,
177 strna(p->options));
178
179 if (mount(p->what,
180 p->where,
181 p->type,
182 p->flags,
183 p->options) < 0) {
184 log_full((p->mode & MNT_FATAL) ? LOG_ERR : LOG_DEBUG, "Failed to mount %s: %s", p->where, strerror(errno));
185 return (p->mode & MNT_FATAL) ? -errno : 0;
186 }
187
188 /* Relabel again, since we now mounted something fresh here */
189 if (relabel)
190 label_fix(p->where, false, false);
191
192 return 1;
193 }
194
195 int mount_setup_early(void) {
196 unsigned i;
197 int r = 0;
198
199 assert_cc(N_EARLY_MOUNT <= ELEMENTSOF(mount_table));
200
201 /* Do a minimal mount of /proc and friends to enable the most
202 * basic stuff, such as SELinux */
203 for (i = 0; i < N_EARLY_MOUNT; i ++) {
204 int j;
205
206 j = mount_one(mount_table + i, false);
207 if (r == 0)
208 r = j;
209 }
210
211 return r;
212 }
213
214 int mount_cgroup_controllers(char ***join_controllers) {
215 int r;
216 FILE *f;
217 char buf[LINE_MAX];
218 Set *controllers;
219
220 /* Mount all available cgroup controllers that are built into the kernel. */
221
222 f = fopen("/proc/cgroups", "re");
223 if (!f) {
224 log_error("Failed to enumerate cgroup controllers: %m");
225 return 0;
226 }
227
228 controllers = set_new(string_hash_func, string_compare_func);
229 if (!controllers) {
230 r = log_oom();
231 goto finish;
232 }
233
234 /* Ignore the header line */
235 (void) fgets(buf, sizeof(buf), f);
236
237 for (;;) {
238 char *controller;
239 int enabled = 0;
240
241 if (fscanf(f, "%ms %*i %*i %i", &controller, &enabled) != 2) {
242
243 if (feof(f))
244 break;
245
246 log_error("Failed to parse /proc/cgroups.");
247 r = -EIO;
248 goto finish;
249 }
250
251 if (!enabled) {
252 free(controller);
253 continue;
254 }
255
256 r = set_put(controllers, controller);
257 if (r < 0) {
258 log_error("Failed to add controller to set.");
259 free(controller);
260 goto finish;
261 }
262 }
263
264 for (;;) {
265 MountPoint p;
266 char *controller, *where, *options;
267 char ***k = NULL;
268
269 controller = set_steal_first(controllers);
270 if (!controller)
271 break;
272
273 if (join_controllers)
274 for (k = join_controllers; *k; k++)
275 if (strv_find(*k, controller))
276 break;
277
278 if (k && *k) {
279 char **i, **j;
280
281 for (i = *k, j = *k; *i; i++) {
282
283 if (!streq(*i, controller)) {
284 char *t;
285
286 t = set_remove(controllers, *i);
287 if (!t) {
288 free(*i);
289 continue;
290 }
291 free(t);
292 }
293
294 *(j++) = *i;
295 }
296
297 *j = NULL;
298
299 options = strv_join(*k, ",");
300 if (!options) {
301 free(controller);
302 r = log_oom();
303 goto finish;
304 }
305
306 } else {
307 options = controller;
308 controller = NULL;
309 }
310
311 where = strappend("/sys/fs/cgroup/", options);
312 if (!where) {
313 free(options);
314 r = log_oom();
315 goto finish;
316 }
317
318 zero(p);
319 p.what = "cgroup";
320 p.where = where;
321 p.type = "cgroup";
322 p.options = options;
323 p.flags = MS_NOSUID|MS_NOEXEC|MS_NODEV;
324 p.mode = MNT_IN_CONTAINER;
325
326 r = mount_one(&p, true);
327 free(controller);
328 free(where);
329
330 if (r < 0) {
331 free(options);
332 goto finish;
333 }
334
335 if (r > 0 && k && *k) {
336 char **i;
337
338 for (i = *k; *i; i++) {
339 char *t;
340
341 t = strappend("/sys/fs/cgroup/", *i);
342 if (!t) {
343 r = log_oom();
344 free(options);
345 goto finish;
346 }
347
348 r = symlink(options, t);
349 free(t);
350
351 if (r < 0 && errno != EEXIST) {
352 log_error("Failed to create symlink: %m");
353 r = -errno;
354 free(options);
355 goto finish;
356 }
357 }
358 }
359
360 free(options);
361 }
362
363 r = 0;
364
365 finish:
366 set_free_free(controllers);
367
368 fclose(f);
369
370 return r;
371 }
372
373 static int nftw_cb(
374 const char *fpath,
375 const struct stat *sb,
376 int tflag,
377 struct FTW *ftwbuf) {
378
379 /* No need to label /dev twice in a row... */
380 if (_unlikely_(ftwbuf->level == 0))
381 return FTW_CONTINUE;
382
383 label_fix(fpath, false, false);
384
385 /* /run/initramfs is static data and big, no need to
386 * dynamically relabel its contents at boot... */
387 if (_unlikely_(ftwbuf->level == 1 &&
388 tflag == FTW_D &&
389 streq(fpath, "/run/initramfs")))
390 return FTW_SKIP_SUBTREE;
391
392 return FTW_CONTINUE;
393 };
394
395 int mount_setup(bool loaded_policy) {
396
397 static const char relabel[] =
398 "/run/initramfs/root-fsck\0"
399 "/run/initramfs/shutdown\0";
400
401 int r;
402 unsigned i;
403 const char *j;
404
405 for (i = 0; i < ELEMENTSOF(mount_table); i ++) {
406 r = mount_one(mount_table + i, true);
407
408 if (r < 0)
409 return r;
410 }
411
412 /* Nodes in devtmpfs and /run need to be manually updated for
413 * the appropriate labels, after mounting. The other virtual
414 * API file systems like /sys and /proc do not need that, they
415 * use the same label for all their files. */
416 if (loaded_policy) {
417 usec_t before_relabel, after_relabel;
418 char timespan[FORMAT_TIMESPAN_MAX];
419
420 before_relabel = now(CLOCK_MONOTONIC);
421
422 nftw("/dev", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
423 nftw("/run", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
424
425 /* Explicitly relabel these */
426 NULSTR_FOREACH(j, relabel)
427 label_fix(j, true, false);
428
429 after_relabel = now(CLOCK_MONOTONIC);
430
431 log_info("Relabelled /dev and /run in %s.",
432 format_timespan(timespan, sizeof(timespan), after_relabel - before_relabel, 0));
433 }
434
435 /* Create a few default symlinks, which are normally created
436 * by udevd, but some scripts might need them before we start
437 * udevd. */
438 dev_setup(NULL);
439
440 /* Mark the root directory as shared in regards to mount
441 * propagation. The kernel defaults to "private", but we think
442 * it makes more sense to have a default of "shared" so that
443 * nspawn and the container tools work out of the box. If
444 * specific setups need other settings they can reset the
445 * propagation mode to private if needed. */
446 if (detect_container(NULL) <= 0)
447 if (mount(NULL, "/", NULL, MS_REC|MS_SHARED, NULL) < 0)
448 log_warning("Failed to set up the root directory for shared mount propagation: %m");
449
450 /* Create a few directories we always want around, Note that
451 * sd_booted() checks for /run/systemd/system, so this mkdir
452 * really needs to stay for good, otherwise software that
453 * copied sd-daemon.c into their sources will misdetect
454 * systemd. */
455 mkdir_label("/run/systemd", 0755);
456 mkdir_label("/run/systemd/system", 0755);
457 mkdir_label("/run/systemd/inaccessible", 0000);
458
459 return 0;
460 }