]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/mount-setup.c
systemd: fall back to mounting /sys/fs/cgroup sans xattr
[thirdparty/systemd.git] / src / core / mount-setup.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mount.h>
23 #include <errno.h>
24 #include <sys/stat.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <libgen.h>
28 #include <assert.h>
29 #include <unistd.h>
30 #include <ftw.h>
31
32 #include "mount-setup.h"
33 #include "dev-setup.h"
34 #include "log.h"
35 #include "macro.h"
36 #include "util.h"
37 #include "label.h"
38 #include "set.h"
39 #include "strv.h"
40 #include "mkdir.h"
41 #include "path-util.h"
42 #include "missing.h"
43 #include "virt.h"
44 #include "efivars.h"
45
46 #ifndef TTY_GID
47 #define TTY_GID 5
48 #endif
49
50 typedef enum MountMode {
51 MNT_NONE = 0,
52 MNT_FATAL = 1 << 0,
53 MNT_IN_CONTAINER = 1 << 1,
54 } MountMode;
55
56 typedef struct MountPoint {
57 const char *what;
58 const char *where;
59 const char *type;
60 const char *options;
61 unsigned long flags;
62 bool (*condition_fn)(void);
63 MountMode mode;
64 } MountPoint;
65
66 /* The first three entries we might need before SELinux is up. The
67 * fourth (securityfs) is needed by IMA to load a custom policy. The
68 * other ones we can delay until SELinux and IMA are loaded. */
69 #define N_EARLY_MOUNT 5
70
71 static const MountPoint mount_table[] = {
72 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
73 NULL, MNT_FATAL|MNT_IN_CONTAINER },
74 { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
75 NULL, MNT_FATAL|MNT_IN_CONTAINER },
76 { "devtmpfs", "/dev", "devtmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,
77 NULL, MNT_FATAL|MNT_IN_CONTAINER },
78 { "securityfs", "/sys/kernel/security", "securityfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
79 NULL, MNT_NONE },
80 { "smackfs", "/sys/fs/smackfs", "smackfs", "smackfsdef=*", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
81 NULL, MNT_NONE },
82 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
83 NULL, MNT_FATAL|MNT_IN_CONTAINER },
84 { "devpts", "/dev/pts", "devpts", "mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC,
85 NULL, MNT_IN_CONTAINER },
86 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
87 NULL, MNT_FATAL|MNT_IN_CONTAINER },
88 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
89 NULL, MNT_IN_CONTAINER },
90 #ifdef HAVE_XATTR
91 { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd,xattr", MS_NOSUID|MS_NOEXEC|MS_NODEV,
92 NULL, MNT_IN_CONTAINER },
93 #endif
94 { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd", MS_NOSUID|MS_NOEXEC|MS_NODEV,
95 NULL, MNT_IN_CONTAINER },
96 { "pstore", "/sys/fs/pstore", "pstore", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
97 NULL, MNT_NONE },
98 #ifdef ENABLE_EFI
99 { "efivarfs", "/sys/firmware/efi/efivars", "efivarfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
100 is_efi_boot, MNT_NONE },
101 #endif
102 };
103
104 /* These are API file systems that might be mounted by other software,
105 * we just list them here so that we know that we should ignore them */
106
107 static const char ignore_paths[] =
108 /* SELinux file systems */
109 "/sys/fs/selinux\0"
110 "/selinux\0"
111 /* Legacy cgroup mount points */
112 "/dev/cgroup\0"
113 "/cgroup\0"
114 /* Legacy kernel file system */
115 "/proc/bus/usb\0"
116 /* Container bind mounts */
117 "/proc/sys\0"
118 "/dev/console\0"
119 "/proc/kmsg\0";
120
121 bool mount_point_is_api(const char *path) {
122 unsigned i;
123
124 /* Checks if this mount point is considered "API", and hence
125 * should be ignored */
126
127 for (i = 0; i < ELEMENTSOF(mount_table); i ++)
128 if (path_equal(path, mount_table[i].where))
129 return true;
130
131 return path_startswith(path, "/sys/fs/cgroup/");
132 }
133
134 bool mount_point_ignore(const char *path) {
135 const char *i;
136
137 NULSTR_FOREACH(i, ignore_paths)
138 if (path_equal(path, i))
139 return true;
140
141 return false;
142 }
143
144 static int mount_one(const MountPoint *p, bool relabel) {
145 int r;
146
147 assert(p);
148
149 if (p->condition_fn && !p->condition_fn())
150 return 0;
151
152 /* Relabel first, just in case */
153 if (relabel)
154 label_fix(p->where, true, true);
155
156 r = path_is_mount_point(p->where, true);
157 if (r < 0)
158 return r;
159
160 if (r > 0)
161 return 0;
162
163 /* Skip securityfs in a container */
164 if (!(p->mode & MNT_IN_CONTAINER) && detect_container(NULL) > 0)
165 return 0;
166
167 /* The access mode here doesn't really matter too much, since
168 * the mounted file system will take precedence anyway. */
169 mkdir_p_label(p->where, 0755);
170
171 log_debug("Mounting %s to %s of type %s with options %s.",
172 p->what,
173 p->where,
174 p->type,
175 strna(p->options));
176
177 if (mount(p->what,
178 p->where,
179 p->type,
180 p->flags,
181 p->options) < 0) {
182 log_full((p->mode & MNT_FATAL) ? LOG_ERR : LOG_DEBUG, "Failed to mount %s: %s", p->where, strerror(errno));
183 return (p->mode & MNT_FATAL) ? -errno : 0;
184 }
185
186 /* Relabel again, since we now mounted something fresh here */
187 if (relabel)
188 label_fix(p->where, false, false);
189
190 return 1;
191 }
192
193 int mount_setup_early(void) {
194 unsigned i;
195 int r = 0;
196
197 assert_cc(N_EARLY_MOUNT <= ELEMENTSOF(mount_table));
198
199 /* Do a minimal mount of /proc and friends to enable the most
200 * basic stuff, such as SELinux */
201 for (i = 0; i < N_EARLY_MOUNT; i ++) {
202 int j;
203
204 j = mount_one(mount_table + i, false);
205 if (r == 0)
206 r = j;
207 }
208
209 return r;
210 }
211
212 int mount_cgroup_controllers(char ***join_controllers) {
213 int r;
214 FILE *f;
215 char buf[LINE_MAX];
216 Set *controllers;
217
218 /* Mount all available cgroup controllers that are built into the kernel. */
219
220 f = fopen("/proc/cgroups", "re");
221 if (!f) {
222 log_error("Failed to enumerate cgroup controllers: %m");
223 return 0;
224 }
225
226 controllers = set_new(string_hash_func, string_compare_func);
227 if (!controllers) {
228 r = log_oom();
229 goto finish;
230 }
231
232 /* Ignore the header line */
233 (void) fgets(buf, sizeof(buf), f);
234
235 for (;;) {
236 char *controller;
237 int enabled = 0;
238
239 if (fscanf(f, "%ms %*i %*i %i", &controller, &enabled) != 2) {
240
241 if (feof(f))
242 break;
243
244 log_error("Failed to parse /proc/cgroups.");
245 r = -EIO;
246 goto finish;
247 }
248
249 if (!enabled) {
250 free(controller);
251 continue;
252 }
253
254 r = set_put(controllers, controller);
255 if (r < 0) {
256 log_error("Failed to add controller to set.");
257 free(controller);
258 goto finish;
259 }
260 }
261
262 for (;;) {
263 MountPoint p;
264 char *controller, *where, *options;
265 char ***k = NULL;
266
267 controller = set_steal_first(controllers);
268 if (!controller)
269 break;
270
271 if (join_controllers)
272 for (k = join_controllers; *k; k++)
273 if (strv_find(*k, controller))
274 break;
275
276 if (k && *k) {
277 char **i, **j;
278
279 for (i = *k, j = *k; *i; i++) {
280
281 if (!streq(*i, controller)) {
282 char *t;
283
284 t = set_remove(controllers, *i);
285 if (!t) {
286 free(*i);
287 continue;
288 }
289 free(t);
290 }
291
292 *(j++) = *i;
293 }
294
295 *j = NULL;
296
297 options = strv_join(*k, ",");
298 if (!options) {
299 free(controller);
300 r = log_oom();
301 goto finish;
302 }
303
304 } else {
305 options = controller;
306 controller = NULL;
307 }
308
309 where = strappend("/sys/fs/cgroup/", options);
310 if (!where) {
311 free(options);
312 r = log_oom();
313 goto finish;
314 }
315
316 zero(p);
317 p.what = "cgroup";
318 p.where = where;
319 p.type = "cgroup";
320 p.options = options;
321 p.flags = MS_NOSUID|MS_NOEXEC|MS_NODEV;
322 p.mode = MNT_IN_CONTAINER;
323
324 r = mount_one(&p, true);
325 free(controller);
326 free(where);
327
328 if (r < 0) {
329 free(options);
330 goto finish;
331 }
332
333 if (r > 0 && k && *k) {
334 char **i;
335
336 for (i = *k; *i; i++) {
337 char *t;
338
339 t = strappend("/sys/fs/cgroup/", *i);
340 if (!t) {
341 r = log_oom();
342 free(options);
343 goto finish;
344 }
345
346 r = symlink(options, t);
347 free(t);
348
349 if (r < 0 && errno != EEXIST) {
350 log_error("Failed to create symlink: %m");
351 r = -errno;
352 free(options);
353 goto finish;
354 }
355 }
356 }
357
358 free(options);
359 }
360
361 r = 0;
362
363 finish:
364 set_free_free(controllers);
365
366 fclose(f);
367
368 return r;
369 }
370
371 static int nftw_cb(
372 const char *fpath,
373 const struct stat *sb,
374 int tflag,
375 struct FTW *ftwbuf) {
376
377 /* No need to label /dev twice in a row... */
378 if (_unlikely_(ftwbuf->level == 0))
379 return FTW_CONTINUE;
380
381 label_fix(fpath, false, false);
382
383 /* /run/initramfs is static data and big, no need to
384 * dynamically relabel its contents at boot... */
385 if (_unlikely_(ftwbuf->level == 1 &&
386 tflag == FTW_D &&
387 streq(fpath, "/run/initramfs")))
388 return FTW_SKIP_SUBTREE;
389
390 return FTW_CONTINUE;
391 };
392
393 int mount_setup(bool loaded_policy) {
394
395 static const char relabel[] =
396 "/run/initramfs/root-fsck\0"
397 "/run/initramfs/shutdown\0";
398
399 int r;
400 unsigned i;
401 const char *j;
402
403 for (i = 0; i < ELEMENTSOF(mount_table); i ++) {
404 r = mount_one(mount_table + i, true);
405
406 if (r < 0)
407 return r;
408 }
409
410 /* Nodes in devtmpfs and /run need to be manually updated for
411 * the appropriate labels, after mounting. The other virtual
412 * API file systems like /sys and /proc do not need that, they
413 * use the same label for all their files. */
414 if (loaded_policy) {
415 usec_t before_relabel, after_relabel;
416 char timespan[FORMAT_TIMESPAN_MAX];
417
418 before_relabel = now(CLOCK_MONOTONIC);
419
420 nftw("/dev", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
421 nftw("/run", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
422
423 /* Explicitly relabel these */
424 NULSTR_FOREACH(j, relabel)
425 label_fix(j, true, false);
426
427 after_relabel = now(CLOCK_MONOTONIC);
428
429 log_info("Relabelled /dev and /run in %s.",
430 format_timespan(timespan, sizeof(timespan), after_relabel - before_relabel, 0));
431 }
432
433 /* Create a few default symlinks, which are normally created
434 * by udevd, but some scripts might need them before we start
435 * udevd. */
436 dev_setup(NULL);
437
438 /* Mark the root directory as shared in regards to mount
439 * propagation. The kernel defaults to "private", but we think
440 * it makes more sense to have a default of "shared" so that
441 * nspawn and the container tools work out of the box. If
442 * specific setups need other settings they can reset the
443 * propagation mode to private if needed. */
444 if (detect_container(NULL) <= 0)
445 if (mount(NULL, "/", NULL, MS_REC|MS_SHARED, NULL) < 0)
446 log_warning("Failed to set up the root directory for shared mount propagation: %m");
447
448 /* Create a few directories we always want around, Note that
449 * sd_booted() checks for /run/systemd/system, so this mkdir
450 * really needs to stay for good, otherwise software that
451 * copied sd-daemon.c into their sources will misdetect
452 * systemd. */
453 mkdir_label("/run/systemd", 0755);
454 mkdir_label("/run/systemd/system", 0755);
455 mkdir_label("/run/systemd/inaccessible", 0000);
456
457 return 0;
458 }