]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/mount-setup.c
core: unified cgroup hierarchy support
[thirdparty/systemd.git] / src / core / mount-setup.c
CommitLineData
d6c9574f 1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
8e274523
LP
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
8e274523
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
8e274523 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
8e274523
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <sys/mount.h>
23#include <errno.h>
8e274523 24#include <stdlib.h>
5c0532d1 25#include <unistd.h>
1829dc9d 26#include <ftw.h>
8e274523
LP
27
28#include "mount-setup.h"
5ba2dc25 29#include "dev-setup.h"
64824462 30#include "bus-util.h"
8e274523 31#include "log.h"
c9af1080
LP
32#include "macro.h"
33#include "util.h"
5275d3c1 34#include "label.h"
0c85a4f3
LP
35#include "set.h"
36#include "strv.h"
49e942b2 37#include "mkdir.h"
9eb977db 38#include "path-util.h"
48ac500b 39#include "missing.h"
c481f78b 40#include "virt.h"
34e5a31e 41#include "efivars.h"
8552b176 42#include "smack-util.h"
b12afc8c 43#include "cgroup-util.h"
bef2733f 44
6aa220e0
KS
45typedef enum MountMode {
46 MNT_NONE = 0,
47 MNT_FATAL = 1 << 0,
48 MNT_IN_CONTAINER = 1 << 1,
49} MountMode;
50
ca714c0e
LP
51typedef struct MountPoint {
52 const char *what;
53 const char *where;
54 const char *type;
55 const char *options;
56 unsigned long flags;
6aa220e0
KS
57 bool (*condition_fn)(void);
58 MountMode mode;
ca714c0e
LP
59} MountPoint;
60
4ef31082 61/* The first three entries we might need before SELinux is up. The
160481f6 62 * fourth (securityfs) is needed by IMA to load a custom policy. The
7c96ab1d
LP
63 * other ones we can delay until SELinux and IMA are loaded. When
64 * SMACK is enabled we need smackfs, too, so it's a fifth one. */
65#ifdef HAVE_SMACK
ffbd2c4d 66#define N_EARLY_MOUNT 5
7c96ab1d
LP
67#else
68#define N_EARLY_MOUNT 4
69#endif
4ef31082 70
ca714c0e 71static const MountPoint mount_table[] = {
68d4c452
LP
72 { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
73 NULL, MNT_FATAL|MNT_IN_CONTAINER },
74 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
75 NULL, MNT_FATAL|MNT_IN_CONTAINER },
76 { "devtmpfs", "/dev", "devtmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,
77 NULL, MNT_FATAL|MNT_IN_CONTAINER },
78 { "securityfs", "/sys/kernel/security", "securityfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
79 NULL, MNT_NONE },
d407c940 80#ifdef HAVE_SMACK
68d4c452
LP
81 { "smackfs", "/sys/fs/smackfs", "smackfs", "smackfsdef=*", MS_NOSUID|MS_NOEXEC|MS_NODEV,
82 mac_smack_use, MNT_FATAL },
83 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
84 mac_smack_use, MNT_FATAL },
d407c940 85#endif
68d4c452
LP
86 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
87 NULL, MNT_FATAL|MNT_IN_CONTAINER },
88 { "devpts", "/dev/pts", "devpts", "mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC,
89 NULL, MNT_IN_CONTAINER },
d407c940 90#ifdef HAVE_SMACK
68d4c452
LP
91 { "tmpfs", "/run", "tmpfs", "mode=755,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
92 mac_smack_use, MNT_FATAL },
d407c940 93#endif
68d4c452
LP
94 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
95 NULL, MNT_FATAL|MNT_IN_CONTAINER },
efdb0237
LP
96 { "cgroup", "/sys/fs/cgroup", "cgroup", "__DEVEL__sane_behavior", MS_NOSUID|MS_NOEXEC|MS_NODEV,
97 cg_is_unified_wanted, MNT_FATAL|MNT_IN_CONTAINER },
68d4c452 98 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
efdb0237 99 cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
68d4c452 100 { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd,xattr", MS_NOSUID|MS_NOEXEC|MS_NODEV,
efdb0237 101 cg_is_legacy_wanted, MNT_IN_CONTAINER },
68d4c452 102 { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd", MS_NOSUID|MS_NOEXEC|MS_NODEV,
efdb0237 103 cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
68d4c452
LP
104 { "pstore", "/sys/fs/pstore", "pstore", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
105 NULL, MNT_NONE },
c06bf414 106#ifdef ENABLE_EFI
68d4c452
LP
107 { "efivarfs", "/sys/firmware/efi/efivars", "efivarfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
108 is_efi_boot, MNT_NONE },
c06bf414 109#endif
63cc4c31 110 { "kdbusfs", "/sys/fs/kdbus", "kdbusfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
1f49dffc 111 is_kdbus_wanted, MNT_IN_CONTAINER },
63cc4c31
DM
112};
113
949c6510 114/* These are API file systems that might be mounted by other software,
46ff0ed7 115 * we just list them here so that we know that we should ignore them */
949c6510 116
eaeb18db
LP
117static const char ignore_paths[] =
118 /* SELinux file systems */
119 "/sys/fs/selinux\0"
eaeb18db
LP
120 /* Container bind mounts */
121 "/proc/sys\0"
122 "/dev/console\0"
c481f78b 123 "/proc/kmsg\0";
949c6510 124
dad08730
LP
125bool mount_point_is_api(const char *path) {
126 unsigned i;
127
128 /* Checks if this mount point is considered "API", and hence
129 * should be ignored */
130
ca714c0e 131 for (i = 0; i < ELEMENTSOF(mount_table); i ++)
449ddb2d 132 if (path_equal(path, mount_table[i].where))
dad08730
LP
133 return true;
134
57f2a956
KS
135 return path_startswith(path, "/sys/fs/cgroup/");
136}
137
138bool mount_point_ignore(const char *path) {
eaeb18db 139 const char *i;
57f2a956 140
eaeb18db
LP
141 NULSTR_FOREACH(i, ignore_paths)
142 if (path_equal(path, i))
949c6510
LP
143 return true;
144
57f2a956 145 return false;
dad08730
LP
146}
147
4ef31082 148static int mount_one(const MountPoint *p, bool relabel) {
8e274523
LP
149 int r;
150
ca714c0e 151 assert(p);
8e274523 152
6aa220e0
KS
153 if (p->condition_fn && !p->condition_fn())
154 return 0;
155
51b4af2c 156 /* Relabel first, just in case */
4ef31082 157 if (relabel)
c9bc0764 158 label_fix(p->where, true, true);
51b4af2c 159
e26d6ce5 160 r = path_is_mount_point(p->where, AT_SYMLINK_FOLLOW);
64f75d7a 161 if (r < 0 && r != -ENOENT)
8e274523 162 return r;
8e274523 163 if (r > 0)
51b4af2c 164 return 0;
8e274523 165
c481f78b 166 /* Skip securityfs in a container */
6aa220e0 167 if (!(p->mode & MNT_IN_CONTAINER) && detect_container(NULL) > 0)
c481f78b
LP
168 return 0;
169
a04f58d6
LP
170 /* The access mode here doesn't really matter too much, since
171 * the mounted file system will take precedence anyway. */
c4bfd169
LP
172 if (relabel)
173 mkdir_p_label(p->where, 0755);
174 else
175 mkdir_p(p->where, 0755);
a04f58d6 176
8e274523 177 log_debug("Mounting %s to %s of type %s with options %s.",
ca714c0e
LP
178 p->what,
179 p->where,
180 p->type,
181 strna(p->options));
182
183 if (mount(p->what,
184 p->where,
185 p->type,
186 p->flags,
187 p->options) < 0) {
99a17ada 188 log_full((p->mode & MNT_FATAL) ? LOG_ERR : LOG_DEBUG, "Failed to mount %s at %s: %m", p->type, p->where);
6aa220e0 189 return (p->mode & MNT_FATAL) ? -errno : 0;
8e274523
LP
190 }
191
51b4af2c 192 /* Relabel again, since we now mounted something fresh here */
4ef31082 193 if (relabel)
c9bc0764 194 label_fix(p->where, false, false);
5275d3c1 195
0c85a4f3 196 return 1;
8e274523
LP
197}
198
4ef31082
LP
199int mount_setup_early(void) {
200 unsigned i;
201 int r = 0;
202
203 assert_cc(N_EARLY_MOUNT <= ELEMENTSOF(mount_table));
204
205 /* Do a minimal mount of /proc and friends to enable the most
206 * basic stuff, such as SELinux */
207 for (i = 0; i < N_EARLY_MOUNT; i ++) {
208 int j;
209
210 j = mount_one(mount_table + i, false);
211 if (r == 0)
212 r = j;
213 }
214
215 return r;
216}
217
0c85a4f3 218int mount_cgroup_controllers(char ***join_controllers) {
a6b26d90 219 _cleanup_set_free_free_ Set *controllers = NULL;
a641dcd9 220 int r;
2076ca54 221
efdb0237
LP
222 if (!cg_is_legacy_wanted())
223 return 0;
224
670802d4 225 /* Mount all available cgroup controllers that are built into the kernel. */
2076ca54 226
d5099efc 227 controllers = set_new(&string_hash_ops);
a6b26d90
ZJS
228 if (!controllers)
229 return log_oom();
0c85a4f3 230
b12afc8c
LP
231 r = cg_kernel_controllers(controllers);
232 if (r < 0)
233 return log_error_errno(r, "Failed to enumerate cgroup controllers: %m");
0c85a4f3
LP
234
235 for (;;) {
a641dcd9 236 _cleanup_free_ char *options = NULL, *controller = NULL, *where = NULL;
a6b26d90
ZJS
237 MountPoint p = {
238 .what = "cgroup",
239 .type = "cgroup",
240 .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV,
241 .mode = MNT_IN_CONTAINER,
242 };
0c85a4f3
LP
243 char ***k = NULL;
244
245 controller = set_steal_first(controllers);
246 if (!controller)
247 break;
248
249 if (join_controllers)
250 for (k = join_controllers; *k; k++)
251 if (strv_find(*k, controller))
252 break;
253
254 if (k && *k) {
255 char **i, **j;
256
257 for (i = *k, j = *k; *i; i++) {
258
259 if (!streq(*i, controller)) {
a641dcd9 260 _cleanup_free_ char *t;
0c85a4f3
LP
261
262 t = set_remove(controllers, *i);
263 if (!t) {
264 free(*i);
265 continue;
266 }
0c85a4f3
LP
267 }
268
269 *(j++) = *i;
270 }
271
272 *j = NULL;
273
274 options = strv_join(*k, ",");
a6b26d90
ZJS
275 if (!options)
276 return log_oom();
0c85a4f3
LP
277 } else {
278 options = controller;
279 controller = NULL;
280 }
281
a641dcd9
LP
282 where = strappend("/sys/fs/cgroup/", options);
283 if (!where)
284 return log_oom();
285
286 p.where = where;
0c85a4f3 287 p.options = options;
2076ca54 288
4ef31082 289 r = mount_one(&p, true);
a6b26d90
ZJS
290 if (r < 0)
291 return r;
0c85a4f3
LP
292
293 if (r > 0 && k && *k) {
294 char **i;
295
296 for (i = *k; *i; i++) {
a641dcd9
LP
297 _cleanup_free_ char *t = NULL;
298
299 t = strappend("/sys/fs/cgroup/", *i);
300 if (!t)
301 return log_oom();
0c85a4f3
LP
302
303 r = symlink(options, t);
4a62c710
MS
304 if (r < 0 && errno != EEXIST)
305 return log_error_errno(errno, "Failed to create symlink %s: %m", t);
0c85a4f3
LP
306 }
307 }
2076ca54
LP
308 }
309
679142ce
LP
310 /* Now that we mounted everything, let's make the tmpfs the
311 * cgroup file systems are mounted into read-only. */
b12afc8c 312 (void) mount("tmpfs", "/sys/fs/cgroup", "tmpfs", MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
679142ce 313
a6b26d90 314 return 0;
2076ca54
LP
315}
316
0fff82e5 317#if defined(HAVE_SELINUX) || defined(HAVE_SMACK)
1829dc9d
LP
318static int nftw_cb(
319 const char *fpath,
320 const struct stat *sb,
321 int tflag,
322 struct FTW *ftwbuf) {
323
9fe117ea 324 /* No need to label /dev twice in a row... */
edb49778
LP
325 if (_unlikely_(ftwbuf->level == 0))
326 return FTW_CONTINUE;
327
c9bc0764 328 label_fix(fpath, false, false);
af65c248 329
edb49778 330 /* /run/initramfs is static data and big, no need to
af65c248 331 * dynamically relabel its contents at boot... */
edb49778
LP
332 if (_unlikely_(ftwbuf->level == 1 &&
333 tflag == FTW_D &&
334 streq(fpath, "/run/initramfs")))
335 return FTW_SKIP_SUBTREE;
9fe117ea 336
edb49778 337 return FTW_CONTINUE;
1829dc9d 338};
0fff82e5 339#endif
1829dc9d 340
0b3325e7 341int mount_setup(bool loaded_policy) {
dad08730 342 unsigned i;
68d4c452 343 int r = 0;
8e274523 344
4ef31082 345 for (i = 0; i < ELEMENTSOF(mount_table); i ++) {
68d4c452 346 int j;
4ef31082 347
68d4c452
LP
348 j = mount_one(mount_table + i, loaded_policy);
349 if (r == 0)
350 r = j;
4ef31082 351 }
8e274523 352
68d4c452
LP
353 if (r < 0)
354 return r;
355
0fff82e5 356#if defined(HAVE_SELINUX) || defined(HAVE_SMACK)
f1d19aa4
LP
357 /* Nodes in devtmpfs and /run need to be manually updated for
358 * the appropriate labels, after mounting. The other virtual
359 * API file systems like /sys and /proc do not need that, they
360 * use the same label for all their files. */
0b3325e7
LP
361 if (loaded_policy) {
362 usec_t before_relabel, after_relabel;
363 char timespan[FORMAT_TIMESPAN_MAX];
364
365 before_relabel = now(CLOCK_MONOTONIC);
366
edb49778
LP
367 nftw("/dev", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
368 nftw("/run", nftw_cb, 64, FTW_MOUNT|FTW_PHYS|FTW_ACTIONRETVAL);
0b3325e7
LP
369
370 after_relabel = now(CLOCK_MONOTONIC);
371
372 log_info("Relabelled /dev and /run in %s.",
2fa4092c 373 format_timespan(timespan, sizeof(timespan), after_relabel - before_relabel, 0));
3bbecb2f 374 }
0fff82e5 375#endif
1829dc9d 376
5c0532d1 377 /* Create a few default symlinks, which are normally created
f1d19aa4 378 * by udevd, but some scripts might need them before we start
5c0532d1 379 * udevd. */
03cfe0d5 380 dev_setup(NULL, UID_INVALID, GID_INVALID);
5c0532d1 381
b3ac5f8c
LP
382 /* Mark the root directory as shared in regards to mount
383 * propagation. The kernel defaults to "private", but we think
384 * it makes more sense to have a default of "shared" so that
385 * nspawn and the container tools work out of the box. If
386 * specific setups need other settings they can reset the
387 * propagation mode to private if needed. */
c481f78b
LP
388 if (detect_container(NULL) <= 0)
389 if (mount(NULL, "/", NULL, MS_REC|MS_SHARED, NULL) < 0)
56f64d95 390 log_warning_errno(errno, "Failed to set up the root directory for shared mount propagation: %m");
b3ac5f8c 391
66e41181
LP
392 /* Create a few directories we always want around, Note that
393 * sd_booted() checks for /run/systemd/system, so this mkdir
394 * really needs to stay for good, otherwise software that
395 * copied sd-daemon.c into their sources will misdetect
396 * systemd. */
d2e54fae
KS
397 mkdir_label("/run/systemd", 0755);
398 mkdir_label("/run/systemd/system", 0755);
c17ec25e 399 mkdir_label("/run/systemd/inaccessible", 0000);
b925e726 400
0c85a4f3 401 return 0;
8e274523 402}