2 This file is part of systemd.
4 Copyright 2010 Lennart Poettering
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
24 #include <sys/mount.h>
29 #include "alloc-util.h"
30 #include "dev-setup.h"
32 #include "loopback-setup.h"
35 #include "mount-util.h"
36 #include "namespace.h"
37 #include "path-util.h"
38 #include "selinux-util.h"
39 #include "socket-util.h"
40 #include "string-table.h"
41 #include "string-util.h"
43 #include "umask-util.h"
44 #include "user-util.h"
47 typedef enum MountMode
{
48 /* This is ordered by priority! */
57 typedef struct BindMount
{
64 static int append_mounts(BindMount
**p
, char **strv
, MountMode mode
) {
69 STRV_FOREACH(i
, strv
) {
74 if ((mode
== INACCESSIBLE
|| mode
== READONLY
|| mode
== READWRITE
) && (*i
)[0] == '-') {
79 if (!path_is_absolute(*i
))
90 static int mount_path_compare(const void *a
, const void *b
) {
91 const BindMount
*p
= a
, *q
= b
;
94 d
= path_compare(p
->path
, q
->path
);
97 /* If the paths are equal, check the mode */
98 if (p
->mode
< q
->mode
)
101 if (p
->mode
> q
->mode
)
107 /* If the paths are not equal, then order prefixes first */
111 static void drop_duplicates(BindMount
*m
, unsigned *n
) {
112 BindMount
*f
, *t
, *previous
;
117 for (f
= m
, t
= m
, previous
= NULL
; f
< m
+*n
; f
++) {
119 /* The first one wins */
120 if (previous
&& path_equal(f
->path
, previous
->path
))
133 static int mount_dev(BindMount
*m
) {
134 static const char devnodes
[] =
142 char temporary_mount
[] = "/tmp/namespace-dev-XXXXXX";
143 const char *d
, *dev
= NULL
, *devpts
= NULL
, *devshm
= NULL
, *devhugepages
= NULL
, *devmqueue
= NULL
, *devlog
= NULL
, *devptmx
= NULL
;
144 _cleanup_umask_ mode_t u
;
151 if (!mkdtemp(temporary_mount
))
154 dev
= strjoina(temporary_mount
, "/dev");
155 (void) mkdir(dev
, 0755);
156 if (mount("tmpfs", dev
, "tmpfs", MS_NOSUID
|MS_STRICTATIME
, "mode=755") < 0) {
161 devpts
= strjoina(temporary_mount
, "/dev/pts");
162 (void) mkdir(devpts
, 0755);
163 if (mount("/dev/pts", devpts
, NULL
, MS_BIND
, NULL
) < 0) {
168 devptmx
= strjoina(temporary_mount
, "/dev/ptmx");
169 if (symlink("pts/ptmx", devptmx
) < 0) {
174 devshm
= strjoina(temporary_mount
, "/dev/shm");
175 (void) mkdir(devshm
, 01777);
176 r
= mount("/dev/shm", devshm
, NULL
, MS_BIND
, NULL
);
182 devmqueue
= strjoina(temporary_mount
, "/dev/mqueue");
183 (void) mkdir(devmqueue
, 0755);
184 (void) mount("/dev/mqueue", devmqueue
, NULL
, MS_BIND
, NULL
);
186 devhugepages
= strjoina(temporary_mount
, "/dev/hugepages");
187 (void) mkdir(devhugepages
, 0755);
188 (void) mount("/dev/hugepages", devhugepages
, NULL
, MS_BIND
, NULL
);
190 devlog
= strjoina(temporary_mount
, "/dev/log");
191 (void) symlink("/run/systemd/journal/dev-log", devlog
);
193 NULSTR_FOREACH(d
, devnodes
) {
194 _cleanup_free_
char *dn
= NULL
;
207 if (!S_ISBLK(st
.st_mode
) &&
208 !S_ISCHR(st
.st_mode
)) {
216 dn
= strappend(temporary_mount
, d
);
222 mac_selinux_create_file_prepare(d
, st
.st_mode
);
223 r
= mknod(dn
, st
.st_mode
, st
.st_rdev
);
224 mac_selinux_create_file_clear();
232 dev_setup(temporary_mount
, UID_INVALID
, GID_INVALID
);
234 /* Create the /dev directory if missing. It is more likely to be
235 * missing when the service is started with RootDirectory. This is
236 * consistent with mount units creating the mount points when missing.
238 (void) mkdir_p_label(m
->path
, 0755);
240 if (mount(dev
, m
->path
, NULL
, MS_MOVE
, NULL
) < 0) {
246 rmdir(temporary_mount
);
258 umount(devhugepages
);
265 rmdir(temporary_mount
);
270 static int apply_mount(
273 const char *var_tmp_dir
) {
284 /* First, get rid of everything that is below if there
285 * is anything... Then, overmount it with an
286 * inaccessible directory. */
287 umount_recursive(m
->path
, 0);
289 what
= "/run/systemd/inaccessible";
294 /* Nothing to mount here, we just later toggle the
295 * MS_RDONLY bit for the mount point */
302 case PRIVATE_VAR_TMP
:
310 assert_not_reached("Unknown mode");
315 r
= mount(what
, m
->path
, NULL
, MS_BIND
|MS_REC
, NULL
);
317 log_debug("Successfully mounted %s to %s", what
, m
->path
);
318 else if (m
->ignore
&& errno
== ENOENT
)
324 static int make_read_only(BindMount
*m
) {
329 if (IN_SET(m
->mode
, INACCESSIBLE
, READONLY
))
330 r
= bind_remount_recursive(m
->path
, true);
331 else if (IN_SET(m
->mode
, READWRITE
, PRIVATE_TMP
, PRIVATE_VAR_TMP
, PRIVATE_DEV
))
332 r
= bind_remount_recursive(m
->path
, false);
336 if (m
->ignore
&& r
== -ENOENT
)
343 const char* root_directory
,
344 char** read_write_dirs
,
345 char** read_only_dirs
,
346 char** inaccessible_dirs
,
348 const char* var_tmp_dir
,
350 ProtectHome protect_home
,
351 ProtectSystem protect_system
,
352 unsigned long mount_flags
) {
354 BindMount
*m
, *mounts
= NULL
;
358 if (mount_flags
== 0)
359 mount_flags
= MS_SHARED
;
361 if (unshare(CLONE_NEWNS
) < 0)
364 n
= !!tmp_dir
+ !!var_tmp_dir
+
365 strv_length(read_write_dirs
) +
366 strv_length(read_only_dirs
) +
367 strv_length(inaccessible_dirs
) +
369 (protect_home
!= PROTECT_HOME_NO
? 3 : 0) +
370 (protect_system
!= PROTECT_SYSTEM_NO
? 2 : 0) +
371 (protect_system
== PROTECT_SYSTEM_FULL
? 1 : 0);
374 m
= mounts
= (BindMount
*) alloca0(n
* sizeof(BindMount
));
375 r
= append_mounts(&m
, read_write_dirs
, READWRITE
);
379 r
= append_mounts(&m
, read_only_dirs
, READONLY
);
383 r
= append_mounts(&m
, inaccessible_dirs
, INACCESSIBLE
);
388 m
->path
= prefix_roota(root_directory
, "/tmp");
389 m
->mode
= PRIVATE_TMP
;
394 m
->path
= prefix_roota(root_directory
, "/var/tmp");
395 m
->mode
= PRIVATE_VAR_TMP
;
400 m
->path
= prefix_roota(root_directory
, "/dev");
401 m
->mode
= PRIVATE_DEV
;
405 if (protect_home
!= PROTECT_HOME_NO
) {
406 const char *home_dir
, *run_user_dir
, *root_dir
;
408 home_dir
= prefix_roota(root_directory
, "/home");
409 home_dir
= strjoina("-", home_dir
);
410 run_user_dir
= prefix_roota(root_directory
, "/run/user");
411 run_user_dir
= strjoina("-", run_user_dir
);
412 root_dir
= prefix_roota(root_directory
, "/root");
413 root_dir
= strjoina("-", root_dir
);
415 r
= append_mounts(&m
, STRV_MAKE(home_dir
, run_user_dir
, root_dir
),
416 protect_home
== PROTECT_HOME_READ_ONLY
? READONLY
: INACCESSIBLE
);
421 if (protect_system
!= PROTECT_SYSTEM_NO
) {
422 const char *usr_dir
, *boot_dir
, *etc_dir
;
424 usr_dir
= prefix_roota(root_directory
, "/usr");
425 boot_dir
= prefix_roota(root_directory
, "/boot");
426 boot_dir
= strjoina("-", boot_dir
);
427 etc_dir
= prefix_roota(root_directory
, "/etc");
429 r
= append_mounts(&m
, protect_system
== PROTECT_SYSTEM_FULL
430 ? STRV_MAKE(usr_dir
, boot_dir
, etc_dir
)
431 : STRV_MAKE(usr_dir
, boot_dir
), READONLY
);
436 assert(mounts
+ n
== m
);
438 qsort(mounts
, n
, sizeof(BindMount
), mount_path_compare
);
439 drop_duplicates(mounts
, &n
);
442 if (n
> 0 || root_directory
) {
443 /* Remount / as SLAVE so that nothing now mounted in the namespace
444 shows up in the parent */
445 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
449 if (root_directory
) {
450 /* Turn directory into bind mount */
451 if (mount(root_directory
, root_directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
456 for (m
= mounts
; m
< mounts
+ n
; ++m
) {
457 r
= apply_mount(m
, tmp_dir
, var_tmp_dir
);
462 for (m
= mounts
; m
< mounts
+ n
; ++m
) {
463 r
= make_read_only(m
);
469 if (root_directory
) {
470 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
471 r
= mount_move_root(root_directory
);
473 /* at this point, we cannot rollback */
478 /* Remount / as the desired mode. Not that this will not
479 * reestablish propagation from our side to the host, since
480 * what's disconnected is disconnected. */
481 if (mount(NULL
, "/", NULL
, mount_flags
| MS_REC
, NULL
) < 0)
482 /* at this point, we cannot rollback */
489 for (m
= mounts
; m
< mounts
+ n
; ++m
)
491 (void) umount2(m
->path
, MNT_DETACH
);
497 static int setup_one_tmp_dir(const char *id
, const char *prefix
, char **path
) {
498 _cleanup_free_
char *x
= NULL
;
499 char bid
[SD_ID128_STRING_MAX
];
507 /* We include the boot id in the directory so that after a
508 * reboot we can easily identify obsolete directories. */
510 r
= sd_id128_get_boot(&boot_id
);
514 x
= strjoin(prefix
, "/systemd-private-", sd_id128_to_string(boot_id
, bid
), "-", id
, "-XXXXXX", NULL
);
522 RUN_WITH_UMASK(0000) {
525 y
= strjoina(x
, "/tmp");
527 if (mkdir(y
, 0777 | S_ISVTX
) < 0)
537 int setup_tmp_dirs(const char *id
, char **tmp_dir
, char **var_tmp_dir
) {
545 r
= setup_one_tmp_dir(id
, "/tmp", &a
);
549 r
= setup_one_tmp_dir(id
, "/var/tmp", &b
);
553 t
= strjoina(a
, "/tmp");
567 int setup_netns(int netns_storage_socket
[2]) {
568 _cleanup_close_
int netns
= -1;
571 assert(netns_storage_socket
);
572 assert(netns_storage_socket
[0] >= 0);
573 assert(netns_storage_socket
[1] >= 0);
575 /* We use the passed socketpair as a storage buffer for our
576 * namespace reference fd. Whatever process runs this first
577 * shall create a new namespace, all others should just join
578 * it. To serialize that we use a file lock on the socket
581 * It's a bit crazy, but hey, works great! */
583 if (lockf(netns_storage_socket
[0], F_LOCK
, 0) < 0)
586 netns
= receive_one_fd(netns_storage_socket
[0], MSG_DONTWAIT
);
587 if (netns
== -EAGAIN
) {
588 /* Nothing stored yet, so let's create a new namespace */
590 if (unshare(CLONE_NEWNET
) < 0) {
597 netns
= open("/proc/self/ns/net", O_RDONLY
|O_CLOEXEC
|O_NOCTTY
);
605 } else if (netns
< 0) {
610 /* Yay, found something, so let's join the namespace */
611 if (setns(netns
, CLONE_NEWNET
) < 0) {
619 q
= send_one_fd(netns_storage_socket
[1], netns
, MSG_DONTWAIT
);
626 lockf(netns_storage_socket
[0], F_ULOCK
, 0);
630 static const char *const protect_home_table
[_PROTECT_HOME_MAX
] = {
631 [PROTECT_HOME_NO
] = "no",
632 [PROTECT_HOME_YES
] = "yes",
633 [PROTECT_HOME_READ_ONLY
] = "read-only",
636 DEFINE_STRING_TABLE_LOOKUP(protect_home
, ProtectHome
);
638 static const char *const protect_system_table
[_PROTECT_SYSTEM_MAX
] = {
639 [PROTECT_SYSTEM_NO
] = "no",
640 [PROTECT_SYSTEM_YES
] = "yes",
641 [PROTECT_SYSTEM_FULL
] = "full",
644 DEFINE_STRING_TABLE_LOOKUP(protect_system
, ProtectSystem
);