2 This file is part of systemd.
4 Copyright 2010 Lennart Poettering
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
24 #include <sys/mount.h>
29 #include "alloc-util.h"
30 #include "dev-setup.h"
32 #include "loopback-setup.h"
35 #include "mount-util.h"
36 #include "namespace.h"
37 #include "path-util.h"
38 #include "selinux-util.h"
39 #include "socket-util.h"
40 #include "string-table.h"
41 #include "string-util.h"
43 #include "umask-util.h"
44 #include "user-util.h"
47 typedef enum MountMode
{
48 /* This is ordered by priority! */
57 typedef struct BindMount
{
64 static int append_mounts(BindMount
**p
, char **strv
, MountMode mode
) {
69 STRV_FOREACH(i
, strv
) {
74 if ((mode
== INACCESSIBLE
|| mode
== READONLY
|| mode
== READWRITE
) && (*i
)[0] == '-') {
79 if (!path_is_absolute(*i
))
90 static int mount_path_compare(const void *a
, const void *b
) {
91 const BindMount
*p
= a
, *q
= b
;
94 d
= path_compare(p
->path
, q
->path
);
97 /* If the paths are equal, check the mode */
98 if (p
->mode
< q
->mode
)
101 if (p
->mode
> q
->mode
)
107 /* If the paths are not equal, then order prefixes first */
111 static void drop_duplicates(BindMount
*m
, unsigned *n
) {
112 BindMount
*f
, *t
, *previous
;
117 for (f
= m
, t
= m
, previous
= NULL
; f
< m
+*n
; f
++) {
119 /* The first one wins */
120 if (previous
&& path_equal(f
->path
, previous
->path
))
133 static int mount_dev(BindMount
*m
) {
134 static const char devnodes
[] =
142 char temporary_mount
[] = "/tmp/namespace-dev-XXXXXX";
143 const char *d
, *dev
= NULL
, *devpts
= NULL
, *devshm
= NULL
, *devhugepages
= NULL
, *devmqueue
= NULL
, *devlog
= NULL
, *devptmx
= NULL
;
144 _cleanup_umask_ mode_t u
;
151 if (!mkdtemp(temporary_mount
))
154 dev
= strjoina(temporary_mount
, "/dev");
155 (void) mkdir(dev
, 0755);
156 if (mount("tmpfs", dev
, "tmpfs", MS_NOSUID
|MS_STRICTATIME
, "mode=755") < 0) {
161 devpts
= strjoina(temporary_mount
, "/dev/pts");
162 (void) mkdir(devpts
, 0755);
163 if (mount("/dev/pts", devpts
, NULL
, MS_BIND
, NULL
) < 0) {
168 devptmx
= strjoina(temporary_mount
, "/dev/ptmx");
169 if (symlink("pts/ptmx", devptmx
) < 0) {
174 devshm
= strjoina(temporary_mount
, "/dev/shm");
175 (void) mkdir(devshm
, 01777);
176 r
= mount("/dev/shm", devshm
, NULL
, MS_BIND
, NULL
);
182 devmqueue
= strjoina(temporary_mount
, "/dev/mqueue");
183 (void) mkdir(devmqueue
, 0755);
184 (void) mount("/dev/mqueue", devmqueue
, NULL
, MS_BIND
, NULL
);
186 devhugepages
= strjoina(temporary_mount
, "/dev/hugepages");
187 (void) mkdir(devhugepages
, 0755);
188 (void) mount("/dev/hugepages", devhugepages
, NULL
, MS_BIND
, NULL
);
190 devlog
= strjoina(temporary_mount
, "/dev/log");
191 (void) symlink("/run/systemd/journal/dev-log", devlog
);
193 NULSTR_FOREACH(d
, devnodes
) {
194 _cleanup_free_
char *dn
= NULL
;
207 if (!S_ISBLK(st
.st_mode
) &&
208 !S_ISCHR(st
.st_mode
)) {
216 dn
= strappend(temporary_mount
, d
);
222 mac_selinux_create_file_prepare(d
, st
.st_mode
);
223 r
= mknod(dn
, st
.st_mode
, st
.st_rdev
);
224 mac_selinux_create_file_clear();
232 dev_setup(temporary_mount
, UID_INVALID
, GID_INVALID
);
234 /* Create the /dev directory if missing. It is more likely to be
235 * missing when the service is started with RootDirectory. This is
236 * consistent with mount units creating the mount points when missing.
238 (void) mkdir_p_label(m
->path
, 0755);
240 /* Unmount everything in old /dev */
241 umount_recursive(m
->path
, 0);
242 if (mount(dev
, m
->path
, NULL
, MS_MOVE
, NULL
) < 0) {
248 rmdir(temporary_mount
);
260 umount(devhugepages
);
267 rmdir(temporary_mount
);
272 static int apply_mount(
275 const char *var_tmp_dir
) {
286 /* First, get rid of everything that is below if there
287 * is anything... Then, overmount it with an
288 * inaccessible directory. */
289 umount_recursive(m
->path
, 0);
291 what
= "/run/systemd/inaccessible";
296 /* Nothing to mount here, we just later toggle the
297 * MS_RDONLY bit for the mount point */
304 case PRIVATE_VAR_TMP
:
312 assert_not_reached("Unknown mode");
317 r
= mount(what
, m
->path
, NULL
, MS_BIND
|MS_REC
, NULL
);
319 log_debug("Successfully mounted %s to %s", what
, m
->path
);
320 else if (m
->ignore
&& errno
== ENOENT
)
326 static int make_read_only(BindMount
*m
) {
331 if (IN_SET(m
->mode
, INACCESSIBLE
, READONLY
))
332 r
= bind_remount_recursive(m
->path
, true);
333 else if (IN_SET(m
->mode
, READWRITE
, PRIVATE_TMP
, PRIVATE_VAR_TMP
, PRIVATE_DEV
))
334 r
= bind_remount_recursive(m
->path
, false);
338 if (m
->ignore
&& r
== -ENOENT
)
345 const char* root_directory
,
346 char** read_write_dirs
,
347 char** read_only_dirs
,
348 char** inaccessible_dirs
,
350 const char* var_tmp_dir
,
352 ProtectHome protect_home
,
353 ProtectSystem protect_system
,
354 unsigned long mount_flags
) {
356 BindMount
*m
, *mounts
= NULL
;
360 if (mount_flags
== 0)
361 mount_flags
= MS_SHARED
;
363 if (unshare(CLONE_NEWNS
) < 0)
366 n
= !!tmp_dir
+ !!var_tmp_dir
+
367 strv_length(read_write_dirs
) +
368 strv_length(read_only_dirs
) +
369 strv_length(inaccessible_dirs
) +
371 (protect_home
!= PROTECT_HOME_NO
? 3 : 0) +
372 (protect_system
!= PROTECT_SYSTEM_NO
? 2 : 0) +
373 (protect_system
== PROTECT_SYSTEM_FULL
? 1 : 0);
376 m
= mounts
= (BindMount
*) alloca0(n
* sizeof(BindMount
));
377 r
= append_mounts(&m
, read_write_dirs
, READWRITE
);
381 r
= append_mounts(&m
, read_only_dirs
, READONLY
);
385 r
= append_mounts(&m
, inaccessible_dirs
, INACCESSIBLE
);
390 m
->path
= prefix_roota(root_directory
, "/tmp");
391 m
->mode
= PRIVATE_TMP
;
396 m
->path
= prefix_roota(root_directory
, "/var/tmp");
397 m
->mode
= PRIVATE_VAR_TMP
;
402 m
->path
= prefix_roota(root_directory
, "/dev");
403 m
->mode
= PRIVATE_DEV
;
407 if (protect_home
!= PROTECT_HOME_NO
) {
408 const char *home_dir
, *run_user_dir
, *root_dir
;
410 home_dir
= prefix_roota(root_directory
, "/home");
411 home_dir
= strjoina("-", home_dir
);
412 run_user_dir
= prefix_roota(root_directory
, "/run/user");
413 run_user_dir
= strjoina("-", run_user_dir
);
414 root_dir
= prefix_roota(root_directory
, "/root");
415 root_dir
= strjoina("-", root_dir
);
417 r
= append_mounts(&m
, STRV_MAKE(home_dir
, run_user_dir
, root_dir
),
418 protect_home
== PROTECT_HOME_READ_ONLY
? READONLY
: INACCESSIBLE
);
423 if (protect_system
!= PROTECT_SYSTEM_NO
) {
424 const char *usr_dir
, *boot_dir
, *etc_dir
;
426 usr_dir
= prefix_roota(root_directory
, "/usr");
427 boot_dir
= prefix_roota(root_directory
, "/boot");
428 boot_dir
= strjoina("-", boot_dir
);
429 etc_dir
= prefix_roota(root_directory
, "/etc");
431 r
= append_mounts(&m
, protect_system
== PROTECT_SYSTEM_FULL
432 ? STRV_MAKE(usr_dir
, boot_dir
, etc_dir
)
433 : STRV_MAKE(usr_dir
, boot_dir
), READONLY
);
438 assert(mounts
+ n
== m
);
440 qsort(mounts
, n
, sizeof(BindMount
), mount_path_compare
);
441 drop_duplicates(mounts
, &n
);
444 if (n
> 0 || root_directory
) {
445 /* Remount / as SLAVE so that nothing now mounted in the namespace
446 shows up in the parent */
447 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
451 if (root_directory
) {
452 /* Turn directory into bind mount */
453 if (mount(root_directory
, root_directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
458 for (m
= mounts
; m
< mounts
+ n
; ++m
) {
459 r
= apply_mount(m
, tmp_dir
, var_tmp_dir
);
464 for (m
= mounts
; m
< mounts
+ n
; ++m
) {
465 r
= make_read_only(m
);
471 if (root_directory
) {
472 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
473 r
= mount_move_root(root_directory
);
475 /* at this point, we cannot rollback */
480 /* Remount / as the desired mode. Not that this will not
481 * reestablish propagation from our side to the host, since
482 * what's disconnected is disconnected. */
483 if (mount(NULL
, "/", NULL
, mount_flags
| MS_REC
, NULL
) < 0)
484 /* at this point, we cannot rollback */
491 for (m
= mounts
; m
< mounts
+ n
; ++m
)
493 (void) umount2(m
->path
, MNT_DETACH
);
499 static int setup_one_tmp_dir(const char *id
, const char *prefix
, char **path
) {
500 _cleanup_free_
char *x
= NULL
;
501 char bid
[SD_ID128_STRING_MAX
];
509 /* We include the boot id in the directory so that after a
510 * reboot we can easily identify obsolete directories. */
512 r
= sd_id128_get_boot(&boot_id
);
516 x
= strjoin(prefix
, "/systemd-private-", sd_id128_to_string(boot_id
, bid
), "-", id
, "-XXXXXX", NULL
);
524 RUN_WITH_UMASK(0000) {
527 y
= strjoina(x
, "/tmp");
529 if (mkdir(y
, 0777 | S_ISVTX
) < 0)
539 int setup_tmp_dirs(const char *id
, char **tmp_dir
, char **var_tmp_dir
) {
547 r
= setup_one_tmp_dir(id
, "/tmp", &a
);
551 r
= setup_one_tmp_dir(id
, "/var/tmp", &b
);
555 t
= strjoina(a
, "/tmp");
569 int setup_netns(int netns_storage_socket
[2]) {
570 _cleanup_close_
int netns
= -1;
573 assert(netns_storage_socket
);
574 assert(netns_storage_socket
[0] >= 0);
575 assert(netns_storage_socket
[1] >= 0);
577 /* We use the passed socketpair as a storage buffer for our
578 * namespace reference fd. Whatever process runs this first
579 * shall create a new namespace, all others should just join
580 * it. To serialize that we use a file lock on the socket
583 * It's a bit crazy, but hey, works great! */
585 if (lockf(netns_storage_socket
[0], F_LOCK
, 0) < 0)
588 netns
= receive_one_fd(netns_storage_socket
[0], MSG_DONTWAIT
);
589 if (netns
== -EAGAIN
) {
590 /* Nothing stored yet, so let's create a new namespace */
592 if (unshare(CLONE_NEWNET
) < 0) {
599 netns
= open("/proc/self/ns/net", O_RDONLY
|O_CLOEXEC
|O_NOCTTY
);
607 } else if (netns
< 0) {
612 /* Yay, found something, so let's join the namespace */
613 if (setns(netns
, CLONE_NEWNET
) < 0) {
621 q
= send_one_fd(netns_storage_socket
[1], netns
, MSG_DONTWAIT
);
628 lockf(netns_storage_socket
[0], F_ULOCK
, 0);
632 static const char *const protect_home_table
[_PROTECT_HOME_MAX
] = {
633 [PROTECT_HOME_NO
] = "no",
634 [PROTECT_HOME_YES
] = "yes",
635 [PROTECT_HOME_READ_ONLY
] = "read-only",
638 DEFINE_STRING_TABLE_LOOKUP(protect_home
, ProtectHome
);
640 static const char *const protect_system_table
[_PROTECT_SYSTEM_MAX
] = {
641 [PROTECT_SYSTEM_NO
] = "no",
642 [PROTECT_SYSTEM_YES
] = "yes",
643 [PROTECT_SYSTEM_FULL
] = "full",
646 DEFINE_STRING_TABLE_LOOKUP(protect_system
, ProtectSystem
);