1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/mount.h>
31 #include "dev-setup.h"
33 #include "loopback-setup.h"
36 #include "mount-util.h"
37 #include "namespace.h"
38 #include "path-util.h"
39 #include "selinux-util.h"
40 #include "socket-util.h"
41 #include "string-table.h"
42 #include "string-util.h"
46 typedef enum MountMode
{
47 /* This is ordered by priority! */
57 typedef struct BindMount
{
64 static int append_mounts(BindMount
**p
, char **strv
, MountMode mode
) {
69 STRV_FOREACH(i
, strv
) {
74 if ((mode
== INACCESSIBLE
|| mode
== READONLY
|| mode
== READWRITE
) && (*i
)[0] == '-') {
79 if (!path_is_absolute(*i
))
90 static int mount_path_compare(const void *a
, const void *b
) {
91 const BindMount
*p
= a
, *q
= b
;
94 d
= path_compare(p
->path
, q
->path
);
97 /* If the paths are equal, check the mode */
98 if (p
->mode
< q
->mode
)
101 if (p
->mode
> q
->mode
)
107 /* If the paths are not equal, then order prefixes first */
111 static void drop_duplicates(BindMount
*m
, unsigned *n
) {
112 BindMount
*f
, *t
, *previous
;
117 for (f
= m
, t
= m
, previous
= NULL
; f
< m
+*n
; f
++) {
119 /* The first one wins */
120 if (previous
&& path_equal(f
->path
, previous
->path
))
133 static int mount_dev(BindMount
*m
) {
134 static const char devnodes
[] =
142 char temporary_mount
[] = "/tmp/namespace-dev-XXXXXX";
143 const char *d
, *dev
= NULL
, *devpts
= NULL
, *devshm
= NULL
, *devhugepages
= NULL
, *devmqueue
= NULL
, *devlog
= NULL
, *devptmx
= NULL
;
144 _cleanup_umask_ mode_t u
;
151 if (!mkdtemp(temporary_mount
))
154 dev
= strjoina(temporary_mount
, "/dev");
155 (void) mkdir(dev
, 0755);
156 if (mount("tmpfs", dev
, "tmpfs", MS_NOSUID
|MS_STRICTATIME
, "mode=755") < 0) {
161 devpts
= strjoina(temporary_mount
, "/dev/pts");
162 (void) mkdir(devpts
, 0755);
163 if (mount("/dev/pts", devpts
, NULL
, MS_BIND
, NULL
) < 0) {
168 devptmx
= strjoina(temporary_mount
, "/dev/ptmx");
169 if (symlink("pts/ptmx", devptmx
) < 0) {
174 devshm
= strjoina(temporary_mount
, "/dev/shm");
175 (void) mkdir(devshm
, 01777);
176 r
= mount("/dev/shm", devshm
, NULL
, MS_BIND
, NULL
);
182 devmqueue
= strjoina(temporary_mount
, "/dev/mqueue");
183 (void) mkdir(devmqueue
, 0755);
184 (void) mount("/dev/mqueue", devmqueue
, NULL
, MS_BIND
, NULL
);
186 devhugepages
= strjoina(temporary_mount
, "/dev/hugepages");
187 (void) mkdir(devhugepages
, 0755);
188 (void) mount("/dev/hugepages", devhugepages
, NULL
, MS_BIND
, NULL
);
190 devlog
= strjoina(temporary_mount
, "/dev/log");
191 (void) symlink("/run/systemd/journal/dev-log", devlog
);
193 NULSTR_FOREACH(d
, devnodes
) {
194 _cleanup_free_
char *dn
= NULL
;
207 if (!S_ISBLK(st
.st_mode
) &&
208 !S_ISCHR(st
.st_mode
)) {
216 dn
= strappend(temporary_mount
, d
);
222 mac_selinux_create_file_prepare(d
, st
.st_mode
);
223 r
= mknod(dn
, st
.st_mode
, st
.st_rdev
);
224 mac_selinux_create_file_clear();
232 dev_setup(temporary_mount
, UID_INVALID
, GID_INVALID
);
234 /* Create the /dev directory if missing. It is more likely to be
235 * missing when the service is started with RootDirectory. This is
236 * consistent with mount units creating the mount points when missing.
238 (void) mkdir_p_label(m
->path
, 0755);
240 if (mount(dev
, m
->path
, NULL
, MS_MOVE
, NULL
) < 0) {
246 rmdir(temporary_mount
);
258 umount(devhugepages
);
265 rmdir(temporary_mount
);
270 static int mount_kdbus(BindMount
*m
) {
272 char temporary_mount
[] = "/tmp/kdbus-dev-XXXXXX";
273 _cleanup_free_
char *basepath
= NULL
;
274 _cleanup_umask_ mode_t u
;
275 char *busnode
= NULL
, *root
;
283 if (!mkdtemp(temporary_mount
))
284 return log_error_errno(errno
, "Failed create temp dir: %m");
286 root
= strjoina(temporary_mount
, "/kdbus");
287 (void) mkdir(root
, 0755);
288 if (mount("tmpfs", root
, "tmpfs", MS_NOSUID
|MS_STRICTATIME
, "mode=777") < 0) {
293 /* create a new /dev/null dev node copy so we have some fodder to
294 * bind-mount the custom endpoint over. */
295 if (stat("/dev/null", &st
) < 0) {
296 r
= log_error_errno(errno
, "Failed to stat /dev/null: %m");
300 busnode
= strjoina(root
, "/bus");
301 if (mknod(busnode
, (st
.st_mode
& ~07777) | 0600, st
.st_rdev
) < 0) {
302 r
= log_error_errno(errno
, "mknod() for %s failed: %m",
307 r
= mount(m
->path
, busnode
, NULL
, MS_BIND
, NULL
);
309 r
= log_error_errno(errno
, "bind mount of %s failed: %m",
314 basepath
= dirname_malloc(m
->path
);
320 if (mount(root
, basepath
, NULL
, MS_MOVE
, NULL
) < 0) {
321 r
= log_error_errno(errno
, "bind mount of %s failed: %m",
326 rmdir(temporary_mount
);
337 rmdir(temporary_mount
);
342 static int apply_mount(
345 const char *var_tmp_dir
) {
356 /* First, get rid of everything that is below if there
357 * is anything... Then, overmount it with an
358 * inaccessible directory. */
359 umount_recursive(m
->path
, 0);
361 what
= "/run/systemd/inaccessible";
366 /* Nothing to mount here, we just later toggle the
367 * MS_RDONLY bit for the mount point */
374 case PRIVATE_VAR_TMP
:
381 case PRIVATE_BUS_ENDPOINT
:
382 return mount_kdbus(m
);
385 assert_not_reached("Unknown mode");
390 r
= mount(what
, m
->path
, NULL
, MS_BIND
|MS_REC
, NULL
);
392 log_debug("Successfully mounted %s to %s", what
, m
->path
);
393 else if (m
->ignore
&& errno
== ENOENT
)
399 static int make_read_only(BindMount
*m
) {
404 if (IN_SET(m
->mode
, INACCESSIBLE
, READONLY
))
405 r
= bind_remount_recursive(m
->path
, true);
406 else if (IN_SET(m
->mode
, READWRITE
, PRIVATE_TMP
, PRIVATE_VAR_TMP
, PRIVATE_DEV
))
407 r
= bind_remount_recursive(m
->path
, false);
411 if (m
->ignore
&& r
== -ENOENT
)
418 const char* root_directory
,
419 char** read_write_dirs
,
420 char** read_only_dirs
,
421 char** inaccessible_dirs
,
423 const char* var_tmp_dir
,
424 const char* bus_endpoint_path
,
426 ProtectHome protect_home
,
427 ProtectSystem protect_system
,
428 unsigned long mount_flags
) {
430 BindMount
*m
, *mounts
= NULL
;
434 if (mount_flags
== 0)
435 mount_flags
= MS_SHARED
;
437 if (unshare(CLONE_NEWNS
) < 0)
440 n
= !!tmp_dir
+ !!var_tmp_dir
+ !!bus_endpoint_path
+
441 strv_length(read_write_dirs
) +
442 strv_length(read_only_dirs
) +
443 strv_length(inaccessible_dirs
) +
445 (protect_home
!= PROTECT_HOME_NO
? 3 : 0) +
446 (protect_system
!= PROTECT_SYSTEM_NO
? 2 : 0) +
447 (protect_system
== PROTECT_SYSTEM_FULL
? 1 : 0);
450 m
= mounts
= (BindMount
*) alloca0(n
* sizeof(BindMount
));
451 r
= append_mounts(&m
, read_write_dirs
, READWRITE
);
455 r
= append_mounts(&m
, read_only_dirs
, READONLY
);
459 r
= append_mounts(&m
, inaccessible_dirs
, INACCESSIBLE
);
464 m
->path
= prefix_roota(root_directory
, "/tmp");
465 m
->mode
= PRIVATE_TMP
;
470 m
->path
= prefix_roota(root_directory
, "/var/tmp");
471 m
->mode
= PRIVATE_VAR_TMP
;
476 m
->path
= prefix_roota(root_directory
, "/dev");
477 m
->mode
= PRIVATE_DEV
;
481 if (bus_endpoint_path
) {
482 m
->path
= prefix_roota(root_directory
, bus_endpoint_path
);
483 m
->mode
= PRIVATE_BUS_ENDPOINT
;
487 if (protect_home
!= PROTECT_HOME_NO
) {
488 const char *home_dir
, *run_user_dir
, *root_dir
;
490 home_dir
= prefix_roota(root_directory
, "/home");
491 home_dir
= strjoina("-", home_dir
);
492 run_user_dir
= prefix_roota(root_directory
, "/run/user");
493 run_user_dir
= strjoina("-", run_user_dir
);
494 root_dir
= prefix_roota(root_directory
, "/root");
495 root_dir
= strjoina("-", root_dir
);
497 r
= append_mounts(&m
, STRV_MAKE(home_dir
, run_user_dir
, root_dir
),
498 protect_home
== PROTECT_HOME_READ_ONLY
? READONLY
: INACCESSIBLE
);
503 if (protect_system
!= PROTECT_SYSTEM_NO
) {
504 const char *usr_dir
, *boot_dir
, *etc_dir
;
506 usr_dir
= prefix_roota(root_directory
, "/usr");
507 boot_dir
= prefix_roota(root_directory
, "/boot");
508 boot_dir
= strjoina("-", boot_dir
);
509 etc_dir
= prefix_roota(root_directory
, "/etc");
511 r
= append_mounts(&m
, protect_system
== PROTECT_SYSTEM_FULL
512 ? STRV_MAKE(usr_dir
, boot_dir
, etc_dir
)
513 : STRV_MAKE(usr_dir
, boot_dir
), READONLY
);
518 assert(mounts
+ n
== m
);
520 qsort(mounts
, n
, sizeof(BindMount
), mount_path_compare
);
521 drop_duplicates(mounts
, &n
);
524 if (n
> 0 || root_directory
) {
525 /* Remount / as SLAVE so that nothing now mounted in the namespace
526 shows up in the parent */
527 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
531 if (root_directory
) {
532 /* Turn directory into bind mount */
533 if (mount(root_directory
, root_directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
538 for (m
= mounts
; m
< mounts
+ n
; ++m
) {
539 r
= apply_mount(m
, tmp_dir
, var_tmp_dir
);
544 for (m
= mounts
; m
< mounts
+ n
; ++m
) {
545 r
= make_read_only(m
);
551 if (root_directory
) {
552 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
553 r
= mount_move_root(root_directory
);
555 /* at this point, we cannot rollback */
560 /* Remount / as the desired mode. Not that this will not
561 * reestablish propagation from our side to the host, since
562 * what's disconnected is disconnected. */
563 if (mount(NULL
, "/", NULL
, mount_flags
| MS_REC
, NULL
) < 0)
564 /* at this point, we cannot rollback */
571 for (m
= mounts
; m
< mounts
+ n
; ++m
)
573 (void) umount2(m
->path
, MNT_DETACH
);
579 static int setup_one_tmp_dir(const char *id
, const char *prefix
, char **path
) {
580 _cleanup_free_
char *x
= NULL
;
581 char bid
[SD_ID128_STRING_MAX
];
589 /* We include the boot id in the directory so that after a
590 * reboot we can easily identify obsolete directories. */
592 r
= sd_id128_get_boot(&boot_id
);
596 x
= strjoin(prefix
, "/systemd-private-", sd_id128_to_string(boot_id
, bid
), "-", id
, "-XXXXXX", NULL
);
604 RUN_WITH_UMASK(0000) {
607 y
= strjoina(x
, "/tmp");
609 if (mkdir(y
, 0777 | S_ISVTX
) < 0)
619 int setup_tmp_dirs(const char *id
, char **tmp_dir
, char **var_tmp_dir
) {
627 r
= setup_one_tmp_dir(id
, "/tmp", &a
);
631 r
= setup_one_tmp_dir(id
, "/var/tmp", &b
);
635 t
= strjoina(a
, "/tmp");
649 int setup_netns(int netns_storage_socket
[2]) {
650 _cleanup_close_
int netns
= -1;
653 assert(netns_storage_socket
);
654 assert(netns_storage_socket
[0] >= 0);
655 assert(netns_storage_socket
[1] >= 0);
657 /* We use the passed socketpair as a storage buffer for our
658 * namespace reference fd. Whatever process runs this first
659 * shall create a new namespace, all others should just join
660 * it. To serialize that we use a file lock on the socket
663 * It's a bit crazy, but hey, works great! */
665 if (lockf(netns_storage_socket
[0], F_LOCK
, 0) < 0)
668 netns
= receive_one_fd(netns_storage_socket
[0], MSG_DONTWAIT
);
669 if (netns
== -EAGAIN
) {
670 /* Nothing stored yet, so let's create a new namespace */
672 if (unshare(CLONE_NEWNET
) < 0) {
679 netns
= open("/proc/self/ns/net", O_RDONLY
|O_CLOEXEC
|O_NOCTTY
);
687 } else if (netns
< 0) {
692 /* Yay, found something, so let's join the namespace */
693 if (setns(netns
, CLONE_NEWNET
) < 0) {
701 q
= send_one_fd(netns_storage_socket
[1], netns
, MSG_DONTWAIT
);
708 lockf(netns_storage_socket
[0], F_ULOCK
, 0);
712 static const char *const protect_home_table
[_PROTECT_HOME_MAX
] = {
713 [PROTECT_HOME_NO
] = "no",
714 [PROTECT_HOME_YES
] = "yes",
715 [PROTECT_HOME_READ_ONLY
] = "read-only",
718 DEFINE_STRING_TABLE_LOOKUP(protect_home
, ProtectHome
);
720 static const char *const protect_system_table
[_PROTECT_SYSTEM_MAX
] = {
721 [PROTECT_SYSTEM_NO
] = "no",
722 [PROTECT_SYSTEM_YES
] = "yes",
723 [PROTECT_SYSTEM_FULL
] = "full",
726 DEFINE_STRING_TABLE_LOOKUP(protect_system
, ProtectSystem
);