1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/mount.h>
31 #include "dev-setup.h"
33 #include "loopback-setup.h"
36 #include "mount-util.h"
37 #include "namespace.h"
38 #include "path-util.h"
39 #include "selinux-util.h"
40 #include "socket-util.h"
41 #include "string-table.h"
42 #include "string-util.h"
44 #include "umask-util.h"
47 typedef enum MountMode
{
48 /* This is ordered by priority! */
58 typedef struct BindMount
{
65 static int append_mounts(BindMount
**p
, char **strv
, MountMode mode
) {
70 STRV_FOREACH(i
, strv
) {
75 if ((mode
== INACCESSIBLE
|| mode
== READONLY
|| mode
== READWRITE
) && (*i
)[0] == '-') {
80 if (!path_is_absolute(*i
))
91 static int mount_path_compare(const void *a
, const void *b
) {
92 const BindMount
*p
= a
, *q
= b
;
95 d
= path_compare(p
->path
, q
->path
);
98 /* If the paths are equal, check the mode */
99 if (p
->mode
< q
->mode
)
102 if (p
->mode
> q
->mode
)
108 /* If the paths are not equal, then order prefixes first */
112 static void drop_duplicates(BindMount
*m
, unsigned *n
) {
113 BindMount
*f
, *t
, *previous
;
118 for (f
= m
, t
= m
, previous
= NULL
; f
< m
+*n
; f
++) {
120 /* The first one wins */
121 if (previous
&& path_equal(f
->path
, previous
->path
))
134 static int mount_dev(BindMount
*m
) {
135 static const char devnodes
[] =
143 char temporary_mount
[] = "/tmp/namespace-dev-XXXXXX";
144 const char *d
, *dev
= NULL
, *devpts
= NULL
, *devshm
= NULL
, *devhugepages
= NULL
, *devmqueue
= NULL
, *devlog
= NULL
, *devptmx
= NULL
;
145 _cleanup_umask_ mode_t u
;
152 if (!mkdtemp(temporary_mount
))
155 dev
= strjoina(temporary_mount
, "/dev");
156 (void) mkdir(dev
, 0755);
157 if (mount("tmpfs", dev
, "tmpfs", MS_NOSUID
|MS_STRICTATIME
, "mode=755") < 0) {
162 devpts
= strjoina(temporary_mount
, "/dev/pts");
163 (void) mkdir(devpts
, 0755);
164 if (mount("/dev/pts", devpts
, NULL
, MS_BIND
, NULL
) < 0) {
169 devptmx
= strjoina(temporary_mount
, "/dev/ptmx");
170 if (symlink("pts/ptmx", devptmx
) < 0) {
175 devshm
= strjoina(temporary_mount
, "/dev/shm");
176 (void) mkdir(devshm
, 01777);
177 r
= mount("/dev/shm", devshm
, NULL
, MS_BIND
, NULL
);
183 devmqueue
= strjoina(temporary_mount
, "/dev/mqueue");
184 (void) mkdir(devmqueue
, 0755);
185 (void) mount("/dev/mqueue", devmqueue
, NULL
, MS_BIND
, NULL
);
187 devhugepages
= strjoina(temporary_mount
, "/dev/hugepages");
188 (void) mkdir(devhugepages
, 0755);
189 (void) mount("/dev/hugepages", devhugepages
, NULL
, MS_BIND
, NULL
);
191 devlog
= strjoina(temporary_mount
, "/dev/log");
192 (void) symlink("/run/systemd/journal/dev-log", devlog
);
194 NULSTR_FOREACH(d
, devnodes
) {
195 _cleanup_free_
char *dn
= NULL
;
208 if (!S_ISBLK(st
.st_mode
) &&
209 !S_ISCHR(st
.st_mode
)) {
217 dn
= strappend(temporary_mount
, d
);
223 mac_selinux_create_file_prepare(d
, st
.st_mode
);
224 r
= mknod(dn
, st
.st_mode
, st
.st_rdev
);
225 mac_selinux_create_file_clear();
233 dev_setup(temporary_mount
, UID_INVALID
, GID_INVALID
);
235 /* Create the /dev directory if missing. It is more likely to be
236 * missing when the service is started with RootDirectory. This is
237 * consistent with mount units creating the mount points when missing.
239 (void) mkdir_p_label(m
->path
, 0755);
241 if (mount(dev
, m
->path
, NULL
, MS_MOVE
, NULL
) < 0) {
247 rmdir(temporary_mount
);
259 umount(devhugepages
);
266 rmdir(temporary_mount
);
271 static int mount_kdbus(BindMount
*m
) {
273 char temporary_mount
[] = "/tmp/kdbus-dev-XXXXXX";
274 _cleanup_free_
char *basepath
= NULL
;
275 _cleanup_umask_ mode_t u
;
276 char *busnode
= NULL
, *root
;
284 if (!mkdtemp(temporary_mount
))
285 return log_error_errno(errno
, "Failed create temp dir: %m");
287 root
= strjoina(temporary_mount
, "/kdbus");
288 (void) mkdir(root
, 0755);
289 if (mount("tmpfs", root
, "tmpfs", MS_NOSUID
|MS_STRICTATIME
, "mode=777") < 0) {
294 /* create a new /dev/null dev node copy so we have some fodder to
295 * bind-mount the custom endpoint over. */
296 if (stat("/dev/null", &st
) < 0) {
297 r
= log_error_errno(errno
, "Failed to stat /dev/null: %m");
301 busnode
= strjoina(root
, "/bus");
302 if (mknod(busnode
, (st
.st_mode
& ~07777) | 0600, st
.st_rdev
) < 0) {
303 r
= log_error_errno(errno
, "mknod() for %s failed: %m",
308 r
= mount(m
->path
, busnode
, NULL
, MS_BIND
, NULL
);
310 r
= log_error_errno(errno
, "bind mount of %s failed: %m",
315 basepath
= dirname_malloc(m
->path
);
321 if (mount(root
, basepath
, NULL
, MS_MOVE
, NULL
) < 0) {
322 r
= log_error_errno(errno
, "bind mount of %s failed: %m",
327 rmdir(temporary_mount
);
338 rmdir(temporary_mount
);
343 static int apply_mount(
346 const char *var_tmp_dir
) {
357 /* First, get rid of everything that is below if there
358 * is anything... Then, overmount it with an
359 * inaccessible directory. */
360 umount_recursive(m
->path
, 0);
362 what
= "/run/systemd/inaccessible";
367 /* Nothing to mount here, we just later toggle the
368 * MS_RDONLY bit for the mount point */
375 case PRIVATE_VAR_TMP
:
382 case PRIVATE_BUS_ENDPOINT
:
383 return mount_kdbus(m
);
386 assert_not_reached("Unknown mode");
391 r
= mount(what
, m
->path
, NULL
, MS_BIND
|MS_REC
, NULL
);
393 log_debug("Successfully mounted %s to %s", what
, m
->path
);
394 else if (m
->ignore
&& errno
== ENOENT
)
400 static int make_read_only(BindMount
*m
) {
405 if (IN_SET(m
->mode
, INACCESSIBLE
, READONLY
))
406 r
= bind_remount_recursive(m
->path
, true);
407 else if (IN_SET(m
->mode
, READWRITE
, PRIVATE_TMP
, PRIVATE_VAR_TMP
, PRIVATE_DEV
))
408 r
= bind_remount_recursive(m
->path
, false);
412 if (m
->ignore
&& r
== -ENOENT
)
419 const char* root_directory
,
420 char** read_write_dirs
,
421 char** read_only_dirs
,
422 char** inaccessible_dirs
,
424 const char* var_tmp_dir
,
425 const char* bus_endpoint_path
,
427 ProtectHome protect_home
,
428 ProtectSystem protect_system
,
429 unsigned long mount_flags
) {
431 BindMount
*m
, *mounts
= NULL
;
435 if (mount_flags
== 0)
436 mount_flags
= MS_SHARED
;
438 if (unshare(CLONE_NEWNS
) < 0)
441 n
= !!tmp_dir
+ !!var_tmp_dir
+ !!bus_endpoint_path
+
442 strv_length(read_write_dirs
) +
443 strv_length(read_only_dirs
) +
444 strv_length(inaccessible_dirs
) +
446 (protect_home
!= PROTECT_HOME_NO
? 3 : 0) +
447 (protect_system
!= PROTECT_SYSTEM_NO
? 2 : 0) +
448 (protect_system
== PROTECT_SYSTEM_FULL
? 1 : 0);
451 m
= mounts
= (BindMount
*) alloca0(n
* sizeof(BindMount
));
452 r
= append_mounts(&m
, read_write_dirs
, READWRITE
);
456 r
= append_mounts(&m
, read_only_dirs
, READONLY
);
460 r
= append_mounts(&m
, inaccessible_dirs
, INACCESSIBLE
);
465 m
->path
= prefix_roota(root_directory
, "/tmp");
466 m
->mode
= PRIVATE_TMP
;
471 m
->path
= prefix_roota(root_directory
, "/var/tmp");
472 m
->mode
= PRIVATE_VAR_TMP
;
477 m
->path
= prefix_roota(root_directory
, "/dev");
478 m
->mode
= PRIVATE_DEV
;
482 if (bus_endpoint_path
) {
483 m
->path
= prefix_roota(root_directory
, bus_endpoint_path
);
484 m
->mode
= PRIVATE_BUS_ENDPOINT
;
488 if (protect_home
!= PROTECT_HOME_NO
) {
489 const char *home_dir
, *run_user_dir
, *root_dir
;
491 home_dir
= prefix_roota(root_directory
, "/home");
492 home_dir
= strjoina("-", home_dir
);
493 run_user_dir
= prefix_roota(root_directory
, "/run/user");
494 run_user_dir
= strjoina("-", run_user_dir
);
495 root_dir
= prefix_roota(root_directory
, "/root");
496 root_dir
= strjoina("-", root_dir
);
498 r
= append_mounts(&m
, STRV_MAKE(home_dir
, run_user_dir
, root_dir
),
499 protect_home
== PROTECT_HOME_READ_ONLY
? READONLY
: INACCESSIBLE
);
504 if (protect_system
!= PROTECT_SYSTEM_NO
) {
505 const char *usr_dir
, *boot_dir
, *etc_dir
;
507 usr_dir
= prefix_roota(root_directory
, "/usr");
508 boot_dir
= prefix_roota(root_directory
, "/boot");
509 boot_dir
= strjoina("-", boot_dir
);
510 etc_dir
= prefix_roota(root_directory
, "/etc");
512 r
= append_mounts(&m
, protect_system
== PROTECT_SYSTEM_FULL
513 ? STRV_MAKE(usr_dir
, boot_dir
, etc_dir
)
514 : STRV_MAKE(usr_dir
, boot_dir
), READONLY
);
519 assert(mounts
+ n
== m
);
521 qsort(mounts
, n
, sizeof(BindMount
), mount_path_compare
);
522 drop_duplicates(mounts
, &n
);
525 if (n
> 0 || root_directory
) {
526 /* Remount / as SLAVE so that nothing now mounted in the namespace
527 shows up in the parent */
528 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
532 if (root_directory
) {
533 /* Turn directory into bind mount */
534 if (mount(root_directory
, root_directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
539 for (m
= mounts
; m
< mounts
+ n
; ++m
) {
540 r
= apply_mount(m
, tmp_dir
, var_tmp_dir
);
545 for (m
= mounts
; m
< mounts
+ n
; ++m
) {
546 r
= make_read_only(m
);
552 if (root_directory
) {
553 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
554 r
= mount_move_root(root_directory
);
556 /* at this point, we cannot rollback */
561 /* Remount / as the desired mode. Not that this will not
562 * reestablish propagation from our side to the host, since
563 * what's disconnected is disconnected. */
564 if (mount(NULL
, "/", NULL
, mount_flags
| MS_REC
, NULL
) < 0)
565 /* at this point, we cannot rollback */
572 for (m
= mounts
; m
< mounts
+ n
; ++m
)
574 (void) umount2(m
->path
, MNT_DETACH
);
580 static int setup_one_tmp_dir(const char *id
, const char *prefix
, char **path
) {
581 _cleanup_free_
char *x
= NULL
;
582 char bid
[SD_ID128_STRING_MAX
];
590 /* We include the boot id in the directory so that after a
591 * reboot we can easily identify obsolete directories. */
593 r
= sd_id128_get_boot(&boot_id
);
597 x
= strjoin(prefix
, "/systemd-private-", sd_id128_to_string(boot_id
, bid
), "-", id
, "-XXXXXX", NULL
);
605 RUN_WITH_UMASK(0000) {
608 y
= strjoina(x
, "/tmp");
610 if (mkdir(y
, 0777 | S_ISVTX
) < 0)
620 int setup_tmp_dirs(const char *id
, char **tmp_dir
, char **var_tmp_dir
) {
628 r
= setup_one_tmp_dir(id
, "/tmp", &a
);
632 r
= setup_one_tmp_dir(id
, "/var/tmp", &b
);
636 t
= strjoina(a
, "/tmp");
650 int setup_netns(int netns_storage_socket
[2]) {
651 _cleanup_close_
int netns
= -1;
654 assert(netns_storage_socket
);
655 assert(netns_storage_socket
[0] >= 0);
656 assert(netns_storage_socket
[1] >= 0);
658 /* We use the passed socketpair as a storage buffer for our
659 * namespace reference fd. Whatever process runs this first
660 * shall create a new namespace, all others should just join
661 * it. To serialize that we use a file lock on the socket
664 * It's a bit crazy, but hey, works great! */
666 if (lockf(netns_storage_socket
[0], F_LOCK
, 0) < 0)
669 netns
= receive_one_fd(netns_storage_socket
[0], MSG_DONTWAIT
);
670 if (netns
== -EAGAIN
) {
671 /* Nothing stored yet, so let's create a new namespace */
673 if (unshare(CLONE_NEWNET
) < 0) {
680 netns
= open("/proc/self/ns/net", O_RDONLY
|O_CLOEXEC
|O_NOCTTY
);
688 } else if (netns
< 0) {
693 /* Yay, found something, so let's join the namespace */
694 if (setns(netns
, CLONE_NEWNET
) < 0) {
702 q
= send_one_fd(netns_storage_socket
[1], netns
, MSG_DONTWAIT
);
709 lockf(netns_storage_socket
[0], F_ULOCK
, 0);
713 static const char *const protect_home_table
[_PROTECT_HOME_MAX
] = {
714 [PROTECT_HOME_NO
] = "no",
715 [PROTECT_HOME_YES
] = "yes",
716 [PROTECT_HOME_READ_ONLY
] = "read-only",
719 DEFINE_STRING_TABLE_LOOKUP(protect_home
, ProtectHome
);
721 static const char *const protect_system_table
[_PROTECT_SYSTEM_MAX
] = {
722 [PROTECT_SYSTEM_NO
] = "no",
723 [PROTECT_SYSTEM_YES
] = "yes",
724 [PROTECT_SYSTEM_FULL
] = "full",
727 DEFINE_STRING_TABLE_LOOKUP(protect_system
, ProtectSystem
);