1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/mount.h>
31 #include "dev-setup.h"
32 #include "loopback-setup.h"
35 #include "path-util.h"
36 #include "selinux-util.h"
37 #include "string-util.h"
40 #include "namespace.h"
42 typedef enum MountMode
{
43 /* This is ordered by priority! */
53 typedef struct BindMount
{
60 static int append_mounts(BindMount
**p
, char **strv
, MountMode mode
) {
65 STRV_FOREACH(i
, strv
) {
70 if ((mode
== INACCESSIBLE
|| mode
== READONLY
|| mode
== READWRITE
) && (*i
)[0] == '-') {
75 if (!path_is_absolute(*i
))
86 static int mount_path_compare(const void *a
, const void *b
) {
87 const BindMount
*p
= a
, *q
= b
;
90 d
= path_compare(p
->path
, q
->path
);
93 /* If the paths are equal, check the mode */
94 if (p
->mode
< q
->mode
)
97 if (p
->mode
> q
->mode
)
103 /* If the paths are not equal, then order prefixes first */
107 static void drop_duplicates(BindMount
*m
, unsigned *n
) {
108 BindMount
*f
, *t
, *previous
;
113 for (f
= m
, t
= m
, previous
= NULL
; f
< m
+*n
; f
++) {
115 /* The first one wins */
116 if (previous
&& path_equal(f
->path
, previous
->path
))
129 static int mount_dev(BindMount
*m
) {
130 static const char devnodes
[] =
138 char temporary_mount
[] = "/tmp/namespace-dev-XXXXXX";
139 const char *d
, *dev
= NULL
, *devpts
= NULL
, *devshm
= NULL
, *devhugepages
= NULL
, *devmqueue
= NULL
, *devlog
= NULL
, *devptmx
= NULL
;
140 _cleanup_umask_ mode_t u
;
147 if (!mkdtemp(temporary_mount
))
150 dev
= strjoina(temporary_mount
, "/dev");
151 (void) mkdir(dev
, 0755);
152 if (mount("tmpfs", dev
, "tmpfs", MS_NOSUID
|MS_STRICTATIME
, "mode=755") < 0) {
157 devpts
= strjoina(temporary_mount
, "/dev/pts");
158 (void) mkdir(devpts
, 0755);
159 if (mount("/dev/pts", devpts
, NULL
, MS_BIND
, NULL
) < 0) {
164 devptmx
= strjoina(temporary_mount
, "/dev/ptmx");
165 if (symlink("pts/ptmx", devptmx
) < 0) {
170 devshm
= strjoina(temporary_mount
, "/dev/shm");
171 (void) mkdir(devshm
, 01777);
172 r
= mount("/dev/shm", devshm
, NULL
, MS_BIND
, NULL
);
178 devmqueue
= strjoina(temporary_mount
, "/dev/mqueue");
179 (void) mkdir(devmqueue
, 0755);
180 (void) mount("/dev/mqueue", devmqueue
, NULL
, MS_BIND
, NULL
);
182 devhugepages
= strjoina(temporary_mount
, "/dev/hugepages");
183 (void) mkdir(devhugepages
, 0755);
184 (void) mount("/dev/hugepages", devhugepages
, NULL
, MS_BIND
, NULL
);
186 devlog
= strjoina(temporary_mount
, "/dev/log");
187 (void) symlink("/run/systemd/journal/dev-log", devlog
);
189 NULSTR_FOREACH(d
, devnodes
) {
190 _cleanup_free_
char *dn
= NULL
;
203 if (!S_ISBLK(st
.st_mode
) &&
204 !S_ISCHR(st
.st_mode
)) {
212 dn
= strappend(temporary_mount
, d
);
218 mac_selinux_create_file_prepare(d
, st
.st_mode
);
219 r
= mknod(dn
, st
.st_mode
, st
.st_rdev
);
220 mac_selinux_create_file_clear();
228 dev_setup(temporary_mount
, UID_INVALID
, GID_INVALID
);
230 /* Create the /dev directory if missing. It is more likely to be
231 * missing when the service is started with RootDirectory. This is
232 * consistent with mount units creating the mount points when missing.
234 (void) mkdir_p_label(m
->path
, 0755);
236 if (mount(dev
, m
->path
, NULL
, MS_MOVE
, NULL
) < 0) {
242 rmdir(temporary_mount
);
254 umount(devhugepages
);
261 rmdir(temporary_mount
);
266 static int mount_kdbus(BindMount
*m
) {
268 char temporary_mount
[] = "/tmp/kdbus-dev-XXXXXX";
269 _cleanup_free_
char *basepath
= NULL
;
270 _cleanup_umask_ mode_t u
;
271 char *busnode
= NULL
, *root
;
279 if (!mkdtemp(temporary_mount
))
280 return log_error_errno(errno
, "Failed create temp dir: %m");
282 root
= strjoina(temporary_mount
, "/kdbus");
283 (void) mkdir(root
, 0755);
284 if (mount("tmpfs", root
, "tmpfs", MS_NOSUID
|MS_STRICTATIME
, "mode=777") < 0) {
289 /* create a new /dev/null dev node copy so we have some fodder to
290 * bind-mount the custom endpoint over. */
291 if (stat("/dev/null", &st
) < 0) {
292 r
= log_error_errno(errno
, "Failed to stat /dev/null: %m");
296 busnode
= strjoina(root
, "/bus");
297 if (mknod(busnode
, (st
.st_mode
& ~07777) | 0600, st
.st_rdev
) < 0) {
298 r
= log_error_errno(errno
, "mknod() for %s failed: %m",
303 r
= mount(m
->path
, busnode
, NULL
, MS_BIND
, NULL
);
305 r
= log_error_errno(errno
, "bind mount of %s failed: %m",
310 basepath
= dirname_malloc(m
->path
);
316 if (mount(root
, basepath
, NULL
, MS_MOVE
, NULL
) < 0) {
317 r
= log_error_errno(errno
, "bind mount of %s failed: %m",
322 rmdir(temporary_mount
);
333 rmdir(temporary_mount
);
338 static int apply_mount(
341 const char *var_tmp_dir
) {
352 /* First, get rid of everything that is below if there
353 * is anything... Then, overmount it with an
354 * inaccessible directory. */
355 umount_recursive(m
->path
, 0);
357 what
= "/run/systemd/inaccessible";
362 /* Nothing to mount here, we just later toggle the
363 * MS_RDONLY bit for the mount point */
370 case PRIVATE_VAR_TMP
:
377 case PRIVATE_BUS_ENDPOINT
:
378 return mount_kdbus(m
);
381 assert_not_reached("Unknown mode");
386 r
= mount(what
, m
->path
, NULL
, MS_BIND
|MS_REC
, NULL
);
388 log_debug("Successfully mounted %s to %s", what
, m
->path
);
389 else if (m
->ignore
&& errno
== ENOENT
)
395 static int make_read_only(BindMount
*m
) {
400 if (IN_SET(m
->mode
, INACCESSIBLE
, READONLY
))
401 r
= bind_remount_recursive(m
->path
, true);
402 else if (IN_SET(m
->mode
, READWRITE
, PRIVATE_TMP
, PRIVATE_VAR_TMP
, PRIVATE_DEV
))
403 r
= bind_remount_recursive(m
->path
, false);
407 if (m
->ignore
&& r
== -ENOENT
)
414 const char* root_directory
,
415 char** read_write_dirs
,
416 char** read_only_dirs
,
417 char** inaccessible_dirs
,
419 const char* var_tmp_dir
,
420 const char* bus_endpoint_path
,
422 ProtectHome protect_home
,
423 ProtectSystem protect_system
,
424 unsigned long mount_flags
) {
426 BindMount
*m
, *mounts
= NULL
;
430 if (mount_flags
== 0)
431 mount_flags
= MS_SHARED
;
433 if (unshare(CLONE_NEWNS
) < 0)
436 n
= !!tmp_dir
+ !!var_tmp_dir
+ !!bus_endpoint_path
+
437 strv_length(read_write_dirs
) +
438 strv_length(read_only_dirs
) +
439 strv_length(inaccessible_dirs
) +
441 (protect_home
!= PROTECT_HOME_NO
? 3 : 0) +
442 (protect_system
!= PROTECT_SYSTEM_NO
? 2 : 0) +
443 (protect_system
== PROTECT_SYSTEM_FULL
? 1 : 0);
446 m
= mounts
= (BindMount
*) alloca0(n
* sizeof(BindMount
));
447 r
= append_mounts(&m
, read_write_dirs
, READWRITE
);
451 r
= append_mounts(&m
, read_only_dirs
, READONLY
);
455 r
= append_mounts(&m
, inaccessible_dirs
, INACCESSIBLE
);
460 m
->path
= prefix_roota(root_directory
, "/tmp");
461 m
->mode
= PRIVATE_TMP
;
466 m
->path
= prefix_roota(root_directory
, "/var/tmp");
467 m
->mode
= PRIVATE_VAR_TMP
;
472 m
->path
= prefix_roota(root_directory
, "/dev");
473 m
->mode
= PRIVATE_DEV
;
477 if (bus_endpoint_path
) {
478 m
->path
= prefix_roota(root_directory
, bus_endpoint_path
);
479 m
->mode
= PRIVATE_BUS_ENDPOINT
;
483 if (protect_home
!= PROTECT_HOME_NO
) {
484 const char *home_dir
, *run_user_dir
, *root_dir
;
486 home_dir
= prefix_roota(root_directory
, "/home");
487 home_dir
= strjoina("-", home_dir
);
488 run_user_dir
= prefix_roota(root_directory
, "/run/user");
489 run_user_dir
= strjoina("-", run_user_dir
);
490 root_dir
= prefix_roota(root_directory
, "/root");
491 root_dir
= strjoina("-", root_dir
);
493 r
= append_mounts(&m
, STRV_MAKE(home_dir
, run_user_dir
, root_dir
),
494 protect_home
== PROTECT_HOME_READ_ONLY
? READONLY
: INACCESSIBLE
);
499 if (protect_system
!= PROTECT_SYSTEM_NO
) {
500 const char *usr_dir
, *boot_dir
, *etc_dir
;
502 usr_dir
= prefix_roota(root_directory
, "/usr");
503 boot_dir
= prefix_roota(root_directory
, "/boot");
504 boot_dir
= strjoina("-", boot_dir
);
505 etc_dir
= prefix_roota(root_directory
, "/etc");
507 r
= append_mounts(&m
, protect_system
== PROTECT_SYSTEM_FULL
508 ? STRV_MAKE(usr_dir
, boot_dir
, etc_dir
)
509 : STRV_MAKE(usr_dir
, boot_dir
), READONLY
);
514 assert(mounts
+ n
== m
);
516 qsort(mounts
, n
, sizeof(BindMount
), mount_path_compare
);
517 drop_duplicates(mounts
, &n
);
520 if (n
> 0 || root_directory
) {
521 /* Remount / as SLAVE so that nothing now mounted in the namespace
522 shows up in the parent */
523 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
527 if (root_directory
) {
528 /* Turn directory into bind mount */
529 if (mount(root_directory
, root_directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
534 for (m
= mounts
; m
< mounts
+ n
; ++m
) {
535 r
= apply_mount(m
, tmp_dir
, var_tmp_dir
);
540 for (m
= mounts
; m
< mounts
+ n
; ++m
) {
541 r
= make_read_only(m
);
547 if (root_directory
) {
548 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
549 r
= mount_move_root(root_directory
);
551 /* at this point, we cannot rollback */
556 /* Remount / as the desired mode. Not that this will not
557 * reestablish propagation from our side to the host, since
558 * what's disconnected is disconnected. */
559 if (mount(NULL
, "/", NULL
, mount_flags
| MS_REC
, NULL
) < 0)
560 /* at this point, we cannot rollback */
567 for (m
= mounts
; m
< mounts
+ n
; ++m
)
569 (void) umount2(m
->path
, MNT_DETACH
);
575 static int setup_one_tmp_dir(const char *id
, const char *prefix
, char **path
) {
576 _cleanup_free_
char *x
= NULL
;
577 char bid
[SD_ID128_STRING_MAX
];
585 /* We include the boot id in the directory so that after a
586 * reboot we can easily identify obsolete directories. */
588 r
= sd_id128_get_boot(&boot_id
);
592 x
= strjoin(prefix
, "/systemd-private-", sd_id128_to_string(boot_id
, bid
), "-", id
, "-XXXXXX", NULL
);
600 RUN_WITH_UMASK(0000) {
603 y
= strjoina(x
, "/tmp");
605 if (mkdir(y
, 0777 | S_ISVTX
) < 0)
615 int setup_tmp_dirs(const char *id
, char **tmp_dir
, char **var_tmp_dir
) {
623 r
= setup_one_tmp_dir(id
, "/tmp", &a
);
627 r
= setup_one_tmp_dir(id
, "/var/tmp", &b
);
631 t
= strjoina(a
, "/tmp");
645 int setup_netns(int netns_storage_socket
[2]) {
646 _cleanup_close_
int netns
= -1;
649 assert(netns_storage_socket
);
650 assert(netns_storage_socket
[0] >= 0);
651 assert(netns_storage_socket
[1] >= 0);
653 /* We use the passed socketpair as a storage buffer for our
654 * namespace reference fd. Whatever process runs this first
655 * shall create a new namespace, all others should just join
656 * it. To serialize that we use a file lock on the socket
659 * It's a bit crazy, but hey, works great! */
661 if (lockf(netns_storage_socket
[0], F_LOCK
, 0) < 0)
664 netns
= receive_one_fd(netns_storage_socket
[0], MSG_DONTWAIT
);
665 if (netns
== -EAGAIN
) {
666 /* Nothing stored yet, so let's create a new namespace */
668 if (unshare(CLONE_NEWNET
) < 0) {
675 netns
= open("/proc/self/ns/net", O_RDONLY
|O_CLOEXEC
|O_NOCTTY
);
683 } else if (netns
< 0) {
688 /* Yay, found something, so let's join the namespace */
689 if (setns(netns
, CLONE_NEWNET
) < 0) {
697 q
= send_one_fd(netns_storage_socket
[1], netns
, MSG_DONTWAIT
);
704 lockf(netns_storage_socket
[0], F_ULOCK
, 0);
708 static const char *const protect_home_table
[_PROTECT_HOME_MAX
] = {
709 [PROTECT_HOME_NO
] = "no",
710 [PROTECT_HOME_YES
] = "yes",
711 [PROTECT_HOME_READ_ONLY
] = "read-only",
714 DEFINE_STRING_TABLE_LOOKUP(protect_home
, ProtectHome
);
716 static const char *const protect_system_table
[_PROTECT_SYSTEM_MAX
] = {
717 [PROTECT_SYSTEM_NO
] = "no",
718 [PROTECT_SYSTEM_YES
] = "yes",
719 [PROTECT_SYSTEM_FULL
] = "full",
722 DEFINE_STRING_TABLE_LOOKUP(protect_system
, ProtectSystem
);