1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
23 #include <sys/mount.h>
33 #include "path-util.h"
35 #include "loopback-setup.h"
36 #include "dev-setup.h"
37 #include "selinux-util.h"
38 #include "namespace.h"
41 typedef enum MountMode
{
42 /* This is ordered by priority! */
52 typedef struct BindMount
{
59 static int append_mounts(BindMount
**p
, char **strv
, MountMode mode
) {
64 STRV_FOREACH(i
, strv
) {
69 if ((mode
== INACCESSIBLE
|| mode
== READONLY
|| mode
== READWRITE
) && (*i
)[0] == '-') {
74 if (!path_is_absolute(*i
))
85 static int mount_path_compare(const void *a
, const void *b
) {
86 const BindMount
*p
= a
, *q
= b
;
89 d
= path_compare(p
->path
, q
->path
);
92 /* If the paths are equal, check the mode */
93 if (p
->mode
< q
->mode
)
96 if (p
->mode
> q
->mode
)
102 /* If the paths are not equal, then order prefixes first */
106 static void drop_duplicates(BindMount
*m
, unsigned *n
) {
107 BindMount
*f
, *t
, *previous
;
112 for (f
= m
, t
= m
, previous
= NULL
; f
< m
+*n
; f
++) {
114 /* The first one wins */
115 if (previous
&& path_equal(f
->path
, previous
->path
))
128 static int mount_dev(BindMount
*m
) {
129 static const char devnodes
[] =
137 char temporary_mount
[] = "/tmp/namespace-dev-XXXXXX";
138 const char *d
, *dev
= NULL
, *devpts
= NULL
, *devshm
= NULL
, *devhugepages
= NULL
, *devmqueue
= NULL
, *devlog
= NULL
, *devptmx
= NULL
;
139 _cleanup_umask_ mode_t u
;
146 if (!mkdtemp(temporary_mount
))
149 dev
= strjoina(temporary_mount
, "/dev");
150 (void) mkdir(dev
, 0755);
151 if (mount("tmpfs", dev
, "tmpfs", MS_NOSUID
|MS_STRICTATIME
, "mode=755") < 0) {
156 devpts
= strjoina(temporary_mount
, "/dev/pts");
157 (void) mkdir(devpts
, 0755);
158 if (mount("/dev/pts", devpts
, NULL
, MS_BIND
, NULL
) < 0) {
163 devptmx
= strjoina(temporary_mount
, "/dev/ptmx");
164 if (symlink("pts/ptmx", devptmx
) < 0) {
169 devshm
= strjoina(temporary_mount
, "/dev/shm");
170 (void) mkdir(devshm
, 01777);
171 r
= mount("/dev/shm", devshm
, NULL
, MS_BIND
, NULL
);
177 devmqueue
= strjoina(temporary_mount
, "/dev/mqueue");
178 (void) mkdir(devmqueue
, 0755);
179 (void) mount("/dev/mqueue", devmqueue
, NULL
, MS_BIND
, NULL
);
181 devhugepages
= strjoina(temporary_mount
, "/dev/hugepages");
182 (void) mkdir(devhugepages
, 0755);
183 (void) mount("/dev/hugepages", devhugepages
, NULL
, MS_BIND
, NULL
);
185 devlog
= strjoina(temporary_mount
, "/dev/log");
186 (void) symlink("/run/systemd/journal/dev-log", devlog
);
188 NULSTR_FOREACH(d
, devnodes
) {
189 _cleanup_free_
char *dn
= NULL
;
202 if (!S_ISBLK(st
.st_mode
) &&
203 !S_ISCHR(st
.st_mode
)) {
211 dn
= strappend(temporary_mount
, d
);
217 mac_selinux_create_file_prepare(d
, st
.st_mode
);
218 r
= mknod(dn
, st
.st_mode
, st
.st_rdev
);
219 mac_selinux_create_file_clear();
227 dev_setup(temporary_mount
, UID_INVALID
, GID_INVALID
);
229 /* Create the /dev directory if missing. It is more likely to be
230 * missing when the service is started with RootDirectory. This is
231 * consistent with mount units creating the mount points when missing.
233 (void) mkdir_p_label(m
->path
, 0755);
235 if (mount(dev
, m
->path
, NULL
, MS_MOVE
, NULL
) < 0) {
241 rmdir(temporary_mount
);
253 umount(devhugepages
);
260 rmdir(temporary_mount
);
265 static int mount_kdbus(BindMount
*m
) {
267 char temporary_mount
[] = "/tmp/kdbus-dev-XXXXXX";
268 _cleanup_free_
char *basepath
= NULL
;
269 _cleanup_umask_ mode_t u
;
270 char *busnode
= NULL
, *root
;
278 if (!mkdtemp(temporary_mount
))
279 return log_error_errno(errno
, "Failed create temp dir: %m");
281 root
= strjoina(temporary_mount
, "/kdbus");
282 (void) mkdir(root
, 0755);
283 if (mount("tmpfs", root
, "tmpfs", MS_NOSUID
|MS_STRICTATIME
, "mode=777") < 0) {
288 /* create a new /dev/null dev node copy so we have some fodder to
289 * bind-mount the custom endpoint over. */
290 if (stat("/dev/null", &st
) < 0) {
291 log_error_errno(errno
, "Failed to stat /dev/null: %m");
296 busnode
= strjoina(root
, "/bus");
297 if (mknod(busnode
, (st
.st_mode
& ~07777) | 0600, st
.st_rdev
) < 0) {
298 log_error_errno(errno
, "mknod() for %s failed: %m", busnode
);
303 r
= mount(m
->path
, busnode
, NULL
, MS_BIND
, NULL
);
305 log_error_errno(errno
, "bind mount of %s failed: %m", m
->path
);
310 basepath
= dirname_malloc(m
->path
);
316 if (mount(root
, basepath
, NULL
, MS_MOVE
, NULL
) < 0) {
317 log_error_errno(errno
, "bind mount of %s failed: %m", basepath
);
322 rmdir(temporary_mount
);
333 rmdir(temporary_mount
);
338 static int apply_mount(
341 const char *var_tmp_dir
) {
352 /* First, get rid of everything that is below if there
353 * is anything... Then, overmount it with an
354 * inaccessible directory. */
355 umount_recursive(m
->path
, 0);
357 what
= "/run/systemd/inaccessible";
362 /* Nothing to mount here, we just later toggle the
363 * MS_RDONLY bit for the mount point */
370 case PRIVATE_VAR_TMP
:
377 case PRIVATE_BUS_ENDPOINT
:
378 return mount_kdbus(m
);
381 assert_not_reached("Unknown mode");
386 r
= mount(what
, m
->path
, NULL
, MS_BIND
|MS_REC
, NULL
);
388 log_debug("Successfully mounted %s to %s", what
, m
->path
);
389 else if (m
->ignore
&& errno
== ENOENT
)
395 static int make_read_only(BindMount
*m
) {
400 if (IN_SET(m
->mode
, INACCESSIBLE
, READONLY
))
401 r
= bind_remount_recursive(m
->path
, true);
402 else if (IN_SET(m
->mode
, READWRITE
, PRIVATE_TMP
, PRIVATE_VAR_TMP
, PRIVATE_DEV
))
403 r
= bind_remount_recursive(m
->path
, false);
407 if (m
->ignore
&& r
== -ENOENT
)
414 const char* root_directory
,
415 char** read_write_dirs
,
416 char** read_only_dirs
,
417 char** inaccessible_dirs
,
419 const char* var_tmp_dir
,
420 const char* bus_endpoint_path
,
422 ProtectHome protect_home
,
423 ProtectSystem protect_system
,
424 unsigned long mount_flags
) {
426 BindMount
*m
, *mounts
= NULL
;
430 if (mount_flags
== 0)
431 mount_flags
= MS_SHARED
;
433 if (unshare(CLONE_NEWNS
) < 0)
436 n
= !!tmp_dir
+ !!var_tmp_dir
+ !!bus_endpoint_path
+
437 strv_length(read_write_dirs
) +
438 strv_length(read_only_dirs
) +
439 strv_length(inaccessible_dirs
) +
441 (protect_home
!= PROTECT_HOME_NO
? 3 : 0) +
442 (protect_system
!= PROTECT_SYSTEM_NO
? 2 : 0) +
443 (protect_system
== PROTECT_SYSTEM_FULL
? 1 : 0);
446 m
= mounts
= (BindMount
*) alloca0(n
* sizeof(BindMount
));
447 r
= append_mounts(&m
, read_write_dirs
, READWRITE
);
451 r
= append_mounts(&m
, read_only_dirs
, READONLY
);
455 r
= append_mounts(&m
, inaccessible_dirs
, INACCESSIBLE
);
460 m
->path
= prefix_roota(root_directory
, "/tmp");
461 m
->mode
= PRIVATE_TMP
;
466 m
->path
= prefix_roota(root_directory
, "/var/tmp");
467 m
->mode
= PRIVATE_VAR_TMP
;
472 m
->path
= prefix_roota(root_directory
, "/dev");
473 m
->mode
= PRIVATE_DEV
;
477 if (bus_endpoint_path
) {
478 m
->path
= prefix_roota(root_directory
, bus_endpoint_path
);
479 m
->mode
= PRIVATE_BUS_ENDPOINT
;
483 if (protect_home
!= PROTECT_HOME_NO
) {
484 const char *home_dir
, *run_user_dir
, *root_dir
;
486 home_dir
= prefix_roota(root_directory
, "/home");
487 home_dir
= strjoina("-", home_dir
);
488 run_user_dir
= prefix_roota(root_directory
, "/run/user");
489 run_user_dir
= strjoina("-", run_user_dir
);
490 root_dir
= prefix_roota(root_directory
, "/root");
491 root_dir
= strjoina("-", root_dir
);
493 r
= append_mounts(&m
, STRV_MAKE(home_dir
, run_user_dir
, root_dir
),
494 protect_home
== PROTECT_HOME_READ_ONLY
? READONLY
: INACCESSIBLE
);
499 if (protect_system
!= PROTECT_SYSTEM_NO
) {
500 const char *usr_dir
, *boot_dir
, *etc_dir
;
502 usr_dir
= prefix_roota(root_directory
, "/usr");
503 boot_dir
= prefix_roota(root_directory
, "/boot");
504 boot_dir
= strjoina("-", boot_dir
);
505 etc_dir
= prefix_roota(root_directory
, "/etc");
507 r
= append_mounts(&m
, protect_system
== PROTECT_SYSTEM_FULL
508 ? STRV_MAKE(usr_dir
, boot_dir
, etc_dir
)
509 : STRV_MAKE(usr_dir
, boot_dir
), READONLY
);
514 assert(mounts
+ n
== m
);
516 qsort(mounts
, n
, sizeof(BindMount
), mount_path_compare
);
517 drop_duplicates(mounts
, &n
);
520 if (n
> 0 || root_directory
) {
521 /* Remount / as SLAVE so that nothing now mounted in the namespace
522 shows up in the parent */
523 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
527 if (root_directory
) {
528 /* Turn directory into bind mount */
529 if (mount(root_directory
, root_directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
534 for (m
= mounts
; m
< mounts
+ n
; ++m
) {
535 r
= apply_mount(m
, tmp_dir
, var_tmp_dir
);
540 for (m
= mounts
; m
< mounts
+ n
; ++m
) {
541 r
= make_read_only(m
);
547 if (root_directory
) {
548 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
549 r
= mount_move_root(root_directory
);
551 /* at this point, we cannot rollback */
556 /* Remount / as the desired mode. Not that this will not
557 * reestablish propagation from our side to the host, since
558 * what's disconnected is disconnected. */
559 if (mount(NULL
, "/", NULL
, mount_flags
| MS_REC
, NULL
) < 0) {
560 /* at this point, we cannot rollback */
568 for (m
= mounts
; m
< mounts
+ n
; ++m
)
570 (void) umount2(m
->path
, MNT_DETACH
);
576 static int setup_one_tmp_dir(const char *id
, const char *prefix
, char **path
) {
577 _cleanup_free_
char *x
= NULL
;
578 char bid
[SD_ID128_STRING_MAX
];
586 /* We include the boot id in the directory so that after a
587 * reboot we can easily identify obsolete directories. */
589 r
= sd_id128_get_boot(&boot_id
);
593 x
= strjoin(prefix
, "/systemd-private-", sd_id128_to_string(boot_id
, bid
), "-", id
, "-XXXXXX", NULL
);
601 RUN_WITH_UMASK(0000) {
604 y
= strjoina(x
, "/tmp");
606 if (mkdir(y
, 0777 | S_ISVTX
) < 0)
616 int setup_tmp_dirs(const char *id
, char **tmp_dir
, char **var_tmp_dir
) {
624 r
= setup_one_tmp_dir(id
, "/tmp", &a
);
628 r
= setup_one_tmp_dir(id
, "/var/tmp", &b
);
632 t
= strjoina(a
, "/tmp");
646 int setup_netns(int netns_storage_socket
[2]) {
647 _cleanup_close_
int netns
= -1;
649 struct cmsghdr cmsghdr
;
650 uint8_t buf
[CMSG_SPACE(sizeof(int))];
653 .msg_control
= &control
,
654 .msg_controllen
= sizeof(control
),
656 struct cmsghdr
*cmsg
;
659 assert(netns_storage_socket
);
660 assert(netns_storage_socket
[0] >= 0);
661 assert(netns_storage_socket
[1] >= 0);
663 /* We use the passed socketpair as a storage buffer for our
664 * namespace reference fd. Whatever process runs this first
665 * shall create a new namespace, all others should just join
666 * it. To serialize that we use a file lock on the socket
669 * It's a bit crazy, but hey, works great! */
671 if (lockf(netns_storage_socket
[0], F_LOCK
, 0) < 0)
674 if (recvmsg(netns_storage_socket
[0], &mh
, MSG_DONTWAIT
|MSG_CMSG_CLOEXEC
) < 0) {
675 if (errno
!= EAGAIN
) {
680 /* Nothing stored yet, so let's create a new namespace */
682 if (unshare(CLONE_NEWNET
) < 0) {
689 netns
= open("/proc/self/ns/net", O_RDONLY
|O_CLOEXEC
|O_NOCTTY
);
697 /* Yay, found something, so let's join the namespace */
699 CMSG_FOREACH(cmsg
, &mh
)
700 if (cmsg
->cmsg_level
== SOL_SOCKET
&& cmsg
->cmsg_type
== SCM_RIGHTS
) {
701 assert(cmsg
->cmsg_len
== CMSG_LEN(sizeof(int)));
702 netns
= *(int*) CMSG_DATA(cmsg
);
705 if (setns(netns
, CLONE_NEWNET
) < 0) {
713 cmsg
= CMSG_FIRSTHDR(&mh
);
714 cmsg
->cmsg_level
= SOL_SOCKET
;
715 cmsg
->cmsg_type
= SCM_RIGHTS
;
716 cmsg
->cmsg_len
= CMSG_LEN(sizeof(int));
717 memcpy(CMSG_DATA(cmsg
), &netns
, sizeof(int));
718 mh
.msg_controllen
= cmsg
->cmsg_len
;
720 if (sendmsg(netns_storage_socket
[1], &mh
, MSG_DONTWAIT
|MSG_NOSIGNAL
) < 0) {
726 lockf(netns_storage_socket
[0], F_ULOCK
, 0);
731 static const char *const protect_home_table
[_PROTECT_HOME_MAX
] = {
732 [PROTECT_HOME_NO
] = "no",
733 [PROTECT_HOME_YES
] = "yes",
734 [PROTECT_HOME_READ_ONLY
] = "read-only",
737 DEFINE_STRING_TABLE_LOOKUP(protect_home
, ProtectHome
);
739 static const char *const protect_system_table
[_PROTECT_SYSTEM_MAX
] = {
740 [PROTECT_SYSTEM_NO
] = "no",
741 [PROTECT_SYSTEM_YES
] = "yes",
742 [PROTECT_SYSTEM_FULL
] = "full",
745 DEFINE_STRING_TABLE_LOOKUP(protect_system
, ProtectSystem
);