1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
23 #include <sys/mount.h>
33 #include "path-util.h"
35 #include "loopback-setup.h"
36 #include "dev-setup.h"
37 #include "selinux-util.h"
38 #include "namespace.h"
41 typedef enum MountMode
{
42 /* This is ordered by priority! */
52 typedef struct BindMount
{
59 static int append_mounts(BindMount
**p
, char **strv
, MountMode mode
) {
64 STRV_FOREACH(i
, strv
) {
69 if ((mode
== INACCESSIBLE
|| mode
== READONLY
|| mode
== READWRITE
) && (*i
)[0] == '-') {
74 if (!path_is_absolute(*i
))
85 static int mount_path_compare(const void *a
, const void *b
) {
86 const BindMount
*p
= a
, *q
= b
;
89 d
= path_compare(p
->path
, q
->path
);
92 /* If the paths are equal, check the mode */
93 if (p
->mode
< q
->mode
)
96 if (p
->mode
> q
->mode
)
102 /* If the paths are not equal, then order prefixes first */
106 static void drop_duplicates(BindMount
*m
, unsigned *n
) {
107 BindMount
*f
, *t
, *previous
;
112 for (f
= m
, t
= m
, previous
= NULL
; f
< m
+*n
; f
++) {
114 /* The first one wins */
115 if (previous
&& path_equal(f
->path
, previous
->path
))
128 static int mount_dev(BindMount
*m
) {
129 static const char devnodes
[] =
137 char temporary_mount
[] = "/tmp/namespace-dev-XXXXXX";
138 const char *d
, *dev
= NULL
, *devpts
= NULL
, *devshm
= NULL
, *devhugepages
= NULL
, *devmqueue
= NULL
, *devlog
= NULL
, *devptmx
= NULL
;
139 _cleanup_umask_ mode_t u
;
146 if (!mkdtemp(temporary_mount
))
149 dev
= strjoina(temporary_mount
, "/dev");
150 (void) mkdir(dev
, 0755);
151 if (mount("tmpfs", dev
, "tmpfs", MS_NOSUID
|MS_STRICTATIME
, "mode=755") < 0) {
156 devpts
= strjoina(temporary_mount
, "/dev/pts");
157 (void) mkdir(devpts
, 0755);
158 if (mount("/dev/pts", devpts
, NULL
, MS_BIND
, NULL
) < 0) {
163 devptmx
= strjoina(temporary_mount
, "/dev/ptmx");
164 if (symlink("pts/ptmx", devptmx
) < 0) {
169 devshm
= strjoina(temporary_mount
, "/dev/shm");
170 (void) mkdir(devshm
, 01777);
171 r
= mount("/dev/shm", devshm
, NULL
, MS_BIND
, NULL
);
177 devmqueue
= strjoina(temporary_mount
, "/dev/mqueue");
178 (void) mkdir(devmqueue
, 0755);
179 (void) mount("/dev/mqueue", devmqueue
, NULL
, MS_BIND
, NULL
);
181 devhugepages
= strjoina(temporary_mount
, "/dev/hugepages");
182 (void) mkdir(devhugepages
, 0755);
183 (void) mount("/dev/hugepages", devhugepages
, NULL
, MS_BIND
, NULL
);
185 devlog
= strjoina(temporary_mount
, "/dev/log");
186 (void) symlink("/run/systemd/journal/dev-log", devlog
);
188 NULSTR_FOREACH(d
, devnodes
) {
189 _cleanup_free_
char *dn
= NULL
;
202 if (!S_ISBLK(st
.st_mode
) &&
203 !S_ISCHR(st
.st_mode
)) {
211 dn
= strappend(temporary_mount
, d
);
217 mac_selinux_create_file_prepare(d
, st
.st_mode
);
218 r
= mknod(dn
, st
.st_mode
, st
.st_rdev
);
219 mac_selinux_create_file_clear();
227 dev_setup(temporary_mount
, UID_INVALID
, GID_INVALID
);
229 /* Create the /dev directory if missing. It is more likely to be
230 * missing when the service is started with RootDirectory. This is
231 * consistent with mount units creating the mount points when missing.
233 (void) mkdir_p_label(m
->path
, 0755);
235 if (mount(dev
, m
->path
, NULL
, MS_MOVE
, NULL
) < 0) {
241 rmdir(temporary_mount
);
253 umount(devhugepages
);
260 rmdir(temporary_mount
);
265 static int mount_kdbus(BindMount
*m
) {
267 char temporary_mount
[] = "/tmp/kdbus-dev-XXXXXX";
268 _cleanup_free_
char *basepath
= NULL
;
269 _cleanup_umask_ mode_t u
;
270 char *busnode
= NULL
, *root
;
278 if (!mkdtemp(temporary_mount
))
279 return log_error_errno(errno
, "Failed create temp dir: %m");
281 root
= strjoina(temporary_mount
, "/kdbus");
282 (void) mkdir(root
, 0755);
283 if (mount("tmpfs", root
, "tmpfs", MS_NOSUID
|MS_STRICTATIME
, "mode=777") < 0) {
288 /* create a new /dev/null dev node copy so we have some fodder to
289 * bind-mount the custom endpoint over. */
290 if (stat("/dev/null", &st
) < 0) {
291 r
= log_error_errno(errno
, "Failed to stat /dev/null: %m");
295 busnode
= strjoina(root
, "/bus");
296 if (mknod(busnode
, (st
.st_mode
& ~07777) | 0600, st
.st_rdev
) < 0) {
297 r
= log_error_errno(errno
, "mknod() for %s failed: %m",
302 r
= mount(m
->path
, busnode
, NULL
, MS_BIND
, NULL
);
304 r
= log_error_errno(errno
, "bind mount of %s failed: %m",
309 basepath
= dirname_malloc(m
->path
);
315 if (mount(root
, basepath
, NULL
, MS_MOVE
, NULL
) < 0) {
316 r
= log_error_errno(errno
, "bind mount of %s failed: %m",
321 rmdir(temporary_mount
);
332 rmdir(temporary_mount
);
337 static int apply_mount(
340 const char *var_tmp_dir
) {
351 /* First, get rid of everything that is below if there
352 * is anything... Then, overmount it with an
353 * inaccessible directory. */
354 umount_recursive(m
->path
, 0);
356 what
= "/run/systemd/inaccessible";
361 /* Nothing to mount here, we just later toggle the
362 * MS_RDONLY bit for the mount point */
369 case PRIVATE_VAR_TMP
:
376 case PRIVATE_BUS_ENDPOINT
:
377 return mount_kdbus(m
);
380 assert_not_reached("Unknown mode");
385 r
= mount(what
, m
->path
, NULL
, MS_BIND
|MS_REC
, NULL
);
387 log_debug("Successfully mounted %s to %s", what
, m
->path
);
388 else if (m
->ignore
&& errno
== ENOENT
)
394 static int make_read_only(BindMount
*m
) {
399 if (IN_SET(m
->mode
, INACCESSIBLE
, READONLY
))
400 r
= bind_remount_recursive(m
->path
, true);
401 else if (IN_SET(m
->mode
, READWRITE
, PRIVATE_TMP
, PRIVATE_VAR_TMP
, PRIVATE_DEV
))
402 r
= bind_remount_recursive(m
->path
, false);
406 if (m
->ignore
&& r
== -ENOENT
)
413 const char* root_directory
,
414 char** read_write_dirs
,
415 char** read_only_dirs
,
416 char** inaccessible_dirs
,
418 const char* var_tmp_dir
,
419 const char* bus_endpoint_path
,
421 ProtectHome protect_home
,
422 ProtectSystem protect_system
,
423 unsigned long mount_flags
) {
425 BindMount
*m
, *mounts
= NULL
;
429 if (mount_flags
== 0)
430 mount_flags
= MS_SHARED
;
432 if (unshare(CLONE_NEWNS
) < 0)
435 n
= !!tmp_dir
+ !!var_tmp_dir
+ !!bus_endpoint_path
+
436 strv_length(read_write_dirs
) +
437 strv_length(read_only_dirs
) +
438 strv_length(inaccessible_dirs
) +
440 (protect_home
!= PROTECT_HOME_NO
? 3 : 0) +
441 (protect_system
!= PROTECT_SYSTEM_NO
? 2 : 0) +
442 (protect_system
== PROTECT_SYSTEM_FULL
? 1 : 0);
445 m
= mounts
= (BindMount
*) alloca0(n
* sizeof(BindMount
));
446 r
= append_mounts(&m
, read_write_dirs
, READWRITE
);
450 r
= append_mounts(&m
, read_only_dirs
, READONLY
);
454 r
= append_mounts(&m
, inaccessible_dirs
, INACCESSIBLE
);
459 m
->path
= prefix_roota(root_directory
, "/tmp");
460 m
->mode
= PRIVATE_TMP
;
465 m
->path
= prefix_roota(root_directory
, "/var/tmp");
466 m
->mode
= PRIVATE_VAR_TMP
;
471 m
->path
= prefix_roota(root_directory
, "/dev");
472 m
->mode
= PRIVATE_DEV
;
476 if (bus_endpoint_path
) {
477 m
->path
= prefix_roota(root_directory
, bus_endpoint_path
);
478 m
->mode
= PRIVATE_BUS_ENDPOINT
;
482 if (protect_home
!= PROTECT_HOME_NO
) {
483 const char *home_dir
, *run_user_dir
, *root_dir
;
485 home_dir
= prefix_roota(root_directory
, "/home");
486 home_dir
= strjoina("-", home_dir
);
487 run_user_dir
= prefix_roota(root_directory
, "/run/user");
488 run_user_dir
= strjoina("-", run_user_dir
);
489 root_dir
= prefix_roota(root_directory
, "/root");
490 root_dir
= strjoina("-", root_dir
);
492 r
= append_mounts(&m
, STRV_MAKE(home_dir
, run_user_dir
, root_dir
),
493 protect_home
== PROTECT_HOME_READ_ONLY
? READONLY
: INACCESSIBLE
);
498 if (protect_system
!= PROTECT_SYSTEM_NO
) {
499 const char *usr_dir
, *boot_dir
, *etc_dir
;
501 usr_dir
= prefix_roota(root_directory
, "/usr");
502 boot_dir
= prefix_roota(root_directory
, "/boot");
503 boot_dir
= strjoina("-", boot_dir
);
504 etc_dir
= prefix_roota(root_directory
, "/etc");
506 r
= append_mounts(&m
, protect_system
== PROTECT_SYSTEM_FULL
507 ? STRV_MAKE(usr_dir
, boot_dir
, etc_dir
)
508 : STRV_MAKE(usr_dir
, boot_dir
), READONLY
);
513 assert(mounts
+ n
== m
);
515 qsort(mounts
, n
, sizeof(BindMount
), mount_path_compare
);
516 drop_duplicates(mounts
, &n
);
519 if (n
> 0 || root_directory
) {
520 /* Remount / as SLAVE so that nothing now mounted in the namespace
521 shows up in the parent */
522 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
526 if (root_directory
) {
527 /* Turn directory into bind mount */
528 if (mount(root_directory
, root_directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
533 for (m
= mounts
; m
< mounts
+ n
; ++m
) {
534 r
= apply_mount(m
, tmp_dir
, var_tmp_dir
);
539 for (m
= mounts
; m
< mounts
+ n
; ++m
) {
540 r
= make_read_only(m
);
546 if (root_directory
) {
547 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
548 r
= mount_move_root(root_directory
);
550 /* at this point, we cannot rollback */
555 /* Remount / as the desired mode. Not that this will not
556 * reestablish propagation from our side to the host, since
557 * what's disconnected is disconnected. */
558 if (mount(NULL
, "/", NULL
, mount_flags
| MS_REC
, NULL
) < 0) {
559 /* at this point, we cannot rollback */
567 for (m
= mounts
; m
< mounts
+ n
; ++m
)
569 (void) umount2(m
->path
, MNT_DETACH
);
575 static int setup_one_tmp_dir(const char *id
, const char *prefix
, char **path
) {
576 _cleanup_free_
char *x
= NULL
;
577 char bid
[SD_ID128_STRING_MAX
];
585 /* We include the boot id in the directory so that after a
586 * reboot we can easily identify obsolete directories. */
588 r
= sd_id128_get_boot(&boot_id
);
592 x
= strjoin(prefix
, "/systemd-private-", sd_id128_to_string(boot_id
, bid
), "-", id
, "-XXXXXX", NULL
);
600 RUN_WITH_UMASK(0000) {
603 y
= strjoina(x
, "/tmp");
605 if (mkdir(y
, 0777 | S_ISVTX
) < 0)
615 int setup_tmp_dirs(const char *id
, char **tmp_dir
, char **var_tmp_dir
) {
623 r
= setup_one_tmp_dir(id
, "/tmp", &a
);
627 r
= setup_one_tmp_dir(id
, "/var/tmp", &b
);
631 t
= strjoina(a
, "/tmp");
645 int setup_netns(int netns_storage_socket
[2]) {
646 _cleanup_close_
int netns
= -1;
648 struct cmsghdr cmsghdr
;
649 uint8_t buf
[CMSG_SPACE(sizeof(int))];
652 .msg_control
= &control
,
653 .msg_controllen
= sizeof(control
),
655 struct cmsghdr
*cmsg
;
658 assert(netns_storage_socket
);
659 assert(netns_storage_socket
[0] >= 0);
660 assert(netns_storage_socket
[1] >= 0);
662 /* We use the passed socketpair as a storage buffer for our
663 * namespace reference fd. Whatever process runs this first
664 * shall create a new namespace, all others should just join
665 * it. To serialize that we use a file lock on the socket
668 * It's a bit crazy, but hey, works great! */
670 if (lockf(netns_storage_socket
[0], F_LOCK
, 0) < 0)
673 if (recvmsg(netns_storage_socket
[0], &mh
, MSG_DONTWAIT
|MSG_CMSG_CLOEXEC
) < 0) {
674 if (errno
!= EAGAIN
) {
679 /* Nothing stored yet, so let's create a new namespace */
681 if (unshare(CLONE_NEWNET
) < 0) {
688 netns
= open("/proc/self/ns/net", O_RDONLY
|O_CLOEXEC
|O_NOCTTY
);
696 /* Yay, found something, so let's join the namespace */
698 CMSG_FOREACH(cmsg
, &mh
)
699 if (cmsg
->cmsg_level
== SOL_SOCKET
&& cmsg
->cmsg_type
== SCM_RIGHTS
) {
700 assert(cmsg
->cmsg_len
== CMSG_LEN(sizeof(int)));
701 netns
= *(int*) CMSG_DATA(cmsg
);
704 if (setns(netns
, CLONE_NEWNET
) < 0) {
712 cmsg
= CMSG_FIRSTHDR(&mh
);
713 cmsg
->cmsg_level
= SOL_SOCKET
;
714 cmsg
->cmsg_type
= SCM_RIGHTS
;
715 cmsg
->cmsg_len
= CMSG_LEN(sizeof(int));
716 memcpy(CMSG_DATA(cmsg
), &netns
, sizeof(int));
717 mh
.msg_controllen
= cmsg
->cmsg_len
;
719 if (sendmsg(netns_storage_socket
[1], &mh
, MSG_DONTWAIT
|MSG_NOSIGNAL
) < 0) {
725 lockf(netns_storage_socket
[0], F_ULOCK
, 0);
730 static const char *const protect_home_table
[_PROTECT_HOME_MAX
] = {
731 [PROTECT_HOME_NO
] = "no",
732 [PROTECT_HOME_YES
] = "yes",
733 [PROTECT_HOME_READ_ONLY
] = "read-only",
736 DEFINE_STRING_TABLE_LOOKUP(protect_home
, ProtectHome
);
738 static const char *const protect_system_table
[_PROTECT_SYSTEM_MAX
] = {
739 [PROTECT_SYSTEM_NO
] = "no",
740 [PROTECT_SYSTEM_YES
] = "yes",
741 [PROTECT_SYSTEM_FULL
] = "full",
744 DEFINE_STRING_TABLE_LOOKUP(protect_system
, ProtectSystem
);