1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/signalfd.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
44 #include <linux/veth.h>
45 #include <sys/personality.h>
46 #include <linux/loop.h>
49 #include <selinux/selinux.h>
57 #include <blkid/blkid.h>
60 #include "sd-daemon.h"
70 #include "cgroup-util.h"
72 #include "path-util.h"
73 #include "loopback-setup.h"
74 #include "dev-setup.h"
79 #include "bus-error.h"
81 #include "bus-kernel.h"
84 #include "rtnl-util.h"
85 #include "udev-util.h"
86 #include "blkid-util.h"
88 #include "siphash24.h"
90 #include "base-filesystem.h"
92 #include "event-util.h"
95 #include "seccomp-util.h"
98 typedef enum ContainerStatus
{
103 typedef enum LinkJournal
{
110 typedef enum Volatile
{
116 static char *arg_directory
= NULL
;
117 static char *arg_user
= NULL
;
118 static sd_id128_t arg_uuid
= {};
119 static char *arg_machine
= NULL
;
120 static const char *arg_selinux_context
= NULL
;
121 static const char *arg_selinux_apifs_context
= NULL
;
122 static const char *arg_slice
= NULL
;
123 static bool arg_private_network
= false;
124 static bool arg_read_only
= false;
125 static bool arg_boot
= false;
126 static LinkJournal arg_link_journal
= LINK_AUTO
;
127 static bool arg_link_journal_try
= false;
128 static uint64_t arg_retain
=
129 (1ULL << CAP_CHOWN
) |
130 (1ULL << CAP_DAC_OVERRIDE
) |
131 (1ULL << CAP_DAC_READ_SEARCH
) |
132 (1ULL << CAP_FOWNER
) |
133 (1ULL << CAP_FSETID
) |
134 (1ULL << CAP_IPC_OWNER
) |
136 (1ULL << CAP_LEASE
) |
137 (1ULL << CAP_LINUX_IMMUTABLE
) |
138 (1ULL << CAP_NET_BIND_SERVICE
) |
139 (1ULL << CAP_NET_BROADCAST
) |
140 (1ULL << CAP_NET_RAW
) |
141 (1ULL << CAP_SETGID
) |
142 (1ULL << CAP_SETFCAP
) |
143 (1ULL << CAP_SETPCAP
) |
144 (1ULL << CAP_SETUID
) |
145 (1ULL << CAP_SYS_ADMIN
) |
146 (1ULL << CAP_SYS_CHROOT
) |
147 (1ULL << CAP_SYS_NICE
) |
148 (1ULL << CAP_SYS_PTRACE
) |
149 (1ULL << CAP_SYS_TTY_CONFIG
) |
150 (1ULL << CAP_SYS_RESOURCE
) |
151 (1ULL << CAP_SYS_BOOT
) |
152 (1ULL << CAP_AUDIT_WRITE
) |
153 (1ULL << CAP_AUDIT_CONTROL
) |
155 static char **arg_bind
= NULL
;
156 static char **arg_bind_ro
= NULL
;
157 static char **arg_tmpfs
= NULL
;
158 static char **arg_setenv
= NULL
;
159 static bool arg_quiet
= false;
160 static bool arg_share_system
= false;
161 static bool arg_register
= true;
162 static bool arg_keep_unit
= false;
163 static char **arg_network_interfaces
= NULL
;
164 static char **arg_network_macvlan
= NULL
;
165 static bool arg_network_veth
= false;
166 static const char *arg_network_bridge
= NULL
;
167 static unsigned long arg_personality
= 0xffffffffLU
;
168 static const char *arg_image
= NULL
;
169 static Volatile arg_volatile
= VOLATILE_NO
;
171 static void help(void) {
172 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
173 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
174 " -h --help Show this help\n"
175 " --version Print version string\n"
176 " -q --quiet Do not show status information\n"
177 " -D --directory=PATH Root directory for the container\n"
178 " -i --image=PATH File system device or image for the container\n"
179 " -b --boot Boot up full system (i.e. invoke init)\n"
180 " -u --user=USER Run the command under specified user or uid\n"
181 " -M --machine=NAME Set the machine name for the container\n"
182 " --uuid=UUID Set a specific machine UUID for the container\n"
183 " -S --slice=SLICE Place the container in the specified slice\n"
184 " --private-network Disable network in container\n"
185 " --network-interface=INTERFACE\n"
186 " Assign an existing network interface to the\n"
188 " --network-macvlan=INTERFACE\n"
189 " Create a macvlan network interface based on an\n"
190 " existing network interface to the container\n"
191 " --network-veth Add a virtual ethernet connection between host\n"
193 " --network-bridge=INTERFACE\n"
194 " Add a virtual ethernet connection between host\n"
195 " and container and add it to an existing bridge on\n"
197 " -Z --selinux-context=SECLABEL\n"
198 " Set the SELinux security context to be used by\n"
199 " processes in the container\n"
200 " -L --selinux-apifs-context=SECLABEL\n"
201 " Set the SELinux security context to be used by\n"
202 " API/tmpfs file systems in the container\n"
203 " --capability=CAP In addition to the default, retain specified\n"
205 " --drop-capability=CAP Drop the specified capability from the default set\n"
206 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
207 " try-guest, try-host\n"
208 " -j Equivalent to --link-journal=try-guest\n"
209 " --read-only Mount the root directory read-only\n"
210 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
212 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
213 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
214 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
215 " --share-system Share system namespaces with host\n"
216 " --register=BOOLEAN Register container as machine\n"
217 " --keep-unit Do not register a scope for the machine, reuse\n"
218 " the service unit nspawn is running in\n"
219 " --volatile[=MODE] Run the system in volatile mode\n",
220 program_invocation_short_name
);
223 static int parse_argv(int argc
, char *argv
[]) {
240 ARG_NETWORK_INTERFACE
,
248 static const struct option options
[] = {
249 { "help", no_argument
, NULL
, 'h' },
250 { "version", no_argument
, NULL
, ARG_VERSION
},
251 { "directory", required_argument
, NULL
, 'D' },
252 { "user", required_argument
, NULL
, 'u' },
253 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
254 { "boot", no_argument
, NULL
, 'b' },
255 { "uuid", required_argument
, NULL
, ARG_UUID
},
256 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
257 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
258 { "drop-capability", required_argument
, NULL
, ARG_DROP_CAPABILITY
},
259 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
260 { "bind", required_argument
, NULL
, ARG_BIND
},
261 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
262 { "tmpfs", required_argument
, NULL
, ARG_TMPFS
},
263 { "machine", required_argument
, NULL
, 'M' },
264 { "slice", required_argument
, NULL
, 'S' },
265 { "setenv", required_argument
, NULL
, ARG_SETENV
},
266 { "selinux-context", required_argument
, NULL
, 'Z' },
267 { "selinux-apifs-context", required_argument
, NULL
, 'L' },
268 { "quiet", no_argument
, NULL
, 'q' },
269 { "share-system", no_argument
, NULL
, ARG_SHARE_SYSTEM
},
270 { "register", required_argument
, NULL
, ARG_REGISTER
},
271 { "keep-unit", no_argument
, NULL
, ARG_KEEP_UNIT
},
272 { "network-interface", required_argument
, NULL
, ARG_NETWORK_INTERFACE
},
273 { "network-macvlan", required_argument
, NULL
, ARG_NETWORK_MACVLAN
},
274 { "network-veth", no_argument
, NULL
, ARG_NETWORK_VETH
},
275 { "network-bridge", required_argument
, NULL
, ARG_NETWORK_BRIDGE
},
276 { "personality", required_argument
, NULL
, ARG_PERSONALITY
},
277 { "image", required_argument
, NULL
, 'i' },
278 { "volatile", optional_argument
, NULL
, ARG_VOLATILE
},
283 uint64_t plus
= 0, minus
= 0;
288 while ((c
= getopt_long(argc
, argv
, "+hD:u:bL:M:jS:Z:qi:", options
, NULL
)) >= 0)
297 puts(PACKAGE_STRING
);
298 puts(SYSTEMD_FEATURES
);
303 arg_directory
= canonicalize_file_name(optarg
);
304 if (!arg_directory
) {
305 log_error("Invalid root directory: %m");
317 arg_user
= strdup(optarg
);
323 case ARG_NETWORK_BRIDGE
:
324 arg_network_bridge
= optarg
;
328 case ARG_NETWORK_VETH
:
329 arg_network_veth
= true;
330 arg_private_network
= true;
333 case ARG_NETWORK_INTERFACE
:
334 if (strv_extend(&arg_network_interfaces
, optarg
) < 0)
337 arg_private_network
= true;
340 case ARG_NETWORK_MACVLAN
:
341 if (strv_extend(&arg_network_macvlan
, optarg
) < 0)
346 case ARG_PRIVATE_NETWORK
:
347 arg_private_network
= true;
355 r
= sd_id128_from_string(optarg
, &arg_uuid
);
357 log_error("Invalid UUID: %s", optarg
);
367 if (isempty(optarg
)) {
372 if (!hostname_is_valid(optarg
)) {
373 log_error("Invalid machine name: %s", optarg
);
378 arg_machine
= strdup(optarg
);
386 arg_selinux_context
= optarg
;
390 arg_selinux_apifs_context
= optarg
;
394 arg_read_only
= true;
398 case ARG_DROP_CAPABILITY
: {
399 const char *state
, *word
;
402 FOREACH_WORD_SEPARATOR(word
, length
, optarg
, ",", state
) {
403 _cleanup_free_
char *t
;
406 t
= strndup(word
, length
);
410 if (streq(t
, "all")) {
411 if (c
== ARG_CAPABILITY
)
412 plus
= (uint64_t) -1;
414 minus
= (uint64_t) -1;
416 if (cap_from_name(t
, &cap
) < 0) {
417 log_error("Failed to parse capability %s.", t
);
421 if (c
== ARG_CAPABILITY
)
422 plus
|= 1ULL << (uint64_t) cap
;
424 minus
|= 1ULL << (uint64_t) cap
;
432 arg_link_journal
= LINK_GUEST
;
433 arg_link_journal_try
= true;
436 case ARG_LINK_JOURNAL
:
437 if (streq(optarg
, "auto"))
438 arg_link_journal
= LINK_AUTO
;
439 else if (streq(optarg
, "no"))
440 arg_link_journal
= LINK_NO
;
441 else if (streq(optarg
, "guest"))
442 arg_link_journal
= LINK_GUEST
;
443 else if (streq(optarg
, "host"))
444 arg_link_journal
= LINK_HOST
;
445 else if (streq(optarg
, "try-guest")) {
446 arg_link_journal
= LINK_GUEST
;
447 arg_link_journal_try
= true;
448 } else if (streq(optarg
, "try-host")) {
449 arg_link_journal
= LINK_HOST
;
450 arg_link_journal_try
= true;
452 log_error("Failed to parse link journal mode %s", optarg
);
460 _cleanup_free_
char *a
= NULL
, *b
= NULL
;
464 x
= c
== ARG_BIND
? &arg_bind
: &arg_bind_ro
;
466 e
= strchr(optarg
, ':');
468 a
= strndup(optarg
, e
- optarg
);
478 if (!path_is_absolute(a
) || !path_is_absolute(b
)) {
479 log_error("Invalid bind mount specification: %s", optarg
);
483 r
= strv_extend(x
, a
);
487 r
= strv_extend(x
, b
);
495 _cleanup_free_
char *a
= NULL
, *b
= NULL
;
498 e
= strchr(optarg
, ':');
500 a
= strndup(optarg
, e
- optarg
);
504 b
= strdup("mode=0755");
510 if (!path_is_absolute(a
)) {
511 log_error("Invalid tmpfs specification: %s", optarg
);
515 r
= strv_push(&arg_tmpfs
, a
);
521 r
= strv_push(&arg_tmpfs
, b
);
533 if (!env_assignment_is_valid(optarg
)) {
534 log_error("Environment variable assignment '%s' is not valid.", optarg
);
538 n
= strv_env_set(arg_setenv
, optarg
);
542 strv_free(arg_setenv
);
551 case ARG_SHARE_SYSTEM
:
552 arg_share_system
= true;
556 r
= parse_boolean(optarg
);
558 log_error("Failed to parse --register= argument: %s", optarg
);
566 arg_keep_unit
= true;
569 case ARG_PERSONALITY
:
571 arg_personality
= personality_from_string(optarg
);
572 if (arg_personality
== 0xffffffffLU
) {
573 log_error("Unknown or unsupported personality '%s'.", optarg
);
582 arg_volatile
= VOLATILE_YES
;
584 r
= parse_boolean(optarg
);
586 if (streq(optarg
, "state"))
587 arg_volatile
= VOLATILE_STATE
;
589 log_error("Failed to parse --volatile= argument: %s", optarg
);
593 arg_volatile
= r
? VOLATILE_YES
: VOLATILE_NO
;
602 assert_not_reached("Unhandled option");
605 if (arg_share_system
)
606 arg_register
= false;
608 if (arg_boot
&& arg_share_system
) {
609 log_error("--boot and --share-system may not be combined.");
613 if (arg_keep_unit
&& cg_pid_get_owner_uid(0, NULL
) >= 0) {
614 log_error("--keep-unit may not be used when invoked from a user session.");
618 if (arg_directory
&& arg_image
) {
619 log_error("--directory= and --image= may not be combined.");
623 if (arg_volatile
!= VOLATILE_NO
&& arg_read_only
) {
624 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
628 arg_retain
= (arg_retain
| plus
| (arg_private_network
? 1ULL << CAP_NET_ADMIN
: 0)) & ~minus
;
633 static int mount_all(const char *dest
) {
635 typedef struct MountPoint
{
644 static const MountPoint mount_table
[] = {
645 { "proc", "/proc", "proc", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, true },
646 { "/proc/sys", "/proc/sys", NULL
, NULL
, MS_BIND
, true }, /* Bind mount first */
647 { NULL
, "/proc/sys", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_REMOUNT
, true }, /* Then, make it r/o */
648 { "sysfs", "/sys", "sysfs", NULL
, MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, true },
649 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID
|MS_STRICTATIME
, true },
650 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID
), MS_NOSUID
|MS_NOEXEC
, true },
651 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, true },
652 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, true },
654 { "/sys/fs/selinux", "/sys/fs/selinux", NULL
, NULL
, MS_BIND
, false }, /* Bind mount first */
655 { NULL
, "/sys/fs/selinux", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_REMOUNT
, false }, /* Then, make it r/o */
662 for (k
= 0; k
< ELEMENTSOF(mount_table
); k
++) {
663 _cleanup_free_
char *where
= NULL
;
665 _cleanup_free_
char *options
= NULL
;
670 where
= strjoin(dest
, "/", mount_table
[k
].where
, NULL
);
674 t
= path_is_mount_point(where
, true);
676 log_error_errno(-t
, "Failed to detect whether %s is a mount point: %m", where
);
684 /* Skip this entry if it is not a remount. */
685 if (mount_table
[k
].what
&& t
> 0)
688 t
= mkdir_p(where
, 0755);
690 if (mount_table
[k
].fatal
) {
691 log_error_errno(-t
, "Failed to create directory %s: %m", where
);
696 log_warning_errno(-t
, "Failed to create directory %s: %m", where
);
702 if (arg_selinux_apifs_context
&&
703 (streq_ptr(mount_table
[k
].what
, "tmpfs") || streq_ptr(mount_table
[k
].what
, "devpts"))) {
704 options
= strjoin(mount_table
[k
].options
, ",context=\"", arg_selinux_apifs_context
, "\"", NULL
);
711 o
= mount_table
[k
].options
;
714 if (mount(mount_table
[k
].what
,
717 mount_table
[k
].flags
,
720 if (mount_table
[k
].fatal
) {
721 log_error("mount(%s) failed: %m", where
);
726 log_warning("mount(%s) failed: %m", where
);
733 static int mount_binds(const char *dest
, char **l
, bool ro
) {
736 STRV_FOREACH_PAIR(x
, y
, l
) {
737 _cleanup_free_
char *where
= NULL
;
738 struct stat source_st
, dest_st
;
741 if (stat(*x
, &source_st
) < 0) {
742 log_error("Failed to stat %s: %m", *x
);
746 where
= strappend(dest
, *y
);
750 r
= stat(where
, &dest_st
);
752 if ((source_st
.st_mode
& S_IFMT
) != (dest_st
.st_mode
& S_IFMT
)) {
753 log_error("The file types of %s and %s do not match. Refusing bind mount", *x
, where
);
756 } else if (errno
== ENOENT
) {
757 r
= mkdir_parents_label(where
, 0755);
759 log_error_errno(-r
, "Failed to bind mount %s: %m", *x
);
763 log_error("Failed to bind mount %s: %m", *x
);
767 /* Create the mount point, but be conservative -- refuse to create block
768 * and char devices. */
769 if (S_ISDIR(source_st
.st_mode
)) {
770 r
= mkdir_label(where
, 0755);
771 if (r
< 0 && errno
!= EEXIST
) {
772 log_error_errno(-r
, "Failed to create mount point %s: %m", where
);
776 } else if (S_ISFIFO(source_st
.st_mode
)) {
777 r
= mkfifo(where
, 0644);
778 if (r
< 0 && errno
!= EEXIST
) {
779 log_error("Failed to create mount point %s: %m", where
);
783 } else if (S_ISSOCK(source_st
.st_mode
)) {
784 r
= mknod(where
, 0644 | S_IFSOCK
, 0);
785 if (r
< 0 && errno
!= EEXIST
) {
786 log_error("Failed to create mount point %s: %m", where
);
790 } else if (S_ISREG(source_st
.st_mode
)) {
793 log_error_errno(-r
, "Failed to create mount point %s: %m", where
);
798 log_error("Refusing to create mountpoint for file: %s", *x
);
802 if (mount(*x
, where
, "bind", MS_BIND
, NULL
) < 0) {
803 log_error("mount(%s) failed: %m", where
);
808 r
= bind_remount_recursive(where
, true);
810 log_error_errno(-r
, "Read-Only bind mount failed: %m");
819 static int mount_tmpfs(const char *dest
) {
822 STRV_FOREACH_PAIR(i
, o
, arg_tmpfs
) {
823 _cleanup_free_
char *where
= NULL
;
826 where
= strappend(dest
, *i
);
830 r
= mkdir_label(where
, 0755);
831 if (r
< 0 && errno
!= EEXIST
) {
832 log_error_errno(-r
, "creating mount point for tmpfs %s failed: %m", where
);
837 if (mount("tmpfs", where
, "tmpfs", MS_NODEV
|MS_STRICTATIME
, *o
) < 0) {
838 log_error("tmpfs mount to %s failed: %m", where
);
846 static int setup_timezone(const char *dest
) {
847 _cleanup_free_
char *where
= NULL
, *p
= NULL
, *q
= NULL
, *check
= NULL
, *what
= NULL
;
853 /* Fix the timezone, if possible */
854 r
= readlink_malloc("/etc/localtime", &p
);
856 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
860 z
= path_startswith(p
, "../usr/share/zoneinfo/");
862 z
= path_startswith(p
, "/usr/share/zoneinfo/");
864 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
868 where
= strappend(dest
, "/etc/localtime");
872 r
= readlink_malloc(where
, &q
);
874 y
= path_startswith(q
, "../usr/share/zoneinfo/");
876 y
= path_startswith(q
, "/usr/share/zoneinfo/");
878 /* Already pointing to the right place? Then do nothing .. */
879 if (y
&& streq(y
, z
))
883 check
= strjoin(dest
, "/usr/share/zoneinfo/", z
, NULL
);
887 if (access(check
, F_OK
) < 0) {
888 log_warning("Timezone %s does not exist in container, not updating container timezone.", z
);
892 what
= strappend("../usr/share/zoneinfo/", z
);
896 r
= mkdir_parents(where
, 0755);
898 log_error_errno(-r
, "Failed to create directory for timezone info %s in container: %m", where
);
904 if (r
< 0 && errno
!= ENOENT
) {
905 log_error("Failed to remove existing timezone info %s in container: %m", where
);
910 if (symlink(what
, where
) < 0) {
911 log_error("Failed to correct timezone of container: %m");
918 static int setup_resolv_conf(const char *dest
) {
919 _cleanup_free_
char *where
= NULL
;
924 if (arg_private_network
)
927 /* Fix resolv.conf, if possible */
928 where
= strappend(dest
, "/etc/resolv.conf");
932 /* We don't really care for the results of this really. If it
933 * fails, it fails, but meh... */
934 r
= mkdir_parents(where
, 0755);
936 log_warning_errno(-r
, "Failed to create parent directory for resolv.conf %s: %m", where
);
941 r
= copy_file("/etc/resolv.conf", where
, O_TRUNC
|O_NOFOLLOW
, 0644);
943 log_warning_errno(-r
, "Failed to copy /etc/resolv.conf to %s: %m", where
);
951 static int setup_volatile_state(const char *directory
) {
957 if (arg_volatile
!= VOLATILE_STATE
)
960 /* --volatile=state means we simply overmount /var
961 with a tmpfs, and the rest read-only. */
963 r
= bind_remount_recursive(directory
, true);
965 log_error_errno(-r
, "Failed to remount %s read-only: %m", directory
);
969 p
= strappenda(directory
, "/var");
971 if (r
< 0 && errno
!= EEXIST
) {
972 log_error("Failed to create %s: %m", directory
);
976 if (mount("tmpfs", p
, "tmpfs", MS_STRICTATIME
, "mode=755") < 0) {
977 log_error("Failed to mount tmpfs to /var: %m");
984 static int setup_volatile(const char *directory
) {
985 bool tmpfs_mounted
= false, bind_mounted
= false;
986 char template[] = "/tmp/nspawn-volatile-XXXXXX";
992 if (arg_volatile
!= VOLATILE_YES
)
995 /* --volatile=yes means we mount a tmpfs to the root dir, and
996 the original /usr to use inside it, and that read-only. */
998 if (!mkdtemp(template)) {
999 log_error("Failed to create temporary directory: %m");
1003 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME
, "mode=755") < 0) {
1004 log_error("Failed to mount tmpfs for root directory: %m");
1009 tmpfs_mounted
= true;
1011 f
= strappenda(directory
, "/usr");
1012 t
= strappenda(template, "/usr");
1015 if (r
< 0 && errno
!= EEXIST
) {
1016 log_error("Failed to create %s: %m", t
);
1021 if (mount(f
, t
, "bind", MS_BIND
|MS_REC
, NULL
) < 0) {
1022 log_error("Failed to create /usr bind mount: %m");
1027 bind_mounted
= true;
1029 r
= bind_remount_recursive(t
, true);
1031 log_error_errno(-r
, "Failed to remount %s read-only: %m", t
);
1035 if (mount(template, directory
, NULL
, MS_MOVE
, NULL
) < 0) {
1036 log_error("Failed to move root mount: %m");
1054 static char* id128_format_as_uuid(sd_id128_t id
, char s
[37]) {
1057 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1058 SD_ID128_FORMAT_VAL(id
));
1063 static int setup_boot_id(const char *dest
) {
1064 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
1065 sd_id128_t rnd
= {};
1071 if (arg_share_system
)
1074 /* Generate a new randomized boot ID, so that each boot-up of
1075 * the container gets a new one */
1077 from
= strappend(dest
, "/dev/proc-sys-kernel-random-boot-id");
1078 to
= strappend(dest
, "/proc/sys/kernel/random/boot_id");
1082 r
= sd_id128_randomize(&rnd
);
1084 log_error_errno(-r
, "Failed to generate random boot id: %m");
1088 id128_format_as_uuid(rnd
, as_uuid
);
1090 r
= write_string_file(from
, as_uuid
);
1092 log_error_errno(-r
, "Failed to write boot id: %m");
1096 if (mount(from
, to
, "bind", MS_BIND
, NULL
) < 0) {
1097 log_error("Failed to bind mount boot id: %m");
1099 } else if (mount(from
, to
, "bind", MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
))
1100 log_warning("Failed to make boot id read-only: %m");
1106 static int copy_devnodes(const char *dest
) {
1108 static const char devnodes
[] =
1119 _cleanup_umask_ mode_t u
;
1125 NULSTR_FOREACH(d
, devnodes
) {
1126 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
1129 from
= strappend("/dev/", d
);
1130 to
= strjoin(dest
, "/dev/", d
, NULL
);
1134 if (stat(from
, &st
) < 0) {
1136 if (errno
!= ENOENT
) {
1137 log_error("Failed to stat %s: %m", from
);
1141 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
1143 log_error("%s is not a char or block device, cannot copy", from
);
1147 r
= mkdir_parents(to
, 0775);
1149 log_error_errno(-r
, "Failed to create parent directory of %s: %m", to
);
1153 if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
1154 log_error("mknod(%s) failed: %m", dest
);
1163 static int setup_ptmx(const char *dest
) {
1164 _cleanup_free_
char *p
= NULL
;
1166 p
= strappend(dest
, "/dev/ptmx");
1170 if (symlink("pts/ptmx", p
) < 0) {
1171 log_error("Failed to create /dev/ptmx symlink: %m");
1178 static int setup_dev_console(const char *dest
, const char *console
) {
1179 _cleanup_umask_ mode_t u
;
1189 if (stat("/dev/null", &st
) < 0) {
1190 log_error("Failed to stat /dev/null: %m");
1194 r
= chmod_and_chown(console
, 0600, 0, 0);
1196 log_error_errno(-r
, "Failed to correct access mode for TTY: %m");
1200 /* We need to bind mount the right tty to /dev/console since
1201 * ptys can only exist on pts file systems. To have something
1202 * to bind mount things on we create a device node first, and
1203 * use /dev/null for that since we the cgroups device policy
1204 * allows us to create that freely, while we cannot create
1205 * /dev/console. (Note that the major minor doesn't actually
1206 * matter here, since we mount it over anyway). */
1208 to
= strappenda(dest
, "/dev/console");
1209 if (mknod(to
, (st
.st_mode
& ~07777) | 0600, st
.st_rdev
) < 0) {
1210 log_error("mknod() for /dev/console failed: %m");
1214 if (mount(console
, to
, "bind", MS_BIND
, NULL
) < 0) {
1215 log_error("Bind mount for /dev/console failed: %m");
1222 static int setup_kmsg(const char *dest
, int kmsg_socket
) {
1223 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
1225 _cleanup_umask_ mode_t u
;
1227 struct cmsghdr cmsghdr
;
1228 uint8_t buf
[CMSG_SPACE(sizeof(int))];
1230 struct msghdr mh
= {
1231 .msg_control
= &control
,
1232 .msg_controllen
= sizeof(control
),
1234 struct cmsghdr
*cmsg
;
1237 assert(kmsg_socket
>= 0);
1241 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1242 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1243 * on the reading side behave very similar to /proc/kmsg,
1244 * their writing side behaves differently from /dev/kmsg in
1245 * that writing blocks when nothing is reading. In order to
1246 * avoid any problems with containers deadlocking due to this
1247 * we simply make /dev/kmsg unavailable to the container. */
1248 if (asprintf(&from
, "%s/dev/kmsg", dest
) < 0 ||
1249 asprintf(&to
, "%s/proc/kmsg", dest
) < 0)
1252 if (mkfifo(from
, 0600) < 0) {
1253 log_error("mkfifo() for /dev/kmsg failed: %m");
1257 r
= chmod_and_chown(from
, 0600, 0, 0);
1259 log_error_errno(-r
, "Failed to correct access mode for /dev/kmsg: %m");
1263 if (mount(from
, to
, "bind", MS_BIND
, NULL
) < 0) {
1264 log_error("Bind mount for /proc/kmsg failed: %m");
1268 fd
= open(from
, O_RDWR
|O_NDELAY
|O_CLOEXEC
);
1270 log_error("Failed to open fifo: %m");
1274 cmsg
= CMSG_FIRSTHDR(&mh
);
1275 cmsg
->cmsg_level
= SOL_SOCKET
;
1276 cmsg
->cmsg_type
= SCM_RIGHTS
;
1277 cmsg
->cmsg_len
= CMSG_LEN(sizeof(int));
1278 memcpy(CMSG_DATA(cmsg
), &fd
, sizeof(int));
1280 mh
.msg_controllen
= cmsg
->cmsg_len
;
1282 /* Store away the fd in the socket, so that it stays open as
1283 * long as we run the child */
1284 k
= sendmsg(kmsg_socket
, &mh
, MSG_DONTWAIT
|MSG_NOSIGNAL
);
1288 log_error("Failed to send FIFO fd: %m");
1292 /* And now make the FIFO unavailable as /dev/kmsg... */
1297 static int setup_hostname(void) {
1299 if (arg_share_system
)
1302 if (sethostname_idempotent(arg_machine
) < 0)
1308 static int setup_journal(const char *directory
) {
1309 sd_id128_t machine_id
, this_id
;
1310 _cleanup_free_
char *p
= NULL
, *b
= NULL
, *q
= NULL
, *d
= NULL
;
1314 p
= strappend(directory
, "/etc/machine-id");
1318 r
= read_one_line_file(p
, &b
);
1319 if (r
== -ENOENT
&& arg_link_journal
== LINK_AUTO
)
1322 log_error_errno(-r
, "Failed to read machine ID from %s: %m", p
);
1327 if (isempty(id
) && arg_link_journal
== LINK_AUTO
)
1330 /* Verify validity */
1331 r
= sd_id128_from_string(id
, &machine_id
);
1333 log_error_errno(-r
, "Failed to parse machine ID from %s: %m", p
);
1337 r
= sd_id128_get_machine(&this_id
);
1339 log_error_errno(-r
, "Failed to retrieve machine ID: %m");
1343 if (sd_id128_equal(machine_id
, this_id
)) {
1344 log_full(arg_link_journal
== LINK_AUTO
? LOG_WARNING
: LOG_ERR
,
1345 "Host and machine ids are equal (%s): refusing to link journals", id
);
1346 if (arg_link_journal
== LINK_AUTO
)
1352 if (arg_link_journal
== LINK_NO
)
1356 p
= strappend("/var/log/journal/", id
);
1357 q
= strjoin(directory
, "/var/log/journal/", id
, NULL
);
1361 if (path_is_mount_point(p
, false) > 0) {
1362 if (arg_link_journal
!= LINK_AUTO
) {
1363 log_error("%s: already a mount point, refusing to use for journal", p
);
1370 if (path_is_mount_point(q
, false) > 0) {
1371 if (arg_link_journal
!= LINK_AUTO
) {
1372 log_error("%s: already a mount point, refusing to use for journal", q
);
1379 r
= readlink_and_make_absolute(p
, &d
);
1381 if ((arg_link_journal
== LINK_GUEST
||
1382 arg_link_journal
== LINK_AUTO
) &&
1385 r
= mkdir_p(q
, 0755);
1387 log_warning("Failed to create directory %s: %m", q
);
1391 if (unlink(p
) < 0) {
1392 log_error("Failed to remove symlink %s: %m", p
);
1395 } else if (r
== -EINVAL
) {
1397 if (arg_link_journal
== LINK_GUEST
&&
1400 if (errno
== ENOTDIR
) {
1401 log_error("%s already exists and is neither a symlink nor a directory", p
);
1404 log_error("Failed to remove %s: %m", p
);
1408 } else if (r
!= -ENOENT
) {
1409 log_error("readlink(%s) failed: %m", p
);
1413 if (arg_link_journal
== LINK_GUEST
) {
1415 if (symlink(q
, p
) < 0) {
1416 if (arg_link_journal_try
) {
1417 log_debug("Failed to symlink %s to %s, skipping journal setup: %m", q
, p
);
1420 log_error("Failed to symlink %s to %s: %m", q
, p
);
1425 r
= mkdir_p(q
, 0755);
1427 log_warning("Failed to create directory %s: %m", q
);
1431 if (arg_link_journal
== LINK_HOST
) {
1432 /* don't create parents here -- if the host doesn't have
1433 * permanent journal set up, don't force it here */
1436 if (arg_link_journal_try
) {
1437 log_debug("Failed to create %s, skipping journal setup: %m", p
);
1440 log_error("Failed to create %s: %m", p
);
1445 } else if (access(p
, F_OK
) < 0)
1448 if (dir_is_empty(q
) == 0)
1449 log_warning("%s is not empty, proceeding anyway.", q
);
1451 r
= mkdir_p(q
, 0755);
1453 log_error("Failed to create %s: %m", q
);
1457 if (mount(p
, q
, "bind", MS_BIND
, NULL
) < 0) {
1458 log_error("Failed to bind mount journal from host into guest: %m");
1465 static int drop_capabilities(void) {
1466 return capability_bounding_set_drop(~arg_retain
, false);
1469 static int register_machine(pid_t pid
, int local_ifindex
) {
1470 _cleanup_bus_error_free_ sd_bus_error error
= SD_BUS_ERROR_NULL
;
1471 _cleanup_bus_close_unref_ sd_bus
*bus
= NULL
;
1477 r
= sd_bus_default_system(&bus
);
1479 log_error_errno(-r
, "Failed to open system bus: %m");
1483 if (arg_keep_unit
) {
1484 r
= sd_bus_call_method(
1486 "org.freedesktop.machine1",
1487 "/org/freedesktop/machine1",
1488 "org.freedesktop.machine1.Manager",
1489 "RegisterMachineWithNetwork",
1494 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid
),
1498 strempty(arg_directory
),
1499 local_ifindex
> 0 ? 1 : 0, local_ifindex
);
1501 _cleanup_bus_message_unref_ sd_bus_message
*m
= NULL
;
1503 r
= sd_bus_message_new_method_call(
1506 "org.freedesktop.machine1",
1507 "/org/freedesktop/machine1",
1508 "org.freedesktop.machine1.Manager",
1509 "CreateMachineWithNetwork");
1511 log_error_errno(-r
, "Failed to create message: %m");
1515 r
= sd_bus_message_append(
1519 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid
),
1523 strempty(arg_directory
),
1524 local_ifindex
> 0 ? 1 : 0, local_ifindex
);
1526 log_error_errno(-r
, "Failed to append message arguments: %m");
1530 r
= sd_bus_message_open_container(m
, 'a', "(sv)");
1532 log_error_errno(-r
, "Failed to open container: %m");
1536 if (!isempty(arg_slice
)) {
1537 r
= sd_bus_message_append(m
, "(sv)", "Slice", "s", arg_slice
);
1539 log_error_errno(-r
, "Failed to append slice: %m");
1544 r
= sd_bus_message_append(m
, "(sv)", "DevicePolicy", "s", "strict");
1546 log_error_errno(-r
, "Failed to add device policy: %m");
1550 r
= sd_bus_message_append(m
, "(sv)", "DeviceAllow", "a(ss)", 9,
1551 /* Allow the container to
1552 * access and create the API
1553 * device nodes, so that
1554 * PrivateDevices= in the
1555 * container can work
1560 "/dev/random", "rwm",
1561 "/dev/urandom", "rwm",
1563 "/dev/net/tun", "rwm",
1564 /* Allow the container
1565 * access to ptys. However,
1567 * container to ever create
1568 * these device nodes. */
1569 "/dev/pts/ptmx", "rw",
1572 log_error_errno(-r
, "Failed to add device whitelist: %m");
1576 r
= sd_bus_message_close_container(m
);
1578 log_error_errno(-r
, "Failed to close container: %m");
1582 r
= sd_bus_call(bus
, m
, 0, &error
, NULL
);
1586 log_error("Failed to register machine: %s", bus_error_message(&error
, r
));
1593 static int terminate_machine(pid_t pid
) {
1594 _cleanup_bus_error_free_ sd_bus_error error
= SD_BUS_ERROR_NULL
;
1595 _cleanup_bus_message_unref_ sd_bus_message
*reply
= NULL
;
1596 _cleanup_bus_close_unref_ sd_bus
*bus
= NULL
;
1603 r
= sd_bus_default_system(&bus
);
1605 log_error_errno(-r
, "Failed to open system bus: %m");
1609 r
= sd_bus_call_method(
1611 "org.freedesktop.machine1",
1612 "/org/freedesktop/machine1",
1613 "org.freedesktop.machine1.Manager",
1620 /* Note that the machine might already have been
1621 * cleaned up automatically, hence don't consider it a
1622 * failure if we cannot get the machine object. */
1623 log_debug("Failed to get machine: %s", bus_error_message(&error
, r
));
1627 r
= sd_bus_message_read(reply
, "o", &path
);
1629 return bus_log_parse_error(r
);
1631 r
= sd_bus_call_method(
1633 "org.freedesktop.machine1",
1635 "org.freedesktop.machine1.Machine",
1641 log_debug("Failed to terminate machine: %s", bus_error_message(&error
, r
));
1648 static int reset_audit_loginuid(void) {
1649 _cleanup_free_
char *p
= NULL
;
1652 if (arg_share_system
)
1655 r
= read_one_line_file("/proc/self/loginuid", &p
);
1659 log_error_errno(-r
, "Failed to read /proc/self/loginuid: %m");
1663 /* Already reset? */
1664 if (streq(p
, "4294967295"))
1667 r
= write_string_file("/proc/self/loginuid", "4294967295");
1669 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1670 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1671 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1672 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1673 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r
));
1681 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1682 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1684 static int generate_mac(struct ether_addr
*mac
, sd_id128_t hash_key
) {
1691 l
= strlen(arg_machine
);
1692 sz
= sizeof(sd_id128_t
) + l
;
1695 /* fetch some persistent data unique to the host */
1696 r
= sd_id128_get_machine((sd_id128_t
*) v
);
1700 /* combine with some data unique (on this host) to this
1701 * container instance */
1702 memcpy(v
+ sizeof(sd_id128_t
), arg_machine
, l
);
1704 /* Let's hash the host machine ID plus the container name. We
1705 * use a fixed, but originally randomly created hash key here. */
1706 siphash24(result
, v
, sz
, hash_key
.bytes
);
1708 assert_cc(ETH_ALEN
<= sizeof(result
));
1709 memcpy(mac
->ether_addr_octet
, result
, ETH_ALEN
);
1711 /* see eth_random_addr in the kernel */
1712 mac
->ether_addr_octet
[0] &= 0xfe; /* clear multicast bit */
1713 mac
->ether_addr_octet
[0] |= 0x02; /* set local assignment bit (IEEE802) */
1718 static int setup_veth(pid_t pid
, char iface_name
[IFNAMSIZ
], int *ifi
) {
1719 _cleanup_rtnl_message_unref_ sd_rtnl_message
*m
= NULL
;
1720 _cleanup_rtnl_unref_ sd_rtnl
*rtnl
= NULL
;
1721 struct ether_addr mac_host
, mac_container
;
1724 if (!arg_private_network
)
1727 if (!arg_network_veth
)
1730 /* Use two different interface name prefixes depending whether
1731 * we are in bridge mode or not. */
1732 snprintf(iface_name
, IFNAMSIZ
- 1, "%s-%s",
1733 arg_network_bridge
? "vb" : "ve", arg_machine
);
1735 r
= generate_mac(&mac_container
, CONTAINER_HASH_KEY
);
1737 log_error("Failed to generate predictable MAC address for container side");
1741 r
= generate_mac(&mac_host
, HOST_HASH_KEY
);
1743 log_error("Failed to generate predictable MAC address for host side");
1747 r
= sd_rtnl_open(&rtnl
, 0);
1749 log_error_errno(-r
, "Failed to connect to netlink: %m");
1753 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_NEWLINK
, 0);
1755 log_error_errno(-r
, "Failed to allocate netlink message: %m");
1759 r
= sd_rtnl_message_append_string(m
, IFLA_IFNAME
, iface_name
);
1761 log_error_errno(-r
, "Failed to add netlink interface name: %m");
1765 r
= sd_rtnl_message_append_ether_addr(m
, IFLA_ADDRESS
, &mac_host
);
1767 log_error_errno(-r
, "Failed to add netlink MAC address: %m");
1771 r
= sd_rtnl_message_open_container(m
, IFLA_LINKINFO
);
1773 log_error_errno(-r
, "Failed to open netlink container: %m");
1777 r
= sd_rtnl_message_open_container_union(m
, IFLA_INFO_DATA
, "veth");
1779 log_error_errno(-r
, "Failed to open netlink container: %m");
1783 r
= sd_rtnl_message_open_container(m
, VETH_INFO_PEER
);
1785 log_error_errno(-r
, "Failed to open netlink container: %m");
1789 r
= sd_rtnl_message_append_string(m
, IFLA_IFNAME
, "host0");
1791 log_error_errno(-r
, "Failed to add netlink interface name: %m");
1795 r
= sd_rtnl_message_append_ether_addr(m
, IFLA_ADDRESS
, &mac_container
);
1797 log_error_errno(-r
, "Failed to add netlink MAC address: %m");
1801 r
= sd_rtnl_message_append_u32(m
, IFLA_NET_NS_PID
, pid
);
1803 log_error_errno(-r
, "Failed to add netlink namespace field: %m");
1807 r
= sd_rtnl_message_close_container(m
);
1809 log_error_errno(-r
, "Failed to close netlink container: %m");
1813 r
= sd_rtnl_message_close_container(m
);
1815 log_error_errno(-r
, "Failed to close netlink container: %m");
1819 r
= sd_rtnl_message_close_container(m
);
1821 log_error_errno(-r
, "Failed to close netlink container: %m");
1825 r
= sd_rtnl_call(rtnl
, m
, 0, NULL
);
1827 log_error_errno(-r
, "Failed to add new veth interfaces: %m");
1831 i
= (int) if_nametoindex(iface_name
);
1833 log_error("Failed to resolve interface %s: %m", iface_name
);
1842 static int setup_bridge(const char veth_name
[], int *ifi
) {
1843 _cleanup_rtnl_message_unref_ sd_rtnl_message
*m
= NULL
;
1844 _cleanup_rtnl_unref_ sd_rtnl
*rtnl
= NULL
;
1847 if (!arg_private_network
)
1850 if (!arg_network_veth
)
1853 if (!arg_network_bridge
)
1856 bridge
= (int) if_nametoindex(arg_network_bridge
);
1858 log_error("Failed to resolve interface %s: %m", arg_network_bridge
);
1864 r
= sd_rtnl_open(&rtnl
, 0);
1866 log_error_errno(-r
, "Failed to connect to netlink: %m");
1870 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_SETLINK
, 0);
1872 log_error_errno(-r
, "Failed to allocate netlink message: %m");
1876 r
= sd_rtnl_message_link_set_flags(m
, IFF_UP
, IFF_UP
);
1878 log_error_errno(-r
, "Failed to set IFF_UP flag: %m");
1882 r
= sd_rtnl_message_append_string(m
, IFLA_IFNAME
, veth_name
);
1884 log_error_errno(-r
, "Failed to add netlink interface name field: %m");
1888 r
= sd_rtnl_message_append_u32(m
, IFLA_MASTER
, bridge
);
1890 log_error_errno(-r
, "Failed to add netlink master field: %m");
1894 r
= sd_rtnl_call(rtnl
, m
, 0, NULL
);
1896 log_error_errno(-r
, "Failed to add veth interface to bridge: %m");
1903 static int parse_interface(struct udev
*udev
, const char *name
) {
1904 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
1905 char ifi_str
[2 + DECIMAL_STR_MAX(int)];
1908 ifi
= (int) if_nametoindex(name
);
1910 log_error("Failed to resolve interface %s: %m", name
);
1914 sprintf(ifi_str
, "n%i", ifi
);
1915 d
= udev_device_new_from_device_id(udev
, ifi_str
);
1917 log_error("Failed to get udev device for interface %s: %m", name
);
1921 if (udev_device_get_is_initialized(d
) <= 0) {
1922 log_error("Network interface %s is not initialized yet.", name
);
1929 static int move_network_interfaces(pid_t pid
) {
1930 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
1931 _cleanup_rtnl_unref_ sd_rtnl
*rtnl
= NULL
;
1935 if (!arg_private_network
)
1938 if (strv_isempty(arg_network_interfaces
))
1941 r
= sd_rtnl_open(&rtnl
, 0);
1943 log_error_errno(-r
, "Failed to connect to netlink: %m");
1949 log_error("Failed to connect to udev.");
1953 STRV_FOREACH(i
, arg_network_interfaces
) {
1954 _cleanup_rtnl_message_unref_ sd_rtnl_message
*m
= NULL
;
1957 ifi
= parse_interface(udev
, *i
);
1961 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_SETLINK
, ifi
);
1963 log_error_errno(-r
, "Failed to allocate netlink message: %m");
1967 r
= sd_rtnl_message_append_u32(m
, IFLA_NET_NS_PID
, pid
);
1969 log_error_errno(-r
, "Failed to append namespace PID to netlink message: %m");
1973 r
= sd_rtnl_call(rtnl
, m
, 0, NULL
);
1975 log_error_errno(-r
, "Failed to move interface %s to namespace: %m", *i
);
1983 static int setup_macvlan(pid_t pid
) {
1984 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
1985 _cleanup_rtnl_unref_ sd_rtnl
*rtnl
= NULL
;
1989 if (!arg_private_network
)
1992 if (strv_isempty(arg_network_macvlan
))
1995 r
= sd_rtnl_open(&rtnl
, 0);
1997 log_error_errno(-r
, "Failed to connect to netlink: %m");
2003 log_error("Failed to connect to udev.");
2007 STRV_FOREACH(i
, arg_network_macvlan
) {
2008 _cleanup_rtnl_message_unref_ sd_rtnl_message
*m
= NULL
;
2009 _cleanup_free_
char *n
= NULL
;
2012 ifi
= parse_interface(udev
, *i
);
2016 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_NEWLINK
, 0);
2018 log_error_errno(-r
, "Failed to allocate netlink message: %m");
2022 r
= sd_rtnl_message_append_u32(m
, IFLA_LINK
, ifi
);
2024 log_error_errno(-r
, "Failed to add netlink interface index: %m");
2028 n
= strappend("mv-", *i
);
2032 strshorten(n
, IFNAMSIZ
-1);
2034 r
= sd_rtnl_message_append_string(m
, IFLA_IFNAME
, n
);
2036 log_error_errno(-r
, "Failed to add netlink interface name: %m");
2040 r
= sd_rtnl_message_append_u32(m
, IFLA_NET_NS_PID
, pid
);
2042 log_error_errno(-r
, "Failed to add netlink namespace field: %m");
2046 r
= sd_rtnl_message_open_container(m
, IFLA_LINKINFO
);
2048 log_error_errno(-r
, "Failed to open netlink container: %m");
2052 r
= sd_rtnl_message_open_container_union(m
, IFLA_INFO_DATA
, "macvlan");
2054 log_error_errno(-r
, "Failed to open netlink container: %m");
2058 r
= sd_rtnl_message_append_u32(m
, IFLA_MACVLAN_MODE
, MACVLAN_MODE_BRIDGE
);
2060 log_error_errno(-r
, "Failed to append macvlan mode: %m");
2064 r
= sd_rtnl_message_close_container(m
);
2066 log_error_errno(-r
, "Failed to close netlink container: %m");
2070 r
= sd_rtnl_message_close_container(m
);
2072 log_error_errno(-r
, "Failed to close netlink container: %m");
2076 r
= sd_rtnl_call(rtnl
, m
, 0, NULL
);
2078 log_error_errno(-r
, "Failed to add new macvlan interfaces: %m");
2086 static int setup_seccomp(void) {
2089 static const int blacklist
[] = {
2090 SCMP_SYS(kexec_load
),
2091 SCMP_SYS(open_by_handle_at
),
2092 SCMP_SYS(init_module
),
2093 SCMP_SYS(finit_module
),
2094 SCMP_SYS(delete_module
),
2101 scmp_filter_ctx seccomp
;
2105 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
2109 r
= seccomp_add_secondary_archs(seccomp
);
2111 log_error_errno(-r
, "Failed to add secondary archs to seccomp filter: %m");
2115 for (i
= 0; i
< ELEMENTSOF(blacklist
); i
++) {
2116 r
= seccomp_rule_add(seccomp
, SCMP_ACT_ERRNO(EPERM
), blacklist
[i
], 0);
2118 continue; /* unknown syscall */
2120 log_error_errno(-r
, "Failed to block syscall: %m");
2126 Audit is broken in containers, much of the userspace audit
2127 hookup will fail if running inside a container. We don't
2128 care and just turn off creation of audit sockets.
2130 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2131 with EAFNOSUPPORT which audit userspace uses as indication
2132 that audit is disabled in the kernel.
2135 r
= seccomp_rule_add(
2137 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
2140 SCMP_A0(SCMP_CMP_EQ
, AF_NETLINK
),
2141 SCMP_A2(SCMP_CMP_EQ
, NETLINK_AUDIT
));
2143 log_error_errno(-r
, "Failed to add audit seccomp rule: %m");
2147 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
2149 log_error_errno(-r
, "Failed to unset NO_NEW_PRIVS: %m");
2153 r
= seccomp_load(seccomp
);
2155 log_error_errno(-r
, "Failed to install seccomp audit filter: %m");
2158 seccomp_release(seccomp
);
2166 static int setup_image(char **device_path
, int *loop_nr
) {
2167 struct loop_info64 info
= {
2168 .lo_flags
= LO_FLAGS_AUTOCLEAR
|LO_FLAGS_PARTSCAN
2170 _cleanup_close_
int fd
= -1, control
= -1, loop
= -1;
2171 _cleanup_free_
char* loopdev
= NULL
;
2175 assert(device_path
);
2178 fd
= open(arg_image
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
2180 log_error("Failed to open %s: %m", arg_image
);
2184 if (fstat(fd
, &st
) < 0) {
2185 log_error("Failed to stat %s: %m", arg_image
);
2189 if (S_ISBLK(st
.st_mode
)) {
2192 p
= strdup(arg_image
);
2206 if (!S_ISREG(st
.st_mode
)) {
2207 log_error("%s is not a regular file or block device: %m", arg_image
);
2211 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
2213 log_error("Failed to open /dev/loop-control: %m");
2217 nr
= ioctl(control
, LOOP_CTL_GET_FREE
);
2219 log_error("Failed to allocate loop device: %m");
2223 if (asprintf(&loopdev
, "/dev/loop%i", nr
) < 0)
2226 loop
= open(loopdev
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
2228 log_error("Failed to open loop device %s: %m", loopdev
);
2232 if (ioctl(loop
, LOOP_SET_FD
, fd
) < 0) {
2233 log_error("Failed to set loopback file descriptor on %s: %m", loopdev
);
2238 info
.lo_flags
|= LO_FLAGS_READ_ONLY
;
2240 if (ioctl(loop
, LOOP_SET_STATUS64
, &info
) < 0) {
2241 log_error("Failed to set loopback settings on %s: %m", loopdev
);
2245 *device_path
= loopdev
;
2256 static int dissect_image(
2258 char **root_device
, bool *root_device_rw
,
2259 char **home_device
, bool *home_device_rw
,
2260 char **srv_device
, bool *srv_device_rw
,
2264 int home_nr
= -1, root_nr
= -1, secondary_root_nr
= -1, srv_nr
= -1;
2265 _cleanup_free_
char *home
= NULL
, *root
= NULL
, *secondary_root
= NULL
, *srv
= NULL
;
2266 _cleanup_udev_enumerate_unref_
struct udev_enumerate
*e
= NULL
;
2267 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
2268 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
2269 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
2270 struct udev_list_entry
*first
, *item
;
2271 bool home_rw
= true, root_rw
= true, secondary_root_rw
= true, srv_rw
= true;
2272 const char *pttype
= NULL
;
2278 assert(root_device
);
2279 assert(home_device
);
2283 b
= blkid_new_probe();
2288 r
= blkid_probe_set_device(b
, fd
, 0, 0);
2293 log_error("Failed to set device on blkid probe: %m");
2297 blkid_probe_enable_partitions(b
, 1);
2298 blkid_probe_set_partitions_flags(b
, BLKID_PARTS_ENTRY_DETAILS
);
2301 r
= blkid_do_safeprobe(b
);
2302 if (r
== -2 || r
== 1) {
2303 log_error("Failed to identify any partition table on %s.\n"
2304 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image
);
2306 } else if (r
!= 0) {
2309 log_error("Failed to probe: %m");
2313 blkid_probe_lookup_value(b
, "PTTYPE", &pttype
, NULL
);
2314 if (!streq_ptr(pttype
, "gpt")) {
2315 log_error("Image %s does not carry a GUID Partition Table.\n"
2316 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image
);
2321 pl
= blkid_probe_get_partitions(b
);
2326 log_error("Failed to list partitions of %s", arg_image
);
2334 if (fstat(fd
, &st
) < 0) {
2335 log_error("Failed to stat block device: %m");
2339 d
= udev_device_new_from_devnum(udev
, 'b', st
.st_rdev
);
2343 e
= udev_enumerate_new(udev
);
2347 r
= udev_enumerate_add_match_parent(e
, d
);
2351 r
= udev_enumerate_scan_devices(e
);
2353 log_error_errno(-r
, "Failed to scan for partition devices of %s: %m", arg_image
);
2357 first
= udev_enumerate_get_list_entry(e
);
2358 udev_list_entry_foreach(item
, first
) {
2359 _cleanup_udev_device_unref_
struct udev_device
*q
;
2360 const char *stype
, *node
;
2361 unsigned long long flags
;
2368 q
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(item
));
2373 log_error("Failed to get partition device of %s: %m", arg_image
);
2377 qn
= udev_device_get_devnum(q
);
2381 if (st
.st_rdev
== qn
)
2384 node
= udev_device_get_devnode(q
);
2388 pp
= blkid_partlist_devno_to_partition(pl
, qn
);
2392 flags
= blkid_partition_get_flags(pp
);
2393 if (flags
& GPT_FLAG_NO_AUTO
)
2396 nr
= blkid_partition_get_partno(pp
);
2400 stype
= blkid_partition_get_type_string(pp
);
2404 if (sd_id128_from_string(stype
, &type_id
) < 0)
2407 if (sd_id128_equal(type_id
, GPT_HOME
)) {
2409 if (home
&& nr
>= home_nr
)
2413 home_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2416 home
= strdup(node
);
2419 } else if (sd_id128_equal(type_id
, GPT_SRV
)) {
2421 if (srv
&& nr
>= srv_nr
)
2425 srv_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2432 #ifdef GPT_ROOT_NATIVE
2433 else if (sd_id128_equal(type_id
, GPT_ROOT_NATIVE
)) {
2435 if (root
&& nr
>= root_nr
)
2439 root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2442 root
= strdup(node
);
2447 #ifdef GPT_ROOT_SECONDARY
2448 else if (sd_id128_equal(type_id
, GPT_ROOT_SECONDARY
)) {
2450 if (secondary_root
&& nr
>= secondary_root_nr
)
2453 secondary_root_nr
= nr
;
2454 secondary_root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2457 free(secondary_root
);
2458 secondary_root
= strdup(node
);
2459 if (!secondary_root
)
2465 if (!root
&& !secondary_root
) {
2466 log_error("Failed to identify root partition in disk image %s.\n"
2467 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image
);
2472 *root_device
= root
;
2475 *root_device_rw
= root_rw
;
2477 } else if (secondary_root
) {
2478 *root_device
= secondary_root
;
2479 secondary_root
= NULL
;
2481 *root_device_rw
= secondary_root_rw
;
2486 *home_device
= home
;
2489 *home_device_rw
= home_rw
;
2496 *srv_device_rw
= srv_rw
;
2501 log_error("--image= is not supported, compiled without blkid support.");
2506 static int mount_device(const char *what
, const char *where
, const char *directory
, bool rw
) {
2508 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
2509 const char *fstype
, *p
;
2519 p
= strappenda(where
, directory
);
2524 b
= blkid_new_probe_from_filename(what
);
2528 log_error("Failed to allocate prober for %s: %m", what
);
2532 blkid_probe_enable_superblocks(b
, 1);
2533 blkid_probe_set_superblocks_flags(b
, BLKID_SUBLKS_TYPE
);
2536 r
= blkid_do_safeprobe(b
);
2537 if (r
== -1 || r
== 1) {
2538 log_error("Cannot determine file system type of %s", what
);
2540 } else if (r
!= 0) {
2543 log_error("Failed to probe %s: %m", what
);
2548 if (blkid_probe_lookup_value(b
, "TYPE", &fstype
, NULL
) < 0) {
2551 log_error("Failed to determine file system type of %s", what
);
2555 if (streq(fstype
, "crypto_LUKS")) {
2556 log_error("nspawn currently does not support LUKS disk images.");
2560 if (mount(what
, p
, fstype
, MS_NODEV
|(rw
? 0 : MS_RDONLY
), NULL
) < 0) {
2561 log_error("Failed to mount %s: %m", what
);
2567 log_error("--image= is not supported, compiled without blkid support.");
2572 static int mount_devices(
2574 const char *root_device
, bool root_device_rw
,
2575 const char *home_device
, bool home_device_rw
,
2576 const char *srv_device
, bool srv_device_rw
) {
2582 r
= mount_device(root_device
, arg_directory
, NULL
, root_device_rw
);
2584 log_error_errno(-r
, "Failed to mount root directory: %m");
2590 r
= mount_device(home_device
, arg_directory
, "/home", home_device_rw
);
2592 log_error_errno(-r
, "Failed to mount home directory: %m");
2598 r
= mount_device(srv_device
, arg_directory
, "/srv", srv_device_rw
);
2600 log_error_errno(-r
, "Failed to mount server data directory: %m");
2608 static void loop_remove(int nr
, int *image_fd
) {
2609 _cleanup_close_
int control
= -1;
2615 if (image_fd
&& *image_fd
>= 0) {
2616 r
= ioctl(*image_fd
, LOOP_CLR_FD
);
2618 log_warning("Failed to close loop image: %m");
2619 *image_fd
= safe_close(*image_fd
);
2622 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
2624 log_warning("Failed to open /dev/loop-control: %m");
2628 r
= ioctl(control
, LOOP_CTL_REMOVE
, nr
);
2630 log_warning("Failed to remove loop %d: %m", nr
);
2633 static int spawn_getent(const char *database
, const char *key
, pid_t
*rpid
) {
2641 if (pipe2(pipe_fds
, O_CLOEXEC
) < 0) {
2642 log_error("Failed to allocate pipe: %m");
2648 log_error("Failed to fork getent child: %m");
2650 } else if (pid
== 0) {
2652 char *empty_env
= NULL
;
2654 if (dup3(pipe_fds
[1], STDOUT_FILENO
, 0) < 0)
2655 _exit(EXIT_FAILURE
);
2657 if (pipe_fds
[0] > 2)
2658 safe_close(pipe_fds
[0]);
2659 if (pipe_fds
[1] > 2)
2660 safe_close(pipe_fds
[1]);
2662 nullfd
= open("/dev/null", O_RDWR
);
2664 _exit(EXIT_FAILURE
);
2666 if (dup3(nullfd
, STDIN_FILENO
, 0) < 0)
2667 _exit(EXIT_FAILURE
);
2669 if (dup3(nullfd
, STDERR_FILENO
, 0) < 0)
2670 _exit(EXIT_FAILURE
);
2675 reset_all_signal_handlers();
2676 close_all_fds(NULL
, 0);
2678 execle("/usr/bin/getent", "getent", database
, key
, NULL
, &empty_env
);
2679 execle("/bin/getent", "getent", database
, key
, NULL
, &empty_env
);
2680 _exit(EXIT_FAILURE
);
2683 pipe_fds
[1] = safe_close(pipe_fds
[1]);
2690 static int change_uid_gid(char **_home
) {
2691 char line
[LINE_MAX
], *x
, *u
, *g
, *h
;
2692 const char *word
, *state
;
2693 _cleanup_free_ uid_t
*uids
= NULL
;
2694 _cleanup_free_
char *home
= NULL
;
2695 _cleanup_fclose_
FILE *f
= NULL
;
2696 _cleanup_close_
int fd
= -1;
2697 unsigned n_uids
= 0;
2706 if (!arg_user
|| streq(arg_user
, "root") || streq(arg_user
, "0")) {
2707 /* Reset everything fully to 0, just in case */
2709 if (setgroups(0, NULL
) < 0) {
2710 log_error("setgroups() failed: %m");
2714 if (setresgid(0, 0, 0) < 0) {
2715 log_error("setregid() failed: %m");
2719 if (setresuid(0, 0, 0) < 0) {
2720 log_error("setreuid() failed: %m");
2728 /* First, get user credentials */
2729 fd
= spawn_getent("passwd", arg_user
, &pid
);
2733 f
= fdopen(fd
, "r");
2738 if (!fgets(line
, sizeof(line
), f
)) {
2741 log_error("Failed to resolve user %s.", arg_user
);
2745 log_error("Failed to read from getent: %m");
2751 wait_for_terminate_and_warn("getent passwd", pid
);
2753 x
= strchr(line
, ':');
2755 log_error("/etc/passwd entry has invalid user field.");
2759 u
= strchr(x
+1, ':');
2761 log_error("/etc/passwd entry has invalid password field.");
2768 log_error("/etc/passwd entry has invalid UID field.");
2776 log_error("/etc/passwd entry has invalid GID field.");
2781 h
= strchr(x
+1, ':');
2783 log_error("/etc/passwd entry has invalid GECOS field.");
2790 log_error("/etc/passwd entry has invalid home directory field.");
2796 r
= parse_uid(u
, &uid
);
2798 log_error("Failed to parse UID of user.");
2802 r
= parse_gid(g
, &gid
);
2804 log_error("Failed to parse GID of user.");
2812 /* Second, get group memberships */
2813 fd
= spawn_getent("initgroups", arg_user
, &pid
);
2818 f
= fdopen(fd
, "r");
2823 if (!fgets(line
, sizeof(line
), f
)) {
2825 log_error("Failed to resolve user %s.", arg_user
);
2829 log_error("Failed to read from getent: %m");
2835 wait_for_terminate_and_warn("getent initgroups", pid
);
2837 /* Skip over the username and subsequent separator whitespace */
2839 x
+= strcspn(x
, WHITESPACE
);
2840 x
+= strspn(x
, WHITESPACE
);
2842 FOREACH_WORD(word
, l
, x
, state
) {
2848 if (!GREEDY_REALLOC(uids
, sz
, n_uids
+1))
2851 r
= parse_uid(c
, &uids
[n_uids
++]);
2853 log_error("Failed to parse group data from getent.");
2858 r
= mkdir_parents(home
, 0775);
2860 log_error_errno(-r
, "Failed to make home root directory: %m");
2864 r
= mkdir_safe(home
, 0755, uid
, gid
);
2865 if (r
< 0 && r
!= -EEXIST
) {
2866 log_error_errno(-r
, "Failed to make home directory: %m");
2870 fchown(STDIN_FILENO
, uid
, gid
);
2871 fchown(STDOUT_FILENO
, uid
, gid
);
2872 fchown(STDERR_FILENO
, uid
, gid
);
2874 if (setgroups(n_uids
, uids
) < 0) {
2875 log_error("Failed to set auxiliary groups: %m");
2879 if (setresgid(gid
, gid
, gid
) < 0) {
2880 log_error("setregid() failed: %m");
2884 if (setresuid(uid
, uid
, uid
) < 0) {
2885 log_error("setreuid() failed: %m");
2899 * < 0 : wait_for_terminate() failed to get the state of the
2900 * container, the container was terminated by a signal, or
2901 * failed for an unknown reason. No change is made to the
2902 * container argument.
2903 * > 0 : The program executed in the container terminated with an
2904 * error. The exit code of the program executed in the
2905 * container is returned. The container argument has been set
2906 * to CONTAINER_TERMINATED.
2907 * 0 : The container is being rebooted, has been shut down or exited
2908 * successfully. The container argument has been set to either
2909 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2911 * That is, success is indicated by a return value of zero, and an
2912 * error is indicated by a non-zero value.
2914 static int wait_for_container(pid_t pid
, ContainerStatus
*container
) {
2918 r
= wait_for_terminate(pid
, &status
);
2920 log_warning_errno(-r
, "Failed to wait for container: %m");
2924 switch (status
.si_code
) {
2927 if (status
.si_status
== 0) {
2928 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s exited successfully.", arg_machine
);
2931 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s failed with error code %i.", arg_machine
, status
.si_status
);
2933 *container
= CONTAINER_TERMINATED
;
2934 return status
.si_status
;
2937 if (status
.si_status
== SIGINT
) {
2939 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s has been shut down.", arg_machine
);
2940 *container
= CONTAINER_TERMINATED
;
2943 } else if (status
.si_status
== SIGHUP
) {
2945 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s is being rebooted.", arg_machine
);
2946 *container
= CONTAINER_REBOOTED
;
2950 /* CLD_KILLED fallthrough */
2953 log_error("Container %s terminated by signal %s.", arg_machine
, signal_to_string(status
.si_status
));
2957 log_error("Container %s failed due to unknown reason.", arg_machine
);
2964 static void nop_handler(int sig
) {}
2966 static int on_orderly_shutdown(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
2969 pid
= PTR_TO_UINT32(userdata
);
2971 if (kill(pid
, SIGRTMIN
+3) >= 0) {
2972 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2973 sd_event_source_set_userdata(s
, NULL
);
2978 sd_event_exit(sd_event_source_get_event(s
), 0);
2982 int main(int argc
, char *argv
[]) {
2984 _cleanup_free_
char *device_path
= NULL
, *root_device
= NULL
, *home_device
= NULL
, *srv_device
= NULL
;
2985 bool root_device_rw
= true, home_device_rw
= true, srv_device_rw
= true;
2986 _cleanup_close_
int master
= -1, image_fd
= -1;
2987 _cleanup_close_pair_
int kmsg_socket_pair
[2] = { -1, -1 };
2988 _cleanup_fdset_free_ FDSet
*fds
= NULL
;
2989 int r
= EXIT_FAILURE
, k
, n_fd_passed
, loop_nr
= -1;
2990 const char *console
= NULL
;
2991 char veth_name
[IFNAMSIZ
];
2992 bool secondary
= false;
2993 sigset_t mask
, mask_chld
;
2996 log_parse_environment();
2999 k
= parse_argv(argc
, argv
);
3008 if (arg_directory
) {
3011 p
= path_make_absolute_cwd(arg_directory
);
3012 free(arg_directory
);
3015 arg_directory
= get_current_dir_name();
3017 if (!arg_directory
) {
3018 log_error("Failed to determine path, please use -D.");
3021 path_kill_slashes(arg_directory
);
3025 arg_machine
= strdup(basename(arg_image
? arg_image
: arg_directory
));
3031 hostname_cleanup(arg_machine
, false);
3032 if (isempty(arg_machine
)) {
3033 log_error("Failed to determine machine name automatically, please use -M.");
3038 if (geteuid() != 0) {
3039 log_error("Need to be root.");
3043 if (sd_booted() <= 0) {
3044 log_error("Not running on a systemd system.");
3049 n_fd_passed
= sd_listen_fds(false);
3050 if (n_fd_passed
> 0) {
3051 k
= fdset_new_listen_fds(&fds
, false);
3053 log_error_errno(-k
, "Failed to collect file descriptors: %m");
3057 fdset_close_others(fds
);
3060 if (arg_directory
) {
3061 if (path_equal(arg_directory
, "/")) {
3062 log_error("Spawning container on root directory not supported.");
3067 if (path_is_os_tree(arg_directory
) <= 0) {
3068 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory
);
3074 p
= strappenda(arg_directory
,
3075 argc
> optind
&& path_is_absolute(argv
[optind
]) ? argv
[optind
] : "/usr/bin/");
3076 if (access(p
, F_OK
) < 0) {
3077 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory
);
3083 char template[] = "/tmp/nspawn-root-XXXXXX";
3085 if (!mkdtemp(template)) {
3086 log_error("Failed to create temporary directory: %m");
3091 arg_directory
= strdup(template);
3092 if (!arg_directory
) {
3097 image_fd
= setup_image(&device_path
, &loop_nr
);
3103 r
= dissect_image(image_fd
,
3104 &root_device
, &root_device_rw
,
3105 &home_device
, &home_device_rw
,
3106 &srv_device
, &srv_device_rw
,
3112 master
= posix_openpt(O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NDELAY
);
3114 log_error("Failed to acquire pseudo tty: %m");
3118 console
= ptsname(master
);
3120 log_error("Failed to determine tty name: %m");
3125 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3126 arg_machine
, arg_image
? arg_image
: arg_directory
);
3128 if (unlockpt(master
) < 0) {
3129 log_error("Failed to unlock tty: %m");
3133 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_NONBLOCK
|SOCK_CLOEXEC
, 0, kmsg_socket_pair
) < 0) {
3134 log_error("Failed to create kmsg socket pair: %m");
3140 "STATUS=Container running.");
3142 assert_se(sigemptyset(&mask
) == 0);
3143 sigset_add_many(&mask
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, -1);
3144 assert_se(sigprocmask(SIG_BLOCK
, &mask
, NULL
) == 0);
3146 assert_se(sigemptyset(&mask_chld
) == 0);
3147 assert_se(sigaddset(&mask_chld
, SIGCHLD
) == 0);
3150 ContainerStatus container_status
;
3151 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
3152 struct sigaction sa
= {
3153 .sa_handler
= nop_handler
,
3154 .sa_flags
= SA_NOCLDSTOP
,
3157 r
= barrier_create(&barrier
);
3159 log_error_errno(-r
, "Cannot initialize IPC barrier: %m");
3163 /* Child can be killed before execv(), so handle SIGCHLD
3164 * in order to interrupt parent's blocking calls and
3165 * give it a chance to call wait() and terminate. */
3166 r
= sigprocmask(SIG_UNBLOCK
, &mask_chld
, NULL
);
3168 log_error("Failed to change the signal mask: %m");
3172 r
= sigaction(SIGCHLD
, &sa
, NULL
);
3174 log_error("Failed to install SIGCHLD handler: %m");
3178 pid
= syscall(__NR_clone
, SIGCHLD
|CLONE_NEWNS
|
3179 (arg_share_system
? 0 : CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
)|
3180 (arg_private_network
? CLONE_NEWNET
: 0), NULL
);
3182 if (errno
== EINVAL
)
3183 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3185 log_error("clone() failed: %m");
3193 _cleanup_free_
char *home
= NULL
;
3195 const char *envp
[] = {
3196 "PATH=" DEFAULT_PATH_SPLIT_USR
,
3197 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3202 NULL
, /* container_uuid */
3203 NULL
, /* LISTEN_FDS */
3204 NULL
, /* LISTEN_PID */
3209 barrier_set_role(&barrier
, BARRIER_CHILD
);
3211 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
3215 master
= safe_close(master
);
3217 close_nointr(STDIN_FILENO
);
3218 close_nointr(STDOUT_FILENO
);
3219 close_nointr(STDERR_FILENO
);
3221 kmsg_socket_pair
[0] = safe_close(kmsg_socket_pair
[0]);
3223 reset_all_signal_handlers();
3224 reset_signal_mask();
3226 k
= open_terminal(console
, O_RDWR
);
3227 if (k
!= STDIN_FILENO
) {
3233 log_error_errno(-k
, "Failed to open console: %m");
3234 _exit(EXIT_FAILURE
);
3237 if (dup2(STDIN_FILENO
, STDOUT_FILENO
) != STDOUT_FILENO
||
3238 dup2(STDIN_FILENO
, STDERR_FILENO
) != STDERR_FILENO
) {
3239 log_error("Failed to duplicate console: %m");
3240 _exit(EXIT_FAILURE
);
3244 log_error("setsid() failed: %m");
3245 _exit(EXIT_FAILURE
);
3248 if (reset_audit_loginuid() < 0)
3249 _exit(EXIT_FAILURE
);
3251 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0) {
3252 log_error("PR_SET_PDEATHSIG failed: %m");
3253 _exit(EXIT_FAILURE
);
3256 /* Mark everything as slave, so that we still
3257 * receive mounts from the real root, but don't
3258 * propagate mounts to the real root. */
3259 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0) {
3260 log_error("MS_SLAVE|MS_REC failed: %m");
3261 _exit(EXIT_FAILURE
);
3264 if (mount_devices(arg_directory
,
3265 root_device
, root_device_rw
,
3266 home_device
, home_device_rw
,
3267 srv_device
, srv_device_rw
) < 0)
3268 _exit(EXIT_FAILURE
);
3270 /* Turn directory into bind mount */
3271 if (mount(arg_directory
, arg_directory
, "bind", MS_BIND
|MS_REC
, NULL
) < 0) {
3272 log_error("Failed to make bind mount: %m");
3273 _exit(EXIT_FAILURE
);
3276 r
= setup_volatile(arg_directory
);
3278 _exit(EXIT_FAILURE
);
3280 if (setup_volatile_state(arg_directory
) < 0)
3281 _exit(EXIT_FAILURE
);
3283 r
= base_filesystem_create(arg_directory
);
3285 _exit(EXIT_FAILURE
);
3287 if (arg_read_only
) {
3288 k
= bind_remount_recursive(arg_directory
, true);
3290 log_error_errno(-k
, "Failed to make tree read-only: %m");
3291 _exit(EXIT_FAILURE
);
3295 if (mount_all(arg_directory
) < 0)
3296 _exit(EXIT_FAILURE
);
3298 if (copy_devnodes(arg_directory
) < 0)
3299 _exit(EXIT_FAILURE
);
3301 if (setup_ptmx(arg_directory
) < 0)
3302 _exit(EXIT_FAILURE
);
3304 dev_setup(arg_directory
);
3306 if (setup_seccomp() < 0)
3307 _exit(EXIT_FAILURE
);
3309 if (setup_dev_console(arg_directory
, console
) < 0)
3310 _exit(EXIT_FAILURE
);
3312 if (setup_kmsg(arg_directory
, kmsg_socket_pair
[1]) < 0)
3313 _exit(EXIT_FAILURE
);
3315 kmsg_socket_pair
[1] = safe_close(kmsg_socket_pair
[1]);
3317 if (setup_boot_id(arg_directory
) < 0)
3318 _exit(EXIT_FAILURE
);
3320 if (setup_timezone(arg_directory
) < 0)
3321 _exit(EXIT_FAILURE
);
3323 if (setup_resolv_conf(arg_directory
) < 0)
3324 _exit(EXIT_FAILURE
);
3326 if (setup_journal(arg_directory
) < 0)
3327 _exit(EXIT_FAILURE
);
3329 if (mount_binds(arg_directory
, arg_bind
, false) < 0)
3330 _exit(EXIT_FAILURE
);
3332 if (mount_binds(arg_directory
, arg_bind_ro
, true) < 0)
3333 _exit(EXIT_FAILURE
);
3335 if (mount_tmpfs(arg_directory
) < 0)
3336 _exit(EXIT_FAILURE
);
3338 /* Tell the parent that we are ready, and that
3339 * it can cgroupify us to that we lack access
3340 * to certain devices and resources. */
3341 (void)barrier_place(&barrier
);
3343 if (chdir(arg_directory
) < 0) {
3344 log_error("chdir(%s) failed: %m", arg_directory
);
3345 _exit(EXIT_FAILURE
);
3348 if (mount(arg_directory
, "/", NULL
, MS_MOVE
, NULL
) < 0) {
3349 log_error("mount(MS_MOVE) failed: %m");
3350 _exit(EXIT_FAILURE
);
3353 if (chroot(".") < 0) {
3354 log_error("chroot() failed: %m");
3355 _exit(EXIT_FAILURE
);
3358 if (chdir("/") < 0) {
3359 log_error("chdir() failed: %m");
3360 _exit(EXIT_FAILURE
);
3365 if (arg_private_network
)
3368 if (drop_capabilities() < 0) {
3369 log_error("drop_capabilities() failed: %m");
3370 _exit(EXIT_FAILURE
);
3373 r
= change_uid_gid(&home
);
3375 _exit(EXIT_FAILURE
);
3377 if ((asprintf((char**)(envp
+ n_env
++), "HOME=%s", home
? home
: "/root") < 0) ||
3378 (asprintf((char**)(envp
+ n_env
++), "USER=%s", arg_user
? arg_user
: "root") < 0) ||
3379 (asprintf((char**)(envp
+ n_env
++), "LOGNAME=%s", arg_user
? arg_user
: "root") < 0)) {
3381 _exit(EXIT_FAILURE
);
3384 if (!sd_id128_equal(arg_uuid
, SD_ID128_NULL
)) {
3387 if (asprintf((char**)(envp
+ n_env
++), "container_uuid=%s", id128_format_as_uuid(arg_uuid
, as_uuid
)) < 0) {
3389 _exit(EXIT_FAILURE
);
3393 if (fdset_size(fds
) > 0) {
3394 k
= fdset_cloexec(fds
, false);
3396 log_error("Failed to unset O_CLOEXEC for file descriptors.");
3397 _exit(EXIT_FAILURE
);
3400 if ((asprintf((char **)(envp
+ n_env
++), "LISTEN_FDS=%u", n_fd_passed
) < 0) ||
3401 (asprintf((char **)(envp
+ n_env
++), "LISTEN_PID=1") < 0)) {
3403 _exit(EXIT_FAILURE
);
3409 if (arg_personality
!= 0xffffffffLU
) {
3410 if (personality(arg_personality
) < 0) {
3411 log_error("personality() failed: %m");
3412 _exit(EXIT_FAILURE
);
3414 } else if (secondary
) {
3415 if (personality(PER_LINUX32
) < 0) {
3416 log_error("personality() failed: %m");
3417 _exit(EXIT_FAILURE
);
3422 if (arg_selinux_context
)
3423 if (setexeccon((security_context_t
) arg_selinux_context
) < 0) {
3424 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context
);
3425 _exit(EXIT_FAILURE
);
3429 if (!strv_isempty(arg_setenv
)) {
3432 n
= strv_env_merge(2, envp
, arg_setenv
);
3435 _exit(EXIT_FAILURE
);
3440 env_use
= (char**) envp
;
3442 /* Wait until the parent is ready with the setup, too... */
3443 if (!barrier_place_and_sync(&barrier
))
3444 _exit(EXIT_FAILURE
);
3450 /* Automatically search for the init system */
3452 l
= 1 + argc
- optind
;
3453 a
= newa(char*, l
+ 1);
3454 memcpy(a
+ 1, argv
+ optind
, l
* sizeof(char*));
3456 a
[0] = (char*) "/usr/lib/systemd/systemd";
3457 execve(a
[0], a
, env_use
);
3459 a
[0] = (char*) "/lib/systemd/systemd";
3460 execve(a
[0], a
, env_use
);
3462 a
[0] = (char*) "/sbin/init";
3463 execve(a
[0], a
, env_use
);
3464 } else if (argc
> optind
)
3465 execvpe(argv
[optind
], argv
+ optind
, env_use
);
3467 chdir(home
? home
: "/root");
3468 execle("/bin/bash", "-bash", NULL
, env_use
);
3469 execle("/bin/sh", "-sh", NULL
, env_use
);
3472 log_error("execv() failed: %m");
3473 _exit(EXIT_FAILURE
);
3476 barrier_set_role(&barrier
, BARRIER_PARENT
);
3480 /* wait for child-setup to be done */
3481 if (barrier_place_and_sync(&barrier
)) {
3482 _cleanup_event_unref_ sd_event
*event
= NULL
;
3483 _cleanup_(pty_forward_freep
) PTYForward
*forward
= NULL
;
3486 r
= move_network_interfaces(pid
);
3490 r
= setup_veth(pid
, veth_name
, &ifi
);
3494 r
= setup_bridge(veth_name
, &ifi
);
3498 r
= setup_macvlan(pid
);
3502 r
= register_machine(pid
, ifi
);
3506 /* Block SIGCHLD here, before notifying child.
3507 * process_pty() will handle it with the other signals. */
3508 r
= sigprocmask(SIG_BLOCK
, &mask_chld
, NULL
);
3512 /* Reset signal to default */
3513 r
= default_signals(SIGCHLD
, -1);
3517 /* Notify the child that the parent is ready with all
3518 * its setup, and that the child can now hand over
3519 * control to the code to run inside the container. */
3520 (void)barrier_place(&barrier
);
3522 r
= sd_event_new(&event
);
3524 log_error_errno(-r
, "Failed to get default event source: %m");
3529 /* Try to kill the init system on SIGINT or SIGTERM */
3530 sd_event_add_signal(event
, NULL
, SIGINT
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
3531 sd_event_add_signal(event
, NULL
, SIGTERM
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
3533 /* Immediately exit */
3534 sd_event_add_signal(event
, NULL
, SIGINT
, NULL
, NULL
);
3535 sd_event_add_signal(event
, NULL
, SIGTERM
, NULL
, NULL
);
3538 /* simply exit on sigchld */
3539 sd_event_add_signal(event
, NULL
, SIGCHLD
, NULL
, NULL
);
3541 r
= pty_forward_new(event
, master
, &forward
);
3543 log_error_errno(-r
, "Failed to create PTY forwarder: %m");
3547 r
= sd_event_loop(event
);
3549 log_error_errno(-r
, "Failed to run event loop: %m");
3553 forward
= pty_forward_free(forward
);
3558 /* Kill if it is not dead yet anyway */
3559 terminate_machine(pid
);
3562 /* Normally redundant, but better safe than sorry */
3565 r
= wait_for_container(pid
, &container_status
);
3569 /* We failed to wait for the container, or the
3570 * container exited abnormally */
3573 } else if (r
> 0 || container_status
== CONTAINER_TERMINATED
)
3574 /* The container exited with a non-zero
3575 * status, or with zero status and no reboot
3579 /* CONTAINER_REBOOTED, loop again */
3581 if (arg_keep_unit
) {
3582 /* Special handling if we are running as a
3583 * service: instead of simply restarting the
3584 * machine we want to restart the entire
3585 * service, so let's inform systemd about this
3586 * with the special exit code 133. The service
3587 * file uses RestartForceExitStatus=133 so
3588 * that this results in a full nspawn
3589 * restart. This is necessary since we might
3590 * have cgroup parameters set we want to have
3600 "STATUS=Terminating...");
3602 loop_remove(loop_nr
, &image_fd
);
3607 free(arg_directory
);
3610 strv_free(arg_setenv
);
3611 strv_free(arg_network_interfaces
);
3612 strv_free(arg_network_macvlan
);
3613 strv_free(arg_bind
);
3614 strv_free(arg_bind_ro
);
3615 strv_free(arg_tmpfs
);