1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
23 #include <blkid/blkid.h>
27 #include <linux/loop.h>
33 #include <selinux/selinux.h>
40 #include <sys/mount.h>
41 #include <sys/personality.h>
42 #include <sys/prctl.h>
43 #include <sys/types.h>
46 #include "sd-daemon.h"
50 #include "base-filesystem.h"
51 #include "blkid-util.h"
52 #include "btrfs-util.h"
54 #include "capability.h"
55 #include "cgroup-util.h"
57 #include "dev-setup.h"
59 #include "event-util.h"
63 #include "formats-util.h"
65 #include "hostname-util.h"
67 #include "loopback-setup.h"
68 #include "machine-image.h"
72 #include "netlink-util.h"
73 #include "nspawn-cgroup.h"
74 #include "nspawn-expose-ports.h"
75 #include "nspawn-mount.h"
76 #include "nspawn-network.h"
77 #include "nspawn-register.h"
78 #include "nspawn-settings.h"
79 #include "nspawn-setuid.h"
80 #include "path-util.h"
81 #include "process-util.h"
83 #include "random-util.h"
86 #include "seccomp-util.h"
88 #include "signal-util.h"
89 #include "string-util.h"
91 #include "terminal-util.h"
92 #include "udev-util.h"
93 #include "user-util.h"
96 typedef enum ContainerStatus
{
101 typedef enum LinkJournal
{
108 static char *arg_directory
= NULL
;
109 static char *arg_template
= NULL
;
110 static char *arg_user
= NULL
;
111 static sd_id128_t arg_uuid
= {};
112 static char *arg_machine
= NULL
;
113 static const char *arg_selinux_context
= NULL
;
114 static const char *arg_selinux_apifs_context
= NULL
;
115 static const char *arg_slice
= NULL
;
116 static bool arg_private_network
= false;
117 static bool arg_read_only
= false;
118 static bool arg_boot
= false;
119 static bool arg_ephemeral
= false;
120 static LinkJournal arg_link_journal
= LINK_AUTO
;
121 static bool arg_link_journal_try
= false;
122 static uint64_t arg_retain
=
123 (1ULL << CAP_CHOWN
) |
124 (1ULL << CAP_DAC_OVERRIDE
) |
125 (1ULL << CAP_DAC_READ_SEARCH
) |
126 (1ULL << CAP_FOWNER
) |
127 (1ULL << CAP_FSETID
) |
128 (1ULL << CAP_IPC_OWNER
) |
130 (1ULL << CAP_LEASE
) |
131 (1ULL << CAP_LINUX_IMMUTABLE
) |
132 (1ULL << CAP_NET_BIND_SERVICE
) |
133 (1ULL << CAP_NET_BROADCAST
) |
134 (1ULL << CAP_NET_RAW
) |
135 (1ULL << CAP_SETGID
) |
136 (1ULL << CAP_SETFCAP
) |
137 (1ULL << CAP_SETPCAP
) |
138 (1ULL << CAP_SETUID
) |
139 (1ULL << CAP_SYS_ADMIN
) |
140 (1ULL << CAP_SYS_CHROOT
) |
141 (1ULL << CAP_SYS_NICE
) |
142 (1ULL << CAP_SYS_PTRACE
) |
143 (1ULL << CAP_SYS_TTY_CONFIG
) |
144 (1ULL << CAP_SYS_RESOURCE
) |
145 (1ULL << CAP_SYS_BOOT
) |
146 (1ULL << CAP_AUDIT_WRITE
) |
147 (1ULL << CAP_AUDIT_CONTROL
) |
149 static CustomMount
*arg_custom_mounts
= NULL
;
150 static unsigned arg_n_custom_mounts
= 0;
151 static char **arg_setenv
= NULL
;
152 static bool arg_quiet
= false;
153 static bool arg_share_system
= false;
154 static bool arg_register
= true;
155 static bool arg_keep_unit
= false;
156 static char **arg_network_interfaces
= NULL
;
157 static char **arg_network_macvlan
= NULL
;
158 static char **arg_network_ipvlan
= NULL
;
159 static bool arg_network_veth
= false;
160 static char *arg_network_bridge
= NULL
;
161 static unsigned long arg_personality
= PERSONALITY_INVALID
;
162 static char *arg_image
= NULL
;
163 static VolatileMode arg_volatile_mode
= VOLATILE_NO
;
164 static ExposePort
*arg_expose_ports
= NULL
;
165 static char **arg_property
= NULL
;
166 static uid_t arg_uid_shift
= UID_INVALID
, arg_uid_range
= 0x10000U
;
167 static bool arg_userns
= false;
168 static int arg_kill_signal
= 0;
169 static bool arg_unified_cgroup_hierarchy
= false;
170 static SettingsMask arg_settings_mask
= 0;
171 static int arg_settings_trusted
= -1;
172 static char **arg_parameters
= NULL
;
174 static void help(void) {
175 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
176 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
177 " -h --help Show this help\n"
178 " --version Print version string\n"
179 " -q --quiet Do not show status information\n"
180 " -D --directory=PATH Root directory for the container\n"
181 " --template=PATH Initialize root directory from template directory,\n"
183 " -x --ephemeral Run container with snapshot of root directory, and\n"
184 " remove it after exit\n"
185 " -i --image=PATH File system device or disk image for the container\n"
186 " -b --boot Boot up full system (i.e. invoke init)\n"
187 " -u --user=USER Run the command under specified user or uid\n"
188 " -M --machine=NAME Set the machine name for the container\n"
189 " --uuid=UUID Set a specific machine UUID for the container\n"
190 " -S --slice=SLICE Place the container in the specified slice\n"
191 " --property=NAME=VALUE Set scope unit property\n"
192 " --private-users[=UIDBASE[:NUIDS]]\n"
193 " Run within user namespace\n"
194 " --private-network Disable network in container\n"
195 " --network-interface=INTERFACE\n"
196 " Assign an existing network interface to the\n"
198 " --network-macvlan=INTERFACE\n"
199 " Create a macvlan network interface based on an\n"
200 " existing network interface to the container\n"
201 " --network-ipvlan=INTERFACE\n"
202 " Create a ipvlan network interface based on an\n"
203 " existing network interface to the container\n"
204 " -n --network-veth Add a virtual ethernet connection between host\n"
206 " --network-bridge=INTERFACE\n"
207 " Add a virtual ethernet connection between host\n"
208 " and container and add it to an existing bridge on\n"
210 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
211 " Expose a container IP port on the host\n"
212 " -Z --selinux-context=SECLABEL\n"
213 " Set the SELinux security context to be used by\n"
214 " processes in the container\n"
215 " -L --selinux-apifs-context=SECLABEL\n"
216 " Set the SELinux security context to be used by\n"
217 " API/tmpfs file systems in the container\n"
218 " --capability=CAP In addition to the default, retain specified\n"
220 " --drop-capability=CAP Drop the specified capability from the default set\n"
221 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
222 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
223 " try-guest, try-host\n"
224 " -j Equivalent to --link-journal=try-guest\n"
225 " --read-only Mount the root directory read-only\n"
226 " --bind=PATH[:PATH[:OPTIONS]]\n"
227 " Bind mount a file or directory from the host into\n"
229 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
230 " Similar, but creates a read-only bind mount\n"
231 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
232 " --overlay=PATH[:PATH...]:PATH\n"
233 " Create an overlay mount from the host to \n"
235 " --overlay-ro=PATH[:PATH...]:PATH\n"
236 " Similar, but creates a read-only overlay mount\n"
237 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
238 " --share-system Share system namespaces with host\n"
239 " --register=BOOLEAN Register container as machine\n"
240 " --keep-unit Do not register a scope for the machine, reuse\n"
241 " the service unit nspawn is running in\n"
242 " --volatile[=MODE] Run the system in volatile mode\n"
243 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
244 , program_invocation_short_name
);
248 static int custom_mounts_prepare(void) {
252 /* Ensure the mounts are applied prefix first. */
253 qsort_safe(arg_custom_mounts
, arg_n_custom_mounts
, sizeof(CustomMount
), custom_mount_compare
);
255 /* Allocate working directories for the overlay file systems that need it */
256 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
257 CustomMount
*m
= &arg_custom_mounts
[i
];
259 if (arg_userns
&& arg_uid_shift
== UID_INVALID
&& path_equal(m
->destination
, "/")) {
260 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
264 if (m
->type
!= CUSTOM_MOUNT_OVERLAY
)
273 r
= tempfn_random(m
->source
, NULL
, &m
->work_dir
);
275 return log_error_errno(r
, "Failed to generate work directory from %s: %m", m
->source
);
281 static int detect_unified_cgroup_hierarchy(void) {
285 /* Allow the user to control whether the unified hierarchy is used */
286 e
= getenv("UNIFIED_CGROUP_HIERARCHY");
288 r
= parse_boolean(e
);
290 return log_error_errno(r
, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
292 arg_unified_cgroup_hierarchy
= r
;
296 /* Otherwise inherit the default from the host system */
299 return log_error_errno(r
, "Failed to determine whether the unified cgroups hierarchy is used: %m");
301 arg_unified_cgroup_hierarchy
= r
;
305 static int parse_argv(int argc
, char *argv
[]) {
324 ARG_NETWORK_INTERFACE
,
337 static const struct option options
[] = {
338 { "help", no_argument
, NULL
, 'h' },
339 { "version", no_argument
, NULL
, ARG_VERSION
},
340 { "directory", required_argument
, NULL
, 'D' },
341 { "template", required_argument
, NULL
, ARG_TEMPLATE
},
342 { "ephemeral", no_argument
, NULL
, 'x' },
343 { "user", required_argument
, NULL
, 'u' },
344 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
345 { "boot", no_argument
, NULL
, 'b' },
346 { "uuid", required_argument
, NULL
, ARG_UUID
},
347 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
348 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
349 { "drop-capability", required_argument
, NULL
, ARG_DROP_CAPABILITY
},
350 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
351 { "bind", required_argument
, NULL
, ARG_BIND
},
352 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
353 { "tmpfs", required_argument
, NULL
, ARG_TMPFS
},
354 { "overlay", required_argument
, NULL
, ARG_OVERLAY
},
355 { "overlay-ro", required_argument
, NULL
, ARG_OVERLAY_RO
},
356 { "machine", required_argument
, NULL
, 'M' },
357 { "slice", required_argument
, NULL
, 'S' },
358 { "setenv", required_argument
, NULL
, ARG_SETENV
},
359 { "selinux-context", required_argument
, NULL
, 'Z' },
360 { "selinux-apifs-context", required_argument
, NULL
, 'L' },
361 { "quiet", no_argument
, NULL
, 'q' },
362 { "share-system", no_argument
, NULL
, ARG_SHARE_SYSTEM
},
363 { "register", required_argument
, NULL
, ARG_REGISTER
},
364 { "keep-unit", no_argument
, NULL
, ARG_KEEP_UNIT
},
365 { "network-interface", required_argument
, NULL
, ARG_NETWORK_INTERFACE
},
366 { "network-macvlan", required_argument
, NULL
, ARG_NETWORK_MACVLAN
},
367 { "network-ipvlan", required_argument
, NULL
, ARG_NETWORK_IPVLAN
},
368 { "network-veth", no_argument
, NULL
, 'n' },
369 { "network-bridge", required_argument
, NULL
, ARG_NETWORK_BRIDGE
},
370 { "personality", required_argument
, NULL
, ARG_PERSONALITY
},
371 { "image", required_argument
, NULL
, 'i' },
372 { "volatile", optional_argument
, NULL
, ARG_VOLATILE
},
373 { "port", required_argument
, NULL
, 'p' },
374 { "property", required_argument
, NULL
, ARG_PROPERTY
},
375 { "private-users", optional_argument
, NULL
, ARG_PRIVATE_USERS
},
376 { "kill-signal", required_argument
, NULL
, ARG_KILL_SIGNAL
},
377 { "settings", required_argument
, NULL
, ARG_SETTINGS
},
382 uint64_t plus
= 0, minus
= 0;
383 bool mask_all_settings
= false, mask_no_settings
= false;
388 while ((c
= getopt_long(argc
, argv
, "+hD:u:bL:M:jS:Z:qi:xp:n", options
, NULL
)) >= 0)
400 r
= parse_path_argument_and_warn(optarg
, false, &arg_directory
);
406 r
= parse_path_argument_and_warn(optarg
, false, &arg_template
);
412 r
= parse_path_argument_and_warn(optarg
, false, &arg_image
);
418 arg_ephemeral
= true;
422 r
= free_and_strdup(&arg_user
, optarg
);
426 arg_settings_mask
|= SETTING_USER
;
429 case ARG_NETWORK_BRIDGE
:
430 r
= free_and_strdup(&arg_network_bridge
, optarg
);
437 arg_network_veth
= true;
438 arg_private_network
= true;
439 arg_settings_mask
|= SETTING_NETWORK
;
442 case ARG_NETWORK_INTERFACE
:
443 if (strv_extend(&arg_network_interfaces
, optarg
) < 0)
446 arg_private_network
= true;
447 arg_settings_mask
|= SETTING_NETWORK
;
450 case ARG_NETWORK_MACVLAN
:
451 if (strv_extend(&arg_network_macvlan
, optarg
) < 0)
454 arg_private_network
= true;
455 arg_settings_mask
|= SETTING_NETWORK
;
458 case ARG_NETWORK_IPVLAN
:
459 if (strv_extend(&arg_network_ipvlan
, optarg
) < 0)
464 case ARG_PRIVATE_NETWORK
:
465 arg_private_network
= true;
466 arg_settings_mask
|= SETTING_NETWORK
;
471 arg_settings_mask
|= SETTING_BOOT
;
475 r
= sd_id128_from_string(optarg
, &arg_uuid
);
477 log_error("Invalid UUID: %s", optarg
);
481 arg_settings_mask
|= SETTING_MACHINE_ID
;
490 arg_machine
= mfree(arg_machine
);
492 if (!machine_name_is_valid(optarg
)) {
493 log_error("Invalid machine name: %s", optarg
);
497 r
= free_and_strdup(&arg_machine
, optarg
);
505 arg_selinux_context
= optarg
;
509 arg_selinux_apifs_context
= optarg
;
513 arg_read_only
= true;
514 arg_settings_mask
|= SETTING_READ_ONLY
;
518 case ARG_DROP_CAPABILITY
: {
519 const char *state
, *word
;
522 FOREACH_WORD_SEPARATOR(word
, length
, optarg
, ",", state
) {
523 _cleanup_free_
char *t
;
525 t
= strndup(word
, length
);
529 if (streq(t
, "all")) {
530 if (c
== ARG_CAPABILITY
)
531 plus
= (uint64_t) -1;
533 minus
= (uint64_t) -1;
537 cap
= capability_from_name(t
);
539 log_error("Failed to parse capability %s.", t
);
543 if (c
== ARG_CAPABILITY
)
544 plus
|= 1ULL << (uint64_t) cap
;
546 minus
|= 1ULL << (uint64_t) cap
;
550 arg_settings_mask
|= SETTING_CAPABILITY
;
555 arg_link_journal
= LINK_GUEST
;
556 arg_link_journal_try
= true;
559 case ARG_LINK_JOURNAL
:
560 if (streq(optarg
, "auto")) {
561 arg_link_journal
= LINK_AUTO
;
562 arg_link_journal_try
= false;
563 } else if (streq(optarg
, "no")) {
564 arg_link_journal
= LINK_NO
;
565 arg_link_journal_try
= false;
566 } else if (streq(optarg
, "guest")) {
567 arg_link_journal
= LINK_GUEST
;
568 arg_link_journal_try
= false;
569 } else if (streq(optarg
, "host")) {
570 arg_link_journal
= LINK_HOST
;
571 arg_link_journal_try
= false;
572 } else if (streq(optarg
, "try-guest")) {
573 arg_link_journal
= LINK_GUEST
;
574 arg_link_journal_try
= true;
575 } else if (streq(optarg
, "try-host")) {
576 arg_link_journal
= LINK_HOST
;
577 arg_link_journal_try
= true;
579 log_error("Failed to parse link journal mode %s", optarg
);
587 r
= bind_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
, c
== ARG_BIND_RO
);
589 return log_error_errno(r
, "Failed to parse --bind(-ro)= argument %s: %m", optarg
);
591 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
595 r
= tmpfs_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
);
597 return log_error_errno(r
, "Failed to parse --tmpfs= argument %s: %m", optarg
);
599 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
603 case ARG_OVERLAY_RO
: {
604 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
605 _cleanup_strv_free_
char **lower
= NULL
;
610 r
= strv_split_extract(&lower
, optarg
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
614 log_error("Invalid overlay specification: %s", optarg
);
618 STRV_FOREACH(i
, lower
) {
619 if (!path_is_absolute(*i
)) {
620 log_error("Overlay path %s is not absolute.", *i
);
628 log_error("--overlay= needs at least two colon-separated directories specified.");
633 /* If two parameters are specified,
634 * the first one is the lower, the
635 * second one the upper directory. And
636 * we'll also define the destination
637 * mount point the same as the upper. */
641 destination
= strdup(upper
);
646 upper
= lower
[n
- 2];
647 destination
= lower
[n
- 1];
651 m
= custom_mount_add(&arg_custom_mounts
, &arg_n_custom_mounts
, CUSTOM_MOUNT_OVERLAY
);
655 m
->destination
= destination
;
658 m
->read_only
= c
== ARG_OVERLAY_RO
;
660 upper
= destination
= NULL
;
663 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
670 if (!env_assignment_is_valid(optarg
)) {
671 log_error("Environment variable assignment '%s' is not valid.", optarg
);
675 n
= strv_env_set(arg_setenv
, optarg
);
679 strv_free(arg_setenv
);
682 arg_settings_mask
|= SETTING_ENVIRONMENT
;
690 case ARG_SHARE_SYSTEM
:
691 arg_share_system
= true;
695 r
= parse_boolean(optarg
);
697 log_error("Failed to parse --register= argument: %s", optarg
);
705 arg_keep_unit
= true;
708 case ARG_PERSONALITY
:
710 arg_personality
= personality_from_string(optarg
);
711 if (arg_personality
== PERSONALITY_INVALID
) {
712 log_error("Unknown or unsupported personality '%s'.", optarg
);
716 arg_settings_mask
|= SETTING_PERSONALITY
;
722 arg_volatile_mode
= VOLATILE_YES
;
726 m
= volatile_mode_from_string(optarg
);
728 log_error("Failed to parse --volatile= argument: %s", optarg
);
731 arg_volatile_mode
= m
;
734 arg_settings_mask
|= SETTING_VOLATILE_MODE
;
738 r
= expose_port_parse(&arg_expose_ports
, optarg
);
740 return log_error_errno(r
, "Duplicate port specification: %s", optarg
);
742 return log_error_errno(r
, "Failed to parse host port %s: %m", optarg
);
744 arg_settings_mask
|= SETTING_EXPOSE_PORTS
;
748 if (strv_extend(&arg_property
, optarg
) < 0)
753 case ARG_PRIVATE_USERS
:
755 _cleanup_free_
char *buffer
= NULL
;
756 const char *range
, *shift
;
758 range
= strchr(optarg
, ':');
760 buffer
= strndup(optarg
, range
- optarg
);
766 if (safe_atou32(range
, &arg_uid_range
) < 0 || arg_uid_range
<= 0) {
767 log_error("Failed to parse UID range: %s", range
);
773 if (parse_uid(shift
, &arg_uid_shift
) < 0) {
774 log_error("Failed to parse UID: %s", optarg
);
782 case ARG_KILL_SIGNAL
:
783 arg_kill_signal
= signal_from_string_try_harder(optarg
);
784 if (arg_kill_signal
< 0) {
785 log_error("Cannot parse signal: %s", optarg
);
789 arg_settings_mask
|= SETTING_KILL_SIGNAL
;
794 /* no → do not read files
795 * yes → read files, do not override cmdline, trust only subset
796 * override → read files, override cmdline, trust only subset
797 * trusted → read files, do not override cmdline, trust all
800 r
= parse_boolean(optarg
);
802 if (streq(optarg
, "trusted")) {
803 mask_all_settings
= false;
804 mask_no_settings
= false;
805 arg_settings_trusted
= true;
807 } else if (streq(optarg
, "override")) {
808 mask_all_settings
= false;
809 mask_no_settings
= true;
810 arg_settings_trusted
= -1;
812 return log_error_errno(r
, "Failed to parse --settings= argument: %s", optarg
);
815 mask_all_settings
= false;
816 mask_no_settings
= false;
817 arg_settings_trusted
= -1;
820 mask_all_settings
= true;
821 mask_no_settings
= false;
822 arg_settings_trusted
= false;
831 assert_not_reached("Unhandled option");
834 if (arg_share_system
)
835 arg_register
= false;
837 if (arg_boot
&& arg_share_system
) {
838 log_error("--boot and --share-system may not be combined.");
842 if (arg_keep_unit
&& cg_pid_get_owner_uid(0, NULL
) >= 0) {
843 log_error("--keep-unit may not be used when invoked from a user session.");
847 if (arg_directory
&& arg_image
) {
848 log_error("--directory= and --image= may not be combined.");
852 if (arg_template
&& arg_image
) {
853 log_error("--template= and --image= may not be combined.");
857 if (arg_template
&& !(arg_directory
|| arg_machine
)) {
858 log_error("--template= needs --directory= or --machine=.");
862 if (arg_ephemeral
&& arg_template
) {
863 log_error("--ephemeral and --template= may not be combined.");
867 if (arg_ephemeral
&& arg_image
) {
868 log_error("--ephemeral and --image= may not be combined.");
872 if (arg_ephemeral
&& !IN_SET(arg_link_journal
, LINK_NO
, LINK_AUTO
)) {
873 log_error("--ephemeral and --link-journal= may not be combined.");
877 if (arg_userns
&& access("/proc/self/uid_map", F_OK
) < 0)
878 return log_error_errno(EOPNOTSUPP
, "--private-users= is not supported, kernel compiled without user namespace support.");
881 arg_parameters
= strv_copy(argv
+ optind
);
885 arg_settings_mask
|= SETTING_BOOT
;
888 /* Load all settings from .nspawn files */
889 if (mask_no_settings
)
890 arg_settings_mask
= 0;
892 /* Don't load any settings from .nspawn files */
893 if (mask_all_settings
)
894 arg_settings_mask
= _SETTINGS_MASK_ALL
;
896 arg_retain
= (arg_retain
| plus
| (arg_private_network
? 1ULL << CAP_NET_ADMIN
: 0)) & ~minus
;
898 r
= detect_unified_cgroup_hierarchy();
905 static int verify_arguments(void) {
907 if (arg_volatile_mode
!= VOLATILE_NO
&& arg_read_only
) {
908 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
912 if (arg_expose_ports
&& !arg_private_network
) {
913 log_error("Cannot use --port= without private networking.");
917 if (arg_boot
&& arg_kill_signal
<= 0)
918 arg_kill_signal
= SIGRTMIN
+3;
923 static int userns_lchown(const char *p
, uid_t uid
, gid_t gid
) {
929 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
932 if (uid
!= UID_INVALID
) {
933 uid
+= arg_uid_shift
;
935 if (uid
< arg_uid_shift
|| uid
>= arg_uid_shift
+ arg_uid_range
)
939 if (gid
!= GID_INVALID
) {
940 gid
+= (gid_t
) arg_uid_shift
;
942 if (gid
< (gid_t
) arg_uid_shift
|| gid
>= (gid_t
) (arg_uid_shift
+ arg_uid_range
))
946 if (lchown(p
, uid
, gid
) < 0)
952 static int userns_mkdir(const char *root
, const char *path
, mode_t mode
, uid_t uid
, gid_t gid
) {
955 q
= prefix_roota(root
, path
);
956 if (mkdir(q
, mode
) < 0) {
962 return userns_lchown(q
, uid
, gid
);
965 static int setup_timezone(const char *dest
) {
966 _cleanup_free_
char *p
= NULL
, *q
= NULL
;
967 const char *where
, *check
, *what
;
973 /* Fix the timezone, if possible */
974 r
= readlink_malloc("/etc/localtime", &p
);
976 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
980 z
= path_startswith(p
, "../usr/share/zoneinfo/");
982 z
= path_startswith(p
, "/usr/share/zoneinfo/");
984 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
988 where
= prefix_roota(dest
, "/etc/localtime");
989 r
= readlink_malloc(where
, &q
);
991 y
= path_startswith(q
, "../usr/share/zoneinfo/");
993 y
= path_startswith(q
, "/usr/share/zoneinfo/");
995 /* Already pointing to the right place? Then do nothing .. */
996 if (y
&& streq(y
, z
))
1000 check
= strjoina("/usr/share/zoneinfo/", z
);
1001 check
= prefix_root(dest
, check
);
1002 if (laccess(check
, F_OK
) < 0) {
1003 log_warning("Timezone %s does not exist in container, not updating container timezone.", z
);
1008 if (r
< 0 && errno
!= ENOENT
) {
1009 log_error_errno(errno
, "Failed to remove existing timezone info %s in container: %m", where
);
1013 what
= strjoina("../usr/share/zoneinfo/", z
);
1014 if (symlink(what
, where
) < 0) {
1015 log_error_errno(errno
, "Failed to correct timezone of container: %m");
1019 r
= userns_lchown(where
, 0, 0);
1021 return log_warning_errno(r
, "Failed to chown /etc/localtime: %m");
1026 static int setup_resolv_conf(const char *dest
) {
1027 const char *where
= NULL
;
1032 if (arg_private_network
)
1035 /* Fix resolv.conf, if possible */
1036 where
= prefix_roota(dest
, "/etc/resolv.conf");
1038 r
= copy_file("/etc/resolv.conf", where
, O_TRUNC
|O_NOFOLLOW
, 0644, 0);
1040 /* If the file already exists as symlink, let's
1041 * suppress the warning, under the assumption that
1042 * resolved or something similar runs inside and the
1043 * symlink points there.
1045 * If the disk image is read-only, there's also no
1046 * point in complaining.
1048 log_full_errno(IN_SET(r
, -ELOOP
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
1049 "Failed to copy /etc/resolv.conf to %s: %m", where
);
1053 r
= userns_lchown(where
, 0, 0);
1055 log_warning_errno(r
, "Failed to chown /etc/resolv.conf: %m");
1060 static char* id128_format_as_uuid(sd_id128_t id
, char s
[37]) {
1064 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1065 SD_ID128_FORMAT_VAL(id
));
1070 static int setup_boot_id(const char *dest
) {
1071 const char *from
, *to
;
1072 sd_id128_t rnd
= {};
1076 if (arg_share_system
)
1079 /* Generate a new randomized boot ID, so that each boot-up of
1080 * the container gets a new one */
1082 from
= prefix_roota(dest
, "/run/proc-sys-kernel-random-boot-id");
1083 to
= prefix_roota(dest
, "/proc/sys/kernel/random/boot_id");
1085 r
= sd_id128_randomize(&rnd
);
1087 return log_error_errno(r
, "Failed to generate random boot id: %m");
1089 id128_format_as_uuid(rnd
, as_uuid
);
1091 r
= write_string_file(from
, as_uuid
, WRITE_STRING_FILE_CREATE
);
1093 return log_error_errno(r
, "Failed to write boot id: %m");
1095 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1096 r
= log_error_errno(errno
, "Failed to bind mount boot id: %m");
1097 else if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
) < 0)
1098 log_warning_errno(errno
, "Failed to make boot id read-only: %m");
1104 static int copy_devnodes(const char *dest
) {
1106 static const char devnodes
[] =
1117 _cleanup_umask_ mode_t u
;
1123 /* Create /dev/net, so that we can create /dev/net/tun in it */
1124 if (userns_mkdir(dest
, "/dev/net", 0755, 0, 0) < 0)
1125 return log_error_errno(r
, "Failed to create /dev/net directory: %m");
1127 NULSTR_FOREACH(d
, devnodes
) {
1128 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
1131 from
= strappend("/dev/", d
);
1132 to
= prefix_root(dest
, from
);
1134 if (stat(from
, &st
) < 0) {
1136 if (errno
!= ENOENT
)
1137 return log_error_errno(errno
, "Failed to stat %s: %m", from
);
1139 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
1141 log_error("%s is not a char or block device, cannot copy.", from
);
1145 if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
1147 return log_error_errno(errno
, "mknod(%s) failed: %m", to
);
1149 /* Some systems abusively restrict mknod but
1150 * allow bind mounts. */
1153 return log_error_errno(r
, "touch (%s) failed: %m", to
);
1154 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1155 return log_error_errno(errno
, "Both mknod and bind mount (%s) failed: %m", to
);
1158 r
= userns_lchown(to
, 0, 0);
1160 return log_error_errno(r
, "chown() of device node %s failed: %m", to
);
1167 static int setup_pts(const char *dest
) {
1168 _cleanup_free_
char *options
= NULL
;
1172 if (arg_selinux_apifs_context
)
1173 (void) asprintf(&options
,
1174 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
",context=\"%s\"",
1175 arg_uid_shift
+ TTY_GID
,
1176 arg_selinux_apifs_context
);
1179 (void) asprintf(&options
,
1180 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
,
1181 arg_uid_shift
+ TTY_GID
);
1186 /* Mount /dev/pts itself */
1187 p
= prefix_roota(dest
, "/dev/pts");
1188 if (mkdir(p
, 0755) < 0)
1189 return log_error_errno(errno
, "Failed to create /dev/pts: %m");
1190 if (mount("devpts", p
, "devpts", MS_NOSUID
|MS_NOEXEC
, options
) < 0)
1191 return log_error_errno(errno
, "Failed to mount /dev/pts: %m");
1192 if (userns_lchown(p
, 0, 0) < 0)
1193 return log_error_errno(errno
, "Failed to chown /dev/pts: %m");
1195 /* Create /dev/ptmx symlink */
1196 p
= prefix_roota(dest
, "/dev/ptmx");
1197 if (symlink("pts/ptmx", p
) < 0)
1198 return log_error_errno(errno
, "Failed to create /dev/ptmx symlink: %m");
1199 if (userns_lchown(p
, 0, 0) < 0)
1200 return log_error_errno(errno
, "Failed to chown /dev/ptmx: %m");
1202 /* And fix /dev/pts/ptmx ownership */
1203 p
= prefix_roota(dest
, "/dev/pts/ptmx");
1204 if (userns_lchown(p
, 0, 0) < 0)
1205 return log_error_errno(errno
, "Failed to chown /dev/pts/ptmx: %m");
1210 static int setup_dev_console(const char *dest
, const char *console
) {
1211 _cleanup_umask_ mode_t u
;
1220 r
= chmod_and_chown(console
, 0600, arg_uid_shift
, arg_uid_shift
);
1222 return log_error_errno(r
, "Failed to correct access mode for TTY: %m");
1224 /* We need to bind mount the right tty to /dev/console since
1225 * ptys can only exist on pts file systems. To have something
1226 * to bind mount things on we create a empty regular file. */
1228 to
= prefix_roota(dest
, "/dev/console");
1231 return log_error_errno(r
, "touch() for /dev/console failed: %m");
1233 if (mount(console
, to
, NULL
, MS_BIND
, NULL
) < 0)
1234 return log_error_errno(errno
, "Bind mount for /dev/console failed: %m");
1239 static int setup_kmsg(const char *dest
, int kmsg_socket
) {
1240 const char *from
, *to
;
1241 _cleanup_umask_ mode_t u
;
1244 assert(kmsg_socket
>= 0);
1248 /* We create the kmsg FIFO as /run/kmsg, but immediately
1249 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1250 * on the reading side behave very similar to /proc/kmsg,
1251 * their writing side behaves differently from /dev/kmsg in
1252 * that writing blocks when nothing is reading. In order to
1253 * avoid any problems with containers deadlocking due to this
1254 * we simply make /dev/kmsg unavailable to the container. */
1255 from
= prefix_roota(dest
, "/run/kmsg");
1256 to
= prefix_roota(dest
, "/proc/kmsg");
1258 if (mkfifo(from
, 0600) < 0)
1259 return log_error_errno(errno
, "mkfifo() for /run/kmsg failed: %m");
1260 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1261 return log_error_errno(errno
, "Bind mount for /proc/kmsg failed: %m");
1263 fd
= open(from
, O_RDWR
|O_NDELAY
|O_CLOEXEC
);
1265 return log_error_errno(errno
, "Failed to open fifo: %m");
1267 /* Store away the fd in the socket, so that it stays open as
1268 * long as we run the child */
1269 r
= send_one_fd(kmsg_socket
, fd
, 0);
1273 return log_error_errno(r
, "Failed to send FIFO fd: %m");
1275 /* And now make the FIFO unavailable as /run/kmsg... */
1276 (void) unlink(from
);
1281 static int on_address_change(sd_netlink
*rtnl
, sd_netlink_message
*m
, void *userdata
) {
1282 union in_addr_union
*exposed
= userdata
;
1288 expose_port_execute(rtnl
, arg_expose_ports
, exposed
);
1292 static int setup_hostname(void) {
1294 if (arg_share_system
)
1297 if (sethostname_idempotent(arg_machine
) < 0)
1303 static int setup_journal(const char *directory
) {
1304 sd_id128_t machine_id
, this_id
;
1305 _cleanup_free_
char *b
= NULL
, *d
= NULL
;
1306 const char *etc_machine_id
, *p
, *q
;
1310 /* Don't link journals in ephemeral mode */
1314 etc_machine_id
= prefix_roota(directory
, "/etc/machine-id");
1316 r
= read_one_line_file(etc_machine_id
, &b
);
1317 if (r
== -ENOENT
&& arg_link_journal
== LINK_AUTO
)
1320 return log_error_errno(r
, "Failed to read machine ID from %s: %m", etc_machine_id
);
1323 if (isempty(id
) && arg_link_journal
== LINK_AUTO
)
1326 /* Verify validity */
1327 r
= sd_id128_from_string(id
, &machine_id
);
1329 return log_error_errno(r
, "Failed to parse machine ID from %s: %m", etc_machine_id
);
1331 r
= sd_id128_get_machine(&this_id
);
1333 return log_error_errno(r
, "Failed to retrieve machine ID: %m");
1335 if (sd_id128_equal(machine_id
, this_id
)) {
1336 log_full(arg_link_journal
== LINK_AUTO
? LOG_WARNING
: LOG_ERR
,
1337 "Host and machine ids are equal (%s): refusing to link journals", id
);
1338 if (arg_link_journal
== LINK_AUTO
)
1343 if (arg_link_journal
== LINK_NO
)
1346 r
= userns_mkdir(directory
, "/var", 0755, 0, 0);
1348 return log_error_errno(r
, "Failed to create /var: %m");
1350 r
= userns_mkdir(directory
, "/var/log", 0755, 0, 0);
1352 return log_error_errno(r
, "Failed to create /var/log: %m");
1354 r
= userns_mkdir(directory
, "/var/log/journal", 0755, 0, 0);
1356 return log_error_errno(r
, "Failed to create /var/log/journal: %m");
1358 p
= strjoina("/var/log/journal/", id
);
1359 q
= prefix_roota(directory
, p
);
1361 if (path_is_mount_point(p
, 0) > 0) {
1362 if (arg_link_journal
!= LINK_AUTO
) {
1363 log_error("%s: already a mount point, refusing to use for journal", p
);
1370 if (path_is_mount_point(q
, 0) > 0) {
1371 if (arg_link_journal
!= LINK_AUTO
) {
1372 log_error("%s: already a mount point, refusing to use for journal", q
);
1379 r
= readlink_and_make_absolute(p
, &d
);
1381 if ((arg_link_journal
== LINK_GUEST
||
1382 arg_link_journal
== LINK_AUTO
) &&
1385 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1387 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
1392 return log_error_errno(errno
, "Failed to remove symlink %s: %m", p
);
1393 } else if (r
== -EINVAL
) {
1395 if (arg_link_journal
== LINK_GUEST
&&
1398 if (errno
== ENOTDIR
) {
1399 log_error("%s already exists and is neither a symlink nor a directory", p
);
1402 log_error_errno(errno
, "Failed to remove %s: %m", p
);
1406 } else if (r
!= -ENOENT
) {
1407 log_error_errno(errno
, "readlink(%s) failed: %m", p
);
1411 if (arg_link_journal
== LINK_GUEST
) {
1413 if (symlink(q
, p
) < 0) {
1414 if (arg_link_journal_try
) {
1415 log_debug_errno(errno
, "Failed to symlink %s to %s, skipping journal setup: %m", q
, p
);
1418 log_error_errno(errno
, "Failed to symlink %s to %s: %m", q
, p
);
1423 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1425 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
1429 if (arg_link_journal
== LINK_HOST
) {
1430 /* don't create parents here -- if the host doesn't have
1431 * permanent journal set up, don't force it here */
1434 if (arg_link_journal_try
) {
1435 log_debug_errno(errno
, "Failed to create %s, skipping journal setup: %m", p
);
1438 log_error_errno(errno
, "Failed to create %s: %m", p
);
1443 } else if (access(p
, F_OK
) < 0)
1446 if (dir_is_empty(q
) == 0)
1447 log_warning("%s is not empty, proceeding anyway.", q
);
1449 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1451 log_error_errno(errno
, "Failed to create %s: %m", q
);
1455 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1456 return log_error_errno(errno
, "Failed to bind mount journal from host into guest: %m");
1461 static int drop_capabilities(void) {
1462 return capability_bounding_set_drop(~arg_retain
, false);
1465 static int reset_audit_loginuid(void) {
1466 _cleanup_free_
char *p
= NULL
;
1469 if (arg_share_system
)
1472 r
= read_one_line_file("/proc/self/loginuid", &p
);
1476 return log_error_errno(r
, "Failed to read /proc/self/loginuid: %m");
1478 /* Already reset? */
1479 if (streq(p
, "4294967295"))
1482 r
= write_string_file("/proc/self/loginuid", "4294967295", 0);
1485 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1486 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1487 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1488 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1489 "using systemd-nspawn. Sleeping for 5s... (%m)");
1497 static int setup_seccomp(void) {
1500 static const struct {
1501 uint64_t capability
;
1504 { CAP_SYS_RAWIO
, SCMP_SYS(iopl
) },
1505 { CAP_SYS_RAWIO
, SCMP_SYS(ioperm
) },
1506 { CAP_SYS_BOOT
, SCMP_SYS(kexec_load
) },
1507 { CAP_SYS_ADMIN
, SCMP_SYS(swapon
) },
1508 { CAP_SYS_ADMIN
, SCMP_SYS(swapoff
) },
1509 { CAP_SYS_ADMIN
, SCMP_SYS(open_by_handle_at
) },
1510 { CAP_SYS_MODULE
, SCMP_SYS(init_module
) },
1511 { CAP_SYS_MODULE
, SCMP_SYS(finit_module
) },
1512 { CAP_SYS_MODULE
, SCMP_SYS(delete_module
) },
1513 { CAP_SYSLOG
, SCMP_SYS(syslog
) },
1516 scmp_filter_ctx seccomp
;
1520 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
1524 r
= seccomp_add_secondary_archs(seccomp
);
1526 log_error_errno(r
, "Failed to add secondary archs to seccomp filter: %m");
1530 for (i
= 0; i
< ELEMENTSOF(blacklist
); i
++) {
1531 if (arg_retain
& (1ULL << blacklist
[i
].capability
))
1534 r
= seccomp_rule_add(seccomp
, SCMP_ACT_ERRNO(EPERM
), blacklist
[i
].syscall_num
, 0);
1536 continue; /* unknown syscall */
1538 log_error_errno(r
, "Failed to block syscall: %m");
1545 Audit is broken in containers, much of the userspace audit
1546 hookup will fail if running inside a container. We don't
1547 care and just turn off creation of audit sockets.
1549 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1550 with EAFNOSUPPORT which audit userspace uses as indication
1551 that audit is disabled in the kernel.
1554 r
= seccomp_rule_add(
1556 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1559 SCMP_A0(SCMP_CMP_EQ
, AF_NETLINK
),
1560 SCMP_A2(SCMP_CMP_EQ
, NETLINK_AUDIT
));
1562 log_error_errno(r
, "Failed to add audit seccomp rule: %m");
1566 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
1568 log_error_errno(r
, "Failed to unset NO_NEW_PRIVS: %m");
1572 r
= seccomp_load(seccomp
);
1574 log_debug_errno(r
, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1579 log_error_errno(r
, "Failed to install seccomp audit filter: %m");
1584 seccomp_release(seccomp
);
1592 static int setup_propagate(const char *root
) {
1595 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1596 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1597 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
1598 (void) mkdir_p(p
, 0600);
1600 if (userns_mkdir(root
, "/run/systemd", 0755, 0, 0) < 0)
1601 return log_error_errno(errno
, "Failed to create /run/systemd: %m");
1603 if (userns_mkdir(root
, "/run/systemd/nspawn", 0755, 0, 0) < 0)
1604 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn: %m");
1606 if (userns_mkdir(root
, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
1607 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn/incoming: %m");
1609 q
= prefix_roota(root
, "/run/systemd/nspawn/incoming");
1610 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1611 return log_error_errno(errno
, "Failed to install propagation bind mount.");
1613 if (mount(NULL
, q
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
) < 0)
1614 return log_error_errno(errno
, "Failed to make propagation mount read-only");
1619 static int setup_image(char **device_path
, int *loop_nr
) {
1620 struct loop_info64 info
= {
1621 .lo_flags
= LO_FLAGS_AUTOCLEAR
|LO_FLAGS_PARTSCAN
1623 _cleanup_close_
int fd
= -1, control
= -1, loop
= -1;
1624 _cleanup_free_
char* loopdev
= NULL
;
1628 assert(device_path
);
1632 fd
= open(arg_image
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1634 return log_error_errno(errno
, "Failed to open %s: %m", arg_image
);
1636 if (fstat(fd
, &st
) < 0)
1637 return log_error_errno(errno
, "Failed to stat %s: %m", arg_image
);
1639 if (S_ISBLK(st
.st_mode
)) {
1642 p
= strdup(arg_image
);
1656 if (!S_ISREG(st
.st_mode
)) {
1657 log_error_errno(errno
, "%s is not a regular file or block device: %m", arg_image
);
1661 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
1663 return log_error_errno(errno
, "Failed to open /dev/loop-control: %m");
1665 nr
= ioctl(control
, LOOP_CTL_GET_FREE
);
1667 return log_error_errno(errno
, "Failed to allocate loop device: %m");
1669 if (asprintf(&loopdev
, "/dev/loop%i", nr
) < 0)
1672 loop
= open(loopdev
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1674 return log_error_errno(errno
, "Failed to open loop device %s: %m", loopdev
);
1676 if (ioctl(loop
, LOOP_SET_FD
, fd
) < 0)
1677 return log_error_errno(errno
, "Failed to set loopback file descriptor on %s: %m", loopdev
);
1680 info
.lo_flags
|= LO_FLAGS_READ_ONLY
;
1682 if (ioctl(loop
, LOOP_SET_STATUS64
, &info
) < 0)
1683 return log_error_errno(errno
, "Failed to set loopback settings on %s: %m", loopdev
);
1685 *device_path
= loopdev
;
1696 #define PARTITION_TABLE_BLURB \
1697 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1698 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1699 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1700 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1701 "to be bootable with systemd-nspawn."
1703 static int dissect_image(
1705 char **root_device
, bool *root_device_rw
,
1706 char **home_device
, bool *home_device_rw
,
1707 char **srv_device
, bool *srv_device_rw
,
1711 int home_nr
= -1, srv_nr
= -1;
1712 #ifdef GPT_ROOT_NATIVE
1715 #ifdef GPT_ROOT_SECONDARY
1716 int secondary_root_nr
= -1;
1718 _cleanup_free_
char *home
= NULL
, *root
= NULL
, *secondary_root
= NULL
, *srv
= NULL
, *generic
= NULL
;
1719 _cleanup_udev_enumerate_unref_
struct udev_enumerate
*e
= NULL
;
1720 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
1721 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
1722 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
1723 struct udev_list_entry
*first
, *item
;
1724 bool home_rw
= true, root_rw
= true, secondary_root_rw
= true, srv_rw
= true, generic_rw
= true;
1725 bool is_gpt
, is_mbr
, multiple_generic
= false;
1726 const char *pttype
= NULL
;
1733 assert(root_device
);
1734 assert(home_device
);
1739 b
= blkid_new_probe();
1744 r
= blkid_probe_set_device(b
, fd
, 0, 0);
1749 log_error_errno(errno
, "Failed to set device on blkid probe: %m");
1753 blkid_probe_enable_partitions(b
, 1);
1754 blkid_probe_set_partitions_flags(b
, BLKID_PARTS_ENTRY_DETAILS
);
1757 r
= blkid_do_safeprobe(b
);
1758 if (r
== -2 || r
== 1) {
1759 log_error("Failed to identify any partition table on\n"
1761 PARTITION_TABLE_BLURB
, arg_image
);
1763 } else if (r
!= 0) {
1766 log_error_errno(errno
, "Failed to probe: %m");
1770 (void) blkid_probe_lookup_value(b
, "PTTYPE", &pttype
, NULL
);
1772 is_gpt
= streq_ptr(pttype
, "gpt");
1773 is_mbr
= streq_ptr(pttype
, "dos");
1775 if (!is_gpt
&& !is_mbr
) {
1776 log_error("No GPT or MBR partition table discovered on\n"
1778 PARTITION_TABLE_BLURB
, arg_image
);
1783 pl
= blkid_probe_get_partitions(b
);
1788 log_error("Failed to list partitions of %s", arg_image
);
1796 if (fstat(fd
, &st
) < 0)
1797 return log_error_errno(errno
, "Failed to stat block device: %m");
1799 d
= udev_device_new_from_devnum(udev
, 'b', st
.st_rdev
);
1807 log_error("Kernel partitions never appeared.");
1811 e
= udev_enumerate_new(udev
);
1815 r
= udev_enumerate_add_match_parent(e
, d
);
1819 r
= udev_enumerate_scan_devices(e
);
1821 return log_error_errno(r
, "Failed to scan for partition devices of %s: %m", arg_image
);
1823 /* Count the partitions enumerated by the kernel */
1825 first
= udev_enumerate_get_list_entry(e
);
1826 udev_list_entry_foreach(item
, first
)
1829 /* Count the partitions enumerated by blkid */
1830 m
= blkid_partlist_numof_partitions(pl
);
1834 log_error("blkid and kernel partition list do not match.");
1840 /* The kernel has probed fewer partitions than
1841 * blkid? Maybe the kernel prober is still
1842 * running or it got EBUSY because udev
1843 * already opened the device. Let's reprobe
1844 * the device, which is a synchronous call
1845 * that waits until probing is complete. */
1847 for (j
= 0; j
< 20; j
++) {
1849 r
= ioctl(fd
, BLKRRPART
, 0);
1852 if (r
>= 0 || r
!= -EBUSY
)
1855 /* If something else has the device
1856 * open, such as an udev rule, the
1857 * ioctl will return EBUSY. Since
1858 * there's no way to wait until it
1859 * isn't busy anymore, let's just wait
1860 * a bit, and try again.
1862 * This is really something they
1863 * should fix in the kernel! */
1865 usleep(50 * USEC_PER_MSEC
);
1869 return log_error_errno(r
, "Failed to reread partition table: %m");
1872 e
= udev_enumerate_unref(e
);
1875 first
= udev_enumerate_get_list_entry(e
);
1876 udev_list_entry_foreach(item
, first
) {
1877 _cleanup_udev_device_unref_
struct udev_device
*q
;
1879 unsigned long long flags
;
1885 q
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(item
));
1890 log_error_errno(errno
, "Failed to get partition device of %s: %m", arg_image
);
1894 qn
= udev_device_get_devnum(q
);
1898 if (st
.st_rdev
== qn
)
1901 node
= udev_device_get_devnode(q
);
1905 pp
= blkid_partlist_devno_to_partition(pl
, qn
);
1909 flags
= blkid_partition_get_flags(pp
);
1911 nr
= blkid_partition_get_partno(pp
);
1919 if (flags
& GPT_FLAG_NO_AUTO
)
1922 stype
= blkid_partition_get_type_string(pp
);
1926 if (sd_id128_from_string(stype
, &type_id
) < 0)
1929 if (sd_id128_equal(type_id
, GPT_HOME
)) {
1931 if (home
&& nr
>= home_nr
)
1935 home_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1937 r
= free_and_strdup(&home
, node
);
1941 } else if (sd_id128_equal(type_id
, GPT_SRV
)) {
1943 if (srv
&& nr
>= srv_nr
)
1947 srv_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1949 r
= free_and_strdup(&srv
, node
);
1953 #ifdef GPT_ROOT_NATIVE
1954 else if (sd_id128_equal(type_id
, GPT_ROOT_NATIVE
)) {
1956 if (root
&& nr
>= root_nr
)
1960 root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1962 r
= free_and_strdup(&root
, node
);
1967 #ifdef GPT_ROOT_SECONDARY
1968 else if (sd_id128_equal(type_id
, GPT_ROOT_SECONDARY
)) {
1970 if (secondary_root
&& nr
>= secondary_root_nr
)
1973 secondary_root_nr
= nr
;
1974 secondary_root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1976 r
= free_and_strdup(&secondary_root
, node
);
1981 else if (sd_id128_equal(type_id
, GPT_LINUX_GENERIC
)) {
1984 multiple_generic
= true;
1986 generic_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1988 r
= free_and_strdup(&generic
, node
);
1994 } else if (is_mbr
) {
1997 if (flags
!= 0x80) /* Bootable flag */
2000 type
= blkid_partition_get_type(pp
);
2001 if (type
!= 0x83) /* Linux partition */
2005 multiple_generic
= true;
2009 r
= free_and_strdup(&root
, node
);
2017 *root_device
= root
;
2020 *root_device_rw
= root_rw
;
2022 } else if (secondary_root
) {
2023 *root_device
= secondary_root
;
2024 secondary_root
= NULL
;
2026 *root_device_rw
= secondary_root_rw
;
2028 } else if (generic
) {
2030 /* There were no partitions with precise meanings
2031 * around, but we found generic partitions. In this
2032 * case, if there's only one, we can go ahead and boot
2033 * it, otherwise we bail out, because we really cannot
2034 * make any sense of it. */
2036 if (multiple_generic
) {
2037 log_error("Identified multiple bootable Linux partitions on\n"
2039 PARTITION_TABLE_BLURB
, arg_image
);
2043 *root_device
= generic
;
2046 *root_device_rw
= generic_rw
;
2049 log_error("Failed to identify root partition in disk image\n"
2051 PARTITION_TABLE_BLURB
, arg_image
);
2056 *home_device
= home
;
2059 *home_device_rw
= home_rw
;
2066 *srv_device_rw
= srv_rw
;
2071 log_error("--image= is not supported, compiled without blkid support.");
2076 static int mount_device(const char *what
, const char *where
, const char *directory
, bool rw
) {
2078 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
2079 const char *fstype
, *p
;
2089 p
= strjoina(where
, directory
);
2094 b
= blkid_new_probe_from_filename(what
);
2098 log_error_errno(errno
, "Failed to allocate prober for %s: %m", what
);
2102 blkid_probe_enable_superblocks(b
, 1);
2103 blkid_probe_set_superblocks_flags(b
, BLKID_SUBLKS_TYPE
);
2106 r
= blkid_do_safeprobe(b
);
2107 if (r
== -1 || r
== 1) {
2108 log_error("Cannot determine file system type of %s", what
);
2110 } else if (r
!= 0) {
2113 log_error_errno(errno
, "Failed to probe %s: %m", what
);
2118 if (blkid_probe_lookup_value(b
, "TYPE", &fstype
, NULL
) < 0) {
2121 log_error("Failed to determine file system type of %s", what
);
2125 if (streq(fstype
, "crypto_LUKS")) {
2126 log_error("nspawn currently does not support LUKS disk images.");
2130 if (mount(what
, p
, fstype
, MS_NODEV
|(rw
? 0 : MS_RDONLY
), NULL
) < 0)
2131 return log_error_errno(errno
, "Failed to mount %s: %m", what
);
2135 log_error("--image= is not supported, compiled without blkid support.");
2140 static int mount_devices(
2142 const char *root_device
, bool root_device_rw
,
2143 const char *home_device
, bool home_device_rw
,
2144 const char *srv_device
, bool srv_device_rw
) {
2150 r
= mount_device(root_device
, arg_directory
, NULL
, root_device_rw
);
2152 return log_error_errno(r
, "Failed to mount root directory: %m");
2156 r
= mount_device(home_device
, arg_directory
, "/home", home_device_rw
);
2158 return log_error_errno(r
, "Failed to mount home directory: %m");
2162 r
= mount_device(srv_device
, arg_directory
, "/srv", srv_device_rw
);
2164 return log_error_errno(r
, "Failed to mount server data directory: %m");
2170 static void loop_remove(int nr
, int *image_fd
) {
2171 _cleanup_close_
int control
= -1;
2177 if (image_fd
&& *image_fd
>= 0) {
2178 r
= ioctl(*image_fd
, LOOP_CLR_FD
);
2180 log_debug_errno(errno
, "Failed to close loop image: %m");
2181 *image_fd
= safe_close(*image_fd
);
2184 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
2186 log_warning_errno(errno
, "Failed to open /dev/loop-control: %m");
2190 r
= ioctl(control
, LOOP_CTL_REMOVE
, nr
);
2192 log_debug_errno(errno
, "Failed to remove loop %d: %m", nr
);
2197 * < 0 : wait_for_terminate() failed to get the state of the
2198 * container, the container was terminated by a signal, or
2199 * failed for an unknown reason. No change is made to the
2200 * container argument.
2201 * > 0 : The program executed in the container terminated with an
2202 * error. The exit code of the program executed in the
2203 * container is returned. The container argument has been set
2204 * to CONTAINER_TERMINATED.
2205 * 0 : The container is being rebooted, has been shut down or exited
2206 * successfully. The container argument has been set to either
2207 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2209 * That is, success is indicated by a return value of zero, and an
2210 * error is indicated by a non-zero value.
2212 static int wait_for_container(pid_t pid
, ContainerStatus
*container
) {
2216 r
= wait_for_terminate(pid
, &status
);
2218 return log_warning_errno(r
, "Failed to wait for container: %m");
2220 switch (status
.si_code
) {
2223 if (status
.si_status
== 0) {
2224 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s exited successfully.", arg_machine
);
2227 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s failed with error code %i.", arg_machine
, status
.si_status
);
2229 *container
= CONTAINER_TERMINATED
;
2230 return status
.si_status
;
2233 if (status
.si_status
== SIGINT
) {
2235 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s has been shut down.", arg_machine
);
2236 *container
= CONTAINER_TERMINATED
;
2239 } else if (status
.si_status
== SIGHUP
) {
2241 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s is being rebooted.", arg_machine
);
2242 *container
= CONTAINER_REBOOTED
;
2246 /* CLD_KILLED fallthrough */
2249 log_error("Container %s terminated by signal %s.", arg_machine
, signal_to_string(status
.si_status
));
2253 log_error("Container %s failed due to unknown reason.", arg_machine
);
2260 static int on_orderly_shutdown(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
2263 pid
= PTR_TO_UINT32(userdata
);
2265 if (kill(pid
, arg_kill_signal
) >= 0) {
2266 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2267 sd_event_source_set_userdata(s
, NULL
);
2272 sd_event_exit(sd_event_source_get_event(s
), 0);
2276 static int determine_names(void) {
2279 if (arg_template
&& !arg_directory
&& arg_machine
) {
2281 /* If --template= was specified then we should not
2282 * search for a machine, but instead create a new one
2283 * in /var/lib/machine. */
2285 arg_directory
= strjoin("/var/lib/machines/", arg_machine
, NULL
);
2290 if (!arg_image
&& !arg_directory
) {
2292 _cleanup_(image_unrefp
) Image
*i
= NULL
;
2294 r
= image_find(arg_machine
, &i
);
2296 return log_error_errno(r
, "Failed to find image for machine '%s': %m", arg_machine
);
2298 log_error("No image for machine '%s': %m", arg_machine
);
2302 if (i
->type
== IMAGE_RAW
)
2303 r
= free_and_strdup(&arg_image
, i
->path
);
2305 r
= free_and_strdup(&arg_directory
, i
->path
);
2307 return log_error_errno(r
, "Invalid image directory: %m");
2310 arg_read_only
= arg_read_only
|| i
->read_only
;
2312 arg_directory
= get_current_dir_name();
2314 if (!arg_directory
&& !arg_machine
) {
2315 log_error("Failed to determine path, please use -D or -i.");
2321 if (arg_directory
&& path_equal(arg_directory
, "/"))
2322 arg_machine
= gethostname_malloc();
2324 arg_machine
= strdup(basename(arg_image
?: arg_directory
));
2329 hostname_cleanup(arg_machine
);
2330 if (!machine_name_is_valid(arg_machine
)) {
2331 log_error("Failed to determine machine name automatically, please use -M.");
2335 if (arg_ephemeral
) {
2338 /* Add a random suffix when this is an
2339 * ephemeral machine, so that we can run many
2340 * instances at once without manually having
2341 * to specify -M each time. */
2343 if (asprintf(&b
, "%s-%016" PRIx64
, arg_machine
, random_u64()) < 0)
2354 static int determine_uid_shift(const char *directory
) {
2362 if (arg_uid_shift
== UID_INVALID
) {
2365 r
= stat(directory
, &st
);
2367 return log_error_errno(errno
, "Failed to determine UID base of %s: %m", directory
);
2369 arg_uid_shift
= st
.st_uid
& UINT32_C(0xffff0000);
2371 if (arg_uid_shift
!= (st
.st_gid
& UINT32_C(0xffff0000))) {
2372 log_error("UID and GID base of %s don't match.", directory
);
2376 arg_uid_range
= UINT32_C(0x10000);
2379 if (arg_uid_shift
> (uid_t
) -1 - arg_uid_range
) {
2380 log_error("UID base too high for UID range.");
2384 log_info("Using user namespaces with base " UID_FMT
" and range " UID_FMT
".", arg_uid_shift
, arg_uid_range
);
2388 static int inner_child(
2390 const char *directory
,
2396 _cleanup_free_
char *home
= NULL
;
2398 const char *envp
[] = {
2399 "PATH=" DEFAULT_PATH_SPLIT_USR
,
2400 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2405 NULL
, /* container_uuid */
2406 NULL
, /* LISTEN_FDS */
2407 NULL
, /* LISTEN_PID */
2411 _cleanup_strv_free_
char **env_use
= NULL
;
2416 assert(kmsg_socket
>= 0);
2421 /* Tell the parent, that it now can write the UID map. */
2422 (void) barrier_place(barrier
); /* #1 */
2424 /* Wait until the parent wrote the UID map */
2425 if (!barrier_place_and_sync(barrier
)) { /* #2 */
2426 log_error("Parent died too early");
2431 r
= mount_all(NULL
, arg_userns
, true, arg_uid_shift
, arg_private_network
, arg_uid_range
, arg_selinux_apifs_context
);
2435 r
= mount_sysfs(NULL
);
2439 /* Wait until we are cgroup-ified, so that we
2440 * can mount the right cgroup path writable */
2441 if (!barrier_place_and_sync(barrier
)) { /* #3 */
2442 log_error("Parent died too early");
2446 r
= mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy
);
2450 r
= reset_uid_gid();
2452 return log_error_errno(r
, "Couldn't become new root: %m");
2454 r
= setup_boot_id(NULL
);
2458 r
= setup_kmsg(NULL
, kmsg_socket
);
2461 kmsg_socket
= safe_close(kmsg_socket
);
2466 return log_error_errno(errno
, "setsid() failed: %m");
2468 if (arg_private_network
)
2471 if (arg_expose_ports
) {
2472 r
= expose_port_send_rtnl(rtnl_socket
);
2475 rtnl_socket
= safe_close(rtnl_socket
);
2478 if (drop_capabilities() < 0)
2479 return log_error_errno(errno
, "drop_capabilities() failed: %m");
2483 if (arg_personality
!= PERSONALITY_INVALID
) {
2484 if (personality(arg_personality
) < 0)
2485 return log_error_errno(errno
, "personality() failed: %m");
2486 } else if (secondary
) {
2487 if (personality(PER_LINUX32
) < 0)
2488 return log_error_errno(errno
, "personality() failed: %m");
2492 if (arg_selinux_context
)
2493 if (setexeccon((security_context_t
) arg_selinux_context
) < 0)
2494 return log_error_errno(errno
, "setexeccon(\"%s\") failed: %m", arg_selinux_context
);
2497 r
= change_uid_gid(arg_user
, &home
);
2501 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
2505 if ((asprintf((char**)(envp
+ n_env
++), "HOME=%s", home
? home
: "/root") < 0) ||
2506 (asprintf((char**)(envp
+ n_env
++), "USER=%s", arg_user
? arg_user
: "root") < 0) ||
2507 (asprintf((char**)(envp
+ n_env
++), "LOGNAME=%s", arg_user
? arg_user
: "root") < 0))
2510 if (!sd_id128_equal(arg_uuid
, SD_ID128_NULL
)) {
2513 if (asprintf((char**)(envp
+ n_env
++), "container_uuid=%s", id128_format_as_uuid(arg_uuid
, as_uuid
)) < 0)
2517 if (fdset_size(fds
) > 0) {
2518 r
= fdset_cloexec(fds
, false);
2520 return log_error_errno(r
, "Failed to unset O_CLOEXEC for file descriptors.");
2522 if ((asprintf((char **)(envp
+ n_env
++), "LISTEN_FDS=%u", fdset_size(fds
)) < 0) ||
2523 (asprintf((char **)(envp
+ n_env
++), "LISTEN_PID=1") < 0))
2527 env_use
= strv_env_merge(2, envp
, arg_setenv
);
2531 /* Let the parent know that we are ready and
2532 * wait until the parent is ready with the
2534 if (!barrier_place_and_sync(barrier
)) { /* #4 */
2535 log_error("Parent died too early");
2539 /* Now, explicitly close the log, so that we
2540 * then can close all remaining fds. Closing
2541 * the log explicitly first has the benefit
2542 * that the logging subsystem knows about it,
2543 * and is thus ready to be reopened should we
2544 * need it again. Note that the other fds
2545 * closed here are at least the locking and
2548 (void) fdset_close_others(fds
);
2554 /* Automatically search for the init system */
2556 m
= 1 + strv_length(arg_parameters
);
2557 a
= newa(char*, m
+ 1);
2558 if (strv_isempty(arg_parameters
))
2561 memcpy(a
+ 1, arg_parameters
, m
* sizeof(char*));
2563 a
[0] = (char*) "/usr/lib/systemd/systemd";
2564 execve(a
[0], a
, env_use
);
2566 a
[0] = (char*) "/lib/systemd/systemd";
2567 execve(a
[0], a
, env_use
);
2569 a
[0] = (char*) "/sbin/init";
2570 execve(a
[0], a
, env_use
);
2571 } else if (!strv_isempty(arg_parameters
))
2572 execvpe(arg_parameters
[0], arg_parameters
, env_use
);
2574 chdir(home
?: "/root");
2575 execle("/bin/bash", "-bash", NULL
, env_use
);
2576 execle("/bin/sh", "-sh", NULL
, env_use
);
2580 return log_error_errno(errno
, "execv() failed: %m");
2583 static int outer_child(
2585 const char *directory
,
2586 const char *console
,
2587 const char *root_device
, bool root_device_rw
,
2588 const char *home_device
, bool home_device_rw
,
2589 const char *srv_device
, bool srv_device_rw
,
2595 int uid_shift_socket
,
2605 assert(pid_socket
>= 0);
2606 assert(kmsg_socket
>= 0);
2610 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0)
2611 return log_error_errno(errno
, "PR_SET_PDEATHSIG failed: %m");
2614 close_nointr(STDIN_FILENO
);
2615 close_nointr(STDOUT_FILENO
);
2616 close_nointr(STDERR_FILENO
);
2618 r
= open_terminal(console
, O_RDWR
);
2619 if (r
!= STDIN_FILENO
) {
2625 return log_error_errno(r
, "Failed to open console: %m");
2628 if (dup2(STDIN_FILENO
, STDOUT_FILENO
) != STDOUT_FILENO
||
2629 dup2(STDIN_FILENO
, STDERR_FILENO
) != STDERR_FILENO
)
2630 return log_error_errno(errno
, "Failed to duplicate console: %m");
2633 r
= reset_audit_loginuid();
2637 /* Mark everything as slave, so that we still
2638 * receive mounts from the real root, but don't
2639 * propagate mounts to the real root. */
2640 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
2641 return log_error_errno(errno
, "MS_SLAVE|MS_REC failed: %m");
2643 r
= mount_devices(directory
,
2644 root_device
, root_device_rw
,
2645 home_device
, home_device_rw
,
2646 srv_device
, srv_device_rw
);
2650 r
= determine_uid_shift(directory
);
2655 l
= send(uid_shift_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), MSG_NOSIGNAL
);
2657 return log_error_errno(errno
, "Failed to send UID shift: %m");
2658 if (l
!= sizeof(arg_uid_shift
)) {
2659 log_error("Short write while sending UID shift.");
2664 /* Turn directory into bind mount */
2665 if (mount(directory
, directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
2666 return log_error_errno(errno
, "Failed to make bind mount: %m");
2668 r
= setup_volatile(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2672 r
= setup_volatile_state(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2676 r
= base_filesystem_create(directory
, arg_uid_shift
, (gid_t
) arg_uid_shift
);
2680 if (arg_read_only
) {
2681 r
= bind_remount_recursive(directory
, true);
2683 return log_error_errno(r
, "Failed to make tree read-only: %m");
2686 r
= mount_all(directory
, arg_userns
, false, arg_private_network
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2690 r
= copy_devnodes(directory
);
2694 dev_setup(directory
, arg_uid_shift
, arg_uid_shift
);
2696 r
= setup_pts(directory
);
2700 r
= setup_propagate(directory
);
2704 r
= setup_dev_console(directory
, console
);
2708 r
= setup_seccomp();
2712 r
= setup_timezone(directory
);
2716 r
= setup_resolv_conf(directory
);
2720 r
= setup_journal(directory
);
2724 r
= mount_custom(directory
, arg_custom_mounts
, arg_n_custom_mounts
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2728 r
= mount_cgroups(directory
, arg_unified_cgroup_hierarchy
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2732 r
= mount_move_root(directory
);
2734 return log_error_errno(r
, "Failed to move root directory: %m");
2736 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
|
2737 (arg_share_system
? 0 : CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
) |
2738 (arg_private_network
? CLONE_NEWNET
: 0) |
2739 (arg_userns
? CLONE_NEWUSER
: 0),
2742 return log_error_errno(errno
, "Failed to fork inner child: %m");
2744 pid_socket
= safe_close(pid_socket
);
2745 uid_shift_socket
= safe_close(uid_shift_socket
);
2747 /* The inner child has all namespaces that are
2748 * requested, so that we all are owned by the user if
2749 * user namespaces are turned on. */
2751 r
= inner_child(barrier
, directory
, secondary
, kmsg_socket
, rtnl_socket
, fds
);
2753 _exit(EXIT_FAILURE
);
2755 _exit(EXIT_SUCCESS
);
2758 l
= send(pid_socket
, &pid
, sizeof(pid
), MSG_NOSIGNAL
);
2760 return log_error_errno(errno
, "Failed to send PID: %m");
2761 if (l
!= sizeof(pid
)) {
2762 log_error("Short write while sending PID.");
2766 pid_socket
= safe_close(pid_socket
);
2767 kmsg_socket
= safe_close(kmsg_socket
);
2768 rtnl_socket
= safe_close(rtnl_socket
);
2773 static int setup_uid_map(pid_t pid
) {
2774 char uid_map
[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t
) + 1], line
[DECIMAL_STR_MAX(uid_t
)*3+3+1];
2779 xsprintf(uid_map
, "/proc/" PID_FMT
"/uid_map", pid
);
2780 xsprintf(line
, UID_FMT
" " UID_FMT
" " UID_FMT
"\n", 0, arg_uid_shift
, arg_uid_range
);
2781 r
= write_string_file(uid_map
, line
, 0);
2783 return log_error_errno(r
, "Failed to write UID map: %m");
2785 /* We always assign the same UID and GID ranges */
2786 xsprintf(uid_map
, "/proc/" PID_FMT
"/gid_map", pid
);
2787 r
= write_string_file(uid_map
, line
, 0);
2789 return log_error_errno(r
, "Failed to write GID map: %m");
2794 static int load_settings(void) {
2795 _cleanup_(settings_freep
) Settings
*settings
= NULL
;
2796 _cleanup_fclose_
FILE *f
= NULL
;
2797 _cleanup_free_
char *p
= NULL
;
2801 /* If all settings are masked, there's no point in looking for
2802 * the settings file */
2803 if ((arg_settings_mask
& _SETTINGS_MASK_ALL
) == _SETTINGS_MASK_ALL
)
2806 fn
= strjoina(arg_machine
, ".nspawn");
2808 /* We first look in the admin's directories in /etc and /run */
2809 FOREACH_STRING(i
, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2810 _cleanup_free_
char *j
= NULL
;
2812 j
= strjoin(i
, "/", fn
, NULL
);
2821 /* By default we trust configuration from /etc and /run */
2822 if (arg_settings_trusted
< 0)
2823 arg_settings_trusted
= true;
2828 if (errno
!= ENOENT
)
2829 return log_error_errno(errno
, "Failed to open %s: %m", j
);
2833 /* After that, let's look for a file next to the
2834 * actual image we shall boot. */
2837 p
= file_in_same_dir(arg_image
, fn
);
2840 } else if (arg_directory
) {
2841 p
= file_in_same_dir(arg_directory
, fn
);
2848 if (!f
&& errno
!= ENOENT
)
2849 return log_error_errno(errno
, "Failed to open %s: %m", p
);
2851 /* By default we do not trust configuration from /var/lib/machines */
2852 if (arg_settings_trusted
< 0)
2853 arg_settings_trusted
= false;
2860 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted
));
2862 r
= settings_load(f
, p
, &settings
);
2866 /* Copy over bits from the settings, unless they have been
2867 * explicitly masked by command line switches. */
2869 if ((arg_settings_mask
& SETTING_BOOT
) == 0 &&
2870 settings
->boot
>= 0) {
2871 arg_boot
= settings
->boot
;
2873 strv_free(arg_parameters
);
2874 arg_parameters
= settings
->parameters
;
2875 settings
->parameters
= NULL
;
2878 if ((arg_settings_mask
& SETTING_ENVIRONMENT
) == 0 &&
2879 settings
->environment
) {
2880 strv_free(arg_setenv
);
2881 arg_setenv
= settings
->environment
;
2882 settings
->environment
= NULL
;
2885 if ((arg_settings_mask
& SETTING_USER
) == 0 &&
2888 arg_user
= settings
->user
;
2889 settings
->user
= NULL
;
2892 if ((arg_settings_mask
& SETTING_CAPABILITY
) == 0) {
2895 plus
= settings
->capability
;
2896 if (settings_private_network(settings
))
2897 plus
|= (1ULL << CAP_NET_ADMIN
);
2899 if (!arg_settings_trusted
&& plus
!= 0) {
2900 if (settings
->capability
!= 0)
2901 log_warning("Ignoring Capability= setting, file %s is not trusted.", p
);
2905 arg_retain
&= ~settings
->drop_capability
;
2908 if ((arg_settings_mask
& SETTING_KILL_SIGNAL
) == 0 &&
2909 settings
->kill_signal
> 0)
2910 arg_kill_signal
= settings
->kill_signal
;
2912 if ((arg_settings_mask
& SETTING_PERSONALITY
) == 0 &&
2913 settings
->personality
!= PERSONALITY_INVALID
)
2914 arg_personality
= settings
->personality
;
2916 if ((arg_settings_mask
& SETTING_MACHINE_ID
) == 0 &&
2917 !sd_id128_is_null(settings
->machine_id
)) {
2919 if (!arg_settings_trusted
)
2920 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p
);
2922 arg_uuid
= settings
->machine_id
;
2925 if ((arg_settings_mask
& SETTING_READ_ONLY
) == 0 &&
2926 settings
->read_only
>= 0)
2927 arg_read_only
= settings
->read_only
;
2929 if ((arg_settings_mask
& SETTING_VOLATILE_MODE
) == 0 &&
2930 settings
->volatile_mode
!= _VOLATILE_MODE_INVALID
)
2931 arg_volatile_mode
= settings
->volatile_mode
;
2933 if ((arg_settings_mask
& SETTING_CUSTOM_MOUNTS
) == 0 &&
2934 settings
->n_custom_mounts
> 0) {
2936 if (!arg_settings_trusted
)
2937 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p
);
2939 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
2940 arg_custom_mounts
= settings
->custom_mounts
;
2941 arg_n_custom_mounts
= settings
->n_custom_mounts
;
2943 settings
->custom_mounts
= NULL
;
2944 settings
->n_custom_mounts
= 0;
2948 if ((arg_settings_mask
& SETTING_NETWORK
) == 0 &&
2949 (settings
->private_network
>= 0 ||
2950 settings
->network_veth
>= 0 ||
2951 settings
->network_bridge
||
2952 settings
->network_interfaces
||
2953 settings
->network_macvlan
||
2954 settings
->network_ipvlan
)) {
2956 if (!arg_settings_trusted
)
2957 log_warning("Ignoring network settings, file %s is not trusted.", p
);
2959 arg_network_veth
= settings_private_network(settings
);
2960 arg_private_network
= settings_private_network(settings
);
2962 strv_free(arg_network_interfaces
);
2963 arg_network_interfaces
= settings
->network_interfaces
;
2964 settings
->network_interfaces
= NULL
;
2966 strv_free(arg_network_macvlan
);
2967 arg_network_macvlan
= settings
->network_macvlan
;
2968 settings
->network_macvlan
= NULL
;
2970 strv_free(arg_network_ipvlan
);
2971 arg_network_ipvlan
= settings
->network_ipvlan
;
2972 settings
->network_ipvlan
= NULL
;
2974 free(arg_network_bridge
);
2975 arg_network_bridge
= settings
->network_bridge
;
2976 settings
->network_bridge
= NULL
;
2980 if ((arg_settings_mask
& SETTING_EXPOSE_PORTS
) == 0 &&
2981 settings
->expose_ports
) {
2983 if (!arg_settings_trusted
)
2984 log_warning("Ignoring Port= setting, file %s is not trusted.", p
);
2986 expose_port_free_all(arg_expose_ports
);
2987 arg_expose_ports
= settings
->expose_ports
;
2988 settings
->expose_ports
= NULL
;
2995 int main(int argc
, char *argv
[]) {
2997 _cleanup_free_
char *device_path
= NULL
, *root_device
= NULL
, *home_device
= NULL
, *srv_device
= NULL
, *console
= NULL
;
2998 bool root_device_rw
= true, home_device_rw
= true, srv_device_rw
= true;
2999 _cleanup_close_
int master
= -1, image_fd
= -1;
3000 _cleanup_fdset_free_ FDSet
*fds
= NULL
;
3001 int r
, n_fd_passed
, loop_nr
= -1;
3002 char veth_name
[IFNAMSIZ
];
3003 bool secondary
= false, remove_subvol
= false;
3006 int ret
= EXIT_SUCCESS
;
3007 union in_addr_union exposed
= {};
3008 _cleanup_release_lock_file_ LockFile tree_global_lock
= LOCK_FILE_INIT
, tree_local_lock
= LOCK_FILE_INIT
;
3011 log_parse_environment();
3014 r
= parse_argv(argc
, argv
);
3018 if (geteuid() != 0) {
3019 log_error("Need to be root.");
3023 r
= determine_names();
3027 r
= load_settings();
3031 r
= verify_arguments();
3035 n_fd_passed
= sd_listen_fds(false);
3036 if (n_fd_passed
> 0) {
3037 r
= fdset_new_listen_fds(&fds
, false);
3039 log_error_errno(r
, "Failed to collect file descriptors: %m");
3044 if (arg_directory
) {
3047 if (path_equal(arg_directory
, "/") && !arg_ephemeral
) {
3048 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3053 if (arg_ephemeral
) {
3054 _cleanup_free_
char *np
= NULL
;
3056 /* If the specified path is a mount point we
3057 * generate the new snapshot immediately
3058 * inside it under a random name. However if
3059 * the specified is not a mount point we
3060 * create the new snapshot in the parent
3061 * directory, just next to it. */
3062 r
= path_is_mount_point(arg_directory
, 0);
3064 log_error_errno(r
, "Failed to determine whether directory %s is mount point: %m", arg_directory
);
3068 r
= tempfn_random_child(arg_directory
, "machine.", &np
);
3070 r
= tempfn_random(arg_directory
, "machine.", &np
);
3072 log_error_errno(r
, "Failed to generate name for snapshot: %m");
3076 r
= image_path_lock(np
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3078 log_error_errno(r
, "Failed to lock %s: %m", np
);
3082 r
= btrfs_subvol_snapshot(arg_directory
, np
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
3084 log_error_errno(r
, "Failed to create snapshot %s from %s: %m", np
, arg_directory
);
3088 free(arg_directory
);
3092 remove_subvol
= true;
3095 r
= image_path_lock(arg_directory
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3097 log_error_errno(r
, "Directory tree %s is currently busy.", arg_directory
);
3101 log_error_errno(r
, "Failed to lock %s: %m", arg_directory
);
3106 r
= btrfs_subvol_snapshot(arg_template
, arg_directory
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
3109 log_info("Directory %s already exists, not populating from template %s.", arg_directory
, arg_template
);
3111 log_error_errno(r
, "Couldn't create snapshot %s from %s: %m", arg_directory
, arg_template
);
3115 log_info("Populated %s from template %s.", arg_directory
, arg_template
);
3121 if (path_is_os_tree(arg_directory
) <= 0) {
3122 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory
);
3129 p
= strjoina(arg_directory
, "/usr/");
3130 if (laccess(p
, F_OK
) < 0) {
3131 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory
);
3138 char template[] = "/tmp/nspawn-root-XXXXXX";
3141 assert(!arg_template
);
3143 r
= image_path_lock(arg_image
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3145 r
= log_error_errno(r
, "Disk image %s is currently busy.", arg_image
);
3149 r
= log_error_errno(r
, "Failed to create image lock: %m");
3153 if (!mkdtemp(template)) {
3154 log_error_errno(errno
, "Failed to create temporary directory: %m");
3159 arg_directory
= strdup(template);
3160 if (!arg_directory
) {
3165 image_fd
= setup_image(&device_path
, &loop_nr
);
3171 r
= dissect_image(image_fd
,
3172 &root_device
, &root_device_rw
,
3173 &home_device
, &home_device_rw
,
3174 &srv_device
, &srv_device_rw
,
3180 r
= custom_mounts_prepare();
3185 isatty(STDIN_FILENO
) > 0 &&
3186 isatty(STDOUT_FILENO
) > 0;
3188 master
= posix_openpt(O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NDELAY
);
3190 r
= log_error_errno(errno
, "Failed to acquire pseudo tty: %m");
3194 r
= ptsname_malloc(master
, &console
);
3196 r
= log_error_errno(r
, "Failed to determine tty name: %m");
3200 if (unlockpt(master
) < 0) {
3201 r
= log_error_errno(errno
, "Failed to unlock tty: %m");
3206 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3207 arg_machine
, arg_image
?: arg_directory
);
3209 assert_se(sigprocmask_many(SIG_BLOCK
, NULL
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, -1) >= 0);
3211 assert_se(sigemptyset(&mask_chld
) == 0);
3212 assert_se(sigaddset(&mask_chld
, SIGCHLD
) == 0);
3214 if (prctl(PR_SET_CHILD_SUBREAPER
, 1) < 0) {
3215 r
= log_error_errno(errno
, "Failed to become subreaper: %m");
3220 _cleanup_close_pair_
int kmsg_socket_pair
[2] = { -1, -1 }, rtnl_socket_pair
[2] = { -1, -1 }, pid_socket_pair
[2] = { -1, -1 },
3221 uid_shift_socket_pair
[2] = { -1, -1 };
3222 ContainerStatus container_status
;
3223 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
3224 static const struct sigaction sa
= {
3225 .sa_handler
= nop_signal_handler
,
3226 .sa_flags
= SA_NOCLDSTOP
,
3230 _cleanup_event_unref_ sd_event
*event
= NULL
;
3231 _cleanup_(pty_forward_freep
) PTYForward
*forward
= NULL
;
3232 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
3235 r
= barrier_create(&barrier
);
3237 log_error_errno(r
, "Cannot initialize IPC barrier: %m");
3241 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, kmsg_socket_pair
) < 0) {
3242 r
= log_error_errno(errno
, "Failed to create kmsg socket pair: %m");
3246 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, rtnl_socket_pair
) < 0) {
3247 r
= log_error_errno(errno
, "Failed to create rtnl socket pair: %m");
3251 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, pid_socket_pair
) < 0) {
3252 r
= log_error_errno(errno
, "Failed to create pid socket pair: %m");
3257 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, uid_shift_socket_pair
) < 0) {
3258 r
= log_error_errno(errno
, "Failed to create uid shift socket pair: %m");
3262 /* Child can be killed before execv(), so handle SIGCHLD
3263 * in order to interrupt parent's blocking calls and
3264 * give it a chance to call wait() and terminate. */
3265 r
= sigprocmask(SIG_UNBLOCK
, &mask_chld
, NULL
);
3267 r
= log_error_errno(errno
, "Failed to change the signal mask: %m");
3271 r
= sigaction(SIGCHLD
, &sa
, NULL
);
3273 r
= log_error_errno(errno
, "Failed to install SIGCHLD handler: %m");
3277 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
, NULL
);
3279 if (errno
== EINVAL
)
3280 r
= log_error_errno(errno
, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3282 r
= log_error_errno(errno
, "clone() failed: %m");
3288 /* The outer child only has a file system namespace. */
3289 barrier_set_role(&barrier
, BARRIER_CHILD
);
3291 master
= safe_close(master
);
3293 kmsg_socket_pair
[0] = safe_close(kmsg_socket_pair
[0]);
3294 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3295 pid_socket_pair
[0] = safe_close(pid_socket_pair
[0]);
3296 uid_shift_socket_pair
[0] = safe_close(uid_shift_socket_pair
[0]);
3298 (void) reset_all_signal_handlers();
3299 (void) reset_signal_mask();
3301 r
= outer_child(&barrier
,
3304 root_device
, root_device_rw
,
3305 home_device
, home_device_rw
,
3306 srv_device
, srv_device_rw
,
3310 kmsg_socket_pair
[1],
3311 rtnl_socket_pair
[1],
3312 uid_shift_socket_pair
[1],
3315 _exit(EXIT_FAILURE
);
3317 _exit(EXIT_SUCCESS
);
3320 barrier_set_role(&barrier
, BARRIER_PARENT
);
3322 fds
= fdset_free(fds
);
3324 kmsg_socket_pair
[1] = safe_close(kmsg_socket_pair
[1]);
3325 rtnl_socket_pair
[1] = safe_close(rtnl_socket_pair
[1]);
3326 pid_socket_pair
[1] = safe_close(pid_socket_pair
[1]);
3327 uid_shift_socket_pair
[1] = safe_close(uid_shift_socket_pair
[1]);
3329 /* Wait for the outer child. */
3330 r
= wait_for_terminate_and_warn("namespace helper", pid
, NULL
);
3339 /* And now retrieve the PID of the inner child. */
3340 l
= recv(pid_socket_pair
[0], &pid
, sizeof(pid
), 0);
3342 r
= log_error_errno(errno
, "Failed to read inner child PID: %m");
3345 if (l
!= sizeof(pid
)) {
3346 log_error("Short read while reading inner child PID.");
3351 log_debug("Init process invoked as PID " PID_FMT
, pid
);
3354 if (!barrier_place_and_sync(&barrier
)) { /* #1 */
3355 log_error("Child died too early.");
3360 l
= recv(uid_shift_socket_pair
[0], &arg_uid_shift
, sizeof(arg_uid_shift
), 0);
3362 r
= log_error_errno(errno
, "Failed to read UID shift: %m");
3365 if (l
!= sizeof(arg_uid_shift
)) {
3366 log_error("Short read while reading UID shift.");
3371 r
= setup_uid_map(pid
);
3375 (void) barrier_place(&barrier
); /* #2 */
3378 if (arg_private_network
) {
3380 r
= move_network_interfaces(pid
, arg_network_interfaces
);
3384 if (arg_network_veth
) {
3385 r
= setup_veth(arg_machine
, pid
, veth_name
, !!arg_network_bridge
);
3391 if (arg_network_bridge
) {
3392 r
= setup_bridge(veth_name
, arg_network_bridge
);
3400 r
= setup_macvlan(arg_machine
, pid
, arg_network_macvlan
);
3404 r
= setup_ipvlan(arg_machine
, pid
, arg_network_ipvlan
);
3410 r
= register_machine(
3417 arg_custom_mounts
, arg_n_custom_mounts
,
3425 r
= sync_cgroup(pid
, arg_unified_cgroup_hierarchy
);
3429 if (arg_keep_unit
) {
3430 r
= create_subcgroup(pid
, arg_unified_cgroup_hierarchy
);
3435 r
= chown_cgroup(pid
, arg_uid_shift
);
3439 /* Notify the child that the parent is ready with all
3440 * its setup (including cgroup-ification), and that
3441 * the child can now hand over control to the code to
3442 * run inside the container. */
3443 (void) barrier_place(&barrier
); /* #3 */
3445 /* Block SIGCHLD here, before notifying child.
3446 * process_pty() will handle it with the other signals. */
3447 assert_se(sigprocmask(SIG_BLOCK
, &mask_chld
, NULL
) >= 0);
3449 /* Reset signal to default */
3450 r
= default_signals(SIGCHLD
, -1);
3452 log_error_errno(r
, "Failed to reset SIGCHLD: %m");
3456 /* Let the child know that we are ready and wait that the child is completely ready now. */
3457 if (!barrier_place_and_sync(&barrier
)) { /* #4 */
3458 log_error("Child died too early.");
3465 "STATUS=Container running.\n"
3466 "X_NSPAWN_LEADER_PID=" PID_FMT
, pid
);
3468 r
= sd_event_new(&event
);
3470 log_error_errno(r
, "Failed to get default event source: %m");
3474 if (arg_kill_signal
> 0) {
3475 /* Try to kill the init system on SIGINT or SIGTERM */
3476 sd_event_add_signal(event
, NULL
, SIGINT
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
3477 sd_event_add_signal(event
, NULL
, SIGTERM
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
3479 /* Immediately exit */
3480 sd_event_add_signal(event
, NULL
, SIGINT
, NULL
, NULL
);
3481 sd_event_add_signal(event
, NULL
, SIGTERM
, NULL
, NULL
);
3484 /* simply exit on sigchld */
3485 sd_event_add_signal(event
, NULL
, SIGCHLD
, NULL
, NULL
);
3487 if (arg_expose_ports
) {
3488 r
= expose_port_watch_rtnl(event
, rtnl_socket_pair
[0], on_address_change
, &exposed
, &rtnl
);
3492 (void) expose_port_execute(rtnl
, arg_expose_ports
, &exposed
);
3495 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3497 r
= pty_forward_new(event
, master
, PTY_FORWARD_IGNORE_VHANGUP
| (interactive
? 0 : PTY_FORWARD_READ_ONLY
), &forward
);
3499 log_error_errno(r
, "Failed to create PTY forwarder: %m");
3503 r
= sd_event_loop(event
);
3505 log_error_errno(r
, "Failed to run event loop: %m");
3509 pty_forward_get_last_char(forward
, &last_char
);
3511 forward
= pty_forward_free(forward
);
3513 if (!arg_quiet
&& last_char
!= '\n')
3516 /* Kill if it is not dead yet anyway */
3517 if (arg_register
&& !arg_keep_unit
)
3518 terminate_machine(pid
);
3520 /* Normally redundant, but better safe than sorry */
3523 r
= wait_for_container(pid
, &container_status
);
3527 /* We failed to wait for the container, or the
3528 * container exited abnormally */
3530 else if (r
> 0 || container_status
== CONTAINER_TERMINATED
){
3531 /* The container exited with a non-zero
3532 * status, or with zero status and no reboot
3538 /* CONTAINER_REBOOTED, loop again */
3540 if (arg_keep_unit
) {
3541 /* Special handling if we are running as a
3542 * service: instead of simply restarting the
3543 * machine we want to restart the entire
3544 * service, so let's inform systemd about this
3545 * with the special exit code 133. The service
3546 * file uses RestartForceExitStatus=133 so
3547 * that this results in a full nspawn
3548 * restart. This is necessary since we might
3549 * have cgroup parameters set we want to have
3556 expose_port_flush(arg_expose_ports
, &exposed
);
3562 "STATUS=Terminating...");
3567 /* Try to flush whatever is still queued in the pty */
3569 (void) copy_bytes(master
, STDOUT_FILENO
, (uint64_t) -1, false);
3571 loop_remove(loop_nr
, &image_fd
);
3573 if (remove_subvol
&& arg_directory
) {
3576 k
= btrfs_subvol_remove(arg_directory
, BTRFS_REMOVE_RECURSIVE
|BTRFS_REMOVE_QUOTA
);
3578 log_warning_errno(k
, "Cannot remove subvolume '%s', ignoring: %m", arg_directory
);
3584 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
3585 (void) rm_rf(p
, REMOVE_ROOT
);
3588 expose_port_flush(arg_expose_ports
, &exposed
);
3590 free(arg_directory
);
3595 strv_free(arg_setenv
);
3596 free(arg_network_bridge
);
3597 strv_free(arg_network_interfaces
);
3598 strv_free(arg_network_macvlan
);
3599 strv_free(arg_network_ipvlan
);
3600 strv_free(arg_parameters
);
3601 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
3602 expose_port_free_all(arg_expose_ports
);
3604 return r
< 0 ? EXIT_FAILURE
: ret
;