1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
23 #include <blkid/blkid.h>
27 #include <linux/loop.h>
33 #include <selinux/selinux.h>
40 #include <sys/mount.h>
41 #include <sys/personality.h>
42 #include <sys/prctl.h>
43 #include <sys/types.h>
46 #include "sd-daemon.h"
50 #include "base-filesystem.h"
51 #include "blkid-util.h"
52 #include "btrfs-util.h"
54 #include "capability.h"
55 #include "cgroup-util.h"
57 #include "dev-setup.h"
59 #include "event-util.h"
62 #include "formats-util.h"
64 #include "hostname-util.h"
66 #include "loopback-setup.h"
67 #include "machine-image.h"
71 #include "netlink-util.h"
72 #include "nspawn-cgroup.h"
73 #include "nspawn-expose-ports.h"
74 #include "nspawn-mount.h"
75 #include "nspawn-network.h"
76 #include "nspawn-register.h"
77 #include "nspawn-settings.h"
78 #include "nspawn-setuid.h"
79 #include "path-util.h"
80 #include "process-util.h"
82 #include "random-util.h"
85 #include "seccomp-util.h"
87 #include "signal-util.h"
88 #include "string-util.h"
90 #include "terminal-util.h"
91 #include "udev-util.h"
94 typedef enum ContainerStatus
{
99 typedef enum LinkJournal
{
106 static char *arg_directory
= NULL
;
107 static char *arg_template
= NULL
;
108 static char *arg_user
= NULL
;
109 static sd_id128_t arg_uuid
= {};
110 static char *arg_machine
= NULL
;
111 static const char *arg_selinux_context
= NULL
;
112 static const char *arg_selinux_apifs_context
= NULL
;
113 static const char *arg_slice
= NULL
;
114 static bool arg_private_network
= false;
115 static bool arg_read_only
= false;
116 static bool arg_boot
= false;
117 static bool arg_ephemeral
= false;
118 static LinkJournal arg_link_journal
= LINK_AUTO
;
119 static bool arg_link_journal_try
= false;
120 static uint64_t arg_retain
=
121 (1ULL << CAP_CHOWN
) |
122 (1ULL << CAP_DAC_OVERRIDE
) |
123 (1ULL << CAP_DAC_READ_SEARCH
) |
124 (1ULL << CAP_FOWNER
) |
125 (1ULL << CAP_FSETID
) |
126 (1ULL << CAP_IPC_OWNER
) |
128 (1ULL << CAP_LEASE
) |
129 (1ULL << CAP_LINUX_IMMUTABLE
) |
130 (1ULL << CAP_NET_BIND_SERVICE
) |
131 (1ULL << CAP_NET_BROADCAST
) |
132 (1ULL << CAP_NET_RAW
) |
133 (1ULL << CAP_SETGID
) |
134 (1ULL << CAP_SETFCAP
) |
135 (1ULL << CAP_SETPCAP
) |
136 (1ULL << CAP_SETUID
) |
137 (1ULL << CAP_SYS_ADMIN
) |
138 (1ULL << CAP_SYS_CHROOT
) |
139 (1ULL << CAP_SYS_NICE
) |
140 (1ULL << CAP_SYS_PTRACE
) |
141 (1ULL << CAP_SYS_TTY_CONFIG
) |
142 (1ULL << CAP_SYS_RESOURCE
) |
143 (1ULL << CAP_SYS_BOOT
) |
144 (1ULL << CAP_AUDIT_WRITE
) |
145 (1ULL << CAP_AUDIT_CONTROL
) |
147 static CustomMount
*arg_custom_mounts
= NULL
;
148 static unsigned arg_n_custom_mounts
= 0;
149 static char **arg_setenv
= NULL
;
150 static bool arg_quiet
= false;
151 static bool arg_share_system
= false;
152 static bool arg_register
= true;
153 static bool arg_keep_unit
= false;
154 static char **arg_network_interfaces
= NULL
;
155 static char **arg_network_macvlan
= NULL
;
156 static char **arg_network_ipvlan
= NULL
;
157 static bool arg_network_veth
= false;
158 static char *arg_network_bridge
= NULL
;
159 static unsigned long arg_personality
= PERSONALITY_INVALID
;
160 static char *arg_image
= NULL
;
161 static VolatileMode arg_volatile_mode
= VOLATILE_NO
;
162 static ExposePort
*arg_expose_ports
= NULL
;
163 static char **arg_property
= NULL
;
164 static uid_t arg_uid_shift
= UID_INVALID
, arg_uid_range
= 0x10000U
;
165 static bool arg_userns
= false;
166 static int arg_kill_signal
= 0;
167 static bool arg_unified_cgroup_hierarchy
= false;
168 static SettingsMask arg_settings_mask
= 0;
169 static int arg_settings_trusted
= -1;
170 static char **arg_parameters
= NULL
;
172 static void help(void) {
173 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
174 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
175 " -h --help Show this help\n"
176 " --version Print version string\n"
177 " -q --quiet Do not show status information\n"
178 " -D --directory=PATH Root directory for the container\n"
179 " --template=PATH Initialize root directory from template directory,\n"
181 " -x --ephemeral Run container with snapshot of root directory, and\n"
182 " remove it after exit\n"
183 " -i --image=PATH File system device or disk image for the container\n"
184 " -b --boot Boot up full system (i.e. invoke init)\n"
185 " -u --user=USER Run the command under specified user or uid\n"
186 " -M --machine=NAME Set the machine name for the container\n"
187 " --uuid=UUID Set a specific machine UUID for the container\n"
188 " -S --slice=SLICE Place the container in the specified slice\n"
189 " --property=NAME=VALUE Set scope unit property\n"
190 " --private-users[=UIDBASE[:NUIDS]]\n"
191 " Run within user namespace\n"
192 " --private-network Disable network in container\n"
193 " --network-interface=INTERFACE\n"
194 " Assign an existing network interface to the\n"
196 " --network-macvlan=INTERFACE\n"
197 " Create a macvlan network interface based on an\n"
198 " existing network interface to the container\n"
199 " --network-ipvlan=INTERFACE\n"
200 " Create a ipvlan network interface based on an\n"
201 " existing network interface to the container\n"
202 " -n --network-veth Add a virtual ethernet connection between host\n"
204 " --network-bridge=INTERFACE\n"
205 " Add a virtual ethernet connection between host\n"
206 " and container and add it to an existing bridge on\n"
208 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
209 " Expose a container IP port on the host\n"
210 " -Z --selinux-context=SECLABEL\n"
211 " Set the SELinux security context to be used by\n"
212 " processes in the container\n"
213 " -L --selinux-apifs-context=SECLABEL\n"
214 " Set the SELinux security context to be used by\n"
215 " API/tmpfs file systems in the container\n"
216 " --capability=CAP In addition to the default, retain specified\n"
218 " --drop-capability=CAP Drop the specified capability from the default set\n"
219 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
220 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
221 " try-guest, try-host\n"
222 " -j Equivalent to --link-journal=try-guest\n"
223 " --read-only Mount the root directory read-only\n"
224 " --bind=PATH[:PATH[:OPTIONS]]\n"
225 " Bind mount a file or directory from the host into\n"
227 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
228 " Similar, but creates a read-only bind mount\n"
229 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
230 " --overlay=PATH[:PATH...]:PATH\n"
231 " Create an overlay mount from the host to \n"
233 " --overlay-ro=PATH[:PATH...]:PATH\n"
234 " Similar, but creates a read-only overlay mount\n"
235 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
236 " --share-system Share system namespaces with host\n"
237 " --register=BOOLEAN Register container as machine\n"
238 " --keep-unit Do not register a scope for the machine, reuse\n"
239 " the service unit nspawn is running in\n"
240 " --volatile[=MODE] Run the system in volatile mode\n"
241 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
242 , program_invocation_short_name
);
246 static int custom_mounts_prepare(void) {
250 /* Ensure the mounts are applied prefix first. */
251 qsort_safe(arg_custom_mounts
, arg_n_custom_mounts
, sizeof(CustomMount
), custom_mount_compare
);
253 /* Allocate working directories for the overlay file systems that need it */
254 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
255 CustomMount
*m
= &arg_custom_mounts
[i
];
257 if (arg_userns
&& arg_uid_shift
== UID_INVALID
&& path_equal(m
->destination
, "/")) {
258 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
262 if (m
->type
!= CUSTOM_MOUNT_OVERLAY
)
271 r
= tempfn_random(m
->source
, NULL
, &m
->work_dir
);
273 return log_error_errno(r
, "Failed to generate work directory from %s: %m", m
->source
);
279 static int detect_unified_cgroup_hierarchy(void) {
283 /* Allow the user to control whether the unified hierarchy is used */
284 e
= getenv("UNIFIED_CGROUP_HIERARCHY");
286 r
= parse_boolean(e
);
288 return log_error_errno(r
, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
290 arg_unified_cgroup_hierarchy
= r
;
294 /* Otherwise inherit the default from the host system */
297 return log_error_errno(r
, "Failed to determine whether the unified cgroups hierarchy is used: %m");
299 arg_unified_cgroup_hierarchy
= r
;
303 static int parse_argv(int argc
, char *argv
[]) {
322 ARG_NETWORK_INTERFACE
,
335 static const struct option options
[] = {
336 { "help", no_argument
, NULL
, 'h' },
337 { "version", no_argument
, NULL
, ARG_VERSION
},
338 { "directory", required_argument
, NULL
, 'D' },
339 { "template", required_argument
, NULL
, ARG_TEMPLATE
},
340 { "ephemeral", no_argument
, NULL
, 'x' },
341 { "user", required_argument
, NULL
, 'u' },
342 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
343 { "boot", no_argument
, NULL
, 'b' },
344 { "uuid", required_argument
, NULL
, ARG_UUID
},
345 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
346 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
347 { "drop-capability", required_argument
, NULL
, ARG_DROP_CAPABILITY
},
348 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
349 { "bind", required_argument
, NULL
, ARG_BIND
},
350 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
351 { "tmpfs", required_argument
, NULL
, ARG_TMPFS
},
352 { "overlay", required_argument
, NULL
, ARG_OVERLAY
},
353 { "overlay-ro", required_argument
, NULL
, ARG_OVERLAY_RO
},
354 { "machine", required_argument
, NULL
, 'M' },
355 { "slice", required_argument
, NULL
, 'S' },
356 { "setenv", required_argument
, NULL
, ARG_SETENV
},
357 { "selinux-context", required_argument
, NULL
, 'Z' },
358 { "selinux-apifs-context", required_argument
, NULL
, 'L' },
359 { "quiet", no_argument
, NULL
, 'q' },
360 { "share-system", no_argument
, NULL
, ARG_SHARE_SYSTEM
},
361 { "register", required_argument
, NULL
, ARG_REGISTER
},
362 { "keep-unit", no_argument
, NULL
, ARG_KEEP_UNIT
},
363 { "network-interface", required_argument
, NULL
, ARG_NETWORK_INTERFACE
},
364 { "network-macvlan", required_argument
, NULL
, ARG_NETWORK_MACVLAN
},
365 { "network-ipvlan", required_argument
, NULL
, ARG_NETWORK_IPVLAN
},
366 { "network-veth", no_argument
, NULL
, 'n' },
367 { "network-bridge", required_argument
, NULL
, ARG_NETWORK_BRIDGE
},
368 { "personality", required_argument
, NULL
, ARG_PERSONALITY
},
369 { "image", required_argument
, NULL
, 'i' },
370 { "volatile", optional_argument
, NULL
, ARG_VOLATILE
},
371 { "port", required_argument
, NULL
, 'p' },
372 { "property", required_argument
, NULL
, ARG_PROPERTY
},
373 { "private-users", optional_argument
, NULL
, ARG_PRIVATE_USERS
},
374 { "kill-signal", required_argument
, NULL
, ARG_KILL_SIGNAL
},
375 { "settings", required_argument
, NULL
, ARG_SETTINGS
},
380 uint64_t plus
= 0, minus
= 0;
381 bool mask_all_settings
= false, mask_no_settings
= false;
386 while ((c
= getopt_long(argc
, argv
, "+hD:u:bL:M:jS:Z:qi:xp:n", options
, NULL
)) >= 0)
398 r
= parse_path_argument_and_warn(optarg
, false, &arg_directory
);
404 r
= parse_path_argument_and_warn(optarg
, false, &arg_template
);
410 r
= parse_path_argument_and_warn(optarg
, false, &arg_image
);
416 arg_ephemeral
= true;
420 r
= free_and_strdup(&arg_user
, optarg
);
424 arg_settings_mask
|= SETTING_USER
;
427 case ARG_NETWORK_BRIDGE
:
428 r
= free_and_strdup(&arg_network_bridge
, optarg
);
435 arg_network_veth
= true;
436 arg_private_network
= true;
437 arg_settings_mask
|= SETTING_NETWORK
;
440 case ARG_NETWORK_INTERFACE
:
441 if (strv_extend(&arg_network_interfaces
, optarg
) < 0)
444 arg_private_network
= true;
445 arg_settings_mask
|= SETTING_NETWORK
;
448 case ARG_NETWORK_MACVLAN
:
449 if (strv_extend(&arg_network_macvlan
, optarg
) < 0)
452 arg_private_network
= true;
453 arg_settings_mask
|= SETTING_NETWORK
;
456 case ARG_NETWORK_IPVLAN
:
457 if (strv_extend(&arg_network_ipvlan
, optarg
) < 0)
462 case ARG_PRIVATE_NETWORK
:
463 arg_private_network
= true;
464 arg_settings_mask
|= SETTING_NETWORK
;
469 arg_settings_mask
|= SETTING_BOOT
;
473 r
= sd_id128_from_string(optarg
, &arg_uuid
);
475 log_error("Invalid UUID: %s", optarg
);
479 arg_settings_mask
|= SETTING_MACHINE_ID
;
488 arg_machine
= mfree(arg_machine
);
490 if (!machine_name_is_valid(optarg
)) {
491 log_error("Invalid machine name: %s", optarg
);
495 r
= free_and_strdup(&arg_machine
, optarg
);
503 arg_selinux_context
= optarg
;
507 arg_selinux_apifs_context
= optarg
;
511 arg_read_only
= true;
512 arg_settings_mask
|= SETTING_READ_ONLY
;
516 case ARG_DROP_CAPABILITY
: {
517 const char *state
, *word
;
520 FOREACH_WORD_SEPARATOR(word
, length
, optarg
, ",", state
) {
521 _cleanup_free_
char *t
;
523 t
= strndup(word
, length
);
527 if (streq(t
, "all")) {
528 if (c
== ARG_CAPABILITY
)
529 plus
= (uint64_t) -1;
531 minus
= (uint64_t) -1;
535 cap
= capability_from_name(t
);
537 log_error("Failed to parse capability %s.", t
);
541 if (c
== ARG_CAPABILITY
)
542 plus
|= 1ULL << (uint64_t) cap
;
544 minus
|= 1ULL << (uint64_t) cap
;
548 arg_settings_mask
|= SETTING_CAPABILITY
;
553 arg_link_journal
= LINK_GUEST
;
554 arg_link_journal_try
= true;
557 case ARG_LINK_JOURNAL
:
558 if (streq(optarg
, "auto")) {
559 arg_link_journal
= LINK_AUTO
;
560 arg_link_journal_try
= false;
561 } else if (streq(optarg
, "no")) {
562 arg_link_journal
= LINK_NO
;
563 arg_link_journal_try
= false;
564 } else if (streq(optarg
, "guest")) {
565 arg_link_journal
= LINK_GUEST
;
566 arg_link_journal_try
= false;
567 } else if (streq(optarg
, "host")) {
568 arg_link_journal
= LINK_HOST
;
569 arg_link_journal_try
= false;
570 } else if (streq(optarg
, "try-guest")) {
571 arg_link_journal
= LINK_GUEST
;
572 arg_link_journal_try
= true;
573 } else if (streq(optarg
, "try-host")) {
574 arg_link_journal
= LINK_HOST
;
575 arg_link_journal_try
= true;
577 log_error("Failed to parse link journal mode %s", optarg
);
585 r
= bind_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
, c
== ARG_BIND_RO
);
587 return log_error_errno(r
, "Failed to parse --bind(-ro)= argument %s: %m", optarg
);
589 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
593 r
= tmpfs_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
);
595 return log_error_errno(r
, "Failed to parse --tmpfs= argument %s: %m", optarg
);
597 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
601 case ARG_OVERLAY_RO
: {
602 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
603 _cleanup_strv_free_
char **lower
= NULL
;
608 r
= strv_split_extract(&lower
, optarg
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
612 log_error("Invalid overlay specification: %s", optarg
);
616 STRV_FOREACH(i
, lower
) {
617 if (!path_is_absolute(*i
)) {
618 log_error("Overlay path %s is not absolute.", *i
);
626 log_error("--overlay= needs at least two colon-separated directories specified.");
631 /* If two parameters are specified,
632 * the first one is the lower, the
633 * second one the upper directory. And
634 * we'll also define the destination
635 * mount point the same as the upper. */
639 destination
= strdup(upper
);
644 upper
= lower
[n
- 2];
645 destination
= lower
[n
- 1];
649 m
= custom_mount_add(&arg_custom_mounts
, &arg_n_custom_mounts
, CUSTOM_MOUNT_OVERLAY
);
653 m
->destination
= destination
;
656 m
->read_only
= c
== ARG_OVERLAY_RO
;
658 upper
= destination
= NULL
;
661 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
668 if (!env_assignment_is_valid(optarg
)) {
669 log_error("Environment variable assignment '%s' is not valid.", optarg
);
673 n
= strv_env_set(arg_setenv
, optarg
);
677 strv_free(arg_setenv
);
680 arg_settings_mask
|= SETTING_ENVIRONMENT
;
688 case ARG_SHARE_SYSTEM
:
689 arg_share_system
= true;
693 r
= parse_boolean(optarg
);
695 log_error("Failed to parse --register= argument: %s", optarg
);
703 arg_keep_unit
= true;
706 case ARG_PERSONALITY
:
708 arg_personality
= personality_from_string(optarg
);
709 if (arg_personality
== PERSONALITY_INVALID
) {
710 log_error("Unknown or unsupported personality '%s'.", optarg
);
714 arg_settings_mask
|= SETTING_PERSONALITY
;
720 arg_volatile_mode
= VOLATILE_YES
;
724 m
= volatile_mode_from_string(optarg
);
726 log_error("Failed to parse --volatile= argument: %s", optarg
);
729 arg_volatile_mode
= m
;
732 arg_settings_mask
|= SETTING_VOLATILE_MODE
;
736 r
= expose_port_parse(&arg_expose_ports
, optarg
);
738 return log_error_errno(r
, "Duplicate port specification: %s", optarg
);
740 return log_error_errno(r
, "Failed to parse host port %s: %m", optarg
);
742 arg_settings_mask
|= SETTING_EXPOSE_PORTS
;
746 if (strv_extend(&arg_property
, optarg
) < 0)
751 case ARG_PRIVATE_USERS
:
753 _cleanup_free_
char *buffer
= NULL
;
754 const char *range
, *shift
;
756 range
= strchr(optarg
, ':');
758 buffer
= strndup(optarg
, range
- optarg
);
764 if (safe_atou32(range
, &arg_uid_range
) < 0 || arg_uid_range
<= 0) {
765 log_error("Failed to parse UID range: %s", range
);
771 if (parse_uid(shift
, &arg_uid_shift
) < 0) {
772 log_error("Failed to parse UID: %s", optarg
);
780 case ARG_KILL_SIGNAL
:
781 arg_kill_signal
= signal_from_string_try_harder(optarg
);
782 if (arg_kill_signal
< 0) {
783 log_error("Cannot parse signal: %s", optarg
);
787 arg_settings_mask
|= SETTING_KILL_SIGNAL
;
792 /* no → do not read files
793 * yes → read files, do not override cmdline, trust only subset
794 * override → read files, override cmdline, trust only subset
795 * trusted → read files, do not override cmdline, trust all
798 r
= parse_boolean(optarg
);
800 if (streq(optarg
, "trusted")) {
801 mask_all_settings
= false;
802 mask_no_settings
= false;
803 arg_settings_trusted
= true;
805 } else if (streq(optarg
, "override")) {
806 mask_all_settings
= false;
807 mask_no_settings
= true;
808 arg_settings_trusted
= -1;
810 return log_error_errno(r
, "Failed to parse --settings= argument: %s", optarg
);
813 mask_all_settings
= false;
814 mask_no_settings
= false;
815 arg_settings_trusted
= -1;
818 mask_all_settings
= true;
819 mask_no_settings
= false;
820 arg_settings_trusted
= false;
829 assert_not_reached("Unhandled option");
832 if (arg_share_system
)
833 arg_register
= false;
835 if (arg_boot
&& arg_share_system
) {
836 log_error("--boot and --share-system may not be combined.");
840 if (arg_keep_unit
&& cg_pid_get_owner_uid(0, NULL
) >= 0) {
841 log_error("--keep-unit may not be used when invoked from a user session.");
845 if (arg_directory
&& arg_image
) {
846 log_error("--directory= and --image= may not be combined.");
850 if (arg_template
&& arg_image
) {
851 log_error("--template= and --image= may not be combined.");
855 if (arg_template
&& !(arg_directory
|| arg_machine
)) {
856 log_error("--template= needs --directory= or --machine=.");
860 if (arg_ephemeral
&& arg_template
) {
861 log_error("--ephemeral and --template= may not be combined.");
865 if (arg_ephemeral
&& arg_image
) {
866 log_error("--ephemeral and --image= may not be combined.");
870 if (arg_ephemeral
&& !IN_SET(arg_link_journal
, LINK_NO
, LINK_AUTO
)) {
871 log_error("--ephemeral and --link-journal= may not be combined.");
875 if (arg_userns
&& access("/proc/self/uid_map", F_OK
) < 0)
876 return log_error_errno(EOPNOTSUPP
, "--private-users= is not supported, kernel compiled without user namespace support.");
879 arg_parameters
= strv_copy(argv
+ optind
);
883 arg_settings_mask
|= SETTING_BOOT
;
886 /* Load all settings from .nspawn files */
887 if (mask_no_settings
)
888 arg_settings_mask
= 0;
890 /* Don't load any settings from .nspawn files */
891 if (mask_all_settings
)
892 arg_settings_mask
= _SETTINGS_MASK_ALL
;
894 arg_retain
= (arg_retain
| plus
| (arg_private_network
? 1ULL << CAP_NET_ADMIN
: 0)) & ~minus
;
896 r
= detect_unified_cgroup_hierarchy();
903 static int verify_arguments(void) {
905 if (arg_volatile_mode
!= VOLATILE_NO
&& arg_read_only
) {
906 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
910 if (arg_expose_ports
&& !arg_private_network
) {
911 log_error("Cannot use --port= without private networking.");
915 if (arg_boot
&& arg_kill_signal
<= 0)
916 arg_kill_signal
= SIGRTMIN
+3;
921 static int userns_lchown(const char *p
, uid_t uid
, gid_t gid
) {
927 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
930 if (uid
!= UID_INVALID
) {
931 uid
+= arg_uid_shift
;
933 if (uid
< arg_uid_shift
|| uid
>= arg_uid_shift
+ arg_uid_range
)
937 if (gid
!= GID_INVALID
) {
938 gid
+= (gid_t
) arg_uid_shift
;
940 if (gid
< (gid_t
) arg_uid_shift
|| gid
>= (gid_t
) (arg_uid_shift
+ arg_uid_range
))
944 if (lchown(p
, uid
, gid
) < 0)
950 static int userns_mkdir(const char *root
, const char *path
, mode_t mode
, uid_t uid
, gid_t gid
) {
953 q
= prefix_roota(root
, path
);
954 if (mkdir(q
, mode
) < 0) {
960 return userns_lchown(q
, uid
, gid
);
963 static int setup_timezone(const char *dest
) {
964 _cleanup_free_
char *p
= NULL
, *q
= NULL
;
965 const char *where
, *check
, *what
;
971 /* Fix the timezone, if possible */
972 r
= readlink_malloc("/etc/localtime", &p
);
974 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
978 z
= path_startswith(p
, "../usr/share/zoneinfo/");
980 z
= path_startswith(p
, "/usr/share/zoneinfo/");
982 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
986 where
= prefix_roota(dest
, "/etc/localtime");
987 r
= readlink_malloc(where
, &q
);
989 y
= path_startswith(q
, "../usr/share/zoneinfo/");
991 y
= path_startswith(q
, "/usr/share/zoneinfo/");
993 /* Already pointing to the right place? Then do nothing .. */
994 if (y
&& streq(y
, z
))
998 check
= strjoina("/usr/share/zoneinfo/", z
);
999 check
= prefix_root(dest
, check
);
1000 if (laccess(check
, F_OK
) < 0) {
1001 log_warning("Timezone %s does not exist in container, not updating container timezone.", z
);
1006 if (r
< 0 && errno
!= ENOENT
) {
1007 log_error_errno(errno
, "Failed to remove existing timezone info %s in container: %m", where
);
1011 what
= strjoina("../usr/share/zoneinfo/", z
);
1012 if (symlink(what
, where
) < 0) {
1013 log_error_errno(errno
, "Failed to correct timezone of container: %m");
1017 r
= userns_lchown(where
, 0, 0);
1019 return log_warning_errno(r
, "Failed to chown /etc/localtime: %m");
1024 static int setup_resolv_conf(const char *dest
) {
1025 const char *where
= NULL
;
1030 if (arg_private_network
)
1033 /* Fix resolv.conf, if possible */
1034 where
= prefix_roota(dest
, "/etc/resolv.conf");
1036 r
= copy_file("/etc/resolv.conf", where
, O_TRUNC
|O_NOFOLLOW
, 0644, 0);
1038 /* If the file already exists as symlink, let's
1039 * suppress the warning, under the assumption that
1040 * resolved or something similar runs inside and the
1041 * symlink points there.
1043 * If the disk image is read-only, there's also no
1044 * point in complaining.
1046 log_full_errno(IN_SET(r
, -ELOOP
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
1047 "Failed to copy /etc/resolv.conf to %s: %m", where
);
1051 r
= userns_lchown(where
, 0, 0);
1053 log_warning_errno(r
, "Failed to chown /etc/resolv.conf: %m");
1058 static char* id128_format_as_uuid(sd_id128_t id
, char s
[37]) {
1062 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1063 SD_ID128_FORMAT_VAL(id
));
1068 static int setup_boot_id(const char *dest
) {
1069 const char *from
, *to
;
1070 sd_id128_t rnd
= {};
1074 if (arg_share_system
)
1077 /* Generate a new randomized boot ID, so that each boot-up of
1078 * the container gets a new one */
1080 from
= prefix_roota(dest
, "/run/proc-sys-kernel-random-boot-id");
1081 to
= prefix_roota(dest
, "/proc/sys/kernel/random/boot_id");
1083 r
= sd_id128_randomize(&rnd
);
1085 return log_error_errno(r
, "Failed to generate random boot id: %m");
1087 id128_format_as_uuid(rnd
, as_uuid
);
1089 r
= write_string_file(from
, as_uuid
, WRITE_STRING_FILE_CREATE
);
1091 return log_error_errno(r
, "Failed to write boot id: %m");
1093 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1094 r
= log_error_errno(errno
, "Failed to bind mount boot id: %m");
1095 else if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
) < 0)
1096 log_warning_errno(errno
, "Failed to make boot id read-only: %m");
1102 static int copy_devnodes(const char *dest
) {
1104 static const char devnodes
[] =
1115 _cleanup_umask_ mode_t u
;
1121 /* Create /dev/net, so that we can create /dev/net/tun in it */
1122 if (userns_mkdir(dest
, "/dev/net", 0755, 0, 0) < 0)
1123 return log_error_errno(r
, "Failed to create /dev/net directory: %m");
1125 NULSTR_FOREACH(d
, devnodes
) {
1126 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
1129 from
= strappend("/dev/", d
);
1130 to
= prefix_root(dest
, from
);
1132 if (stat(from
, &st
) < 0) {
1134 if (errno
!= ENOENT
)
1135 return log_error_errno(errno
, "Failed to stat %s: %m", from
);
1137 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
1139 log_error("%s is not a char or block device, cannot copy.", from
);
1143 if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
1145 return log_error_errno(errno
, "mknod(%s) failed: %m", to
);
1147 /* Some systems abusively restrict mknod but
1148 * allow bind mounts. */
1151 return log_error_errno(r
, "touch (%s) failed: %m", to
);
1152 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1153 return log_error_errno(errno
, "Both mknod and bind mount (%s) failed: %m", to
);
1156 r
= userns_lchown(to
, 0, 0);
1158 return log_error_errno(r
, "chown() of device node %s failed: %m", to
);
1165 static int setup_pts(const char *dest
) {
1166 _cleanup_free_
char *options
= NULL
;
1170 if (arg_selinux_apifs_context
)
1171 (void) asprintf(&options
,
1172 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
",context=\"%s\"",
1173 arg_uid_shift
+ TTY_GID
,
1174 arg_selinux_apifs_context
);
1177 (void) asprintf(&options
,
1178 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
,
1179 arg_uid_shift
+ TTY_GID
);
1184 /* Mount /dev/pts itself */
1185 p
= prefix_roota(dest
, "/dev/pts");
1186 if (mkdir(p
, 0755) < 0)
1187 return log_error_errno(errno
, "Failed to create /dev/pts: %m");
1188 if (mount("devpts", p
, "devpts", MS_NOSUID
|MS_NOEXEC
, options
) < 0)
1189 return log_error_errno(errno
, "Failed to mount /dev/pts: %m");
1190 if (userns_lchown(p
, 0, 0) < 0)
1191 return log_error_errno(errno
, "Failed to chown /dev/pts: %m");
1193 /* Create /dev/ptmx symlink */
1194 p
= prefix_roota(dest
, "/dev/ptmx");
1195 if (symlink("pts/ptmx", p
) < 0)
1196 return log_error_errno(errno
, "Failed to create /dev/ptmx symlink: %m");
1197 if (userns_lchown(p
, 0, 0) < 0)
1198 return log_error_errno(errno
, "Failed to chown /dev/ptmx: %m");
1200 /* And fix /dev/pts/ptmx ownership */
1201 p
= prefix_roota(dest
, "/dev/pts/ptmx");
1202 if (userns_lchown(p
, 0, 0) < 0)
1203 return log_error_errno(errno
, "Failed to chown /dev/pts/ptmx: %m");
1208 static int setup_dev_console(const char *dest
, const char *console
) {
1209 _cleanup_umask_ mode_t u
;
1218 r
= chmod_and_chown(console
, 0600, arg_uid_shift
, arg_uid_shift
);
1220 return log_error_errno(r
, "Failed to correct access mode for TTY: %m");
1222 /* We need to bind mount the right tty to /dev/console since
1223 * ptys can only exist on pts file systems. To have something
1224 * to bind mount things on we create a empty regular file. */
1226 to
= prefix_roota(dest
, "/dev/console");
1229 return log_error_errno(r
, "touch() for /dev/console failed: %m");
1231 if (mount(console
, to
, NULL
, MS_BIND
, NULL
) < 0)
1232 return log_error_errno(errno
, "Bind mount for /dev/console failed: %m");
1237 static int setup_kmsg(const char *dest
, int kmsg_socket
) {
1238 const char *from
, *to
;
1239 _cleanup_umask_ mode_t u
;
1242 assert(kmsg_socket
>= 0);
1246 /* We create the kmsg FIFO as /run/kmsg, but immediately
1247 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1248 * on the reading side behave very similar to /proc/kmsg,
1249 * their writing side behaves differently from /dev/kmsg in
1250 * that writing blocks when nothing is reading. In order to
1251 * avoid any problems with containers deadlocking due to this
1252 * we simply make /dev/kmsg unavailable to the container. */
1253 from
= prefix_roota(dest
, "/run/kmsg");
1254 to
= prefix_roota(dest
, "/proc/kmsg");
1256 if (mkfifo(from
, 0600) < 0)
1257 return log_error_errno(errno
, "mkfifo() for /run/kmsg failed: %m");
1258 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1259 return log_error_errno(errno
, "Bind mount for /proc/kmsg failed: %m");
1261 fd
= open(from
, O_RDWR
|O_NDELAY
|O_CLOEXEC
);
1263 return log_error_errno(errno
, "Failed to open fifo: %m");
1265 /* Store away the fd in the socket, so that it stays open as
1266 * long as we run the child */
1267 r
= send_one_fd(kmsg_socket
, fd
, 0);
1271 return log_error_errno(r
, "Failed to send FIFO fd: %m");
1273 /* And now make the FIFO unavailable as /run/kmsg... */
1274 (void) unlink(from
);
1279 static int on_address_change(sd_netlink
*rtnl
, sd_netlink_message
*m
, void *userdata
) {
1280 union in_addr_union
*exposed
= userdata
;
1286 expose_port_execute(rtnl
, arg_expose_ports
, exposed
);
1290 static int setup_hostname(void) {
1292 if (arg_share_system
)
1295 if (sethostname_idempotent(arg_machine
) < 0)
1301 static int setup_journal(const char *directory
) {
1302 sd_id128_t machine_id
, this_id
;
1303 _cleanup_free_
char *b
= NULL
, *d
= NULL
;
1304 const char *etc_machine_id
, *p
, *q
;
1308 /* Don't link journals in ephemeral mode */
1312 etc_machine_id
= prefix_roota(directory
, "/etc/machine-id");
1314 r
= read_one_line_file(etc_machine_id
, &b
);
1315 if (r
== -ENOENT
&& arg_link_journal
== LINK_AUTO
)
1318 return log_error_errno(r
, "Failed to read machine ID from %s: %m", etc_machine_id
);
1321 if (isempty(id
) && arg_link_journal
== LINK_AUTO
)
1324 /* Verify validity */
1325 r
= sd_id128_from_string(id
, &machine_id
);
1327 return log_error_errno(r
, "Failed to parse machine ID from %s: %m", etc_machine_id
);
1329 r
= sd_id128_get_machine(&this_id
);
1331 return log_error_errno(r
, "Failed to retrieve machine ID: %m");
1333 if (sd_id128_equal(machine_id
, this_id
)) {
1334 log_full(arg_link_journal
== LINK_AUTO
? LOG_WARNING
: LOG_ERR
,
1335 "Host and machine ids are equal (%s): refusing to link journals", id
);
1336 if (arg_link_journal
== LINK_AUTO
)
1341 if (arg_link_journal
== LINK_NO
)
1344 r
= userns_mkdir(directory
, "/var", 0755, 0, 0);
1346 return log_error_errno(r
, "Failed to create /var: %m");
1348 r
= userns_mkdir(directory
, "/var/log", 0755, 0, 0);
1350 return log_error_errno(r
, "Failed to create /var/log: %m");
1352 r
= userns_mkdir(directory
, "/var/log/journal", 0755, 0, 0);
1354 return log_error_errno(r
, "Failed to create /var/log/journal: %m");
1356 p
= strjoina("/var/log/journal/", id
);
1357 q
= prefix_roota(directory
, p
);
1359 if (path_is_mount_point(p
, 0) > 0) {
1360 if (arg_link_journal
!= LINK_AUTO
) {
1361 log_error("%s: already a mount point, refusing to use for journal", p
);
1368 if (path_is_mount_point(q
, 0) > 0) {
1369 if (arg_link_journal
!= LINK_AUTO
) {
1370 log_error("%s: already a mount point, refusing to use for journal", q
);
1377 r
= readlink_and_make_absolute(p
, &d
);
1379 if ((arg_link_journal
== LINK_GUEST
||
1380 arg_link_journal
== LINK_AUTO
) &&
1383 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1385 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
1390 return log_error_errno(errno
, "Failed to remove symlink %s: %m", p
);
1391 } else if (r
== -EINVAL
) {
1393 if (arg_link_journal
== LINK_GUEST
&&
1396 if (errno
== ENOTDIR
) {
1397 log_error("%s already exists and is neither a symlink nor a directory", p
);
1400 log_error_errno(errno
, "Failed to remove %s: %m", p
);
1404 } else if (r
!= -ENOENT
) {
1405 log_error_errno(errno
, "readlink(%s) failed: %m", p
);
1409 if (arg_link_journal
== LINK_GUEST
) {
1411 if (symlink(q
, p
) < 0) {
1412 if (arg_link_journal_try
) {
1413 log_debug_errno(errno
, "Failed to symlink %s to %s, skipping journal setup: %m", q
, p
);
1416 log_error_errno(errno
, "Failed to symlink %s to %s: %m", q
, p
);
1421 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1423 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
1427 if (arg_link_journal
== LINK_HOST
) {
1428 /* don't create parents here -- if the host doesn't have
1429 * permanent journal set up, don't force it here */
1432 if (arg_link_journal_try
) {
1433 log_debug_errno(errno
, "Failed to create %s, skipping journal setup: %m", p
);
1436 log_error_errno(errno
, "Failed to create %s: %m", p
);
1441 } else if (access(p
, F_OK
) < 0)
1444 if (dir_is_empty(q
) == 0)
1445 log_warning("%s is not empty, proceeding anyway.", q
);
1447 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1449 log_error_errno(errno
, "Failed to create %s: %m", q
);
1453 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1454 return log_error_errno(errno
, "Failed to bind mount journal from host into guest: %m");
1459 static int drop_capabilities(void) {
1460 return capability_bounding_set_drop(~arg_retain
, false);
1463 static int reset_audit_loginuid(void) {
1464 _cleanup_free_
char *p
= NULL
;
1467 if (arg_share_system
)
1470 r
= read_one_line_file("/proc/self/loginuid", &p
);
1474 return log_error_errno(r
, "Failed to read /proc/self/loginuid: %m");
1476 /* Already reset? */
1477 if (streq(p
, "4294967295"))
1480 r
= write_string_file("/proc/self/loginuid", "4294967295", 0);
1483 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1484 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1485 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1486 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1487 "using systemd-nspawn. Sleeping for 5s... (%m)");
1495 static int setup_seccomp(void) {
1498 static const struct {
1499 uint64_t capability
;
1502 { CAP_SYS_RAWIO
, SCMP_SYS(iopl
) },
1503 { CAP_SYS_RAWIO
, SCMP_SYS(ioperm
) },
1504 { CAP_SYS_BOOT
, SCMP_SYS(kexec_load
) },
1505 { CAP_SYS_ADMIN
, SCMP_SYS(swapon
) },
1506 { CAP_SYS_ADMIN
, SCMP_SYS(swapoff
) },
1507 { CAP_SYS_ADMIN
, SCMP_SYS(open_by_handle_at
) },
1508 { CAP_SYS_MODULE
, SCMP_SYS(init_module
) },
1509 { CAP_SYS_MODULE
, SCMP_SYS(finit_module
) },
1510 { CAP_SYS_MODULE
, SCMP_SYS(delete_module
) },
1511 { CAP_SYSLOG
, SCMP_SYS(syslog
) },
1514 scmp_filter_ctx seccomp
;
1518 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
1522 r
= seccomp_add_secondary_archs(seccomp
);
1524 log_error_errno(r
, "Failed to add secondary archs to seccomp filter: %m");
1528 for (i
= 0; i
< ELEMENTSOF(blacklist
); i
++) {
1529 if (arg_retain
& (1ULL << blacklist
[i
].capability
))
1532 r
= seccomp_rule_add(seccomp
, SCMP_ACT_ERRNO(EPERM
), blacklist
[i
].syscall_num
, 0);
1534 continue; /* unknown syscall */
1536 log_error_errno(r
, "Failed to block syscall: %m");
1543 Audit is broken in containers, much of the userspace audit
1544 hookup will fail if running inside a container. We don't
1545 care and just turn off creation of audit sockets.
1547 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1548 with EAFNOSUPPORT which audit userspace uses as indication
1549 that audit is disabled in the kernel.
1552 r
= seccomp_rule_add(
1554 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1557 SCMP_A0(SCMP_CMP_EQ
, AF_NETLINK
),
1558 SCMP_A2(SCMP_CMP_EQ
, NETLINK_AUDIT
));
1560 log_error_errno(r
, "Failed to add audit seccomp rule: %m");
1564 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
1566 log_error_errno(r
, "Failed to unset NO_NEW_PRIVS: %m");
1570 r
= seccomp_load(seccomp
);
1572 log_debug_errno(r
, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1577 log_error_errno(r
, "Failed to install seccomp audit filter: %m");
1582 seccomp_release(seccomp
);
1590 static int setup_propagate(const char *root
) {
1593 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1594 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1595 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
1596 (void) mkdir_p(p
, 0600);
1598 if (userns_mkdir(root
, "/run/systemd", 0755, 0, 0) < 0)
1599 return log_error_errno(errno
, "Failed to create /run/systemd: %m");
1601 if (userns_mkdir(root
, "/run/systemd/nspawn", 0755, 0, 0) < 0)
1602 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn: %m");
1604 if (userns_mkdir(root
, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
1605 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn/incoming: %m");
1607 q
= prefix_roota(root
, "/run/systemd/nspawn/incoming");
1608 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1609 return log_error_errno(errno
, "Failed to install propagation bind mount.");
1611 if (mount(NULL
, q
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
) < 0)
1612 return log_error_errno(errno
, "Failed to make propagation mount read-only");
1617 static int setup_image(char **device_path
, int *loop_nr
) {
1618 struct loop_info64 info
= {
1619 .lo_flags
= LO_FLAGS_AUTOCLEAR
|LO_FLAGS_PARTSCAN
1621 _cleanup_close_
int fd
= -1, control
= -1, loop
= -1;
1622 _cleanup_free_
char* loopdev
= NULL
;
1626 assert(device_path
);
1630 fd
= open(arg_image
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1632 return log_error_errno(errno
, "Failed to open %s: %m", arg_image
);
1634 if (fstat(fd
, &st
) < 0)
1635 return log_error_errno(errno
, "Failed to stat %s: %m", arg_image
);
1637 if (S_ISBLK(st
.st_mode
)) {
1640 p
= strdup(arg_image
);
1654 if (!S_ISREG(st
.st_mode
)) {
1655 log_error_errno(errno
, "%s is not a regular file or block device: %m", arg_image
);
1659 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
1661 return log_error_errno(errno
, "Failed to open /dev/loop-control: %m");
1663 nr
= ioctl(control
, LOOP_CTL_GET_FREE
);
1665 return log_error_errno(errno
, "Failed to allocate loop device: %m");
1667 if (asprintf(&loopdev
, "/dev/loop%i", nr
) < 0)
1670 loop
= open(loopdev
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1672 return log_error_errno(errno
, "Failed to open loop device %s: %m", loopdev
);
1674 if (ioctl(loop
, LOOP_SET_FD
, fd
) < 0)
1675 return log_error_errno(errno
, "Failed to set loopback file descriptor on %s: %m", loopdev
);
1678 info
.lo_flags
|= LO_FLAGS_READ_ONLY
;
1680 if (ioctl(loop
, LOOP_SET_STATUS64
, &info
) < 0)
1681 return log_error_errno(errno
, "Failed to set loopback settings on %s: %m", loopdev
);
1683 *device_path
= loopdev
;
1694 #define PARTITION_TABLE_BLURB \
1695 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1696 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1697 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1698 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1699 "to be bootable with systemd-nspawn."
1701 static int dissect_image(
1703 char **root_device
, bool *root_device_rw
,
1704 char **home_device
, bool *home_device_rw
,
1705 char **srv_device
, bool *srv_device_rw
,
1709 int home_nr
= -1, srv_nr
= -1;
1710 #ifdef GPT_ROOT_NATIVE
1713 #ifdef GPT_ROOT_SECONDARY
1714 int secondary_root_nr
= -1;
1716 _cleanup_free_
char *home
= NULL
, *root
= NULL
, *secondary_root
= NULL
, *srv
= NULL
, *generic
= NULL
;
1717 _cleanup_udev_enumerate_unref_
struct udev_enumerate
*e
= NULL
;
1718 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
1719 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
1720 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
1721 struct udev_list_entry
*first
, *item
;
1722 bool home_rw
= true, root_rw
= true, secondary_root_rw
= true, srv_rw
= true, generic_rw
= true;
1723 bool is_gpt
, is_mbr
, multiple_generic
= false;
1724 const char *pttype
= NULL
;
1731 assert(root_device
);
1732 assert(home_device
);
1737 b
= blkid_new_probe();
1742 r
= blkid_probe_set_device(b
, fd
, 0, 0);
1747 log_error_errno(errno
, "Failed to set device on blkid probe: %m");
1751 blkid_probe_enable_partitions(b
, 1);
1752 blkid_probe_set_partitions_flags(b
, BLKID_PARTS_ENTRY_DETAILS
);
1755 r
= blkid_do_safeprobe(b
);
1756 if (r
== -2 || r
== 1) {
1757 log_error("Failed to identify any partition table on\n"
1759 PARTITION_TABLE_BLURB
, arg_image
);
1761 } else if (r
!= 0) {
1764 log_error_errno(errno
, "Failed to probe: %m");
1768 (void) blkid_probe_lookup_value(b
, "PTTYPE", &pttype
, NULL
);
1770 is_gpt
= streq_ptr(pttype
, "gpt");
1771 is_mbr
= streq_ptr(pttype
, "dos");
1773 if (!is_gpt
&& !is_mbr
) {
1774 log_error("No GPT or MBR partition table discovered on\n"
1776 PARTITION_TABLE_BLURB
, arg_image
);
1781 pl
= blkid_probe_get_partitions(b
);
1786 log_error("Failed to list partitions of %s", arg_image
);
1794 if (fstat(fd
, &st
) < 0)
1795 return log_error_errno(errno
, "Failed to stat block device: %m");
1797 d
= udev_device_new_from_devnum(udev
, 'b', st
.st_rdev
);
1805 log_error("Kernel partitions never appeared.");
1809 e
= udev_enumerate_new(udev
);
1813 r
= udev_enumerate_add_match_parent(e
, d
);
1817 r
= udev_enumerate_scan_devices(e
);
1819 return log_error_errno(r
, "Failed to scan for partition devices of %s: %m", arg_image
);
1821 /* Count the partitions enumerated by the kernel */
1823 first
= udev_enumerate_get_list_entry(e
);
1824 udev_list_entry_foreach(item
, first
)
1827 /* Count the partitions enumerated by blkid */
1828 m
= blkid_partlist_numof_partitions(pl
);
1832 log_error("blkid and kernel partition list do not match.");
1838 /* The kernel has probed fewer partitions than
1839 * blkid? Maybe the kernel prober is still
1840 * running or it got EBUSY because udev
1841 * already opened the device. Let's reprobe
1842 * the device, which is a synchronous call
1843 * that waits until probing is complete. */
1845 for (j
= 0; j
< 20; j
++) {
1847 r
= ioctl(fd
, BLKRRPART
, 0);
1850 if (r
>= 0 || r
!= -EBUSY
)
1853 /* If something else has the device
1854 * open, such as an udev rule, the
1855 * ioctl will return EBUSY. Since
1856 * there's no way to wait until it
1857 * isn't busy anymore, let's just wait
1858 * a bit, and try again.
1860 * This is really something they
1861 * should fix in the kernel! */
1863 usleep(50 * USEC_PER_MSEC
);
1867 return log_error_errno(r
, "Failed to reread partition table: %m");
1870 e
= udev_enumerate_unref(e
);
1873 first
= udev_enumerate_get_list_entry(e
);
1874 udev_list_entry_foreach(item
, first
) {
1875 _cleanup_udev_device_unref_
struct udev_device
*q
;
1877 unsigned long long flags
;
1883 q
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(item
));
1888 log_error_errno(errno
, "Failed to get partition device of %s: %m", arg_image
);
1892 qn
= udev_device_get_devnum(q
);
1896 if (st
.st_rdev
== qn
)
1899 node
= udev_device_get_devnode(q
);
1903 pp
= blkid_partlist_devno_to_partition(pl
, qn
);
1907 flags
= blkid_partition_get_flags(pp
);
1909 nr
= blkid_partition_get_partno(pp
);
1917 if (flags
& GPT_FLAG_NO_AUTO
)
1920 stype
= blkid_partition_get_type_string(pp
);
1924 if (sd_id128_from_string(stype
, &type_id
) < 0)
1927 if (sd_id128_equal(type_id
, GPT_HOME
)) {
1929 if (home
&& nr
>= home_nr
)
1933 home_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1935 r
= free_and_strdup(&home
, node
);
1939 } else if (sd_id128_equal(type_id
, GPT_SRV
)) {
1941 if (srv
&& nr
>= srv_nr
)
1945 srv_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1947 r
= free_and_strdup(&srv
, node
);
1951 #ifdef GPT_ROOT_NATIVE
1952 else if (sd_id128_equal(type_id
, GPT_ROOT_NATIVE
)) {
1954 if (root
&& nr
>= root_nr
)
1958 root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1960 r
= free_and_strdup(&root
, node
);
1965 #ifdef GPT_ROOT_SECONDARY
1966 else if (sd_id128_equal(type_id
, GPT_ROOT_SECONDARY
)) {
1968 if (secondary_root
&& nr
>= secondary_root_nr
)
1971 secondary_root_nr
= nr
;
1972 secondary_root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1974 r
= free_and_strdup(&secondary_root
, node
);
1979 else if (sd_id128_equal(type_id
, GPT_LINUX_GENERIC
)) {
1982 multiple_generic
= true;
1984 generic_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1986 r
= free_and_strdup(&generic
, node
);
1992 } else if (is_mbr
) {
1995 if (flags
!= 0x80) /* Bootable flag */
1998 type
= blkid_partition_get_type(pp
);
1999 if (type
!= 0x83) /* Linux partition */
2003 multiple_generic
= true;
2007 r
= free_and_strdup(&root
, node
);
2015 *root_device
= root
;
2018 *root_device_rw
= root_rw
;
2020 } else if (secondary_root
) {
2021 *root_device
= secondary_root
;
2022 secondary_root
= NULL
;
2024 *root_device_rw
= secondary_root_rw
;
2026 } else if (generic
) {
2028 /* There were no partitions with precise meanings
2029 * around, but we found generic partitions. In this
2030 * case, if there's only one, we can go ahead and boot
2031 * it, otherwise we bail out, because we really cannot
2032 * make any sense of it. */
2034 if (multiple_generic
) {
2035 log_error("Identified multiple bootable Linux partitions on\n"
2037 PARTITION_TABLE_BLURB
, arg_image
);
2041 *root_device
= generic
;
2044 *root_device_rw
= generic_rw
;
2047 log_error("Failed to identify root partition in disk image\n"
2049 PARTITION_TABLE_BLURB
, arg_image
);
2054 *home_device
= home
;
2057 *home_device_rw
= home_rw
;
2064 *srv_device_rw
= srv_rw
;
2069 log_error("--image= is not supported, compiled without blkid support.");
2074 static int mount_device(const char *what
, const char *where
, const char *directory
, bool rw
) {
2076 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
2077 const char *fstype
, *p
;
2087 p
= strjoina(where
, directory
);
2092 b
= blkid_new_probe_from_filename(what
);
2096 log_error_errno(errno
, "Failed to allocate prober for %s: %m", what
);
2100 blkid_probe_enable_superblocks(b
, 1);
2101 blkid_probe_set_superblocks_flags(b
, BLKID_SUBLKS_TYPE
);
2104 r
= blkid_do_safeprobe(b
);
2105 if (r
== -1 || r
== 1) {
2106 log_error("Cannot determine file system type of %s", what
);
2108 } else if (r
!= 0) {
2111 log_error_errno(errno
, "Failed to probe %s: %m", what
);
2116 if (blkid_probe_lookup_value(b
, "TYPE", &fstype
, NULL
) < 0) {
2119 log_error("Failed to determine file system type of %s", what
);
2123 if (streq(fstype
, "crypto_LUKS")) {
2124 log_error("nspawn currently does not support LUKS disk images.");
2128 if (mount(what
, p
, fstype
, MS_NODEV
|(rw
? 0 : MS_RDONLY
), NULL
) < 0)
2129 return log_error_errno(errno
, "Failed to mount %s: %m", what
);
2133 log_error("--image= is not supported, compiled without blkid support.");
2138 static int mount_devices(
2140 const char *root_device
, bool root_device_rw
,
2141 const char *home_device
, bool home_device_rw
,
2142 const char *srv_device
, bool srv_device_rw
) {
2148 r
= mount_device(root_device
, arg_directory
, NULL
, root_device_rw
);
2150 return log_error_errno(r
, "Failed to mount root directory: %m");
2154 r
= mount_device(home_device
, arg_directory
, "/home", home_device_rw
);
2156 return log_error_errno(r
, "Failed to mount home directory: %m");
2160 r
= mount_device(srv_device
, arg_directory
, "/srv", srv_device_rw
);
2162 return log_error_errno(r
, "Failed to mount server data directory: %m");
2168 static void loop_remove(int nr
, int *image_fd
) {
2169 _cleanup_close_
int control
= -1;
2175 if (image_fd
&& *image_fd
>= 0) {
2176 r
= ioctl(*image_fd
, LOOP_CLR_FD
);
2178 log_debug_errno(errno
, "Failed to close loop image: %m");
2179 *image_fd
= safe_close(*image_fd
);
2182 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
2184 log_warning_errno(errno
, "Failed to open /dev/loop-control: %m");
2188 r
= ioctl(control
, LOOP_CTL_REMOVE
, nr
);
2190 log_debug_errno(errno
, "Failed to remove loop %d: %m", nr
);
2195 * < 0 : wait_for_terminate() failed to get the state of the
2196 * container, the container was terminated by a signal, or
2197 * failed for an unknown reason. No change is made to the
2198 * container argument.
2199 * > 0 : The program executed in the container terminated with an
2200 * error. The exit code of the program executed in the
2201 * container is returned. The container argument has been set
2202 * to CONTAINER_TERMINATED.
2203 * 0 : The container is being rebooted, has been shut down or exited
2204 * successfully. The container argument has been set to either
2205 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2207 * That is, success is indicated by a return value of zero, and an
2208 * error is indicated by a non-zero value.
2210 static int wait_for_container(pid_t pid
, ContainerStatus
*container
) {
2214 r
= wait_for_terminate(pid
, &status
);
2216 return log_warning_errno(r
, "Failed to wait for container: %m");
2218 switch (status
.si_code
) {
2221 if (status
.si_status
== 0) {
2222 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s exited successfully.", arg_machine
);
2225 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s failed with error code %i.", arg_machine
, status
.si_status
);
2227 *container
= CONTAINER_TERMINATED
;
2228 return status
.si_status
;
2231 if (status
.si_status
== SIGINT
) {
2233 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s has been shut down.", arg_machine
);
2234 *container
= CONTAINER_TERMINATED
;
2237 } else if (status
.si_status
== SIGHUP
) {
2239 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s is being rebooted.", arg_machine
);
2240 *container
= CONTAINER_REBOOTED
;
2244 /* CLD_KILLED fallthrough */
2247 log_error("Container %s terminated by signal %s.", arg_machine
, signal_to_string(status
.si_status
));
2251 log_error("Container %s failed due to unknown reason.", arg_machine
);
2258 static int on_orderly_shutdown(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
2261 pid
= PTR_TO_UINT32(userdata
);
2263 if (kill(pid
, arg_kill_signal
) >= 0) {
2264 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2265 sd_event_source_set_userdata(s
, NULL
);
2270 sd_event_exit(sd_event_source_get_event(s
), 0);
2274 static int determine_names(void) {
2277 if (arg_template
&& !arg_directory
&& arg_machine
) {
2279 /* If --template= was specified then we should not
2280 * search for a machine, but instead create a new one
2281 * in /var/lib/machine. */
2283 arg_directory
= strjoin("/var/lib/machines/", arg_machine
, NULL
);
2288 if (!arg_image
&& !arg_directory
) {
2290 _cleanup_(image_unrefp
) Image
*i
= NULL
;
2292 r
= image_find(arg_machine
, &i
);
2294 return log_error_errno(r
, "Failed to find image for machine '%s': %m", arg_machine
);
2296 log_error("No image for machine '%s': %m", arg_machine
);
2300 if (i
->type
== IMAGE_RAW
)
2301 r
= free_and_strdup(&arg_image
, i
->path
);
2303 r
= free_and_strdup(&arg_directory
, i
->path
);
2305 return log_error_errno(r
, "Invalid image directory: %m");
2308 arg_read_only
= arg_read_only
|| i
->read_only
;
2310 arg_directory
= get_current_dir_name();
2312 if (!arg_directory
&& !arg_machine
) {
2313 log_error("Failed to determine path, please use -D or -i.");
2319 if (arg_directory
&& path_equal(arg_directory
, "/"))
2320 arg_machine
= gethostname_malloc();
2322 arg_machine
= strdup(basename(arg_image
?: arg_directory
));
2327 hostname_cleanup(arg_machine
);
2328 if (!machine_name_is_valid(arg_machine
)) {
2329 log_error("Failed to determine machine name automatically, please use -M.");
2333 if (arg_ephemeral
) {
2336 /* Add a random suffix when this is an
2337 * ephemeral machine, so that we can run many
2338 * instances at once without manually having
2339 * to specify -M each time. */
2341 if (asprintf(&b
, "%s-%016" PRIx64
, arg_machine
, random_u64()) < 0)
2352 static int determine_uid_shift(const char *directory
) {
2360 if (arg_uid_shift
== UID_INVALID
) {
2363 r
= stat(directory
, &st
);
2365 return log_error_errno(errno
, "Failed to determine UID base of %s: %m", directory
);
2367 arg_uid_shift
= st
.st_uid
& UINT32_C(0xffff0000);
2369 if (arg_uid_shift
!= (st
.st_gid
& UINT32_C(0xffff0000))) {
2370 log_error("UID and GID base of %s don't match.", directory
);
2374 arg_uid_range
= UINT32_C(0x10000);
2377 if (arg_uid_shift
> (uid_t
) -1 - arg_uid_range
) {
2378 log_error("UID base too high for UID range.");
2382 log_info("Using user namespaces with base " UID_FMT
" and range " UID_FMT
".", arg_uid_shift
, arg_uid_range
);
2386 static int inner_child(
2388 const char *directory
,
2394 _cleanup_free_
char *home
= NULL
;
2396 const char *envp
[] = {
2397 "PATH=" DEFAULT_PATH_SPLIT_USR
,
2398 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2403 NULL
, /* container_uuid */
2404 NULL
, /* LISTEN_FDS */
2405 NULL
, /* LISTEN_PID */
2409 _cleanup_strv_free_
char **env_use
= NULL
;
2414 assert(kmsg_socket
>= 0);
2419 /* Tell the parent, that it now can write the UID map. */
2420 (void) barrier_place(barrier
); /* #1 */
2422 /* Wait until the parent wrote the UID map */
2423 if (!barrier_place_and_sync(barrier
)) { /* #2 */
2424 log_error("Parent died too early");
2429 r
= mount_all(NULL
, arg_userns
, true, arg_uid_shift
, arg_private_network
, arg_uid_range
, arg_selinux_apifs_context
);
2433 r
= mount_sysfs(NULL
);
2437 /* Wait until we are cgroup-ified, so that we
2438 * can mount the right cgroup path writable */
2439 if (!barrier_place_and_sync(barrier
)) { /* #3 */
2440 log_error("Parent died too early");
2444 r
= mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy
);
2448 r
= reset_uid_gid();
2450 return log_error_errno(r
, "Couldn't become new root: %m");
2452 r
= setup_boot_id(NULL
);
2456 r
= setup_kmsg(NULL
, kmsg_socket
);
2459 kmsg_socket
= safe_close(kmsg_socket
);
2464 return log_error_errno(errno
, "setsid() failed: %m");
2466 if (arg_private_network
)
2469 if (arg_expose_ports
) {
2470 r
= expose_port_send_rtnl(rtnl_socket
);
2473 rtnl_socket
= safe_close(rtnl_socket
);
2476 if (drop_capabilities() < 0)
2477 return log_error_errno(errno
, "drop_capabilities() failed: %m");
2481 if (arg_personality
!= PERSONALITY_INVALID
) {
2482 if (personality(arg_personality
) < 0)
2483 return log_error_errno(errno
, "personality() failed: %m");
2484 } else if (secondary
) {
2485 if (personality(PER_LINUX32
) < 0)
2486 return log_error_errno(errno
, "personality() failed: %m");
2490 if (arg_selinux_context
)
2491 if (setexeccon((security_context_t
) arg_selinux_context
) < 0)
2492 return log_error_errno(errno
, "setexeccon(\"%s\") failed: %m", arg_selinux_context
);
2495 r
= change_uid_gid(arg_user
, &home
);
2499 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
2503 if ((asprintf((char**)(envp
+ n_env
++), "HOME=%s", home
? home
: "/root") < 0) ||
2504 (asprintf((char**)(envp
+ n_env
++), "USER=%s", arg_user
? arg_user
: "root") < 0) ||
2505 (asprintf((char**)(envp
+ n_env
++), "LOGNAME=%s", arg_user
? arg_user
: "root") < 0))
2508 if (!sd_id128_equal(arg_uuid
, SD_ID128_NULL
)) {
2511 if (asprintf((char**)(envp
+ n_env
++), "container_uuid=%s", id128_format_as_uuid(arg_uuid
, as_uuid
)) < 0)
2515 if (fdset_size(fds
) > 0) {
2516 r
= fdset_cloexec(fds
, false);
2518 return log_error_errno(r
, "Failed to unset O_CLOEXEC for file descriptors.");
2520 if ((asprintf((char **)(envp
+ n_env
++), "LISTEN_FDS=%u", fdset_size(fds
)) < 0) ||
2521 (asprintf((char **)(envp
+ n_env
++), "LISTEN_PID=1") < 0))
2525 env_use
= strv_env_merge(2, envp
, arg_setenv
);
2529 /* Let the parent know that we are ready and
2530 * wait until the parent is ready with the
2532 if (!barrier_place_and_sync(barrier
)) { /* #4 */
2533 log_error("Parent died too early");
2537 /* Now, explicitly close the log, so that we
2538 * then can close all remaining fds. Closing
2539 * the log explicitly first has the benefit
2540 * that the logging subsystem knows about it,
2541 * and is thus ready to be reopened should we
2542 * need it again. Note that the other fds
2543 * closed here are at least the locking and
2546 (void) fdset_close_others(fds
);
2552 /* Automatically search for the init system */
2554 m
= 1 + strv_length(arg_parameters
);
2555 a
= newa(char*, m
+ 1);
2556 if (strv_isempty(arg_parameters
))
2559 memcpy(a
+ 1, arg_parameters
, m
* sizeof(char*));
2561 a
[0] = (char*) "/usr/lib/systemd/systemd";
2562 execve(a
[0], a
, env_use
);
2564 a
[0] = (char*) "/lib/systemd/systemd";
2565 execve(a
[0], a
, env_use
);
2567 a
[0] = (char*) "/sbin/init";
2568 execve(a
[0], a
, env_use
);
2569 } else if (!strv_isempty(arg_parameters
))
2570 execvpe(arg_parameters
[0], arg_parameters
, env_use
);
2572 chdir(home
?: "/root");
2573 execle("/bin/bash", "-bash", NULL
, env_use
);
2574 execle("/bin/sh", "-sh", NULL
, env_use
);
2578 return log_error_errno(errno
, "execv() failed: %m");
2581 static int outer_child(
2583 const char *directory
,
2584 const char *console
,
2585 const char *root_device
, bool root_device_rw
,
2586 const char *home_device
, bool home_device_rw
,
2587 const char *srv_device
, bool srv_device_rw
,
2593 int uid_shift_socket
,
2603 assert(pid_socket
>= 0);
2604 assert(kmsg_socket
>= 0);
2608 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0)
2609 return log_error_errno(errno
, "PR_SET_PDEATHSIG failed: %m");
2612 close_nointr(STDIN_FILENO
);
2613 close_nointr(STDOUT_FILENO
);
2614 close_nointr(STDERR_FILENO
);
2616 r
= open_terminal(console
, O_RDWR
);
2617 if (r
!= STDIN_FILENO
) {
2623 return log_error_errno(r
, "Failed to open console: %m");
2626 if (dup2(STDIN_FILENO
, STDOUT_FILENO
) != STDOUT_FILENO
||
2627 dup2(STDIN_FILENO
, STDERR_FILENO
) != STDERR_FILENO
)
2628 return log_error_errno(errno
, "Failed to duplicate console: %m");
2631 r
= reset_audit_loginuid();
2635 /* Mark everything as slave, so that we still
2636 * receive mounts from the real root, but don't
2637 * propagate mounts to the real root. */
2638 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
2639 return log_error_errno(errno
, "MS_SLAVE|MS_REC failed: %m");
2641 r
= mount_devices(directory
,
2642 root_device
, root_device_rw
,
2643 home_device
, home_device_rw
,
2644 srv_device
, srv_device_rw
);
2648 r
= determine_uid_shift(directory
);
2653 l
= send(uid_shift_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), MSG_NOSIGNAL
);
2655 return log_error_errno(errno
, "Failed to send UID shift: %m");
2656 if (l
!= sizeof(arg_uid_shift
)) {
2657 log_error("Short write while sending UID shift.");
2662 /* Turn directory into bind mount */
2663 if (mount(directory
, directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
2664 return log_error_errno(errno
, "Failed to make bind mount: %m");
2666 r
= setup_volatile(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2670 r
= setup_volatile_state(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2674 r
= base_filesystem_create(directory
, arg_uid_shift
, (gid_t
) arg_uid_shift
);
2678 if (arg_read_only
) {
2679 r
= bind_remount_recursive(directory
, true);
2681 return log_error_errno(r
, "Failed to make tree read-only: %m");
2684 r
= mount_all(directory
, arg_userns
, false, arg_private_network
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2688 r
= copy_devnodes(directory
);
2692 dev_setup(directory
, arg_uid_shift
, arg_uid_shift
);
2694 r
= setup_pts(directory
);
2698 r
= setup_propagate(directory
);
2702 r
= setup_dev_console(directory
, console
);
2706 r
= setup_seccomp();
2710 r
= setup_timezone(directory
);
2714 r
= setup_resolv_conf(directory
);
2718 r
= setup_journal(directory
);
2722 r
= mount_custom(directory
, arg_custom_mounts
, arg_n_custom_mounts
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2726 r
= mount_cgroups(directory
, arg_unified_cgroup_hierarchy
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2730 r
= mount_move_root(directory
);
2732 return log_error_errno(r
, "Failed to move root directory: %m");
2734 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
|
2735 (arg_share_system
? 0 : CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
) |
2736 (arg_private_network
? CLONE_NEWNET
: 0) |
2737 (arg_userns
? CLONE_NEWUSER
: 0),
2740 return log_error_errno(errno
, "Failed to fork inner child: %m");
2742 pid_socket
= safe_close(pid_socket
);
2743 uid_shift_socket
= safe_close(uid_shift_socket
);
2745 /* The inner child has all namespaces that are
2746 * requested, so that we all are owned by the user if
2747 * user namespaces are turned on. */
2749 r
= inner_child(barrier
, directory
, secondary
, kmsg_socket
, rtnl_socket
, fds
);
2751 _exit(EXIT_FAILURE
);
2753 _exit(EXIT_SUCCESS
);
2756 l
= send(pid_socket
, &pid
, sizeof(pid
), MSG_NOSIGNAL
);
2758 return log_error_errno(errno
, "Failed to send PID: %m");
2759 if (l
!= sizeof(pid
)) {
2760 log_error("Short write while sending PID.");
2764 pid_socket
= safe_close(pid_socket
);
2765 kmsg_socket
= safe_close(kmsg_socket
);
2766 rtnl_socket
= safe_close(rtnl_socket
);
2771 static int setup_uid_map(pid_t pid
) {
2772 char uid_map
[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t
) + 1], line
[DECIMAL_STR_MAX(uid_t
)*3+3+1];
2777 xsprintf(uid_map
, "/proc/" PID_FMT
"/uid_map", pid
);
2778 xsprintf(line
, UID_FMT
" " UID_FMT
" " UID_FMT
"\n", 0, arg_uid_shift
, arg_uid_range
);
2779 r
= write_string_file(uid_map
, line
, 0);
2781 return log_error_errno(r
, "Failed to write UID map: %m");
2783 /* We always assign the same UID and GID ranges */
2784 xsprintf(uid_map
, "/proc/" PID_FMT
"/gid_map", pid
);
2785 r
= write_string_file(uid_map
, line
, 0);
2787 return log_error_errno(r
, "Failed to write GID map: %m");
2792 static int load_settings(void) {
2793 _cleanup_(settings_freep
) Settings
*settings
= NULL
;
2794 _cleanup_fclose_
FILE *f
= NULL
;
2795 _cleanup_free_
char *p
= NULL
;
2799 /* If all settings are masked, there's no point in looking for
2800 * the settings file */
2801 if ((arg_settings_mask
& _SETTINGS_MASK_ALL
) == _SETTINGS_MASK_ALL
)
2804 fn
= strjoina(arg_machine
, ".nspawn");
2806 /* We first look in the admin's directories in /etc and /run */
2807 FOREACH_STRING(i
, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2808 _cleanup_free_
char *j
= NULL
;
2810 j
= strjoin(i
, "/", fn
, NULL
);
2819 /* By default we trust configuration from /etc and /run */
2820 if (arg_settings_trusted
< 0)
2821 arg_settings_trusted
= true;
2826 if (errno
!= ENOENT
)
2827 return log_error_errno(errno
, "Failed to open %s: %m", j
);
2831 /* After that, let's look for a file next to the
2832 * actual image we shall boot. */
2835 p
= file_in_same_dir(arg_image
, fn
);
2838 } else if (arg_directory
) {
2839 p
= file_in_same_dir(arg_directory
, fn
);
2846 if (!f
&& errno
!= ENOENT
)
2847 return log_error_errno(errno
, "Failed to open %s: %m", p
);
2849 /* By default we do not trust configuration from /var/lib/machines */
2850 if (arg_settings_trusted
< 0)
2851 arg_settings_trusted
= false;
2858 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted
));
2860 r
= settings_load(f
, p
, &settings
);
2864 /* Copy over bits from the settings, unless they have been
2865 * explicitly masked by command line switches. */
2867 if ((arg_settings_mask
& SETTING_BOOT
) == 0 &&
2868 settings
->boot
>= 0) {
2869 arg_boot
= settings
->boot
;
2871 strv_free(arg_parameters
);
2872 arg_parameters
= settings
->parameters
;
2873 settings
->parameters
= NULL
;
2876 if ((arg_settings_mask
& SETTING_ENVIRONMENT
) == 0 &&
2877 settings
->environment
) {
2878 strv_free(arg_setenv
);
2879 arg_setenv
= settings
->environment
;
2880 settings
->environment
= NULL
;
2883 if ((arg_settings_mask
& SETTING_USER
) == 0 &&
2886 arg_user
= settings
->user
;
2887 settings
->user
= NULL
;
2890 if ((arg_settings_mask
& SETTING_CAPABILITY
) == 0) {
2893 plus
= settings
->capability
;
2894 if (settings_private_network(settings
))
2895 plus
|= (1ULL << CAP_NET_ADMIN
);
2897 if (!arg_settings_trusted
&& plus
!= 0) {
2898 if (settings
->capability
!= 0)
2899 log_warning("Ignoring Capability= setting, file %s is not trusted.", p
);
2903 arg_retain
&= ~settings
->drop_capability
;
2906 if ((arg_settings_mask
& SETTING_KILL_SIGNAL
) == 0 &&
2907 settings
->kill_signal
> 0)
2908 arg_kill_signal
= settings
->kill_signal
;
2910 if ((arg_settings_mask
& SETTING_PERSONALITY
) == 0 &&
2911 settings
->personality
!= PERSONALITY_INVALID
)
2912 arg_personality
= settings
->personality
;
2914 if ((arg_settings_mask
& SETTING_MACHINE_ID
) == 0 &&
2915 !sd_id128_is_null(settings
->machine_id
)) {
2917 if (!arg_settings_trusted
)
2918 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p
);
2920 arg_uuid
= settings
->machine_id
;
2923 if ((arg_settings_mask
& SETTING_READ_ONLY
) == 0 &&
2924 settings
->read_only
>= 0)
2925 arg_read_only
= settings
->read_only
;
2927 if ((arg_settings_mask
& SETTING_VOLATILE_MODE
) == 0 &&
2928 settings
->volatile_mode
!= _VOLATILE_MODE_INVALID
)
2929 arg_volatile_mode
= settings
->volatile_mode
;
2931 if ((arg_settings_mask
& SETTING_CUSTOM_MOUNTS
) == 0 &&
2932 settings
->n_custom_mounts
> 0) {
2934 if (!arg_settings_trusted
)
2935 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p
);
2937 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
2938 arg_custom_mounts
= settings
->custom_mounts
;
2939 arg_n_custom_mounts
= settings
->n_custom_mounts
;
2941 settings
->custom_mounts
= NULL
;
2942 settings
->n_custom_mounts
= 0;
2946 if ((arg_settings_mask
& SETTING_NETWORK
) == 0 &&
2947 (settings
->private_network
>= 0 ||
2948 settings
->network_veth
>= 0 ||
2949 settings
->network_bridge
||
2950 settings
->network_interfaces
||
2951 settings
->network_macvlan
||
2952 settings
->network_ipvlan
)) {
2954 if (!arg_settings_trusted
)
2955 log_warning("Ignoring network settings, file %s is not trusted.", p
);
2957 arg_network_veth
= settings_private_network(settings
);
2958 arg_private_network
= settings_private_network(settings
);
2960 strv_free(arg_network_interfaces
);
2961 arg_network_interfaces
= settings
->network_interfaces
;
2962 settings
->network_interfaces
= NULL
;
2964 strv_free(arg_network_macvlan
);
2965 arg_network_macvlan
= settings
->network_macvlan
;
2966 settings
->network_macvlan
= NULL
;
2968 strv_free(arg_network_ipvlan
);
2969 arg_network_ipvlan
= settings
->network_ipvlan
;
2970 settings
->network_ipvlan
= NULL
;
2972 free(arg_network_bridge
);
2973 arg_network_bridge
= settings
->network_bridge
;
2974 settings
->network_bridge
= NULL
;
2978 if ((arg_settings_mask
& SETTING_EXPOSE_PORTS
) == 0 &&
2979 settings
->expose_ports
) {
2981 if (!arg_settings_trusted
)
2982 log_warning("Ignoring Port= setting, file %s is not trusted.", p
);
2984 expose_port_free_all(arg_expose_ports
);
2985 arg_expose_ports
= settings
->expose_ports
;
2986 settings
->expose_ports
= NULL
;
2993 int main(int argc
, char *argv
[]) {
2995 _cleanup_free_
char *device_path
= NULL
, *root_device
= NULL
, *home_device
= NULL
, *srv_device
= NULL
, *console
= NULL
;
2996 bool root_device_rw
= true, home_device_rw
= true, srv_device_rw
= true;
2997 _cleanup_close_
int master
= -1, image_fd
= -1;
2998 _cleanup_fdset_free_ FDSet
*fds
= NULL
;
2999 int r
, n_fd_passed
, loop_nr
= -1;
3000 char veth_name
[IFNAMSIZ
];
3001 bool secondary
= false, remove_subvol
= false;
3004 int ret
= EXIT_SUCCESS
;
3005 union in_addr_union exposed
= {};
3006 _cleanup_release_lock_file_ LockFile tree_global_lock
= LOCK_FILE_INIT
, tree_local_lock
= LOCK_FILE_INIT
;
3009 log_parse_environment();
3012 r
= parse_argv(argc
, argv
);
3016 if (geteuid() != 0) {
3017 log_error("Need to be root.");
3021 r
= determine_names();
3025 r
= load_settings();
3029 r
= verify_arguments();
3033 n_fd_passed
= sd_listen_fds(false);
3034 if (n_fd_passed
> 0) {
3035 r
= fdset_new_listen_fds(&fds
, false);
3037 log_error_errno(r
, "Failed to collect file descriptors: %m");
3042 if (arg_directory
) {
3045 if (path_equal(arg_directory
, "/") && !arg_ephemeral
) {
3046 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3051 if (arg_ephemeral
) {
3052 _cleanup_free_
char *np
= NULL
;
3054 /* If the specified path is a mount point we
3055 * generate the new snapshot immediately
3056 * inside it under a random name. However if
3057 * the specified is not a mount point we
3058 * create the new snapshot in the parent
3059 * directory, just next to it. */
3060 r
= path_is_mount_point(arg_directory
, 0);
3062 log_error_errno(r
, "Failed to determine whether directory %s is mount point: %m", arg_directory
);
3066 r
= tempfn_random_child(arg_directory
, "machine.", &np
);
3068 r
= tempfn_random(arg_directory
, "machine.", &np
);
3070 log_error_errno(r
, "Failed to generate name for snapshot: %m");
3074 r
= image_path_lock(np
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3076 log_error_errno(r
, "Failed to lock %s: %m", np
);
3080 r
= btrfs_subvol_snapshot(arg_directory
, np
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
3082 log_error_errno(r
, "Failed to create snapshot %s from %s: %m", np
, arg_directory
);
3086 free(arg_directory
);
3090 remove_subvol
= true;
3093 r
= image_path_lock(arg_directory
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3095 log_error_errno(r
, "Directory tree %s is currently busy.", arg_directory
);
3099 log_error_errno(r
, "Failed to lock %s: %m", arg_directory
);
3104 r
= btrfs_subvol_snapshot(arg_template
, arg_directory
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
3107 log_info("Directory %s already exists, not populating from template %s.", arg_directory
, arg_template
);
3109 log_error_errno(r
, "Couldn't create snapshot %s from %s: %m", arg_directory
, arg_template
);
3113 log_info("Populated %s from template %s.", arg_directory
, arg_template
);
3119 if (path_is_os_tree(arg_directory
) <= 0) {
3120 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory
);
3127 p
= strjoina(arg_directory
, "/usr/");
3128 if (laccess(p
, F_OK
) < 0) {
3129 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory
);
3136 char template[] = "/tmp/nspawn-root-XXXXXX";
3139 assert(!arg_template
);
3141 r
= image_path_lock(arg_image
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3143 r
= log_error_errno(r
, "Disk image %s is currently busy.", arg_image
);
3147 r
= log_error_errno(r
, "Failed to create image lock: %m");
3151 if (!mkdtemp(template)) {
3152 log_error_errno(errno
, "Failed to create temporary directory: %m");
3157 arg_directory
= strdup(template);
3158 if (!arg_directory
) {
3163 image_fd
= setup_image(&device_path
, &loop_nr
);
3169 r
= dissect_image(image_fd
,
3170 &root_device
, &root_device_rw
,
3171 &home_device
, &home_device_rw
,
3172 &srv_device
, &srv_device_rw
,
3178 r
= custom_mounts_prepare();
3183 isatty(STDIN_FILENO
) > 0 &&
3184 isatty(STDOUT_FILENO
) > 0;
3186 master
= posix_openpt(O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NDELAY
);
3188 r
= log_error_errno(errno
, "Failed to acquire pseudo tty: %m");
3192 r
= ptsname_malloc(master
, &console
);
3194 r
= log_error_errno(r
, "Failed to determine tty name: %m");
3198 if (unlockpt(master
) < 0) {
3199 r
= log_error_errno(errno
, "Failed to unlock tty: %m");
3204 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3205 arg_machine
, arg_image
?: arg_directory
);
3207 assert_se(sigprocmask_many(SIG_BLOCK
, NULL
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, -1) >= 0);
3209 assert_se(sigemptyset(&mask_chld
) == 0);
3210 assert_se(sigaddset(&mask_chld
, SIGCHLD
) == 0);
3212 if (prctl(PR_SET_CHILD_SUBREAPER
, 1) < 0) {
3213 r
= log_error_errno(errno
, "Failed to become subreaper: %m");
3218 _cleanup_close_pair_
int kmsg_socket_pair
[2] = { -1, -1 }, rtnl_socket_pair
[2] = { -1, -1 }, pid_socket_pair
[2] = { -1, -1 },
3219 uid_shift_socket_pair
[2] = { -1, -1 };
3220 ContainerStatus container_status
;
3221 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
3222 static const struct sigaction sa
= {
3223 .sa_handler
= nop_signal_handler
,
3224 .sa_flags
= SA_NOCLDSTOP
,
3228 _cleanup_event_unref_ sd_event
*event
= NULL
;
3229 _cleanup_(pty_forward_freep
) PTYForward
*forward
= NULL
;
3230 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
3233 r
= barrier_create(&barrier
);
3235 log_error_errno(r
, "Cannot initialize IPC barrier: %m");
3239 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, kmsg_socket_pair
) < 0) {
3240 r
= log_error_errno(errno
, "Failed to create kmsg socket pair: %m");
3244 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, rtnl_socket_pair
) < 0) {
3245 r
= log_error_errno(errno
, "Failed to create rtnl socket pair: %m");
3249 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, pid_socket_pair
) < 0) {
3250 r
= log_error_errno(errno
, "Failed to create pid socket pair: %m");
3255 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, uid_shift_socket_pair
) < 0) {
3256 r
= log_error_errno(errno
, "Failed to create uid shift socket pair: %m");
3260 /* Child can be killed before execv(), so handle SIGCHLD
3261 * in order to interrupt parent's blocking calls and
3262 * give it a chance to call wait() and terminate. */
3263 r
= sigprocmask(SIG_UNBLOCK
, &mask_chld
, NULL
);
3265 r
= log_error_errno(errno
, "Failed to change the signal mask: %m");
3269 r
= sigaction(SIGCHLD
, &sa
, NULL
);
3271 r
= log_error_errno(errno
, "Failed to install SIGCHLD handler: %m");
3275 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
, NULL
);
3277 if (errno
== EINVAL
)
3278 r
= log_error_errno(errno
, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3280 r
= log_error_errno(errno
, "clone() failed: %m");
3286 /* The outer child only has a file system namespace. */
3287 barrier_set_role(&barrier
, BARRIER_CHILD
);
3289 master
= safe_close(master
);
3291 kmsg_socket_pair
[0] = safe_close(kmsg_socket_pair
[0]);
3292 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3293 pid_socket_pair
[0] = safe_close(pid_socket_pair
[0]);
3294 uid_shift_socket_pair
[0] = safe_close(uid_shift_socket_pair
[0]);
3296 (void) reset_all_signal_handlers();
3297 (void) reset_signal_mask();
3299 r
= outer_child(&barrier
,
3302 root_device
, root_device_rw
,
3303 home_device
, home_device_rw
,
3304 srv_device
, srv_device_rw
,
3308 kmsg_socket_pair
[1],
3309 rtnl_socket_pair
[1],
3310 uid_shift_socket_pair
[1],
3313 _exit(EXIT_FAILURE
);
3315 _exit(EXIT_SUCCESS
);
3318 barrier_set_role(&barrier
, BARRIER_PARENT
);
3320 fds
= fdset_free(fds
);
3322 kmsg_socket_pair
[1] = safe_close(kmsg_socket_pair
[1]);
3323 rtnl_socket_pair
[1] = safe_close(rtnl_socket_pair
[1]);
3324 pid_socket_pair
[1] = safe_close(pid_socket_pair
[1]);
3325 uid_shift_socket_pair
[1] = safe_close(uid_shift_socket_pair
[1]);
3327 /* Wait for the outer child. */
3328 r
= wait_for_terminate_and_warn("namespace helper", pid
, NULL
);
3337 /* And now retrieve the PID of the inner child. */
3338 l
= recv(pid_socket_pair
[0], &pid
, sizeof(pid
), 0);
3340 r
= log_error_errno(errno
, "Failed to read inner child PID: %m");
3343 if (l
!= sizeof(pid
)) {
3344 log_error("Short read while reading inner child PID.");
3349 log_debug("Init process invoked as PID " PID_FMT
, pid
);
3352 if (!barrier_place_and_sync(&barrier
)) { /* #1 */
3353 log_error("Child died too early.");
3358 l
= recv(uid_shift_socket_pair
[0], &arg_uid_shift
, sizeof(arg_uid_shift
), 0);
3360 r
= log_error_errno(errno
, "Failed to read UID shift: %m");
3363 if (l
!= sizeof(arg_uid_shift
)) {
3364 log_error("Short read while reading UID shift.");
3369 r
= setup_uid_map(pid
);
3373 (void) barrier_place(&barrier
); /* #2 */
3376 if (arg_private_network
) {
3378 r
= move_network_interfaces(pid
, arg_network_interfaces
);
3382 if (arg_network_veth
) {
3383 r
= setup_veth(arg_machine
, pid
, veth_name
, !!arg_network_bridge
);
3389 if (arg_network_bridge
) {
3390 r
= setup_bridge(veth_name
, arg_network_bridge
);
3398 r
= setup_macvlan(arg_machine
, pid
, arg_network_macvlan
);
3402 r
= setup_ipvlan(arg_machine
, pid
, arg_network_ipvlan
);
3408 r
= register_machine(
3415 arg_custom_mounts
, arg_n_custom_mounts
,
3423 r
= sync_cgroup(pid
, arg_unified_cgroup_hierarchy
);
3427 if (arg_keep_unit
) {
3428 r
= create_subcgroup(pid
, arg_unified_cgroup_hierarchy
);
3433 r
= chown_cgroup(pid
, arg_uid_shift
);
3437 /* Notify the child that the parent is ready with all
3438 * its setup (including cgroup-ification), and that
3439 * the child can now hand over control to the code to
3440 * run inside the container. */
3441 (void) barrier_place(&barrier
); /* #3 */
3443 /* Block SIGCHLD here, before notifying child.
3444 * process_pty() will handle it with the other signals. */
3445 assert_se(sigprocmask(SIG_BLOCK
, &mask_chld
, NULL
) >= 0);
3447 /* Reset signal to default */
3448 r
= default_signals(SIGCHLD
, -1);
3450 log_error_errno(r
, "Failed to reset SIGCHLD: %m");
3454 /* Let the child know that we are ready and wait that the child is completely ready now. */
3455 if (!barrier_place_and_sync(&barrier
)) { /* #4 */
3456 log_error("Child died too early.");
3463 "STATUS=Container running.\n"
3464 "X_NSPAWN_LEADER_PID=" PID_FMT
, pid
);
3466 r
= sd_event_new(&event
);
3468 log_error_errno(r
, "Failed to get default event source: %m");
3472 if (arg_kill_signal
> 0) {
3473 /* Try to kill the init system on SIGINT or SIGTERM */
3474 sd_event_add_signal(event
, NULL
, SIGINT
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
3475 sd_event_add_signal(event
, NULL
, SIGTERM
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
3477 /* Immediately exit */
3478 sd_event_add_signal(event
, NULL
, SIGINT
, NULL
, NULL
);
3479 sd_event_add_signal(event
, NULL
, SIGTERM
, NULL
, NULL
);
3482 /* simply exit on sigchld */
3483 sd_event_add_signal(event
, NULL
, SIGCHLD
, NULL
, NULL
);
3485 if (arg_expose_ports
) {
3486 r
= expose_port_watch_rtnl(event
, rtnl_socket_pair
[0], on_address_change
, &exposed
, &rtnl
);
3490 (void) expose_port_execute(rtnl
, arg_expose_ports
, &exposed
);
3493 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3495 r
= pty_forward_new(event
, master
, PTY_FORWARD_IGNORE_VHANGUP
| (interactive
? 0 : PTY_FORWARD_READ_ONLY
), &forward
);
3497 log_error_errno(r
, "Failed to create PTY forwarder: %m");
3501 r
= sd_event_loop(event
);
3503 log_error_errno(r
, "Failed to run event loop: %m");
3507 pty_forward_get_last_char(forward
, &last_char
);
3509 forward
= pty_forward_free(forward
);
3511 if (!arg_quiet
&& last_char
!= '\n')
3514 /* Kill if it is not dead yet anyway */
3515 if (arg_register
&& !arg_keep_unit
)
3516 terminate_machine(pid
);
3518 /* Normally redundant, but better safe than sorry */
3521 r
= wait_for_container(pid
, &container_status
);
3525 /* We failed to wait for the container, or the
3526 * container exited abnormally */
3528 else if (r
> 0 || container_status
== CONTAINER_TERMINATED
){
3529 /* The container exited with a non-zero
3530 * status, or with zero status and no reboot
3536 /* CONTAINER_REBOOTED, loop again */
3538 if (arg_keep_unit
) {
3539 /* Special handling if we are running as a
3540 * service: instead of simply restarting the
3541 * machine we want to restart the entire
3542 * service, so let's inform systemd about this
3543 * with the special exit code 133. The service
3544 * file uses RestartForceExitStatus=133 so
3545 * that this results in a full nspawn
3546 * restart. This is necessary since we might
3547 * have cgroup parameters set we want to have
3554 expose_port_flush(arg_expose_ports
, &exposed
);
3560 "STATUS=Terminating...");
3565 /* Try to flush whatever is still queued in the pty */
3567 (void) copy_bytes(master
, STDOUT_FILENO
, (uint64_t) -1, false);
3569 loop_remove(loop_nr
, &image_fd
);
3571 if (remove_subvol
&& arg_directory
) {
3574 k
= btrfs_subvol_remove(arg_directory
, BTRFS_REMOVE_RECURSIVE
|BTRFS_REMOVE_QUOTA
);
3576 log_warning_errno(k
, "Cannot remove subvolume '%s', ignoring: %m", arg_directory
);
3582 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
3583 (void) rm_rf(p
, REMOVE_ROOT
);
3586 expose_port_flush(arg_expose_ports
, &exposed
);
3588 free(arg_directory
);
3593 strv_free(arg_setenv
);
3594 free(arg_network_bridge
);
3595 strv_free(arg_network_interfaces
);
3596 strv_free(arg_network_macvlan
);
3597 strv_free(arg_network_ipvlan
);
3598 strv_free(arg_parameters
);
3599 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
3600 expose_port_free_all(arg_expose_ports
);
3602 return r
< 0 ? EXIT_FAILURE
: ret
;