1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
23 #include <blkid/blkid.h>
27 #include <linux/loop.h>
33 #include <selinux/selinux.h>
40 #include <sys/mount.h>
41 #include <sys/personality.h>
42 #include <sys/prctl.h>
43 #include <sys/types.h>
46 #include "sd-daemon.h"
50 #include "base-filesystem.h"
51 #include "blkid-util.h"
52 #include "btrfs-util.h"
54 #include "capability.h"
55 #include "cgroup-util.h"
57 #include "dev-setup.h"
59 #include "event-util.h"
63 #include "formats-util.h"
65 #include "hostname-util.h"
67 #include "loopback-setup.h"
68 #include "machine-image.h"
72 #include "netlink-util.h"
73 #include "nspawn-cgroup.h"
74 #include "nspawn-expose-ports.h"
75 #include "nspawn-mount.h"
76 #include "nspawn-network.h"
77 #include "nspawn-register.h"
78 #include "nspawn-settings.h"
79 #include "nspawn-setuid.h"
80 #include "path-util.h"
81 #include "process-util.h"
83 #include "random-util.h"
86 #include "seccomp-util.h"
88 #include "signal-util.h"
89 #include "socket-util.h"
90 #include "string-util.h"
92 #include "terminal-util.h"
93 #include "udev-util.h"
94 #include "user-util.h"
97 typedef enum ContainerStatus
{
102 typedef enum LinkJournal
{
109 static char *arg_directory
= NULL
;
110 static char *arg_template
= NULL
;
111 static char *arg_user
= NULL
;
112 static sd_id128_t arg_uuid
= {};
113 static char *arg_machine
= NULL
;
114 static const char *arg_selinux_context
= NULL
;
115 static const char *arg_selinux_apifs_context
= NULL
;
116 static const char *arg_slice
= NULL
;
117 static bool arg_private_network
= false;
118 static bool arg_read_only
= false;
119 static bool arg_boot
= false;
120 static bool arg_ephemeral
= false;
121 static LinkJournal arg_link_journal
= LINK_AUTO
;
122 static bool arg_link_journal_try
= false;
123 static uint64_t arg_retain
=
124 (1ULL << CAP_CHOWN
) |
125 (1ULL << CAP_DAC_OVERRIDE
) |
126 (1ULL << CAP_DAC_READ_SEARCH
) |
127 (1ULL << CAP_FOWNER
) |
128 (1ULL << CAP_FSETID
) |
129 (1ULL << CAP_IPC_OWNER
) |
131 (1ULL << CAP_LEASE
) |
132 (1ULL << CAP_LINUX_IMMUTABLE
) |
133 (1ULL << CAP_NET_BIND_SERVICE
) |
134 (1ULL << CAP_NET_BROADCAST
) |
135 (1ULL << CAP_NET_RAW
) |
136 (1ULL << CAP_SETGID
) |
137 (1ULL << CAP_SETFCAP
) |
138 (1ULL << CAP_SETPCAP
) |
139 (1ULL << CAP_SETUID
) |
140 (1ULL << CAP_SYS_ADMIN
) |
141 (1ULL << CAP_SYS_CHROOT
) |
142 (1ULL << CAP_SYS_NICE
) |
143 (1ULL << CAP_SYS_PTRACE
) |
144 (1ULL << CAP_SYS_TTY_CONFIG
) |
145 (1ULL << CAP_SYS_RESOURCE
) |
146 (1ULL << CAP_SYS_BOOT
) |
147 (1ULL << CAP_AUDIT_WRITE
) |
148 (1ULL << CAP_AUDIT_CONTROL
) |
150 static CustomMount
*arg_custom_mounts
= NULL
;
151 static unsigned arg_n_custom_mounts
= 0;
152 static char **arg_setenv
= NULL
;
153 static bool arg_quiet
= false;
154 static bool arg_share_system
= false;
155 static bool arg_register
= true;
156 static bool arg_keep_unit
= false;
157 static char **arg_network_interfaces
= NULL
;
158 static char **arg_network_macvlan
= NULL
;
159 static char **arg_network_ipvlan
= NULL
;
160 static bool arg_network_veth
= false;
161 static char *arg_network_bridge
= NULL
;
162 static unsigned long arg_personality
= PERSONALITY_INVALID
;
163 static char *arg_image
= NULL
;
164 static VolatileMode arg_volatile_mode
= VOLATILE_NO
;
165 static ExposePort
*arg_expose_ports
= NULL
;
166 static char **arg_property
= NULL
;
167 static uid_t arg_uid_shift
= UID_INVALID
, arg_uid_range
= 0x10000U
;
168 static bool arg_userns
= false;
169 static int arg_kill_signal
= 0;
170 static bool arg_unified_cgroup_hierarchy
= false;
171 static SettingsMask arg_settings_mask
= 0;
172 static int arg_settings_trusted
= -1;
173 static char **arg_parameters
= NULL
;
175 static void help(void) {
176 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
177 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
178 " -h --help Show this help\n"
179 " --version Print version string\n"
180 " -q --quiet Do not show status information\n"
181 " -D --directory=PATH Root directory for the container\n"
182 " --template=PATH Initialize root directory from template directory,\n"
184 " -x --ephemeral Run container with snapshot of root directory, and\n"
185 " remove it after exit\n"
186 " -i --image=PATH File system device or disk image for the container\n"
187 " -b --boot Boot up full system (i.e. invoke init)\n"
188 " -u --user=USER Run the command under specified user or uid\n"
189 " -M --machine=NAME Set the machine name for the container\n"
190 " --uuid=UUID Set a specific machine UUID for the container\n"
191 " -S --slice=SLICE Place the container in the specified slice\n"
192 " --property=NAME=VALUE Set scope unit property\n"
193 " --private-users[=UIDBASE[:NUIDS]]\n"
194 " Run within user namespace\n"
195 " --private-network Disable network in container\n"
196 " --network-interface=INTERFACE\n"
197 " Assign an existing network interface to the\n"
199 " --network-macvlan=INTERFACE\n"
200 " Create a macvlan network interface based on an\n"
201 " existing network interface to the container\n"
202 " --network-ipvlan=INTERFACE\n"
203 " Create a ipvlan network interface based on an\n"
204 " existing network interface to the container\n"
205 " -n --network-veth Add a virtual ethernet connection between host\n"
207 " --network-bridge=INTERFACE\n"
208 " Add a virtual ethernet connection between host\n"
209 " and container and add it to an existing bridge on\n"
211 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
212 " Expose a container IP port on the host\n"
213 " -Z --selinux-context=SECLABEL\n"
214 " Set the SELinux security context to be used by\n"
215 " processes in the container\n"
216 " -L --selinux-apifs-context=SECLABEL\n"
217 " Set the SELinux security context to be used by\n"
218 " API/tmpfs file systems in the container\n"
219 " --capability=CAP In addition to the default, retain specified\n"
221 " --drop-capability=CAP Drop the specified capability from the default set\n"
222 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
223 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
224 " try-guest, try-host\n"
225 " -j Equivalent to --link-journal=try-guest\n"
226 " --read-only Mount the root directory read-only\n"
227 " --bind=PATH[:PATH[:OPTIONS]]\n"
228 " Bind mount a file or directory from the host into\n"
230 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
231 " Similar, but creates a read-only bind mount\n"
232 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
233 " --overlay=PATH[:PATH...]:PATH\n"
234 " Create an overlay mount from the host to \n"
236 " --overlay-ro=PATH[:PATH...]:PATH\n"
237 " Similar, but creates a read-only overlay mount\n"
238 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
239 " --share-system Share system namespaces with host\n"
240 " --register=BOOLEAN Register container as machine\n"
241 " --keep-unit Do not register a scope for the machine, reuse\n"
242 " the service unit nspawn is running in\n"
243 " --volatile[=MODE] Run the system in volatile mode\n"
244 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
245 , program_invocation_short_name
);
249 static int custom_mounts_prepare(void) {
253 /* Ensure the mounts are applied prefix first. */
254 qsort_safe(arg_custom_mounts
, arg_n_custom_mounts
, sizeof(CustomMount
), custom_mount_compare
);
256 /* Allocate working directories for the overlay file systems that need it */
257 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
258 CustomMount
*m
= &arg_custom_mounts
[i
];
260 if (arg_userns
&& arg_uid_shift
== UID_INVALID
&& path_equal(m
->destination
, "/")) {
261 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
265 if (m
->type
!= CUSTOM_MOUNT_OVERLAY
)
274 r
= tempfn_random(m
->source
, NULL
, &m
->work_dir
);
276 return log_error_errno(r
, "Failed to generate work directory from %s: %m", m
->source
);
282 static int detect_unified_cgroup_hierarchy(void) {
286 /* Allow the user to control whether the unified hierarchy is used */
287 e
= getenv("UNIFIED_CGROUP_HIERARCHY");
289 r
= parse_boolean(e
);
291 return log_error_errno(r
, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
293 arg_unified_cgroup_hierarchy
= r
;
297 /* Otherwise inherit the default from the host system */
300 return log_error_errno(r
, "Failed to determine whether the unified cgroups hierarchy is used: %m");
302 arg_unified_cgroup_hierarchy
= r
;
306 static int parse_argv(int argc
, char *argv
[]) {
325 ARG_NETWORK_INTERFACE
,
338 static const struct option options
[] = {
339 { "help", no_argument
, NULL
, 'h' },
340 { "version", no_argument
, NULL
, ARG_VERSION
},
341 { "directory", required_argument
, NULL
, 'D' },
342 { "template", required_argument
, NULL
, ARG_TEMPLATE
},
343 { "ephemeral", no_argument
, NULL
, 'x' },
344 { "user", required_argument
, NULL
, 'u' },
345 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
346 { "boot", no_argument
, NULL
, 'b' },
347 { "uuid", required_argument
, NULL
, ARG_UUID
},
348 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
349 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
350 { "drop-capability", required_argument
, NULL
, ARG_DROP_CAPABILITY
},
351 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
352 { "bind", required_argument
, NULL
, ARG_BIND
},
353 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
354 { "tmpfs", required_argument
, NULL
, ARG_TMPFS
},
355 { "overlay", required_argument
, NULL
, ARG_OVERLAY
},
356 { "overlay-ro", required_argument
, NULL
, ARG_OVERLAY_RO
},
357 { "machine", required_argument
, NULL
, 'M' },
358 { "slice", required_argument
, NULL
, 'S' },
359 { "setenv", required_argument
, NULL
, ARG_SETENV
},
360 { "selinux-context", required_argument
, NULL
, 'Z' },
361 { "selinux-apifs-context", required_argument
, NULL
, 'L' },
362 { "quiet", no_argument
, NULL
, 'q' },
363 { "share-system", no_argument
, NULL
, ARG_SHARE_SYSTEM
},
364 { "register", required_argument
, NULL
, ARG_REGISTER
},
365 { "keep-unit", no_argument
, NULL
, ARG_KEEP_UNIT
},
366 { "network-interface", required_argument
, NULL
, ARG_NETWORK_INTERFACE
},
367 { "network-macvlan", required_argument
, NULL
, ARG_NETWORK_MACVLAN
},
368 { "network-ipvlan", required_argument
, NULL
, ARG_NETWORK_IPVLAN
},
369 { "network-veth", no_argument
, NULL
, 'n' },
370 { "network-bridge", required_argument
, NULL
, ARG_NETWORK_BRIDGE
},
371 { "personality", required_argument
, NULL
, ARG_PERSONALITY
},
372 { "image", required_argument
, NULL
, 'i' },
373 { "volatile", optional_argument
, NULL
, ARG_VOLATILE
},
374 { "port", required_argument
, NULL
, 'p' },
375 { "property", required_argument
, NULL
, ARG_PROPERTY
},
376 { "private-users", optional_argument
, NULL
, ARG_PRIVATE_USERS
},
377 { "kill-signal", required_argument
, NULL
, ARG_KILL_SIGNAL
},
378 { "settings", required_argument
, NULL
, ARG_SETTINGS
},
383 uint64_t plus
= 0, minus
= 0;
384 bool mask_all_settings
= false, mask_no_settings
= false;
389 while ((c
= getopt_long(argc
, argv
, "+hD:u:bL:M:jS:Z:qi:xp:n", options
, NULL
)) >= 0)
401 r
= parse_path_argument_and_warn(optarg
, false, &arg_directory
);
407 r
= parse_path_argument_and_warn(optarg
, false, &arg_template
);
413 r
= parse_path_argument_and_warn(optarg
, false, &arg_image
);
419 arg_ephemeral
= true;
423 r
= free_and_strdup(&arg_user
, optarg
);
427 arg_settings_mask
|= SETTING_USER
;
430 case ARG_NETWORK_BRIDGE
:
431 r
= free_and_strdup(&arg_network_bridge
, optarg
);
438 arg_network_veth
= true;
439 arg_private_network
= true;
440 arg_settings_mask
|= SETTING_NETWORK
;
443 case ARG_NETWORK_INTERFACE
:
444 if (strv_extend(&arg_network_interfaces
, optarg
) < 0)
447 arg_private_network
= true;
448 arg_settings_mask
|= SETTING_NETWORK
;
451 case ARG_NETWORK_MACVLAN
:
452 if (strv_extend(&arg_network_macvlan
, optarg
) < 0)
455 arg_private_network
= true;
456 arg_settings_mask
|= SETTING_NETWORK
;
459 case ARG_NETWORK_IPVLAN
:
460 if (strv_extend(&arg_network_ipvlan
, optarg
) < 0)
465 case ARG_PRIVATE_NETWORK
:
466 arg_private_network
= true;
467 arg_settings_mask
|= SETTING_NETWORK
;
472 arg_settings_mask
|= SETTING_BOOT
;
476 r
= sd_id128_from_string(optarg
, &arg_uuid
);
478 log_error("Invalid UUID: %s", optarg
);
482 arg_settings_mask
|= SETTING_MACHINE_ID
;
491 arg_machine
= mfree(arg_machine
);
493 if (!machine_name_is_valid(optarg
)) {
494 log_error("Invalid machine name: %s", optarg
);
498 r
= free_and_strdup(&arg_machine
, optarg
);
506 arg_selinux_context
= optarg
;
510 arg_selinux_apifs_context
= optarg
;
514 arg_read_only
= true;
515 arg_settings_mask
|= SETTING_READ_ONLY
;
519 case ARG_DROP_CAPABILITY
: {
520 const char *state
, *word
;
523 FOREACH_WORD_SEPARATOR(word
, length
, optarg
, ",", state
) {
524 _cleanup_free_
char *t
;
526 t
= strndup(word
, length
);
530 if (streq(t
, "all")) {
531 if (c
== ARG_CAPABILITY
)
532 plus
= (uint64_t) -1;
534 minus
= (uint64_t) -1;
538 cap
= capability_from_name(t
);
540 log_error("Failed to parse capability %s.", t
);
544 if (c
== ARG_CAPABILITY
)
545 plus
|= 1ULL << (uint64_t) cap
;
547 minus
|= 1ULL << (uint64_t) cap
;
551 arg_settings_mask
|= SETTING_CAPABILITY
;
556 arg_link_journal
= LINK_GUEST
;
557 arg_link_journal_try
= true;
560 case ARG_LINK_JOURNAL
:
561 if (streq(optarg
, "auto")) {
562 arg_link_journal
= LINK_AUTO
;
563 arg_link_journal_try
= false;
564 } else if (streq(optarg
, "no")) {
565 arg_link_journal
= LINK_NO
;
566 arg_link_journal_try
= false;
567 } else if (streq(optarg
, "guest")) {
568 arg_link_journal
= LINK_GUEST
;
569 arg_link_journal_try
= false;
570 } else if (streq(optarg
, "host")) {
571 arg_link_journal
= LINK_HOST
;
572 arg_link_journal_try
= false;
573 } else if (streq(optarg
, "try-guest")) {
574 arg_link_journal
= LINK_GUEST
;
575 arg_link_journal_try
= true;
576 } else if (streq(optarg
, "try-host")) {
577 arg_link_journal
= LINK_HOST
;
578 arg_link_journal_try
= true;
580 log_error("Failed to parse link journal mode %s", optarg
);
588 r
= bind_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
, c
== ARG_BIND_RO
);
590 return log_error_errno(r
, "Failed to parse --bind(-ro)= argument %s: %m", optarg
);
592 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
596 r
= tmpfs_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
);
598 return log_error_errno(r
, "Failed to parse --tmpfs= argument %s: %m", optarg
);
600 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
604 case ARG_OVERLAY_RO
: {
605 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
606 _cleanup_strv_free_
char **lower
= NULL
;
611 r
= strv_split_extract(&lower
, optarg
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
615 log_error("Invalid overlay specification: %s", optarg
);
619 STRV_FOREACH(i
, lower
) {
620 if (!path_is_absolute(*i
)) {
621 log_error("Overlay path %s is not absolute.", *i
);
629 log_error("--overlay= needs at least two colon-separated directories specified.");
634 /* If two parameters are specified,
635 * the first one is the lower, the
636 * second one the upper directory. And
637 * we'll also define the destination
638 * mount point the same as the upper. */
642 destination
= strdup(upper
);
647 upper
= lower
[n
- 2];
648 destination
= lower
[n
- 1];
652 m
= custom_mount_add(&arg_custom_mounts
, &arg_n_custom_mounts
, CUSTOM_MOUNT_OVERLAY
);
656 m
->destination
= destination
;
659 m
->read_only
= c
== ARG_OVERLAY_RO
;
661 upper
= destination
= NULL
;
664 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
671 if (!env_assignment_is_valid(optarg
)) {
672 log_error("Environment variable assignment '%s' is not valid.", optarg
);
676 n
= strv_env_set(arg_setenv
, optarg
);
680 strv_free(arg_setenv
);
683 arg_settings_mask
|= SETTING_ENVIRONMENT
;
691 case ARG_SHARE_SYSTEM
:
692 arg_share_system
= true;
696 r
= parse_boolean(optarg
);
698 log_error("Failed to parse --register= argument: %s", optarg
);
706 arg_keep_unit
= true;
709 case ARG_PERSONALITY
:
711 arg_personality
= personality_from_string(optarg
);
712 if (arg_personality
== PERSONALITY_INVALID
) {
713 log_error("Unknown or unsupported personality '%s'.", optarg
);
717 arg_settings_mask
|= SETTING_PERSONALITY
;
723 arg_volatile_mode
= VOLATILE_YES
;
727 m
= volatile_mode_from_string(optarg
);
729 log_error("Failed to parse --volatile= argument: %s", optarg
);
732 arg_volatile_mode
= m
;
735 arg_settings_mask
|= SETTING_VOLATILE_MODE
;
739 r
= expose_port_parse(&arg_expose_ports
, optarg
);
741 return log_error_errno(r
, "Duplicate port specification: %s", optarg
);
743 return log_error_errno(r
, "Failed to parse host port %s: %m", optarg
);
745 arg_settings_mask
|= SETTING_EXPOSE_PORTS
;
749 if (strv_extend(&arg_property
, optarg
) < 0)
754 case ARG_PRIVATE_USERS
:
756 _cleanup_free_
char *buffer
= NULL
;
757 const char *range
, *shift
;
759 range
= strchr(optarg
, ':');
761 buffer
= strndup(optarg
, range
- optarg
);
767 if (safe_atou32(range
, &arg_uid_range
) < 0 || arg_uid_range
<= 0) {
768 log_error("Failed to parse UID range: %s", range
);
774 if (parse_uid(shift
, &arg_uid_shift
) < 0) {
775 log_error("Failed to parse UID: %s", optarg
);
783 case ARG_KILL_SIGNAL
:
784 arg_kill_signal
= signal_from_string_try_harder(optarg
);
785 if (arg_kill_signal
< 0) {
786 log_error("Cannot parse signal: %s", optarg
);
790 arg_settings_mask
|= SETTING_KILL_SIGNAL
;
795 /* no → do not read files
796 * yes → read files, do not override cmdline, trust only subset
797 * override → read files, override cmdline, trust only subset
798 * trusted → read files, do not override cmdline, trust all
801 r
= parse_boolean(optarg
);
803 if (streq(optarg
, "trusted")) {
804 mask_all_settings
= false;
805 mask_no_settings
= false;
806 arg_settings_trusted
= true;
808 } else if (streq(optarg
, "override")) {
809 mask_all_settings
= false;
810 mask_no_settings
= true;
811 arg_settings_trusted
= -1;
813 return log_error_errno(r
, "Failed to parse --settings= argument: %s", optarg
);
816 mask_all_settings
= false;
817 mask_no_settings
= false;
818 arg_settings_trusted
= -1;
821 mask_all_settings
= true;
822 mask_no_settings
= false;
823 arg_settings_trusted
= false;
832 assert_not_reached("Unhandled option");
835 if (arg_share_system
)
836 arg_register
= false;
838 if (arg_boot
&& arg_share_system
) {
839 log_error("--boot and --share-system may not be combined.");
843 if (arg_keep_unit
&& cg_pid_get_owner_uid(0, NULL
) >= 0) {
844 log_error("--keep-unit may not be used when invoked from a user session.");
848 if (arg_directory
&& arg_image
) {
849 log_error("--directory= and --image= may not be combined.");
853 if (arg_template
&& arg_image
) {
854 log_error("--template= and --image= may not be combined.");
858 if (arg_template
&& !(arg_directory
|| arg_machine
)) {
859 log_error("--template= needs --directory= or --machine=.");
863 if (arg_ephemeral
&& arg_template
) {
864 log_error("--ephemeral and --template= may not be combined.");
868 if (arg_ephemeral
&& arg_image
) {
869 log_error("--ephemeral and --image= may not be combined.");
873 if (arg_ephemeral
&& !IN_SET(arg_link_journal
, LINK_NO
, LINK_AUTO
)) {
874 log_error("--ephemeral and --link-journal= may not be combined.");
878 if (arg_userns
&& access("/proc/self/uid_map", F_OK
) < 0)
879 return log_error_errno(EOPNOTSUPP
, "--private-users= is not supported, kernel compiled without user namespace support.");
882 arg_parameters
= strv_copy(argv
+ optind
);
886 arg_settings_mask
|= SETTING_BOOT
;
889 /* Load all settings from .nspawn files */
890 if (mask_no_settings
)
891 arg_settings_mask
= 0;
893 /* Don't load any settings from .nspawn files */
894 if (mask_all_settings
)
895 arg_settings_mask
= _SETTINGS_MASK_ALL
;
897 arg_retain
= (arg_retain
| plus
| (arg_private_network
? 1ULL << CAP_NET_ADMIN
: 0)) & ~minus
;
899 r
= detect_unified_cgroup_hierarchy();
906 static int verify_arguments(void) {
908 if (arg_volatile_mode
!= VOLATILE_NO
&& arg_read_only
) {
909 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
913 if (arg_expose_ports
&& !arg_private_network
) {
914 log_error("Cannot use --port= without private networking.");
918 if (arg_boot
&& arg_kill_signal
<= 0)
919 arg_kill_signal
= SIGRTMIN
+3;
924 static int userns_lchown(const char *p
, uid_t uid
, gid_t gid
) {
930 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
933 if (uid
!= UID_INVALID
) {
934 uid
+= arg_uid_shift
;
936 if (uid
< arg_uid_shift
|| uid
>= arg_uid_shift
+ arg_uid_range
)
940 if (gid
!= GID_INVALID
) {
941 gid
+= (gid_t
) arg_uid_shift
;
943 if (gid
< (gid_t
) arg_uid_shift
|| gid
>= (gid_t
) (arg_uid_shift
+ arg_uid_range
))
947 if (lchown(p
, uid
, gid
) < 0)
953 static int userns_mkdir(const char *root
, const char *path
, mode_t mode
, uid_t uid
, gid_t gid
) {
956 q
= prefix_roota(root
, path
);
957 if (mkdir(q
, mode
) < 0) {
963 return userns_lchown(q
, uid
, gid
);
966 static int setup_timezone(const char *dest
) {
967 _cleanup_free_
char *p
= NULL
, *q
= NULL
;
968 const char *where
, *check
, *what
;
974 /* Fix the timezone, if possible */
975 r
= readlink_malloc("/etc/localtime", &p
);
977 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
981 z
= path_startswith(p
, "../usr/share/zoneinfo/");
983 z
= path_startswith(p
, "/usr/share/zoneinfo/");
985 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
989 where
= prefix_roota(dest
, "/etc/localtime");
990 r
= readlink_malloc(where
, &q
);
992 y
= path_startswith(q
, "../usr/share/zoneinfo/");
994 y
= path_startswith(q
, "/usr/share/zoneinfo/");
996 /* Already pointing to the right place? Then do nothing .. */
997 if (y
&& streq(y
, z
))
1001 check
= strjoina("/usr/share/zoneinfo/", z
);
1002 check
= prefix_root(dest
, check
);
1003 if (laccess(check
, F_OK
) < 0) {
1004 log_warning("Timezone %s does not exist in container, not updating container timezone.", z
);
1009 if (r
< 0 && errno
!= ENOENT
) {
1010 log_error_errno(errno
, "Failed to remove existing timezone info %s in container: %m", where
);
1014 what
= strjoina("../usr/share/zoneinfo/", z
);
1015 if (symlink(what
, where
) < 0) {
1016 log_error_errno(errno
, "Failed to correct timezone of container: %m");
1020 r
= userns_lchown(where
, 0, 0);
1022 return log_warning_errno(r
, "Failed to chown /etc/localtime: %m");
1027 static int setup_resolv_conf(const char *dest
) {
1028 const char *where
= NULL
;
1033 if (arg_private_network
)
1036 /* Fix resolv.conf, if possible */
1037 where
= prefix_roota(dest
, "/etc/resolv.conf");
1039 r
= copy_file("/etc/resolv.conf", where
, O_TRUNC
|O_NOFOLLOW
, 0644, 0);
1041 /* If the file already exists as symlink, let's
1042 * suppress the warning, under the assumption that
1043 * resolved or something similar runs inside and the
1044 * symlink points there.
1046 * If the disk image is read-only, there's also no
1047 * point in complaining.
1049 log_full_errno(IN_SET(r
, -ELOOP
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
1050 "Failed to copy /etc/resolv.conf to %s: %m", where
);
1054 r
= userns_lchown(where
, 0, 0);
1056 log_warning_errno(r
, "Failed to chown /etc/resolv.conf: %m");
1061 static char* id128_format_as_uuid(sd_id128_t id
, char s
[37]) {
1065 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1066 SD_ID128_FORMAT_VAL(id
));
1071 static int setup_boot_id(const char *dest
) {
1072 const char *from
, *to
;
1073 sd_id128_t rnd
= {};
1077 if (arg_share_system
)
1080 /* Generate a new randomized boot ID, so that each boot-up of
1081 * the container gets a new one */
1083 from
= prefix_roota(dest
, "/run/proc-sys-kernel-random-boot-id");
1084 to
= prefix_roota(dest
, "/proc/sys/kernel/random/boot_id");
1086 r
= sd_id128_randomize(&rnd
);
1088 return log_error_errno(r
, "Failed to generate random boot id: %m");
1090 id128_format_as_uuid(rnd
, as_uuid
);
1092 r
= write_string_file(from
, as_uuid
, WRITE_STRING_FILE_CREATE
);
1094 return log_error_errno(r
, "Failed to write boot id: %m");
1096 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1097 r
= log_error_errno(errno
, "Failed to bind mount boot id: %m");
1098 else if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
) < 0)
1099 log_warning_errno(errno
, "Failed to make boot id read-only: %m");
1105 static int copy_devnodes(const char *dest
) {
1107 static const char devnodes
[] =
1118 _cleanup_umask_ mode_t u
;
1124 /* Create /dev/net, so that we can create /dev/net/tun in it */
1125 if (userns_mkdir(dest
, "/dev/net", 0755, 0, 0) < 0)
1126 return log_error_errno(r
, "Failed to create /dev/net directory: %m");
1128 NULSTR_FOREACH(d
, devnodes
) {
1129 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
1132 from
= strappend("/dev/", d
);
1133 to
= prefix_root(dest
, from
);
1135 if (stat(from
, &st
) < 0) {
1137 if (errno
!= ENOENT
)
1138 return log_error_errno(errno
, "Failed to stat %s: %m", from
);
1140 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
1142 log_error("%s is not a char or block device, cannot copy.", from
);
1146 if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
1148 return log_error_errno(errno
, "mknod(%s) failed: %m", to
);
1150 /* Some systems abusively restrict mknod but
1151 * allow bind mounts. */
1154 return log_error_errno(r
, "touch (%s) failed: %m", to
);
1155 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1156 return log_error_errno(errno
, "Both mknod and bind mount (%s) failed: %m", to
);
1159 r
= userns_lchown(to
, 0, 0);
1161 return log_error_errno(r
, "chown() of device node %s failed: %m", to
);
1168 static int setup_pts(const char *dest
) {
1169 _cleanup_free_
char *options
= NULL
;
1173 if (arg_selinux_apifs_context
)
1174 (void) asprintf(&options
,
1175 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
",context=\"%s\"",
1176 arg_uid_shift
+ TTY_GID
,
1177 arg_selinux_apifs_context
);
1180 (void) asprintf(&options
,
1181 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
,
1182 arg_uid_shift
+ TTY_GID
);
1187 /* Mount /dev/pts itself */
1188 p
= prefix_roota(dest
, "/dev/pts");
1189 if (mkdir(p
, 0755) < 0)
1190 return log_error_errno(errno
, "Failed to create /dev/pts: %m");
1191 if (mount("devpts", p
, "devpts", MS_NOSUID
|MS_NOEXEC
, options
) < 0)
1192 return log_error_errno(errno
, "Failed to mount /dev/pts: %m");
1193 if (userns_lchown(p
, 0, 0) < 0)
1194 return log_error_errno(errno
, "Failed to chown /dev/pts: %m");
1196 /* Create /dev/ptmx symlink */
1197 p
= prefix_roota(dest
, "/dev/ptmx");
1198 if (symlink("pts/ptmx", p
) < 0)
1199 return log_error_errno(errno
, "Failed to create /dev/ptmx symlink: %m");
1200 if (userns_lchown(p
, 0, 0) < 0)
1201 return log_error_errno(errno
, "Failed to chown /dev/ptmx: %m");
1203 /* And fix /dev/pts/ptmx ownership */
1204 p
= prefix_roota(dest
, "/dev/pts/ptmx");
1205 if (userns_lchown(p
, 0, 0) < 0)
1206 return log_error_errno(errno
, "Failed to chown /dev/pts/ptmx: %m");
1211 static int setup_dev_console(const char *dest
, const char *console
) {
1212 _cleanup_umask_ mode_t u
;
1221 r
= chmod_and_chown(console
, 0600, arg_uid_shift
, arg_uid_shift
);
1223 return log_error_errno(r
, "Failed to correct access mode for TTY: %m");
1225 /* We need to bind mount the right tty to /dev/console since
1226 * ptys can only exist on pts file systems. To have something
1227 * to bind mount things on we create a empty regular file. */
1229 to
= prefix_roota(dest
, "/dev/console");
1232 return log_error_errno(r
, "touch() for /dev/console failed: %m");
1234 if (mount(console
, to
, NULL
, MS_BIND
, NULL
) < 0)
1235 return log_error_errno(errno
, "Bind mount for /dev/console failed: %m");
1240 static int setup_kmsg(const char *dest
, int kmsg_socket
) {
1241 const char *from
, *to
;
1242 _cleanup_umask_ mode_t u
;
1245 assert(kmsg_socket
>= 0);
1249 /* We create the kmsg FIFO as /run/kmsg, but immediately
1250 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1251 * on the reading side behave very similar to /proc/kmsg,
1252 * their writing side behaves differently from /dev/kmsg in
1253 * that writing blocks when nothing is reading. In order to
1254 * avoid any problems with containers deadlocking due to this
1255 * we simply make /dev/kmsg unavailable to the container. */
1256 from
= prefix_roota(dest
, "/run/kmsg");
1257 to
= prefix_roota(dest
, "/proc/kmsg");
1259 if (mkfifo(from
, 0600) < 0)
1260 return log_error_errno(errno
, "mkfifo() for /run/kmsg failed: %m");
1261 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1262 return log_error_errno(errno
, "Bind mount for /proc/kmsg failed: %m");
1264 fd
= open(from
, O_RDWR
|O_NDELAY
|O_CLOEXEC
);
1266 return log_error_errno(errno
, "Failed to open fifo: %m");
1268 /* Store away the fd in the socket, so that it stays open as
1269 * long as we run the child */
1270 r
= send_one_fd(kmsg_socket
, fd
, 0);
1274 return log_error_errno(r
, "Failed to send FIFO fd: %m");
1276 /* And now make the FIFO unavailable as /run/kmsg... */
1277 (void) unlink(from
);
1282 static int on_address_change(sd_netlink
*rtnl
, sd_netlink_message
*m
, void *userdata
) {
1283 union in_addr_union
*exposed
= userdata
;
1289 expose_port_execute(rtnl
, arg_expose_ports
, exposed
);
1293 static int setup_hostname(void) {
1295 if (arg_share_system
)
1298 if (sethostname_idempotent(arg_machine
) < 0)
1304 static int setup_journal(const char *directory
) {
1305 sd_id128_t machine_id
, this_id
;
1306 _cleanup_free_
char *b
= NULL
, *d
= NULL
;
1307 const char *etc_machine_id
, *p
, *q
;
1311 /* Don't link journals in ephemeral mode */
1315 etc_machine_id
= prefix_roota(directory
, "/etc/machine-id");
1317 r
= read_one_line_file(etc_machine_id
, &b
);
1318 if (r
== -ENOENT
&& arg_link_journal
== LINK_AUTO
)
1321 return log_error_errno(r
, "Failed to read machine ID from %s: %m", etc_machine_id
);
1324 if (isempty(id
) && arg_link_journal
== LINK_AUTO
)
1327 /* Verify validity */
1328 r
= sd_id128_from_string(id
, &machine_id
);
1330 return log_error_errno(r
, "Failed to parse machine ID from %s: %m", etc_machine_id
);
1332 r
= sd_id128_get_machine(&this_id
);
1334 return log_error_errno(r
, "Failed to retrieve machine ID: %m");
1336 if (sd_id128_equal(machine_id
, this_id
)) {
1337 log_full(arg_link_journal
== LINK_AUTO
? LOG_WARNING
: LOG_ERR
,
1338 "Host and machine ids are equal (%s): refusing to link journals", id
);
1339 if (arg_link_journal
== LINK_AUTO
)
1344 if (arg_link_journal
== LINK_NO
)
1347 r
= userns_mkdir(directory
, "/var", 0755, 0, 0);
1349 return log_error_errno(r
, "Failed to create /var: %m");
1351 r
= userns_mkdir(directory
, "/var/log", 0755, 0, 0);
1353 return log_error_errno(r
, "Failed to create /var/log: %m");
1355 r
= userns_mkdir(directory
, "/var/log/journal", 0755, 0, 0);
1357 return log_error_errno(r
, "Failed to create /var/log/journal: %m");
1359 p
= strjoina("/var/log/journal/", id
);
1360 q
= prefix_roota(directory
, p
);
1362 if (path_is_mount_point(p
, 0) > 0) {
1363 if (arg_link_journal
!= LINK_AUTO
) {
1364 log_error("%s: already a mount point, refusing to use for journal", p
);
1371 if (path_is_mount_point(q
, 0) > 0) {
1372 if (arg_link_journal
!= LINK_AUTO
) {
1373 log_error("%s: already a mount point, refusing to use for journal", q
);
1380 r
= readlink_and_make_absolute(p
, &d
);
1382 if ((arg_link_journal
== LINK_GUEST
||
1383 arg_link_journal
== LINK_AUTO
) &&
1386 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1388 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
1393 return log_error_errno(errno
, "Failed to remove symlink %s: %m", p
);
1394 } else if (r
== -EINVAL
) {
1396 if (arg_link_journal
== LINK_GUEST
&&
1399 if (errno
== ENOTDIR
) {
1400 log_error("%s already exists and is neither a symlink nor a directory", p
);
1403 log_error_errno(errno
, "Failed to remove %s: %m", p
);
1407 } else if (r
!= -ENOENT
) {
1408 log_error_errno(errno
, "readlink(%s) failed: %m", p
);
1412 if (arg_link_journal
== LINK_GUEST
) {
1414 if (symlink(q
, p
) < 0) {
1415 if (arg_link_journal_try
) {
1416 log_debug_errno(errno
, "Failed to symlink %s to %s, skipping journal setup: %m", q
, p
);
1419 log_error_errno(errno
, "Failed to symlink %s to %s: %m", q
, p
);
1424 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1426 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
1430 if (arg_link_journal
== LINK_HOST
) {
1431 /* don't create parents here -- if the host doesn't have
1432 * permanent journal set up, don't force it here */
1435 if (arg_link_journal_try
) {
1436 log_debug_errno(errno
, "Failed to create %s, skipping journal setup: %m", p
);
1439 log_error_errno(errno
, "Failed to create %s: %m", p
);
1444 } else if (access(p
, F_OK
) < 0)
1447 if (dir_is_empty(q
) == 0)
1448 log_warning("%s is not empty, proceeding anyway.", q
);
1450 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1452 log_error_errno(errno
, "Failed to create %s: %m", q
);
1456 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1457 return log_error_errno(errno
, "Failed to bind mount journal from host into guest: %m");
1462 static int drop_capabilities(void) {
1463 return capability_bounding_set_drop(~arg_retain
, false);
1466 static int reset_audit_loginuid(void) {
1467 _cleanup_free_
char *p
= NULL
;
1470 if (arg_share_system
)
1473 r
= read_one_line_file("/proc/self/loginuid", &p
);
1477 return log_error_errno(r
, "Failed to read /proc/self/loginuid: %m");
1479 /* Already reset? */
1480 if (streq(p
, "4294967295"))
1483 r
= write_string_file("/proc/self/loginuid", "4294967295", 0);
1486 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1487 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1488 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1489 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1490 "using systemd-nspawn. Sleeping for 5s... (%m)");
1498 static int setup_seccomp(void) {
1501 static const struct {
1502 uint64_t capability
;
1505 { CAP_SYS_RAWIO
, SCMP_SYS(iopl
) },
1506 { CAP_SYS_RAWIO
, SCMP_SYS(ioperm
) },
1507 { CAP_SYS_BOOT
, SCMP_SYS(kexec_load
) },
1508 { CAP_SYS_ADMIN
, SCMP_SYS(swapon
) },
1509 { CAP_SYS_ADMIN
, SCMP_SYS(swapoff
) },
1510 { CAP_SYS_ADMIN
, SCMP_SYS(open_by_handle_at
) },
1511 { CAP_SYS_MODULE
, SCMP_SYS(init_module
) },
1512 { CAP_SYS_MODULE
, SCMP_SYS(finit_module
) },
1513 { CAP_SYS_MODULE
, SCMP_SYS(delete_module
) },
1514 { CAP_SYSLOG
, SCMP_SYS(syslog
) },
1517 scmp_filter_ctx seccomp
;
1521 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
1525 r
= seccomp_add_secondary_archs(seccomp
);
1527 log_error_errno(r
, "Failed to add secondary archs to seccomp filter: %m");
1531 for (i
= 0; i
< ELEMENTSOF(blacklist
); i
++) {
1532 if (arg_retain
& (1ULL << blacklist
[i
].capability
))
1535 r
= seccomp_rule_add(seccomp
, SCMP_ACT_ERRNO(EPERM
), blacklist
[i
].syscall_num
, 0);
1537 continue; /* unknown syscall */
1539 log_error_errno(r
, "Failed to block syscall: %m");
1546 Audit is broken in containers, much of the userspace audit
1547 hookup will fail if running inside a container. We don't
1548 care and just turn off creation of audit sockets.
1550 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1551 with EAFNOSUPPORT which audit userspace uses as indication
1552 that audit is disabled in the kernel.
1555 r
= seccomp_rule_add(
1557 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1560 SCMP_A0(SCMP_CMP_EQ
, AF_NETLINK
),
1561 SCMP_A2(SCMP_CMP_EQ
, NETLINK_AUDIT
));
1563 log_error_errno(r
, "Failed to add audit seccomp rule: %m");
1567 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
1569 log_error_errno(r
, "Failed to unset NO_NEW_PRIVS: %m");
1573 r
= seccomp_load(seccomp
);
1575 log_debug_errno(r
, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1580 log_error_errno(r
, "Failed to install seccomp audit filter: %m");
1585 seccomp_release(seccomp
);
1593 static int setup_propagate(const char *root
) {
1596 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1597 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1598 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
1599 (void) mkdir_p(p
, 0600);
1601 if (userns_mkdir(root
, "/run/systemd", 0755, 0, 0) < 0)
1602 return log_error_errno(errno
, "Failed to create /run/systemd: %m");
1604 if (userns_mkdir(root
, "/run/systemd/nspawn", 0755, 0, 0) < 0)
1605 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn: %m");
1607 if (userns_mkdir(root
, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
1608 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn/incoming: %m");
1610 q
= prefix_roota(root
, "/run/systemd/nspawn/incoming");
1611 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1612 return log_error_errno(errno
, "Failed to install propagation bind mount.");
1614 if (mount(NULL
, q
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
) < 0)
1615 return log_error_errno(errno
, "Failed to make propagation mount read-only");
1620 static int setup_image(char **device_path
, int *loop_nr
) {
1621 struct loop_info64 info
= {
1622 .lo_flags
= LO_FLAGS_AUTOCLEAR
|LO_FLAGS_PARTSCAN
1624 _cleanup_close_
int fd
= -1, control
= -1, loop
= -1;
1625 _cleanup_free_
char* loopdev
= NULL
;
1629 assert(device_path
);
1633 fd
= open(arg_image
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1635 return log_error_errno(errno
, "Failed to open %s: %m", arg_image
);
1637 if (fstat(fd
, &st
) < 0)
1638 return log_error_errno(errno
, "Failed to stat %s: %m", arg_image
);
1640 if (S_ISBLK(st
.st_mode
)) {
1643 p
= strdup(arg_image
);
1657 if (!S_ISREG(st
.st_mode
)) {
1658 log_error_errno(errno
, "%s is not a regular file or block device: %m", arg_image
);
1662 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
1664 return log_error_errno(errno
, "Failed to open /dev/loop-control: %m");
1666 nr
= ioctl(control
, LOOP_CTL_GET_FREE
);
1668 return log_error_errno(errno
, "Failed to allocate loop device: %m");
1670 if (asprintf(&loopdev
, "/dev/loop%i", nr
) < 0)
1673 loop
= open(loopdev
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1675 return log_error_errno(errno
, "Failed to open loop device %s: %m", loopdev
);
1677 if (ioctl(loop
, LOOP_SET_FD
, fd
) < 0)
1678 return log_error_errno(errno
, "Failed to set loopback file descriptor on %s: %m", loopdev
);
1681 info
.lo_flags
|= LO_FLAGS_READ_ONLY
;
1683 if (ioctl(loop
, LOOP_SET_STATUS64
, &info
) < 0)
1684 return log_error_errno(errno
, "Failed to set loopback settings on %s: %m", loopdev
);
1686 *device_path
= loopdev
;
1697 #define PARTITION_TABLE_BLURB \
1698 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1699 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1700 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1701 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1702 "to be bootable with systemd-nspawn."
1704 static int dissect_image(
1706 char **root_device
, bool *root_device_rw
,
1707 char **home_device
, bool *home_device_rw
,
1708 char **srv_device
, bool *srv_device_rw
,
1712 int home_nr
= -1, srv_nr
= -1;
1713 #ifdef GPT_ROOT_NATIVE
1716 #ifdef GPT_ROOT_SECONDARY
1717 int secondary_root_nr
= -1;
1719 _cleanup_free_
char *home
= NULL
, *root
= NULL
, *secondary_root
= NULL
, *srv
= NULL
, *generic
= NULL
;
1720 _cleanup_udev_enumerate_unref_
struct udev_enumerate
*e
= NULL
;
1721 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
1722 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
1723 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
1724 struct udev_list_entry
*first
, *item
;
1725 bool home_rw
= true, root_rw
= true, secondary_root_rw
= true, srv_rw
= true, generic_rw
= true;
1726 bool is_gpt
, is_mbr
, multiple_generic
= false;
1727 const char *pttype
= NULL
;
1734 assert(root_device
);
1735 assert(home_device
);
1740 b
= blkid_new_probe();
1745 r
= blkid_probe_set_device(b
, fd
, 0, 0);
1750 log_error_errno(errno
, "Failed to set device on blkid probe: %m");
1754 blkid_probe_enable_partitions(b
, 1);
1755 blkid_probe_set_partitions_flags(b
, BLKID_PARTS_ENTRY_DETAILS
);
1758 r
= blkid_do_safeprobe(b
);
1759 if (r
== -2 || r
== 1) {
1760 log_error("Failed to identify any partition table on\n"
1762 PARTITION_TABLE_BLURB
, arg_image
);
1764 } else if (r
!= 0) {
1767 log_error_errno(errno
, "Failed to probe: %m");
1771 (void) blkid_probe_lookup_value(b
, "PTTYPE", &pttype
, NULL
);
1773 is_gpt
= streq_ptr(pttype
, "gpt");
1774 is_mbr
= streq_ptr(pttype
, "dos");
1776 if (!is_gpt
&& !is_mbr
) {
1777 log_error("No GPT or MBR partition table discovered on\n"
1779 PARTITION_TABLE_BLURB
, arg_image
);
1784 pl
= blkid_probe_get_partitions(b
);
1789 log_error("Failed to list partitions of %s", arg_image
);
1797 if (fstat(fd
, &st
) < 0)
1798 return log_error_errno(errno
, "Failed to stat block device: %m");
1800 d
= udev_device_new_from_devnum(udev
, 'b', st
.st_rdev
);
1808 log_error("Kernel partitions never appeared.");
1812 e
= udev_enumerate_new(udev
);
1816 r
= udev_enumerate_add_match_parent(e
, d
);
1820 r
= udev_enumerate_scan_devices(e
);
1822 return log_error_errno(r
, "Failed to scan for partition devices of %s: %m", arg_image
);
1824 /* Count the partitions enumerated by the kernel */
1826 first
= udev_enumerate_get_list_entry(e
);
1827 udev_list_entry_foreach(item
, first
)
1830 /* Count the partitions enumerated by blkid */
1831 m
= blkid_partlist_numof_partitions(pl
);
1835 log_error("blkid and kernel partition list do not match.");
1841 /* The kernel has probed fewer partitions than
1842 * blkid? Maybe the kernel prober is still
1843 * running or it got EBUSY because udev
1844 * already opened the device. Let's reprobe
1845 * the device, which is a synchronous call
1846 * that waits until probing is complete. */
1848 for (j
= 0; j
< 20; j
++) {
1850 r
= ioctl(fd
, BLKRRPART
, 0);
1853 if (r
>= 0 || r
!= -EBUSY
)
1856 /* If something else has the device
1857 * open, such as an udev rule, the
1858 * ioctl will return EBUSY. Since
1859 * there's no way to wait until it
1860 * isn't busy anymore, let's just wait
1861 * a bit, and try again.
1863 * This is really something they
1864 * should fix in the kernel! */
1866 usleep(50 * USEC_PER_MSEC
);
1870 return log_error_errno(r
, "Failed to reread partition table: %m");
1873 e
= udev_enumerate_unref(e
);
1876 first
= udev_enumerate_get_list_entry(e
);
1877 udev_list_entry_foreach(item
, first
) {
1878 _cleanup_udev_device_unref_
struct udev_device
*q
;
1880 unsigned long long flags
;
1886 q
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(item
));
1891 log_error_errno(errno
, "Failed to get partition device of %s: %m", arg_image
);
1895 qn
= udev_device_get_devnum(q
);
1899 if (st
.st_rdev
== qn
)
1902 node
= udev_device_get_devnode(q
);
1906 pp
= blkid_partlist_devno_to_partition(pl
, qn
);
1910 flags
= blkid_partition_get_flags(pp
);
1912 nr
= blkid_partition_get_partno(pp
);
1920 if (flags
& GPT_FLAG_NO_AUTO
)
1923 stype
= blkid_partition_get_type_string(pp
);
1927 if (sd_id128_from_string(stype
, &type_id
) < 0)
1930 if (sd_id128_equal(type_id
, GPT_HOME
)) {
1932 if (home
&& nr
>= home_nr
)
1936 home_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1938 r
= free_and_strdup(&home
, node
);
1942 } else if (sd_id128_equal(type_id
, GPT_SRV
)) {
1944 if (srv
&& nr
>= srv_nr
)
1948 srv_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1950 r
= free_and_strdup(&srv
, node
);
1954 #ifdef GPT_ROOT_NATIVE
1955 else if (sd_id128_equal(type_id
, GPT_ROOT_NATIVE
)) {
1957 if (root
&& nr
>= root_nr
)
1961 root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1963 r
= free_and_strdup(&root
, node
);
1968 #ifdef GPT_ROOT_SECONDARY
1969 else if (sd_id128_equal(type_id
, GPT_ROOT_SECONDARY
)) {
1971 if (secondary_root
&& nr
>= secondary_root_nr
)
1974 secondary_root_nr
= nr
;
1975 secondary_root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1977 r
= free_and_strdup(&secondary_root
, node
);
1982 else if (sd_id128_equal(type_id
, GPT_LINUX_GENERIC
)) {
1985 multiple_generic
= true;
1987 generic_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1989 r
= free_and_strdup(&generic
, node
);
1995 } else if (is_mbr
) {
1998 if (flags
!= 0x80) /* Bootable flag */
2001 type
= blkid_partition_get_type(pp
);
2002 if (type
!= 0x83) /* Linux partition */
2006 multiple_generic
= true;
2010 r
= free_and_strdup(&root
, node
);
2018 *root_device
= root
;
2021 *root_device_rw
= root_rw
;
2023 } else if (secondary_root
) {
2024 *root_device
= secondary_root
;
2025 secondary_root
= NULL
;
2027 *root_device_rw
= secondary_root_rw
;
2029 } else if (generic
) {
2031 /* There were no partitions with precise meanings
2032 * around, but we found generic partitions. In this
2033 * case, if there's only one, we can go ahead and boot
2034 * it, otherwise we bail out, because we really cannot
2035 * make any sense of it. */
2037 if (multiple_generic
) {
2038 log_error("Identified multiple bootable Linux partitions on\n"
2040 PARTITION_TABLE_BLURB
, arg_image
);
2044 *root_device
= generic
;
2047 *root_device_rw
= generic_rw
;
2050 log_error("Failed to identify root partition in disk image\n"
2052 PARTITION_TABLE_BLURB
, arg_image
);
2057 *home_device
= home
;
2060 *home_device_rw
= home_rw
;
2067 *srv_device_rw
= srv_rw
;
2072 log_error("--image= is not supported, compiled without blkid support.");
2077 static int mount_device(const char *what
, const char *where
, const char *directory
, bool rw
) {
2079 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
2080 const char *fstype
, *p
;
2090 p
= strjoina(where
, directory
);
2095 b
= blkid_new_probe_from_filename(what
);
2099 log_error_errno(errno
, "Failed to allocate prober for %s: %m", what
);
2103 blkid_probe_enable_superblocks(b
, 1);
2104 blkid_probe_set_superblocks_flags(b
, BLKID_SUBLKS_TYPE
);
2107 r
= blkid_do_safeprobe(b
);
2108 if (r
== -1 || r
== 1) {
2109 log_error("Cannot determine file system type of %s", what
);
2111 } else if (r
!= 0) {
2114 log_error_errno(errno
, "Failed to probe %s: %m", what
);
2119 if (blkid_probe_lookup_value(b
, "TYPE", &fstype
, NULL
) < 0) {
2122 log_error("Failed to determine file system type of %s", what
);
2126 if (streq(fstype
, "crypto_LUKS")) {
2127 log_error("nspawn currently does not support LUKS disk images.");
2131 if (mount(what
, p
, fstype
, MS_NODEV
|(rw
? 0 : MS_RDONLY
), NULL
) < 0)
2132 return log_error_errno(errno
, "Failed to mount %s: %m", what
);
2136 log_error("--image= is not supported, compiled without blkid support.");
2141 static int mount_devices(
2143 const char *root_device
, bool root_device_rw
,
2144 const char *home_device
, bool home_device_rw
,
2145 const char *srv_device
, bool srv_device_rw
) {
2151 r
= mount_device(root_device
, arg_directory
, NULL
, root_device_rw
);
2153 return log_error_errno(r
, "Failed to mount root directory: %m");
2157 r
= mount_device(home_device
, arg_directory
, "/home", home_device_rw
);
2159 return log_error_errno(r
, "Failed to mount home directory: %m");
2163 r
= mount_device(srv_device
, arg_directory
, "/srv", srv_device_rw
);
2165 return log_error_errno(r
, "Failed to mount server data directory: %m");
2171 static void loop_remove(int nr
, int *image_fd
) {
2172 _cleanup_close_
int control
= -1;
2178 if (image_fd
&& *image_fd
>= 0) {
2179 r
= ioctl(*image_fd
, LOOP_CLR_FD
);
2181 log_debug_errno(errno
, "Failed to close loop image: %m");
2182 *image_fd
= safe_close(*image_fd
);
2185 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
2187 log_warning_errno(errno
, "Failed to open /dev/loop-control: %m");
2191 r
= ioctl(control
, LOOP_CTL_REMOVE
, nr
);
2193 log_debug_errno(errno
, "Failed to remove loop %d: %m", nr
);
2198 * < 0 : wait_for_terminate() failed to get the state of the
2199 * container, the container was terminated by a signal, or
2200 * failed for an unknown reason. No change is made to the
2201 * container argument.
2202 * > 0 : The program executed in the container terminated with an
2203 * error. The exit code of the program executed in the
2204 * container is returned. The container argument has been set
2205 * to CONTAINER_TERMINATED.
2206 * 0 : The container is being rebooted, has been shut down or exited
2207 * successfully. The container argument has been set to either
2208 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2210 * That is, success is indicated by a return value of zero, and an
2211 * error is indicated by a non-zero value.
2213 static int wait_for_container(pid_t pid
, ContainerStatus
*container
) {
2217 r
= wait_for_terminate(pid
, &status
);
2219 return log_warning_errno(r
, "Failed to wait for container: %m");
2221 switch (status
.si_code
) {
2224 if (status
.si_status
== 0) {
2225 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s exited successfully.", arg_machine
);
2228 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s failed with error code %i.", arg_machine
, status
.si_status
);
2230 *container
= CONTAINER_TERMINATED
;
2231 return status
.si_status
;
2234 if (status
.si_status
== SIGINT
) {
2236 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s has been shut down.", arg_machine
);
2237 *container
= CONTAINER_TERMINATED
;
2240 } else if (status
.si_status
== SIGHUP
) {
2242 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s is being rebooted.", arg_machine
);
2243 *container
= CONTAINER_REBOOTED
;
2247 /* CLD_KILLED fallthrough */
2250 log_error("Container %s terminated by signal %s.", arg_machine
, signal_to_string(status
.si_status
));
2254 log_error("Container %s failed due to unknown reason.", arg_machine
);
2261 static int on_orderly_shutdown(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
2264 pid
= PTR_TO_UINT32(userdata
);
2266 if (kill(pid
, arg_kill_signal
) >= 0) {
2267 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2268 sd_event_source_set_userdata(s
, NULL
);
2273 sd_event_exit(sd_event_source_get_event(s
), 0);
2277 static int determine_names(void) {
2280 if (arg_template
&& !arg_directory
&& arg_machine
) {
2282 /* If --template= was specified then we should not
2283 * search for a machine, but instead create a new one
2284 * in /var/lib/machine. */
2286 arg_directory
= strjoin("/var/lib/machines/", arg_machine
, NULL
);
2291 if (!arg_image
&& !arg_directory
) {
2293 _cleanup_(image_unrefp
) Image
*i
= NULL
;
2295 r
= image_find(arg_machine
, &i
);
2297 return log_error_errno(r
, "Failed to find image for machine '%s': %m", arg_machine
);
2299 log_error("No image for machine '%s': %m", arg_machine
);
2303 if (i
->type
== IMAGE_RAW
)
2304 r
= free_and_strdup(&arg_image
, i
->path
);
2306 r
= free_and_strdup(&arg_directory
, i
->path
);
2308 return log_error_errno(r
, "Invalid image directory: %m");
2311 arg_read_only
= arg_read_only
|| i
->read_only
;
2313 arg_directory
= get_current_dir_name();
2315 if (!arg_directory
&& !arg_machine
) {
2316 log_error("Failed to determine path, please use -D or -i.");
2322 if (arg_directory
&& path_equal(arg_directory
, "/"))
2323 arg_machine
= gethostname_malloc();
2325 arg_machine
= strdup(basename(arg_image
?: arg_directory
));
2330 hostname_cleanup(arg_machine
);
2331 if (!machine_name_is_valid(arg_machine
)) {
2332 log_error("Failed to determine machine name automatically, please use -M.");
2336 if (arg_ephemeral
) {
2339 /* Add a random suffix when this is an
2340 * ephemeral machine, so that we can run many
2341 * instances at once without manually having
2342 * to specify -M each time. */
2344 if (asprintf(&b
, "%s-%016" PRIx64
, arg_machine
, random_u64()) < 0)
2355 static int determine_uid_shift(const char *directory
) {
2363 if (arg_uid_shift
== UID_INVALID
) {
2366 r
= stat(directory
, &st
);
2368 return log_error_errno(errno
, "Failed to determine UID base of %s: %m", directory
);
2370 arg_uid_shift
= st
.st_uid
& UINT32_C(0xffff0000);
2372 if (arg_uid_shift
!= (st
.st_gid
& UINT32_C(0xffff0000))) {
2373 log_error("UID and GID base of %s don't match.", directory
);
2377 arg_uid_range
= UINT32_C(0x10000);
2380 if (arg_uid_shift
> (uid_t
) -1 - arg_uid_range
) {
2381 log_error("UID base too high for UID range.");
2385 log_info("Using user namespaces with base " UID_FMT
" and range " UID_FMT
".", arg_uid_shift
, arg_uid_range
);
2389 static int inner_child(
2391 const char *directory
,
2397 _cleanup_free_
char *home
= NULL
;
2399 const char *envp
[] = {
2400 "PATH=" DEFAULT_PATH_SPLIT_USR
,
2401 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2406 NULL
, /* container_uuid */
2407 NULL
, /* LISTEN_FDS */
2408 NULL
, /* LISTEN_PID */
2412 _cleanup_strv_free_
char **env_use
= NULL
;
2417 assert(kmsg_socket
>= 0);
2422 /* Tell the parent, that it now can write the UID map. */
2423 (void) barrier_place(barrier
); /* #1 */
2425 /* Wait until the parent wrote the UID map */
2426 if (!barrier_place_and_sync(barrier
)) { /* #2 */
2427 log_error("Parent died too early");
2432 r
= mount_all(NULL
, arg_userns
, true, arg_uid_shift
, arg_private_network
, arg_uid_range
, arg_selinux_apifs_context
);
2436 r
= mount_sysfs(NULL
);
2440 /* Wait until we are cgroup-ified, so that we
2441 * can mount the right cgroup path writable */
2442 if (!barrier_place_and_sync(barrier
)) { /* #3 */
2443 log_error("Parent died too early");
2447 r
= mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy
);
2451 r
= reset_uid_gid();
2453 return log_error_errno(r
, "Couldn't become new root: %m");
2455 r
= setup_boot_id(NULL
);
2459 r
= setup_kmsg(NULL
, kmsg_socket
);
2462 kmsg_socket
= safe_close(kmsg_socket
);
2467 return log_error_errno(errno
, "setsid() failed: %m");
2469 if (arg_private_network
)
2472 if (arg_expose_ports
) {
2473 r
= expose_port_send_rtnl(rtnl_socket
);
2476 rtnl_socket
= safe_close(rtnl_socket
);
2479 if (drop_capabilities() < 0)
2480 return log_error_errno(errno
, "drop_capabilities() failed: %m");
2484 if (arg_personality
!= PERSONALITY_INVALID
) {
2485 if (personality(arg_personality
) < 0)
2486 return log_error_errno(errno
, "personality() failed: %m");
2487 } else if (secondary
) {
2488 if (personality(PER_LINUX32
) < 0)
2489 return log_error_errno(errno
, "personality() failed: %m");
2493 if (arg_selinux_context
)
2494 if (setexeccon((security_context_t
) arg_selinux_context
) < 0)
2495 return log_error_errno(errno
, "setexeccon(\"%s\") failed: %m", arg_selinux_context
);
2498 r
= change_uid_gid(arg_user
, &home
);
2502 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
2506 if ((asprintf((char**)(envp
+ n_env
++), "HOME=%s", home
? home
: "/root") < 0) ||
2507 (asprintf((char**)(envp
+ n_env
++), "USER=%s", arg_user
? arg_user
: "root") < 0) ||
2508 (asprintf((char**)(envp
+ n_env
++), "LOGNAME=%s", arg_user
? arg_user
: "root") < 0))
2511 if (!sd_id128_equal(arg_uuid
, SD_ID128_NULL
)) {
2514 if (asprintf((char**)(envp
+ n_env
++), "container_uuid=%s", id128_format_as_uuid(arg_uuid
, as_uuid
)) < 0)
2518 if (fdset_size(fds
) > 0) {
2519 r
= fdset_cloexec(fds
, false);
2521 return log_error_errno(r
, "Failed to unset O_CLOEXEC for file descriptors.");
2523 if ((asprintf((char **)(envp
+ n_env
++), "LISTEN_FDS=%u", fdset_size(fds
)) < 0) ||
2524 (asprintf((char **)(envp
+ n_env
++), "LISTEN_PID=1") < 0))
2528 env_use
= strv_env_merge(2, envp
, arg_setenv
);
2532 /* Let the parent know that we are ready and
2533 * wait until the parent is ready with the
2535 if (!barrier_place_and_sync(barrier
)) { /* #4 */
2536 log_error("Parent died too early");
2540 /* Now, explicitly close the log, so that we
2541 * then can close all remaining fds. Closing
2542 * the log explicitly first has the benefit
2543 * that the logging subsystem knows about it,
2544 * and is thus ready to be reopened should we
2545 * need it again. Note that the other fds
2546 * closed here are at least the locking and
2549 (void) fdset_close_others(fds
);
2555 /* Automatically search for the init system */
2557 m
= 1 + strv_length(arg_parameters
);
2558 a
= newa(char*, m
+ 1);
2559 if (strv_isempty(arg_parameters
))
2562 memcpy(a
+ 1, arg_parameters
, m
* sizeof(char*));
2564 a
[0] = (char*) "/usr/lib/systemd/systemd";
2565 execve(a
[0], a
, env_use
);
2567 a
[0] = (char*) "/lib/systemd/systemd";
2568 execve(a
[0], a
, env_use
);
2570 a
[0] = (char*) "/sbin/init";
2571 execve(a
[0], a
, env_use
);
2572 } else if (!strv_isempty(arg_parameters
))
2573 execvpe(arg_parameters
[0], arg_parameters
, env_use
);
2575 chdir(home
?: "/root");
2576 execle("/bin/bash", "-bash", NULL
, env_use
);
2577 execle("/bin/sh", "-sh", NULL
, env_use
);
2581 return log_error_errno(errno
, "execv() failed: %m");
2584 static int outer_child(
2586 const char *directory
,
2587 const char *console
,
2588 const char *root_device
, bool root_device_rw
,
2589 const char *home_device
, bool home_device_rw
,
2590 const char *srv_device
, bool srv_device_rw
,
2596 int uid_shift_socket
,
2606 assert(pid_socket
>= 0);
2607 assert(kmsg_socket
>= 0);
2611 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0)
2612 return log_error_errno(errno
, "PR_SET_PDEATHSIG failed: %m");
2615 close_nointr(STDIN_FILENO
);
2616 close_nointr(STDOUT_FILENO
);
2617 close_nointr(STDERR_FILENO
);
2619 r
= open_terminal(console
, O_RDWR
);
2620 if (r
!= STDIN_FILENO
) {
2626 return log_error_errno(r
, "Failed to open console: %m");
2629 if (dup2(STDIN_FILENO
, STDOUT_FILENO
) != STDOUT_FILENO
||
2630 dup2(STDIN_FILENO
, STDERR_FILENO
) != STDERR_FILENO
)
2631 return log_error_errno(errno
, "Failed to duplicate console: %m");
2634 r
= reset_audit_loginuid();
2638 /* Mark everything as slave, so that we still
2639 * receive mounts from the real root, but don't
2640 * propagate mounts to the real root. */
2641 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
2642 return log_error_errno(errno
, "MS_SLAVE|MS_REC failed: %m");
2644 r
= mount_devices(directory
,
2645 root_device
, root_device_rw
,
2646 home_device
, home_device_rw
,
2647 srv_device
, srv_device_rw
);
2651 r
= determine_uid_shift(directory
);
2656 l
= send(uid_shift_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), MSG_NOSIGNAL
);
2658 return log_error_errno(errno
, "Failed to send UID shift: %m");
2659 if (l
!= sizeof(arg_uid_shift
)) {
2660 log_error("Short write while sending UID shift.");
2665 /* Turn directory into bind mount */
2666 if (mount(directory
, directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
2667 return log_error_errno(errno
, "Failed to make bind mount: %m");
2669 r
= setup_volatile(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2673 r
= setup_volatile_state(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2677 r
= base_filesystem_create(directory
, arg_uid_shift
, (gid_t
) arg_uid_shift
);
2681 if (arg_read_only
) {
2682 r
= bind_remount_recursive(directory
, true);
2684 return log_error_errno(r
, "Failed to make tree read-only: %m");
2687 r
= mount_all(directory
, arg_userns
, false, arg_private_network
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2691 r
= copy_devnodes(directory
);
2695 dev_setup(directory
, arg_uid_shift
, arg_uid_shift
);
2697 r
= setup_pts(directory
);
2701 r
= setup_propagate(directory
);
2705 r
= setup_dev_console(directory
, console
);
2709 r
= setup_seccomp();
2713 r
= setup_timezone(directory
);
2717 r
= setup_resolv_conf(directory
);
2721 r
= setup_journal(directory
);
2725 r
= mount_custom(directory
, arg_custom_mounts
, arg_n_custom_mounts
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2729 r
= mount_cgroups(directory
, arg_unified_cgroup_hierarchy
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2733 r
= mount_move_root(directory
);
2735 return log_error_errno(r
, "Failed to move root directory: %m");
2737 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
|
2738 (arg_share_system
? 0 : CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
) |
2739 (arg_private_network
? CLONE_NEWNET
: 0) |
2740 (arg_userns
? CLONE_NEWUSER
: 0),
2743 return log_error_errno(errno
, "Failed to fork inner child: %m");
2745 pid_socket
= safe_close(pid_socket
);
2746 uid_shift_socket
= safe_close(uid_shift_socket
);
2748 /* The inner child has all namespaces that are
2749 * requested, so that we all are owned by the user if
2750 * user namespaces are turned on. */
2752 r
= inner_child(barrier
, directory
, secondary
, kmsg_socket
, rtnl_socket
, fds
);
2754 _exit(EXIT_FAILURE
);
2756 _exit(EXIT_SUCCESS
);
2759 l
= send(pid_socket
, &pid
, sizeof(pid
), MSG_NOSIGNAL
);
2761 return log_error_errno(errno
, "Failed to send PID: %m");
2762 if (l
!= sizeof(pid
)) {
2763 log_error("Short write while sending PID.");
2767 pid_socket
= safe_close(pid_socket
);
2768 kmsg_socket
= safe_close(kmsg_socket
);
2769 rtnl_socket
= safe_close(rtnl_socket
);
2774 static int setup_uid_map(pid_t pid
) {
2775 char uid_map
[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t
) + 1], line
[DECIMAL_STR_MAX(uid_t
)*3+3+1];
2780 xsprintf(uid_map
, "/proc/" PID_FMT
"/uid_map", pid
);
2781 xsprintf(line
, UID_FMT
" " UID_FMT
" " UID_FMT
"\n", 0, arg_uid_shift
, arg_uid_range
);
2782 r
= write_string_file(uid_map
, line
, 0);
2784 return log_error_errno(r
, "Failed to write UID map: %m");
2786 /* We always assign the same UID and GID ranges */
2787 xsprintf(uid_map
, "/proc/" PID_FMT
"/gid_map", pid
);
2788 r
= write_string_file(uid_map
, line
, 0);
2790 return log_error_errno(r
, "Failed to write GID map: %m");
2795 static int load_settings(void) {
2796 _cleanup_(settings_freep
) Settings
*settings
= NULL
;
2797 _cleanup_fclose_
FILE *f
= NULL
;
2798 _cleanup_free_
char *p
= NULL
;
2802 /* If all settings are masked, there's no point in looking for
2803 * the settings file */
2804 if ((arg_settings_mask
& _SETTINGS_MASK_ALL
) == _SETTINGS_MASK_ALL
)
2807 fn
= strjoina(arg_machine
, ".nspawn");
2809 /* We first look in the admin's directories in /etc and /run */
2810 FOREACH_STRING(i
, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2811 _cleanup_free_
char *j
= NULL
;
2813 j
= strjoin(i
, "/", fn
, NULL
);
2822 /* By default we trust configuration from /etc and /run */
2823 if (arg_settings_trusted
< 0)
2824 arg_settings_trusted
= true;
2829 if (errno
!= ENOENT
)
2830 return log_error_errno(errno
, "Failed to open %s: %m", j
);
2834 /* After that, let's look for a file next to the
2835 * actual image we shall boot. */
2838 p
= file_in_same_dir(arg_image
, fn
);
2841 } else if (arg_directory
) {
2842 p
= file_in_same_dir(arg_directory
, fn
);
2849 if (!f
&& errno
!= ENOENT
)
2850 return log_error_errno(errno
, "Failed to open %s: %m", p
);
2852 /* By default we do not trust configuration from /var/lib/machines */
2853 if (arg_settings_trusted
< 0)
2854 arg_settings_trusted
= false;
2861 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted
));
2863 r
= settings_load(f
, p
, &settings
);
2867 /* Copy over bits from the settings, unless they have been
2868 * explicitly masked by command line switches. */
2870 if ((arg_settings_mask
& SETTING_BOOT
) == 0 &&
2871 settings
->boot
>= 0) {
2872 arg_boot
= settings
->boot
;
2874 strv_free(arg_parameters
);
2875 arg_parameters
= settings
->parameters
;
2876 settings
->parameters
= NULL
;
2879 if ((arg_settings_mask
& SETTING_ENVIRONMENT
) == 0 &&
2880 settings
->environment
) {
2881 strv_free(arg_setenv
);
2882 arg_setenv
= settings
->environment
;
2883 settings
->environment
= NULL
;
2886 if ((arg_settings_mask
& SETTING_USER
) == 0 &&
2889 arg_user
= settings
->user
;
2890 settings
->user
= NULL
;
2893 if ((arg_settings_mask
& SETTING_CAPABILITY
) == 0) {
2896 plus
= settings
->capability
;
2897 if (settings_private_network(settings
))
2898 plus
|= (1ULL << CAP_NET_ADMIN
);
2900 if (!arg_settings_trusted
&& plus
!= 0) {
2901 if (settings
->capability
!= 0)
2902 log_warning("Ignoring Capability= setting, file %s is not trusted.", p
);
2906 arg_retain
&= ~settings
->drop_capability
;
2909 if ((arg_settings_mask
& SETTING_KILL_SIGNAL
) == 0 &&
2910 settings
->kill_signal
> 0)
2911 arg_kill_signal
= settings
->kill_signal
;
2913 if ((arg_settings_mask
& SETTING_PERSONALITY
) == 0 &&
2914 settings
->personality
!= PERSONALITY_INVALID
)
2915 arg_personality
= settings
->personality
;
2917 if ((arg_settings_mask
& SETTING_MACHINE_ID
) == 0 &&
2918 !sd_id128_is_null(settings
->machine_id
)) {
2920 if (!arg_settings_trusted
)
2921 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p
);
2923 arg_uuid
= settings
->machine_id
;
2926 if ((arg_settings_mask
& SETTING_READ_ONLY
) == 0 &&
2927 settings
->read_only
>= 0)
2928 arg_read_only
= settings
->read_only
;
2930 if ((arg_settings_mask
& SETTING_VOLATILE_MODE
) == 0 &&
2931 settings
->volatile_mode
!= _VOLATILE_MODE_INVALID
)
2932 arg_volatile_mode
= settings
->volatile_mode
;
2934 if ((arg_settings_mask
& SETTING_CUSTOM_MOUNTS
) == 0 &&
2935 settings
->n_custom_mounts
> 0) {
2937 if (!arg_settings_trusted
)
2938 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p
);
2940 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
2941 arg_custom_mounts
= settings
->custom_mounts
;
2942 arg_n_custom_mounts
= settings
->n_custom_mounts
;
2944 settings
->custom_mounts
= NULL
;
2945 settings
->n_custom_mounts
= 0;
2949 if ((arg_settings_mask
& SETTING_NETWORK
) == 0 &&
2950 (settings
->private_network
>= 0 ||
2951 settings
->network_veth
>= 0 ||
2952 settings
->network_bridge
||
2953 settings
->network_interfaces
||
2954 settings
->network_macvlan
||
2955 settings
->network_ipvlan
)) {
2957 if (!arg_settings_trusted
)
2958 log_warning("Ignoring network settings, file %s is not trusted.", p
);
2960 arg_network_veth
= settings_private_network(settings
);
2961 arg_private_network
= settings_private_network(settings
);
2963 strv_free(arg_network_interfaces
);
2964 arg_network_interfaces
= settings
->network_interfaces
;
2965 settings
->network_interfaces
= NULL
;
2967 strv_free(arg_network_macvlan
);
2968 arg_network_macvlan
= settings
->network_macvlan
;
2969 settings
->network_macvlan
= NULL
;
2971 strv_free(arg_network_ipvlan
);
2972 arg_network_ipvlan
= settings
->network_ipvlan
;
2973 settings
->network_ipvlan
= NULL
;
2975 free(arg_network_bridge
);
2976 arg_network_bridge
= settings
->network_bridge
;
2977 settings
->network_bridge
= NULL
;
2981 if ((arg_settings_mask
& SETTING_EXPOSE_PORTS
) == 0 &&
2982 settings
->expose_ports
) {
2984 if (!arg_settings_trusted
)
2985 log_warning("Ignoring Port= setting, file %s is not trusted.", p
);
2987 expose_port_free_all(arg_expose_ports
);
2988 arg_expose_ports
= settings
->expose_ports
;
2989 settings
->expose_ports
= NULL
;
2996 int main(int argc
, char *argv
[]) {
2998 _cleanup_free_
char *device_path
= NULL
, *root_device
= NULL
, *home_device
= NULL
, *srv_device
= NULL
, *console
= NULL
;
2999 bool root_device_rw
= true, home_device_rw
= true, srv_device_rw
= true;
3000 _cleanup_close_
int master
= -1, image_fd
= -1;
3001 _cleanup_fdset_free_ FDSet
*fds
= NULL
;
3002 int r
, n_fd_passed
, loop_nr
= -1;
3003 char veth_name
[IFNAMSIZ
];
3004 bool secondary
= false, remove_subvol
= false;
3007 int ret
= EXIT_SUCCESS
;
3008 union in_addr_union exposed
= {};
3009 _cleanup_release_lock_file_ LockFile tree_global_lock
= LOCK_FILE_INIT
, tree_local_lock
= LOCK_FILE_INIT
;
3012 log_parse_environment();
3015 r
= parse_argv(argc
, argv
);
3019 if (geteuid() != 0) {
3020 log_error("Need to be root.");
3024 r
= determine_names();
3028 r
= load_settings();
3032 r
= verify_arguments();
3036 n_fd_passed
= sd_listen_fds(false);
3037 if (n_fd_passed
> 0) {
3038 r
= fdset_new_listen_fds(&fds
, false);
3040 log_error_errno(r
, "Failed to collect file descriptors: %m");
3045 if (arg_directory
) {
3048 if (path_equal(arg_directory
, "/") && !arg_ephemeral
) {
3049 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3054 if (arg_ephemeral
) {
3055 _cleanup_free_
char *np
= NULL
;
3057 /* If the specified path is a mount point we
3058 * generate the new snapshot immediately
3059 * inside it under a random name. However if
3060 * the specified is not a mount point we
3061 * create the new snapshot in the parent
3062 * directory, just next to it. */
3063 r
= path_is_mount_point(arg_directory
, 0);
3065 log_error_errno(r
, "Failed to determine whether directory %s is mount point: %m", arg_directory
);
3069 r
= tempfn_random_child(arg_directory
, "machine.", &np
);
3071 r
= tempfn_random(arg_directory
, "machine.", &np
);
3073 log_error_errno(r
, "Failed to generate name for snapshot: %m");
3077 r
= image_path_lock(np
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3079 log_error_errno(r
, "Failed to lock %s: %m", np
);
3083 r
= btrfs_subvol_snapshot(arg_directory
, np
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
3085 log_error_errno(r
, "Failed to create snapshot %s from %s: %m", np
, arg_directory
);
3089 free(arg_directory
);
3093 remove_subvol
= true;
3096 r
= image_path_lock(arg_directory
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3098 log_error_errno(r
, "Directory tree %s is currently busy.", arg_directory
);
3102 log_error_errno(r
, "Failed to lock %s: %m", arg_directory
);
3107 r
= btrfs_subvol_snapshot(arg_template
, arg_directory
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
3110 log_info("Directory %s already exists, not populating from template %s.", arg_directory
, arg_template
);
3112 log_error_errno(r
, "Couldn't create snapshot %s from %s: %m", arg_directory
, arg_template
);
3116 log_info("Populated %s from template %s.", arg_directory
, arg_template
);
3122 if (path_is_os_tree(arg_directory
) <= 0) {
3123 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory
);
3130 p
= strjoina(arg_directory
, "/usr/");
3131 if (laccess(p
, F_OK
) < 0) {
3132 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory
);
3139 char template[] = "/tmp/nspawn-root-XXXXXX";
3142 assert(!arg_template
);
3144 r
= image_path_lock(arg_image
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3146 r
= log_error_errno(r
, "Disk image %s is currently busy.", arg_image
);
3150 r
= log_error_errno(r
, "Failed to create image lock: %m");
3154 if (!mkdtemp(template)) {
3155 log_error_errno(errno
, "Failed to create temporary directory: %m");
3160 arg_directory
= strdup(template);
3161 if (!arg_directory
) {
3166 image_fd
= setup_image(&device_path
, &loop_nr
);
3172 r
= dissect_image(image_fd
,
3173 &root_device
, &root_device_rw
,
3174 &home_device
, &home_device_rw
,
3175 &srv_device
, &srv_device_rw
,
3181 r
= custom_mounts_prepare();
3186 isatty(STDIN_FILENO
) > 0 &&
3187 isatty(STDOUT_FILENO
) > 0;
3189 master
= posix_openpt(O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NDELAY
);
3191 r
= log_error_errno(errno
, "Failed to acquire pseudo tty: %m");
3195 r
= ptsname_malloc(master
, &console
);
3197 r
= log_error_errno(r
, "Failed to determine tty name: %m");
3201 if (unlockpt(master
) < 0) {
3202 r
= log_error_errno(errno
, "Failed to unlock tty: %m");
3207 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3208 arg_machine
, arg_image
?: arg_directory
);
3210 assert_se(sigprocmask_many(SIG_BLOCK
, NULL
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, -1) >= 0);
3212 assert_se(sigemptyset(&mask_chld
) == 0);
3213 assert_se(sigaddset(&mask_chld
, SIGCHLD
) == 0);
3215 if (prctl(PR_SET_CHILD_SUBREAPER
, 1) < 0) {
3216 r
= log_error_errno(errno
, "Failed to become subreaper: %m");
3221 _cleanup_close_pair_
int kmsg_socket_pair
[2] = { -1, -1 }, rtnl_socket_pair
[2] = { -1, -1 }, pid_socket_pair
[2] = { -1, -1 },
3222 uid_shift_socket_pair
[2] = { -1, -1 };
3223 ContainerStatus container_status
;
3224 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
3225 static const struct sigaction sa
= {
3226 .sa_handler
= nop_signal_handler
,
3227 .sa_flags
= SA_NOCLDSTOP
,
3231 _cleanup_event_unref_ sd_event
*event
= NULL
;
3232 _cleanup_(pty_forward_freep
) PTYForward
*forward
= NULL
;
3233 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
3236 r
= barrier_create(&barrier
);
3238 log_error_errno(r
, "Cannot initialize IPC barrier: %m");
3242 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, kmsg_socket_pair
) < 0) {
3243 r
= log_error_errno(errno
, "Failed to create kmsg socket pair: %m");
3247 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, rtnl_socket_pair
) < 0) {
3248 r
= log_error_errno(errno
, "Failed to create rtnl socket pair: %m");
3252 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, pid_socket_pair
) < 0) {
3253 r
= log_error_errno(errno
, "Failed to create pid socket pair: %m");
3258 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, uid_shift_socket_pair
) < 0) {
3259 r
= log_error_errno(errno
, "Failed to create uid shift socket pair: %m");
3263 /* Child can be killed before execv(), so handle SIGCHLD
3264 * in order to interrupt parent's blocking calls and
3265 * give it a chance to call wait() and terminate. */
3266 r
= sigprocmask(SIG_UNBLOCK
, &mask_chld
, NULL
);
3268 r
= log_error_errno(errno
, "Failed to change the signal mask: %m");
3272 r
= sigaction(SIGCHLD
, &sa
, NULL
);
3274 r
= log_error_errno(errno
, "Failed to install SIGCHLD handler: %m");
3278 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
, NULL
);
3280 if (errno
== EINVAL
)
3281 r
= log_error_errno(errno
, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3283 r
= log_error_errno(errno
, "clone() failed: %m");
3289 /* The outer child only has a file system namespace. */
3290 barrier_set_role(&barrier
, BARRIER_CHILD
);
3292 master
= safe_close(master
);
3294 kmsg_socket_pair
[0] = safe_close(kmsg_socket_pair
[0]);
3295 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3296 pid_socket_pair
[0] = safe_close(pid_socket_pair
[0]);
3297 uid_shift_socket_pair
[0] = safe_close(uid_shift_socket_pair
[0]);
3299 (void) reset_all_signal_handlers();
3300 (void) reset_signal_mask();
3302 r
= outer_child(&barrier
,
3305 root_device
, root_device_rw
,
3306 home_device
, home_device_rw
,
3307 srv_device
, srv_device_rw
,
3311 kmsg_socket_pair
[1],
3312 rtnl_socket_pair
[1],
3313 uid_shift_socket_pair
[1],
3316 _exit(EXIT_FAILURE
);
3318 _exit(EXIT_SUCCESS
);
3321 barrier_set_role(&barrier
, BARRIER_PARENT
);
3323 fds
= fdset_free(fds
);
3325 kmsg_socket_pair
[1] = safe_close(kmsg_socket_pair
[1]);
3326 rtnl_socket_pair
[1] = safe_close(rtnl_socket_pair
[1]);
3327 pid_socket_pair
[1] = safe_close(pid_socket_pair
[1]);
3328 uid_shift_socket_pair
[1] = safe_close(uid_shift_socket_pair
[1]);
3330 /* Wait for the outer child. */
3331 r
= wait_for_terminate_and_warn("namespace helper", pid
, NULL
);
3340 /* And now retrieve the PID of the inner child. */
3341 l
= recv(pid_socket_pair
[0], &pid
, sizeof(pid
), 0);
3343 r
= log_error_errno(errno
, "Failed to read inner child PID: %m");
3346 if (l
!= sizeof(pid
)) {
3347 log_error("Short read while reading inner child PID.");
3352 log_debug("Init process invoked as PID " PID_FMT
, pid
);
3355 if (!barrier_place_and_sync(&barrier
)) { /* #1 */
3356 log_error("Child died too early.");
3361 l
= recv(uid_shift_socket_pair
[0], &arg_uid_shift
, sizeof(arg_uid_shift
), 0);
3363 r
= log_error_errno(errno
, "Failed to read UID shift: %m");
3366 if (l
!= sizeof(arg_uid_shift
)) {
3367 log_error("Short read while reading UID shift.");
3372 r
= setup_uid_map(pid
);
3376 (void) barrier_place(&barrier
); /* #2 */
3379 if (arg_private_network
) {
3381 r
= move_network_interfaces(pid
, arg_network_interfaces
);
3385 if (arg_network_veth
) {
3386 r
= setup_veth(arg_machine
, pid
, veth_name
, !!arg_network_bridge
);
3392 if (arg_network_bridge
) {
3393 r
= setup_bridge(veth_name
, arg_network_bridge
);
3401 r
= setup_macvlan(arg_machine
, pid
, arg_network_macvlan
);
3405 r
= setup_ipvlan(arg_machine
, pid
, arg_network_ipvlan
);
3411 r
= register_machine(
3418 arg_custom_mounts
, arg_n_custom_mounts
,
3426 r
= sync_cgroup(pid
, arg_unified_cgroup_hierarchy
);
3430 if (arg_keep_unit
) {
3431 r
= create_subcgroup(pid
, arg_unified_cgroup_hierarchy
);
3436 r
= chown_cgroup(pid
, arg_uid_shift
);
3440 /* Notify the child that the parent is ready with all
3441 * its setup (including cgroup-ification), and that
3442 * the child can now hand over control to the code to
3443 * run inside the container. */
3444 (void) barrier_place(&barrier
); /* #3 */
3446 /* Block SIGCHLD here, before notifying child.
3447 * process_pty() will handle it with the other signals. */
3448 assert_se(sigprocmask(SIG_BLOCK
, &mask_chld
, NULL
) >= 0);
3450 /* Reset signal to default */
3451 r
= default_signals(SIGCHLD
, -1);
3453 log_error_errno(r
, "Failed to reset SIGCHLD: %m");
3457 /* Let the child know that we are ready and wait that the child is completely ready now. */
3458 if (!barrier_place_and_sync(&barrier
)) { /* #4 */
3459 log_error("Child died too early.");
3466 "STATUS=Container running.\n"
3467 "X_NSPAWN_LEADER_PID=" PID_FMT
, pid
);
3469 r
= sd_event_new(&event
);
3471 log_error_errno(r
, "Failed to get default event source: %m");
3475 if (arg_kill_signal
> 0) {
3476 /* Try to kill the init system on SIGINT or SIGTERM */
3477 sd_event_add_signal(event
, NULL
, SIGINT
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
3478 sd_event_add_signal(event
, NULL
, SIGTERM
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
3480 /* Immediately exit */
3481 sd_event_add_signal(event
, NULL
, SIGINT
, NULL
, NULL
);
3482 sd_event_add_signal(event
, NULL
, SIGTERM
, NULL
, NULL
);
3485 /* simply exit on sigchld */
3486 sd_event_add_signal(event
, NULL
, SIGCHLD
, NULL
, NULL
);
3488 if (arg_expose_ports
) {
3489 r
= expose_port_watch_rtnl(event
, rtnl_socket_pair
[0], on_address_change
, &exposed
, &rtnl
);
3493 (void) expose_port_execute(rtnl
, arg_expose_ports
, &exposed
);
3496 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3498 r
= pty_forward_new(event
, master
, PTY_FORWARD_IGNORE_VHANGUP
| (interactive
? 0 : PTY_FORWARD_READ_ONLY
), &forward
);
3500 log_error_errno(r
, "Failed to create PTY forwarder: %m");
3504 r
= sd_event_loop(event
);
3506 log_error_errno(r
, "Failed to run event loop: %m");
3510 pty_forward_get_last_char(forward
, &last_char
);
3512 forward
= pty_forward_free(forward
);
3514 if (!arg_quiet
&& last_char
!= '\n')
3517 /* Kill if it is not dead yet anyway */
3518 if (arg_register
&& !arg_keep_unit
)
3519 terminate_machine(pid
);
3521 /* Normally redundant, but better safe than sorry */
3524 r
= wait_for_container(pid
, &container_status
);
3528 /* We failed to wait for the container, or the
3529 * container exited abnormally */
3531 else if (r
> 0 || container_status
== CONTAINER_TERMINATED
){
3532 /* The container exited with a non-zero
3533 * status, or with zero status and no reboot
3539 /* CONTAINER_REBOOTED, loop again */
3541 if (arg_keep_unit
) {
3542 /* Special handling if we are running as a
3543 * service: instead of simply restarting the
3544 * machine we want to restart the entire
3545 * service, so let's inform systemd about this
3546 * with the special exit code 133. The service
3547 * file uses RestartForceExitStatus=133 so
3548 * that this results in a full nspawn
3549 * restart. This is necessary since we might
3550 * have cgroup parameters set we want to have
3557 expose_port_flush(arg_expose_ports
, &exposed
);
3563 "STATUS=Terminating...");
3568 /* Try to flush whatever is still queued in the pty */
3570 (void) copy_bytes(master
, STDOUT_FILENO
, (uint64_t) -1, false);
3572 loop_remove(loop_nr
, &image_fd
);
3574 if (remove_subvol
&& arg_directory
) {
3577 k
= btrfs_subvol_remove(arg_directory
, BTRFS_REMOVE_RECURSIVE
|BTRFS_REMOVE_QUOTA
);
3579 log_warning_errno(k
, "Cannot remove subvolume '%s', ignoring: %m", arg_directory
);
3585 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
3586 (void) rm_rf(p
, REMOVE_ROOT
);
3589 expose_port_flush(arg_expose_ports
, &exposed
);
3591 free(arg_directory
);
3596 strv_free(arg_setenv
);
3597 free(arg_network_bridge
);
3598 strv_free(arg_network_interfaces
);
3599 strv_free(arg_network_macvlan
);
3600 strv_free(arg_network_ipvlan
);
3601 strv_free(arg_parameters
);
3602 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
3603 expose_port_free_all(arg_expose_ports
);
3605 return r
< 0 ? EXIT_FAILURE
: ret
;