1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
23 #include <blkid/blkid.h>
27 #include <linux/loop.h>
33 #include <selinux/selinux.h>
40 #include <sys/mount.h>
41 #include <sys/personality.h>
42 #include <sys/prctl.h>
43 #include <sys/types.h>
46 #include "sd-daemon.h"
50 #include "base-filesystem.h"
51 #include "blkid-util.h"
52 #include "btrfs-util.h"
55 #include "capability.h"
56 #include "cgroup-util.h"
58 #include "dev-setup.h"
60 #include "event-util.h"
63 #include "formats-util.h"
65 #include "hostname-util.h"
67 #include "loopback-setup.h"
68 #include "machine-image.h"
72 #include "netlink-util.h"
73 #include "path-util.h"
74 #include "process-util.h"
76 #include "random-util.h"
79 #include "seccomp-util.h"
81 #include "signal-util.h"
83 #include "terminal-util.h"
84 #include "udev-util.h"
87 #include "nspawn-settings.h"
88 #include "nspawn-mount.h"
89 #include "nspawn-network.h"
90 #include "nspawn-expose-ports.h"
91 #include "nspawn-cgroup.h"
92 #include "nspawn-register.h"
93 #include "nspawn-setuid.h"
95 typedef enum ContainerStatus
{
100 typedef enum LinkJournal
{
107 static char *arg_directory
= NULL
;
108 static char *arg_template
= NULL
;
109 static char *arg_user
= NULL
;
110 static sd_id128_t arg_uuid
= {};
111 static char *arg_machine
= NULL
;
112 static const char *arg_selinux_context
= NULL
;
113 static const char *arg_selinux_apifs_context
= NULL
;
114 static const char *arg_slice
= NULL
;
115 static bool arg_private_network
= false;
116 static bool arg_read_only
= false;
117 static bool arg_boot
= false;
118 static bool arg_ephemeral
= false;
119 static LinkJournal arg_link_journal
= LINK_AUTO
;
120 static bool arg_link_journal_try
= false;
121 static uint64_t arg_retain
=
122 (1ULL << CAP_CHOWN
) |
123 (1ULL << CAP_DAC_OVERRIDE
) |
124 (1ULL << CAP_DAC_READ_SEARCH
) |
125 (1ULL << CAP_FOWNER
) |
126 (1ULL << CAP_FSETID
) |
127 (1ULL << CAP_IPC_OWNER
) |
129 (1ULL << CAP_LEASE
) |
130 (1ULL << CAP_LINUX_IMMUTABLE
) |
131 (1ULL << CAP_NET_BIND_SERVICE
) |
132 (1ULL << CAP_NET_BROADCAST
) |
133 (1ULL << CAP_NET_RAW
) |
134 (1ULL << CAP_SETGID
) |
135 (1ULL << CAP_SETFCAP
) |
136 (1ULL << CAP_SETPCAP
) |
137 (1ULL << CAP_SETUID
) |
138 (1ULL << CAP_SYS_ADMIN
) |
139 (1ULL << CAP_SYS_CHROOT
) |
140 (1ULL << CAP_SYS_NICE
) |
141 (1ULL << CAP_SYS_PTRACE
) |
142 (1ULL << CAP_SYS_TTY_CONFIG
) |
143 (1ULL << CAP_SYS_RESOURCE
) |
144 (1ULL << CAP_SYS_BOOT
) |
145 (1ULL << CAP_AUDIT_WRITE
) |
146 (1ULL << CAP_AUDIT_CONTROL
) |
148 static CustomMount
*arg_custom_mounts
= NULL
;
149 static unsigned arg_n_custom_mounts
= 0;
150 static char **arg_setenv
= NULL
;
151 static bool arg_quiet
= false;
152 static bool arg_share_system
= false;
153 static bool arg_register
= true;
154 static bool arg_keep_unit
= false;
155 static char **arg_network_interfaces
= NULL
;
156 static char **arg_network_macvlan
= NULL
;
157 static char **arg_network_ipvlan
= NULL
;
158 static bool arg_network_veth
= false;
159 static char *arg_network_bridge
= NULL
;
160 static unsigned long arg_personality
= PERSONALITY_INVALID
;
161 static char *arg_image
= NULL
;
162 static VolatileMode arg_volatile_mode
= VOLATILE_NO
;
163 static ExposePort
*arg_expose_ports
= NULL
;
164 static char **arg_property
= NULL
;
165 static uid_t arg_uid_shift
= UID_INVALID
, arg_uid_range
= 0x10000U
;
166 static bool arg_userns
= false;
167 static int arg_kill_signal
= 0;
168 static bool arg_unified_cgroup_hierarchy
= false;
169 static SettingsMask arg_settings_mask
= 0;
170 static int arg_settings_trusted
= -1;
171 static char **arg_parameters
= NULL
;
173 static void help(void) {
174 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
175 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
176 " -h --help Show this help\n"
177 " --version Print version string\n"
178 " -q --quiet Do not show status information\n"
179 " -D --directory=PATH Root directory for the container\n"
180 " --template=PATH Initialize root directory from template directory,\n"
182 " -x --ephemeral Run container with snapshot of root directory, and\n"
183 " remove it after exit\n"
184 " -i --image=PATH File system device or disk image for the container\n"
185 " -b --boot Boot up full system (i.e. invoke init)\n"
186 " -u --user=USER Run the command under specified user or uid\n"
187 " -M --machine=NAME Set the machine name for the container\n"
188 " --uuid=UUID Set a specific machine UUID for the container\n"
189 " -S --slice=SLICE Place the container in the specified slice\n"
190 " --property=NAME=VALUE Set scope unit property\n"
191 " --private-users[=UIDBASE[:NUIDS]]\n"
192 " Run within user namespace\n"
193 " --private-network Disable network in container\n"
194 " --network-interface=INTERFACE\n"
195 " Assign an existing network interface to the\n"
197 " --network-macvlan=INTERFACE\n"
198 " Create a macvlan network interface based on an\n"
199 " existing network interface to the container\n"
200 " --network-ipvlan=INTERFACE\n"
201 " Create a ipvlan network interface based on an\n"
202 " existing network interface to the container\n"
203 " -n --network-veth Add a virtual ethernet connection between host\n"
205 " --network-bridge=INTERFACE\n"
206 " Add a virtual ethernet connection between host\n"
207 " and container and add it to an existing bridge on\n"
209 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
210 " Expose a container IP port on the host\n"
211 " -Z --selinux-context=SECLABEL\n"
212 " Set the SELinux security context to be used by\n"
213 " processes in the container\n"
214 " -L --selinux-apifs-context=SECLABEL\n"
215 " Set the SELinux security context to be used by\n"
216 " API/tmpfs file systems in the container\n"
217 " --capability=CAP In addition to the default, retain specified\n"
219 " --drop-capability=CAP Drop the specified capability from the default set\n"
220 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
221 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
222 " try-guest, try-host\n"
223 " -j Equivalent to --link-journal=try-guest\n"
224 " --read-only Mount the root directory read-only\n"
225 " --bind=PATH[:PATH[:OPTIONS]]\n"
226 " Bind mount a file or directory from the host into\n"
228 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
229 " Similar, but creates a read-only bind mount\n"
230 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
231 " --overlay=PATH[:PATH...]:PATH\n"
232 " Create an overlay mount from the host to \n"
234 " --overlay-ro=PATH[:PATH...]:PATH\n"
235 " Similar, but creates a read-only overlay mount\n"
236 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
237 " --share-system Share system namespaces with host\n"
238 " --register=BOOLEAN Register container as machine\n"
239 " --keep-unit Do not register a scope for the machine, reuse\n"
240 " the service unit nspawn is running in\n"
241 " --volatile[=MODE] Run the system in volatile mode\n"
242 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
243 , program_invocation_short_name
);
247 static int custom_mounts_prepare(void) {
251 /* Ensure the mounts are applied prefix first. */
252 qsort_safe(arg_custom_mounts
, arg_n_custom_mounts
, sizeof(CustomMount
), custom_mount_compare
);
254 /* Allocate working directories for the overlay file systems that need it */
255 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
256 CustomMount
*m
= &arg_custom_mounts
[i
];
258 if (arg_userns
&& arg_uid_shift
== UID_INVALID
&& path_equal(m
->destination
, "/")) {
259 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
263 if (m
->type
!= CUSTOM_MOUNT_OVERLAY
)
272 r
= tempfn_random(m
->source
, NULL
, &m
->work_dir
);
274 return log_error_errno(r
, "Failed to generate work directory from %s: %m", m
->source
);
280 static int set_sanitized_path(char **b
, const char *path
) {
286 p
= canonicalize_file_name(path
);
291 p
= path_make_absolute_cwd(path
);
297 *b
= path_kill_slashes(p
);
301 static int detect_unified_cgroup_hierarchy(void) {
305 /* Allow the user to control whether the unified hierarchy is used */
306 e
= getenv("UNIFIED_CGROUP_HIERARCHY");
308 r
= parse_boolean(e
);
310 return log_error_errno(r
, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
312 arg_unified_cgroup_hierarchy
= r
;
316 /* Otherwise inherit the default from the host system */
319 return log_error_errno(r
, "Failed to determine whether the unified cgroups hierarchy is used: %m");
321 arg_unified_cgroup_hierarchy
= r
;
325 static int parse_argv(int argc
, char *argv
[]) {
344 ARG_NETWORK_INTERFACE
,
357 static const struct option options
[] = {
358 { "help", no_argument
, NULL
, 'h' },
359 { "version", no_argument
, NULL
, ARG_VERSION
},
360 { "directory", required_argument
, NULL
, 'D' },
361 { "template", required_argument
, NULL
, ARG_TEMPLATE
},
362 { "ephemeral", no_argument
, NULL
, 'x' },
363 { "user", required_argument
, NULL
, 'u' },
364 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
365 { "boot", no_argument
, NULL
, 'b' },
366 { "uuid", required_argument
, NULL
, ARG_UUID
},
367 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
368 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
369 { "drop-capability", required_argument
, NULL
, ARG_DROP_CAPABILITY
},
370 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
371 { "bind", required_argument
, NULL
, ARG_BIND
},
372 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
373 { "tmpfs", required_argument
, NULL
, ARG_TMPFS
},
374 { "overlay", required_argument
, NULL
, ARG_OVERLAY
},
375 { "overlay-ro", required_argument
, NULL
, ARG_OVERLAY_RO
},
376 { "machine", required_argument
, NULL
, 'M' },
377 { "slice", required_argument
, NULL
, 'S' },
378 { "setenv", required_argument
, NULL
, ARG_SETENV
},
379 { "selinux-context", required_argument
, NULL
, 'Z' },
380 { "selinux-apifs-context", required_argument
, NULL
, 'L' },
381 { "quiet", no_argument
, NULL
, 'q' },
382 { "share-system", no_argument
, NULL
, ARG_SHARE_SYSTEM
},
383 { "register", required_argument
, NULL
, ARG_REGISTER
},
384 { "keep-unit", no_argument
, NULL
, ARG_KEEP_UNIT
},
385 { "network-interface", required_argument
, NULL
, ARG_NETWORK_INTERFACE
},
386 { "network-macvlan", required_argument
, NULL
, ARG_NETWORK_MACVLAN
},
387 { "network-ipvlan", required_argument
, NULL
, ARG_NETWORK_IPVLAN
},
388 { "network-veth", no_argument
, NULL
, 'n' },
389 { "network-bridge", required_argument
, NULL
, ARG_NETWORK_BRIDGE
},
390 { "personality", required_argument
, NULL
, ARG_PERSONALITY
},
391 { "image", required_argument
, NULL
, 'i' },
392 { "volatile", optional_argument
, NULL
, ARG_VOLATILE
},
393 { "port", required_argument
, NULL
, 'p' },
394 { "property", required_argument
, NULL
, ARG_PROPERTY
},
395 { "private-users", optional_argument
, NULL
, ARG_PRIVATE_USERS
},
396 { "kill-signal", required_argument
, NULL
, ARG_KILL_SIGNAL
},
397 { "settings", required_argument
, NULL
, ARG_SETTINGS
},
402 uint64_t plus
= 0, minus
= 0;
403 bool mask_all_settings
= false, mask_no_settings
= false;
408 while ((c
= getopt_long(argc
, argv
, "+hD:u:bL:M:jS:Z:qi:xp:n", options
, NULL
)) >= 0)
417 puts(PACKAGE_STRING
);
418 puts(SYSTEMD_FEATURES
);
422 r
= set_sanitized_path(&arg_directory
, optarg
);
424 return log_error_errno(r
, "Invalid root directory: %m");
429 r
= set_sanitized_path(&arg_template
, optarg
);
431 return log_error_errno(r
, "Invalid template directory: %m");
436 r
= set_sanitized_path(&arg_image
, optarg
);
438 return log_error_errno(r
, "Invalid image path: %m");
443 arg_ephemeral
= true;
447 r
= free_and_strdup(&arg_user
, optarg
);
451 arg_settings_mask
|= SETTING_USER
;
454 case ARG_NETWORK_BRIDGE
:
455 r
= free_and_strdup(&arg_network_bridge
, optarg
);
462 arg_network_veth
= true;
463 arg_private_network
= true;
464 arg_settings_mask
|= SETTING_NETWORK
;
467 case ARG_NETWORK_INTERFACE
:
468 if (strv_extend(&arg_network_interfaces
, optarg
) < 0)
471 arg_private_network
= true;
472 arg_settings_mask
|= SETTING_NETWORK
;
475 case ARG_NETWORK_MACVLAN
:
476 if (strv_extend(&arg_network_macvlan
, optarg
) < 0)
479 arg_private_network
= true;
480 arg_settings_mask
|= SETTING_NETWORK
;
483 case ARG_NETWORK_IPVLAN
:
484 if (strv_extend(&arg_network_ipvlan
, optarg
) < 0)
489 case ARG_PRIVATE_NETWORK
:
490 arg_private_network
= true;
491 arg_settings_mask
|= SETTING_NETWORK
;
496 arg_settings_mask
|= SETTING_BOOT
;
500 r
= sd_id128_from_string(optarg
, &arg_uuid
);
502 log_error("Invalid UUID: %s", optarg
);
506 arg_settings_mask
|= SETTING_MACHINE_ID
;
515 arg_machine
= mfree(arg_machine
);
517 if (!machine_name_is_valid(optarg
)) {
518 log_error("Invalid machine name: %s", optarg
);
522 r
= free_and_strdup(&arg_machine
, optarg
);
530 arg_selinux_context
= optarg
;
534 arg_selinux_apifs_context
= optarg
;
538 arg_read_only
= true;
539 arg_settings_mask
|= SETTING_READ_ONLY
;
543 case ARG_DROP_CAPABILITY
: {
544 const char *state
, *word
;
547 FOREACH_WORD_SEPARATOR(word
, length
, optarg
, ",", state
) {
548 _cleanup_free_
char *t
;
550 t
= strndup(word
, length
);
554 if (streq(t
, "all")) {
555 if (c
== ARG_CAPABILITY
)
556 plus
= (uint64_t) -1;
558 minus
= (uint64_t) -1;
562 cap
= capability_from_name(t
);
564 log_error("Failed to parse capability %s.", t
);
568 if (c
== ARG_CAPABILITY
)
569 plus
|= 1ULL << (uint64_t) cap
;
571 minus
|= 1ULL << (uint64_t) cap
;
575 arg_settings_mask
|= SETTING_CAPABILITY
;
580 arg_link_journal
= LINK_GUEST
;
581 arg_link_journal_try
= true;
584 case ARG_LINK_JOURNAL
:
585 if (streq(optarg
, "auto")) {
586 arg_link_journal
= LINK_AUTO
;
587 arg_link_journal_try
= false;
588 } else if (streq(optarg
, "no")) {
589 arg_link_journal
= LINK_NO
;
590 arg_link_journal_try
= false;
591 } else if (streq(optarg
, "guest")) {
592 arg_link_journal
= LINK_GUEST
;
593 arg_link_journal_try
= false;
594 } else if (streq(optarg
, "host")) {
595 arg_link_journal
= LINK_HOST
;
596 arg_link_journal_try
= false;
597 } else if (streq(optarg
, "try-guest")) {
598 arg_link_journal
= LINK_GUEST
;
599 arg_link_journal_try
= true;
600 } else if (streq(optarg
, "try-host")) {
601 arg_link_journal
= LINK_HOST
;
602 arg_link_journal_try
= true;
604 log_error("Failed to parse link journal mode %s", optarg
);
612 r
= bind_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
, c
== ARG_BIND_RO
);
614 return log_error_errno(r
, "Failed to parse --bind(-ro)= argument %s: %m", optarg
);
616 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
620 r
= tmpfs_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
);
622 return log_error_errno(r
, "Failed to parse --tmpfs= argument %s: %m", optarg
);
624 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
628 case ARG_OVERLAY_RO
: {
629 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
630 _cleanup_strv_free_
char **lower
= NULL
;
635 r
= strv_split_extract(&lower
, optarg
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
639 log_error("Invalid overlay specification: %s", optarg
);
643 STRV_FOREACH(i
, lower
) {
644 if (!path_is_absolute(*i
)) {
645 log_error("Overlay path %s is not absolute.", *i
);
653 log_error("--overlay= needs at least two colon-separated directories specified.");
658 /* If two parameters are specified,
659 * the first one is the lower, the
660 * second one the upper directory. And
661 * we'll also define the destination
662 * mount point the same as the upper. */
666 destination
= strdup(upper
);
671 upper
= lower
[n
- 2];
672 destination
= lower
[n
- 1];
676 m
= custom_mount_add(&arg_custom_mounts
, &arg_n_custom_mounts
, CUSTOM_MOUNT_OVERLAY
);
680 m
->destination
= destination
;
683 m
->read_only
= c
== ARG_OVERLAY_RO
;
685 upper
= destination
= NULL
;
688 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
695 if (!env_assignment_is_valid(optarg
)) {
696 log_error("Environment variable assignment '%s' is not valid.", optarg
);
700 n
= strv_env_set(arg_setenv
, optarg
);
704 strv_free(arg_setenv
);
707 arg_settings_mask
|= SETTING_ENVIRONMENT
;
715 case ARG_SHARE_SYSTEM
:
716 arg_share_system
= true;
720 r
= parse_boolean(optarg
);
722 log_error("Failed to parse --register= argument: %s", optarg
);
730 arg_keep_unit
= true;
733 case ARG_PERSONALITY
:
735 arg_personality
= personality_from_string(optarg
);
736 if (arg_personality
== PERSONALITY_INVALID
) {
737 log_error("Unknown or unsupported personality '%s'.", optarg
);
741 arg_settings_mask
|= SETTING_PERSONALITY
;
747 arg_volatile_mode
= VOLATILE_YES
;
751 m
= volatile_mode_from_string(optarg
);
753 log_error("Failed to parse --volatile= argument: %s", optarg
);
756 arg_volatile_mode
= m
;
759 arg_settings_mask
|= SETTING_VOLATILE_MODE
;
763 r
= expose_port_parse(&arg_expose_ports
, optarg
);
765 return log_error_errno(r
, "Duplicate port specification: %s", optarg
);
767 return log_error_errno(r
, "Failed to parse host port %s: %m", optarg
);
769 arg_settings_mask
|= SETTING_EXPOSE_PORTS
;
773 if (strv_extend(&arg_property
, optarg
) < 0)
778 case ARG_PRIVATE_USERS
:
780 _cleanup_free_
char *buffer
= NULL
;
781 const char *range
, *shift
;
783 range
= strchr(optarg
, ':');
785 buffer
= strndup(optarg
, range
- optarg
);
791 if (safe_atou32(range
, &arg_uid_range
) < 0 || arg_uid_range
<= 0) {
792 log_error("Failed to parse UID range: %s", range
);
798 if (parse_uid(shift
, &arg_uid_shift
) < 0) {
799 log_error("Failed to parse UID: %s", optarg
);
807 case ARG_KILL_SIGNAL
:
808 arg_kill_signal
= signal_from_string_try_harder(optarg
);
809 if (arg_kill_signal
< 0) {
810 log_error("Cannot parse signal: %s", optarg
);
814 arg_settings_mask
|= SETTING_KILL_SIGNAL
;
819 /* no → do not read files
820 * yes → read files, do not override cmdline, trust only subset
821 * override → read files, override cmdline, trust only subset
822 * trusted → read files, do not override cmdline, trust all
825 r
= parse_boolean(optarg
);
827 if (streq(optarg
, "trusted")) {
828 mask_all_settings
= false;
829 mask_no_settings
= false;
830 arg_settings_trusted
= true;
832 } else if (streq(optarg
, "override")) {
833 mask_all_settings
= false;
834 mask_no_settings
= true;
835 arg_settings_trusted
= -1;
837 return log_error_errno(r
, "Failed to parse --settings= argument: %s", optarg
);
840 mask_all_settings
= false;
841 mask_no_settings
= false;
842 arg_settings_trusted
= -1;
845 mask_all_settings
= true;
846 mask_no_settings
= false;
847 arg_settings_trusted
= false;
856 assert_not_reached("Unhandled option");
859 if (arg_share_system
)
860 arg_register
= false;
862 if (arg_boot
&& arg_share_system
) {
863 log_error("--boot and --share-system may not be combined.");
867 if (arg_keep_unit
&& cg_pid_get_owner_uid(0, NULL
) >= 0) {
868 log_error("--keep-unit may not be used when invoked from a user session.");
872 if (arg_directory
&& arg_image
) {
873 log_error("--directory= and --image= may not be combined.");
877 if (arg_template
&& arg_image
) {
878 log_error("--template= and --image= may not be combined.");
882 if (arg_template
&& !(arg_directory
|| arg_machine
)) {
883 log_error("--template= needs --directory= or --machine=.");
887 if (arg_ephemeral
&& arg_template
) {
888 log_error("--ephemeral and --template= may not be combined.");
892 if (arg_ephemeral
&& arg_image
) {
893 log_error("--ephemeral and --image= may not be combined.");
897 if (arg_ephemeral
&& !IN_SET(arg_link_journal
, LINK_NO
, LINK_AUTO
)) {
898 log_error("--ephemeral and --link-journal= may not be combined.");
902 if (arg_userns
&& access("/proc/self/uid_map", F_OK
) < 0)
903 return log_error_errno(EOPNOTSUPP
, "--private-users= is not supported, kernel compiled without user namespace support.");
906 arg_parameters
= strv_copy(argv
+ optind
);
910 arg_settings_mask
|= SETTING_BOOT
;
913 /* Load all settings from .nspawn files */
914 if (mask_no_settings
)
915 arg_settings_mask
= 0;
917 /* Don't load any settings from .nspawn files */
918 if (mask_all_settings
)
919 arg_settings_mask
= _SETTINGS_MASK_ALL
;
921 arg_retain
= (arg_retain
| plus
| (arg_private_network
? 1ULL << CAP_NET_ADMIN
: 0)) & ~minus
;
923 r
= detect_unified_cgroup_hierarchy();
930 static int verify_arguments(void) {
932 if (arg_volatile_mode
!= VOLATILE_NO
&& arg_read_only
) {
933 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
937 if (arg_expose_ports
&& !arg_private_network
) {
938 log_error("Cannot use --port= without private networking.");
942 if (arg_boot
&& arg_kill_signal
<= 0)
943 arg_kill_signal
= SIGRTMIN
+3;
948 static int userns_lchown(const char *p
, uid_t uid
, gid_t gid
) {
954 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
957 if (uid
!= UID_INVALID
) {
958 uid
+= arg_uid_shift
;
960 if (uid
< arg_uid_shift
|| uid
>= arg_uid_shift
+ arg_uid_range
)
964 if (gid
!= GID_INVALID
) {
965 gid
+= (gid_t
) arg_uid_shift
;
967 if (gid
< (gid_t
) arg_uid_shift
|| gid
>= (gid_t
) (arg_uid_shift
+ arg_uid_range
))
971 if (lchown(p
, uid
, gid
) < 0)
977 static int userns_mkdir(const char *root
, const char *path
, mode_t mode
, uid_t uid
, gid_t gid
) {
980 q
= prefix_roota(root
, path
);
981 if (mkdir(q
, mode
) < 0) {
987 return userns_lchown(q
, uid
, gid
);
990 static int setup_timezone(const char *dest
) {
991 _cleanup_free_
char *p
= NULL
, *q
= NULL
;
992 const char *where
, *check
, *what
;
998 /* Fix the timezone, if possible */
999 r
= readlink_malloc("/etc/localtime", &p
);
1001 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1005 z
= path_startswith(p
, "../usr/share/zoneinfo/");
1007 z
= path_startswith(p
, "/usr/share/zoneinfo/");
1009 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1013 where
= prefix_roota(dest
, "/etc/localtime");
1014 r
= readlink_malloc(where
, &q
);
1016 y
= path_startswith(q
, "../usr/share/zoneinfo/");
1018 y
= path_startswith(q
, "/usr/share/zoneinfo/");
1020 /* Already pointing to the right place? Then do nothing .. */
1021 if (y
&& streq(y
, z
))
1025 check
= strjoina("/usr/share/zoneinfo/", z
);
1026 check
= prefix_root(dest
, check
);
1027 if (laccess(check
, F_OK
) < 0) {
1028 log_warning("Timezone %s does not exist in container, not updating container timezone.", z
);
1033 if (r
< 0 && errno
!= ENOENT
) {
1034 log_error_errno(errno
, "Failed to remove existing timezone info %s in container: %m", where
);
1038 what
= strjoina("../usr/share/zoneinfo/", z
);
1039 if (symlink(what
, where
) < 0) {
1040 log_error_errno(errno
, "Failed to correct timezone of container: %m");
1044 r
= userns_lchown(where
, 0, 0);
1046 return log_warning_errno(r
, "Failed to chown /etc/localtime: %m");
1051 static int setup_resolv_conf(const char *dest
) {
1052 const char *where
= NULL
;
1057 if (arg_private_network
)
1060 /* Fix resolv.conf, if possible */
1061 where
= prefix_roota(dest
, "/etc/resolv.conf");
1063 r
= copy_file("/etc/resolv.conf", where
, O_TRUNC
|O_NOFOLLOW
, 0644, 0);
1065 /* If the file already exists as symlink, let's
1066 * suppress the warning, under the assumption that
1067 * resolved or something similar runs inside and the
1068 * symlink points there.
1070 * If the disk image is read-only, there's also no
1071 * point in complaining.
1073 log_full_errno(IN_SET(r
, -ELOOP
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
1074 "Failed to copy /etc/resolv.conf to %s: %m", where
);
1078 r
= userns_lchown(where
, 0, 0);
1080 log_warning_errno(r
, "Failed to chown /etc/resolv.conf: %m");
1085 static char* id128_format_as_uuid(sd_id128_t id
, char s
[37]) {
1089 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1090 SD_ID128_FORMAT_VAL(id
));
1095 static int setup_boot_id(const char *dest
) {
1096 const char *from
, *to
;
1097 sd_id128_t rnd
= {};
1101 if (arg_share_system
)
1104 /* Generate a new randomized boot ID, so that each boot-up of
1105 * the container gets a new one */
1107 from
= prefix_roota(dest
, "/run/proc-sys-kernel-random-boot-id");
1108 to
= prefix_roota(dest
, "/proc/sys/kernel/random/boot_id");
1110 r
= sd_id128_randomize(&rnd
);
1112 return log_error_errno(r
, "Failed to generate random boot id: %m");
1114 id128_format_as_uuid(rnd
, as_uuid
);
1116 r
= write_string_file(from
, as_uuid
, WRITE_STRING_FILE_CREATE
);
1118 return log_error_errno(r
, "Failed to write boot id: %m");
1120 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1121 r
= log_error_errno(errno
, "Failed to bind mount boot id: %m");
1122 else if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
) < 0)
1123 log_warning_errno(errno
, "Failed to make boot id read-only: %m");
1129 static int copy_devnodes(const char *dest
) {
1131 static const char devnodes
[] =
1142 _cleanup_umask_ mode_t u
;
1148 /* Create /dev/net, so that we can create /dev/net/tun in it */
1149 if (userns_mkdir(dest
, "/dev/net", 0755, 0, 0) < 0)
1150 return log_error_errno(r
, "Failed to create /dev/net directory: %m");
1152 NULSTR_FOREACH(d
, devnodes
) {
1153 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
1156 from
= strappend("/dev/", d
);
1157 to
= prefix_root(dest
, from
);
1159 if (stat(from
, &st
) < 0) {
1161 if (errno
!= ENOENT
)
1162 return log_error_errno(errno
, "Failed to stat %s: %m", from
);
1164 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
1166 log_error("%s is not a char or block device, cannot copy.", from
);
1170 if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
1172 return log_error_errno(errno
, "mknod(%s) failed: %m", to
);
1174 /* Some systems abusively restrict mknod but
1175 * allow bind mounts. */
1178 return log_error_errno(r
, "touch (%s) failed: %m", to
);
1179 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1180 return log_error_errno(errno
, "Both mknod and bind mount (%s) failed: %m", to
);
1183 r
= userns_lchown(to
, 0, 0);
1185 return log_error_errno(r
, "chown() of device node %s failed: %m", to
);
1192 static int setup_pts(const char *dest
) {
1193 _cleanup_free_
char *options
= NULL
;
1197 if (arg_selinux_apifs_context
)
1198 (void) asprintf(&options
,
1199 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
",context=\"%s\"",
1200 arg_uid_shift
+ TTY_GID
,
1201 arg_selinux_apifs_context
);
1204 (void) asprintf(&options
,
1205 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
,
1206 arg_uid_shift
+ TTY_GID
);
1211 /* Mount /dev/pts itself */
1212 p
= prefix_roota(dest
, "/dev/pts");
1213 if (mkdir(p
, 0755) < 0)
1214 return log_error_errno(errno
, "Failed to create /dev/pts: %m");
1215 if (mount("devpts", p
, "devpts", MS_NOSUID
|MS_NOEXEC
, options
) < 0)
1216 return log_error_errno(errno
, "Failed to mount /dev/pts: %m");
1217 if (userns_lchown(p
, 0, 0) < 0)
1218 return log_error_errno(errno
, "Failed to chown /dev/pts: %m");
1220 /* Create /dev/ptmx symlink */
1221 p
= prefix_roota(dest
, "/dev/ptmx");
1222 if (symlink("pts/ptmx", p
) < 0)
1223 return log_error_errno(errno
, "Failed to create /dev/ptmx symlink: %m");
1224 if (userns_lchown(p
, 0, 0) < 0)
1225 return log_error_errno(errno
, "Failed to chown /dev/ptmx: %m");
1227 /* And fix /dev/pts/ptmx ownership */
1228 p
= prefix_roota(dest
, "/dev/pts/ptmx");
1229 if (userns_lchown(p
, 0, 0) < 0)
1230 return log_error_errno(errno
, "Failed to chown /dev/pts/ptmx: %m");
1235 static int setup_dev_console(const char *dest
, const char *console
) {
1236 _cleanup_umask_ mode_t u
;
1245 r
= chmod_and_chown(console
, 0600, arg_uid_shift
, arg_uid_shift
);
1247 return log_error_errno(r
, "Failed to correct access mode for TTY: %m");
1249 /* We need to bind mount the right tty to /dev/console since
1250 * ptys can only exist on pts file systems. To have something
1251 * to bind mount things on we create a empty regular file. */
1253 to
= prefix_roota(dest
, "/dev/console");
1256 return log_error_errno(r
, "touch() for /dev/console failed: %m");
1258 if (mount(console
, to
, NULL
, MS_BIND
, NULL
) < 0)
1259 return log_error_errno(errno
, "Bind mount for /dev/console failed: %m");
1264 static int setup_kmsg(const char *dest
, int kmsg_socket
) {
1265 const char *from
, *to
;
1266 _cleanup_umask_ mode_t u
;
1269 assert(kmsg_socket
>= 0);
1273 /* We create the kmsg FIFO as /run/kmsg, but immediately
1274 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1275 * on the reading side behave very similar to /proc/kmsg,
1276 * their writing side behaves differently from /dev/kmsg in
1277 * that writing blocks when nothing is reading. In order to
1278 * avoid any problems with containers deadlocking due to this
1279 * we simply make /dev/kmsg unavailable to the container. */
1280 from
= prefix_roota(dest
, "/run/kmsg");
1281 to
= prefix_roota(dest
, "/proc/kmsg");
1283 if (mkfifo(from
, 0600) < 0)
1284 return log_error_errno(errno
, "mkfifo() for /run/kmsg failed: %m");
1285 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1286 return log_error_errno(errno
, "Bind mount for /proc/kmsg failed: %m");
1288 fd
= open(from
, O_RDWR
|O_NDELAY
|O_CLOEXEC
);
1290 return log_error_errno(errno
, "Failed to open fifo: %m");
1292 /* Store away the fd in the socket, so that it stays open as
1293 * long as we run the child */
1294 r
= send_one_fd(kmsg_socket
, fd
);
1298 return log_error_errno(r
, "Failed to send FIFO fd: %m");
1300 /* And now make the FIFO unavailable as /run/kmsg... */
1301 (void) unlink(from
);
1306 static int on_address_change(sd_netlink
*rtnl
, sd_netlink_message
*m
, void *userdata
) {
1307 union in_addr_union
*exposed
= userdata
;
1313 expose_port_execute(rtnl
, arg_expose_ports
, exposed
);
1317 static int setup_hostname(void) {
1319 if (arg_share_system
)
1322 if (sethostname_idempotent(arg_machine
) < 0)
1328 static int setup_journal(const char *directory
) {
1329 sd_id128_t machine_id
, this_id
;
1330 _cleanup_free_
char *b
= NULL
, *d
= NULL
;
1331 const char *etc_machine_id
, *p
, *q
;
1335 /* Don't link journals in ephemeral mode */
1339 etc_machine_id
= prefix_roota(directory
, "/etc/machine-id");
1341 r
= read_one_line_file(etc_machine_id
, &b
);
1342 if (r
== -ENOENT
&& arg_link_journal
== LINK_AUTO
)
1345 return log_error_errno(r
, "Failed to read machine ID from %s: %m", etc_machine_id
);
1348 if (isempty(id
) && arg_link_journal
== LINK_AUTO
)
1351 /* Verify validity */
1352 r
= sd_id128_from_string(id
, &machine_id
);
1354 return log_error_errno(r
, "Failed to parse machine ID from %s: %m", etc_machine_id
);
1356 r
= sd_id128_get_machine(&this_id
);
1358 return log_error_errno(r
, "Failed to retrieve machine ID: %m");
1360 if (sd_id128_equal(machine_id
, this_id
)) {
1361 log_full(arg_link_journal
== LINK_AUTO
? LOG_WARNING
: LOG_ERR
,
1362 "Host and machine ids are equal (%s): refusing to link journals", id
);
1363 if (arg_link_journal
== LINK_AUTO
)
1368 if (arg_link_journal
== LINK_NO
)
1371 r
= userns_mkdir(directory
, "/var", 0755, 0, 0);
1373 return log_error_errno(r
, "Failed to create /var: %m");
1375 r
= userns_mkdir(directory
, "/var/log", 0755, 0, 0);
1377 return log_error_errno(r
, "Failed to create /var/log: %m");
1379 r
= userns_mkdir(directory
, "/var/log/journal", 0755, 0, 0);
1381 return log_error_errno(r
, "Failed to create /var/log/journal: %m");
1383 p
= strjoina("/var/log/journal/", id
);
1384 q
= prefix_roota(directory
, p
);
1386 if (path_is_mount_point(p
, 0) > 0) {
1387 if (arg_link_journal
!= LINK_AUTO
) {
1388 log_error("%s: already a mount point, refusing to use for journal", p
);
1395 if (path_is_mount_point(q
, 0) > 0) {
1396 if (arg_link_journal
!= LINK_AUTO
) {
1397 log_error("%s: already a mount point, refusing to use for journal", q
);
1404 r
= readlink_and_make_absolute(p
, &d
);
1406 if ((arg_link_journal
== LINK_GUEST
||
1407 arg_link_journal
== LINK_AUTO
) &&
1410 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1412 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
1417 return log_error_errno(errno
, "Failed to remove symlink %s: %m", p
);
1418 } else if (r
== -EINVAL
) {
1420 if (arg_link_journal
== LINK_GUEST
&&
1423 if (errno
== ENOTDIR
) {
1424 log_error("%s already exists and is neither a symlink nor a directory", p
);
1427 log_error_errno(errno
, "Failed to remove %s: %m", p
);
1431 } else if (r
!= -ENOENT
) {
1432 log_error_errno(errno
, "readlink(%s) failed: %m", p
);
1436 if (arg_link_journal
== LINK_GUEST
) {
1438 if (symlink(q
, p
) < 0) {
1439 if (arg_link_journal_try
) {
1440 log_debug_errno(errno
, "Failed to symlink %s to %s, skipping journal setup: %m", q
, p
);
1443 log_error_errno(errno
, "Failed to symlink %s to %s: %m", q
, p
);
1448 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1450 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
1454 if (arg_link_journal
== LINK_HOST
) {
1455 /* don't create parents here -- if the host doesn't have
1456 * permanent journal set up, don't force it here */
1459 if (arg_link_journal_try
) {
1460 log_debug_errno(errno
, "Failed to create %s, skipping journal setup: %m", p
);
1463 log_error_errno(errno
, "Failed to create %s: %m", p
);
1468 } else if (access(p
, F_OK
) < 0)
1471 if (dir_is_empty(q
) == 0)
1472 log_warning("%s is not empty, proceeding anyway.", q
);
1474 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1476 log_error_errno(errno
, "Failed to create %s: %m", q
);
1480 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1481 return log_error_errno(errno
, "Failed to bind mount journal from host into guest: %m");
1486 static int drop_capabilities(void) {
1487 return capability_bounding_set_drop(~arg_retain
, false);
1490 static int reset_audit_loginuid(void) {
1491 _cleanup_free_
char *p
= NULL
;
1494 if (arg_share_system
)
1497 r
= read_one_line_file("/proc/self/loginuid", &p
);
1501 return log_error_errno(r
, "Failed to read /proc/self/loginuid: %m");
1503 /* Already reset? */
1504 if (streq(p
, "4294967295"))
1507 r
= write_string_file("/proc/self/loginuid", "4294967295", 0);
1510 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1511 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1512 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1513 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1514 "using systemd-nspawn. Sleeping for 5s... (%m)");
1522 static int setup_seccomp(void) {
1525 static const struct {
1526 uint64_t capability
;
1529 { CAP_SYS_RAWIO
, SCMP_SYS(iopl
) },
1530 { CAP_SYS_RAWIO
, SCMP_SYS(ioperm
) },
1531 { CAP_SYS_BOOT
, SCMP_SYS(kexec_load
) },
1532 { CAP_SYS_ADMIN
, SCMP_SYS(swapon
) },
1533 { CAP_SYS_ADMIN
, SCMP_SYS(swapoff
) },
1534 { CAP_SYS_ADMIN
, SCMP_SYS(open_by_handle_at
) },
1535 { CAP_SYS_MODULE
, SCMP_SYS(init_module
) },
1536 { CAP_SYS_MODULE
, SCMP_SYS(finit_module
) },
1537 { CAP_SYS_MODULE
, SCMP_SYS(delete_module
) },
1538 { CAP_SYSLOG
, SCMP_SYS(syslog
) },
1541 scmp_filter_ctx seccomp
;
1545 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
1549 r
= seccomp_add_secondary_archs(seccomp
);
1551 log_error_errno(r
, "Failed to add secondary archs to seccomp filter: %m");
1555 for (i
= 0; i
< ELEMENTSOF(blacklist
); i
++) {
1556 if (arg_retain
& (1ULL << blacklist
[i
].capability
))
1559 r
= seccomp_rule_add(seccomp
, SCMP_ACT_ERRNO(EPERM
), blacklist
[i
].syscall_num
, 0);
1561 continue; /* unknown syscall */
1563 log_error_errno(r
, "Failed to block syscall: %m");
1570 Audit is broken in containers, much of the userspace audit
1571 hookup will fail if running inside a container. We don't
1572 care and just turn off creation of audit sockets.
1574 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1575 with EAFNOSUPPORT which audit userspace uses as indication
1576 that audit is disabled in the kernel.
1579 r
= seccomp_rule_add(
1581 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1584 SCMP_A0(SCMP_CMP_EQ
, AF_NETLINK
),
1585 SCMP_A2(SCMP_CMP_EQ
, NETLINK_AUDIT
));
1587 log_error_errno(r
, "Failed to add audit seccomp rule: %m");
1591 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
1593 log_error_errno(r
, "Failed to unset NO_NEW_PRIVS: %m");
1597 r
= seccomp_load(seccomp
);
1599 log_debug_errno(r
, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1604 log_error_errno(r
, "Failed to install seccomp audit filter: %m");
1609 seccomp_release(seccomp
);
1617 static int setup_propagate(const char *root
) {
1620 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1621 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1622 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
1623 (void) mkdir_p(p
, 0600);
1625 if (userns_mkdir(root
, "/run/systemd", 0755, 0, 0) < 0)
1626 return log_error_errno(errno
, "Failed to create /run/systemd: %m");
1628 if (userns_mkdir(root
, "/run/systemd/nspawn", 0755, 0, 0) < 0)
1629 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn: %m");
1631 if (userns_mkdir(root
, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
1632 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn/incoming: %m");
1634 q
= prefix_roota(root
, "/run/systemd/nspawn/incoming");
1635 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1636 return log_error_errno(errno
, "Failed to install propagation bind mount.");
1638 if (mount(NULL
, q
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
) < 0)
1639 return log_error_errno(errno
, "Failed to make propagation mount read-only");
1644 static int setup_image(char **device_path
, int *loop_nr
) {
1645 struct loop_info64 info
= {
1646 .lo_flags
= LO_FLAGS_AUTOCLEAR
|LO_FLAGS_PARTSCAN
1648 _cleanup_close_
int fd
= -1, control
= -1, loop
= -1;
1649 _cleanup_free_
char* loopdev
= NULL
;
1653 assert(device_path
);
1657 fd
= open(arg_image
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1659 return log_error_errno(errno
, "Failed to open %s: %m", arg_image
);
1661 if (fstat(fd
, &st
) < 0)
1662 return log_error_errno(errno
, "Failed to stat %s: %m", arg_image
);
1664 if (S_ISBLK(st
.st_mode
)) {
1667 p
= strdup(arg_image
);
1681 if (!S_ISREG(st
.st_mode
)) {
1682 log_error_errno(errno
, "%s is not a regular file or block device: %m", arg_image
);
1686 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
1688 return log_error_errno(errno
, "Failed to open /dev/loop-control: %m");
1690 nr
= ioctl(control
, LOOP_CTL_GET_FREE
);
1692 return log_error_errno(errno
, "Failed to allocate loop device: %m");
1694 if (asprintf(&loopdev
, "/dev/loop%i", nr
) < 0)
1697 loop
= open(loopdev
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1699 return log_error_errno(errno
, "Failed to open loop device %s: %m", loopdev
);
1701 if (ioctl(loop
, LOOP_SET_FD
, fd
) < 0)
1702 return log_error_errno(errno
, "Failed to set loopback file descriptor on %s: %m", loopdev
);
1705 info
.lo_flags
|= LO_FLAGS_READ_ONLY
;
1707 if (ioctl(loop
, LOOP_SET_STATUS64
, &info
) < 0)
1708 return log_error_errno(errno
, "Failed to set loopback settings on %s: %m", loopdev
);
1710 *device_path
= loopdev
;
1721 #define PARTITION_TABLE_BLURB \
1722 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1723 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1724 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1725 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1726 "to be bootable with systemd-nspawn."
1728 static int dissect_image(
1730 char **root_device
, bool *root_device_rw
,
1731 char **home_device
, bool *home_device_rw
,
1732 char **srv_device
, bool *srv_device_rw
,
1736 int home_nr
= -1, srv_nr
= -1;
1737 #ifdef GPT_ROOT_NATIVE
1740 #ifdef GPT_ROOT_SECONDARY
1741 int secondary_root_nr
= -1;
1743 _cleanup_free_
char *home
= NULL
, *root
= NULL
, *secondary_root
= NULL
, *srv
= NULL
, *generic
= NULL
;
1744 _cleanup_udev_enumerate_unref_
struct udev_enumerate
*e
= NULL
;
1745 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
1746 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
1747 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
1748 struct udev_list_entry
*first
, *item
;
1749 bool home_rw
= true, root_rw
= true, secondary_root_rw
= true, srv_rw
= true, generic_rw
= true;
1750 bool is_gpt
, is_mbr
, multiple_generic
= false;
1751 const char *pttype
= NULL
;
1758 assert(root_device
);
1759 assert(home_device
);
1764 b
= blkid_new_probe();
1769 r
= blkid_probe_set_device(b
, fd
, 0, 0);
1774 log_error_errno(errno
, "Failed to set device on blkid probe: %m");
1778 blkid_probe_enable_partitions(b
, 1);
1779 blkid_probe_set_partitions_flags(b
, BLKID_PARTS_ENTRY_DETAILS
);
1782 r
= blkid_do_safeprobe(b
);
1783 if (r
== -2 || r
== 1) {
1784 log_error("Failed to identify any partition table on\n"
1786 PARTITION_TABLE_BLURB
, arg_image
);
1788 } else if (r
!= 0) {
1791 log_error_errno(errno
, "Failed to probe: %m");
1795 (void) blkid_probe_lookup_value(b
, "PTTYPE", &pttype
, NULL
);
1797 is_gpt
= streq_ptr(pttype
, "gpt");
1798 is_mbr
= streq_ptr(pttype
, "dos");
1800 if (!is_gpt
&& !is_mbr
) {
1801 log_error("No GPT or MBR partition table discovered on\n"
1803 PARTITION_TABLE_BLURB
, arg_image
);
1808 pl
= blkid_probe_get_partitions(b
);
1813 log_error("Failed to list partitions of %s", arg_image
);
1821 if (fstat(fd
, &st
) < 0)
1822 return log_error_errno(errno
, "Failed to stat block device: %m");
1824 d
= udev_device_new_from_devnum(udev
, 'b', st
.st_rdev
);
1832 log_error("Kernel partitions never appeared.");
1836 e
= udev_enumerate_new(udev
);
1840 r
= udev_enumerate_add_match_parent(e
, d
);
1844 r
= udev_enumerate_scan_devices(e
);
1846 return log_error_errno(r
, "Failed to scan for partition devices of %s: %m", arg_image
);
1848 /* Count the partitions enumerated by the kernel */
1850 first
= udev_enumerate_get_list_entry(e
);
1851 udev_list_entry_foreach(item
, first
)
1854 /* Count the partitions enumerated by blkid */
1855 m
= blkid_partlist_numof_partitions(pl
);
1859 log_error("blkid and kernel partition list do not match.");
1865 /* The kernel has probed fewer partitions than
1866 * blkid? Maybe the kernel prober is still
1867 * running or it got EBUSY because udev
1868 * already opened the device. Let's reprobe
1869 * the device, which is a synchronous call
1870 * that waits until probing is complete. */
1872 for (j
= 0; j
< 20; j
++) {
1874 r
= ioctl(fd
, BLKRRPART
, 0);
1877 if (r
>= 0 || r
!= -EBUSY
)
1880 /* If something else has the device
1881 * open, such as an udev rule, the
1882 * ioctl will return EBUSY. Since
1883 * there's no way to wait until it
1884 * isn't busy anymore, let's just wait
1885 * a bit, and try again.
1887 * This is really something they
1888 * should fix in the kernel! */
1890 usleep(50 * USEC_PER_MSEC
);
1894 return log_error_errno(r
, "Failed to reread partition table: %m");
1897 e
= udev_enumerate_unref(e
);
1900 first
= udev_enumerate_get_list_entry(e
);
1901 udev_list_entry_foreach(item
, first
) {
1902 _cleanup_udev_device_unref_
struct udev_device
*q
;
1904 unsigned long long flags
;
1910 q
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(item
));
1915 log_error_errno(errno
, "Failed to get partition device of %s: %m", arg_image
);
1919 qn
= udev_device_get_devnum(q
);
1923 if (st
.st_rdev
== qn
)
1926 node
= udev_device_get_devnode(q
);
1930 pp
= blkid_partlist_devno_to_partition(pl
, qn
);
1934 flags
= blkid_partition_get_flags(pp
);
1936 nr
= blkid_partition_get_partno(pp
);
1944 if (flags
& GPT_FLAG_NO_AUTO
)
1947 stype
= blkid_partition_get_type_string(pp
);
1951 if (sd_id128_from_string(stype
, &type_id
) < 0)
1954 if (sd_id128_equal(type_id
, GPT_HOME
)) {
1956 if (home
&& nr
>= home_nr
)
1960 home_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1962 r
= free_and_strdup(&home
, node
);
1966 } else if (sd_id128_equal(type_id
, GPT_SRV
)) {
1968 if (srv
&& nr
>= srv_nr
)
1972 srv_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1974 r
= free_and_strdup(&srv
, node
);
1978 #ifdef GPT_ROOT_NATIVE
1979 else if (sd_id128_equal(type_id
, GPT_ROOT_NATIVE
)) {
1981 if (root
&& nr
>= root_nr
)
1985 root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1987 r
= free_and_strdup(&root
, node
);
1992 #ifdef GPT_ROOT_SECONDARY
1993 else if (sd_id128_equal(type_id
, GPT_ROOT_SECONDARY
)) {
1995 if (secondary_root
&& nr
>= secondary_root_nr
)
1998 secondary_root_nr
= nr
;
1999 secondary_root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2001 r
= free_and_strdup(&secondary_root
, node
);
2006 else if (sd_id128_equal(type_id
, GPT_LINUX_GENERIC
)) {
2009 multiple_generic
= true;
2011 generic_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2013 r
= free_and_strdup(&generic
, node
);
2019 } else if (is_mbr
) {
2022 if (flags
!= 0x80) /* Bootable flag */
2025 type
= blkid_partition_get_type(pp
);
2026 if (type
!= 0x83) /* Linux partition */
2030 multiple_generic
= true;
2034 r
= free_and_strdup(&root
, node
);
2042 *root_device
= root
;
2045 *root_device_rw
= root_rw
;
2047 } else if (secondary_root
) {
2048 *root_device
= secondary_root
;
2049 secondary_root
= NULL
;
2051 *root_device_rw
= secondary_root_rw
;
2053 } else if (generic
) {
2055 /* There were no partitions with precise meanings
2056 * around, but we found generic partitions. In this
2057 * case, if there's only one, we can go ahead and boot
2058 * it, otherwise we bail out, because we really cannot
2059 * make any sense of it. */
2061 if (multiple_generic
) {
2062 log_error("Identified multiple bootable Linux partitions on\n"
2064 PARTITION_TABLE_BLURB
, arg_image
);
2068 *root_device
= generic
;
2071 *root_device_rw
= generic_rw
;
2074 log_error("Failed to identify root partition in disk image\n"
2076 PARTITION_TABLE_BLURB
, arg_image
);
2081 *home_device
= home
;
2084 *home_device_rw
= home_rw
;
2091 *srv_device_rw
= srv_rw
;
2096 log_error("--image= is not supported, compiled without blkid support.");
2101 static int mount_device(const char *what
, const char *where
, const char *directory
, bool rw
) {
2103 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
2104 const char *fstype
, *p
;
2114 p
= strjoina(where
, directory
);
2119 b
= blkid_new_probe_from_filename(what
);
2123 log_error_errno(errno
, "Failed to allocate prober for %s: %m", what
);
2127 blkid_probe_enable_superblocks(b
, 1);
2128 blkid_probe_set_superblocks_flags(b
, BLKID_SUBLKS_TYPE
);
2131 r
= blkid_do_safeprobe(b
);
2132 if (r
== -1 || r
== 1) {
2133 log_error("Cannot determine file system type of %s", what
);
2135 } else if (r
!= 0) {
2138 log_error_errno(errno
, "Failed to probe %s: %m", what
);
2143 if (blkid_probe_lookup_value(b
, "TYPE", &fstype
, NULL
) < 0) {
2146 log_error("Failed to determine file system type of %s", what
);
2150 if (streq(fstype
, "crypto_LUKS")) {
2151 log_error("nspawn currently does not support LUKS disk images.");
2155 if (mount(what
, p
, fstype
, MS_NODEV
|(rw
? 0 : MS_RDONLY
), NULL
) < 0)
2156 return log_error_errno(errno
, "Failed to mount %s: %m", what
);
2160 log_error("--image= is not supported, compiled without blkid support.");
2165 static int mount_devices(
2167 const char *root_device
, bool root_device_rw
,
2168 const char *home_device
, bool home_device_rw
,
2169 const char *srv_device
, bool srv_device_rw
) {
2175 r
= mount_device(root_device
, arg_directory
, NULL
, root_device_rw
);
2177 return log_error_errno(r
, "Failed to mount root directory: %m");
2181 r
= mount_device(home_device
, arg_directory
, "/home", home_device_rw
);
2183 return log_error_errno(r
, "Failed to mount home directory: %m");
2187 r
= mount_device(srv_device
, arg_directory
, "/srv", srv_device_rw
);
2189 return log_error_errno(r
, "Failed to mount server data directory: %m");
2195 static void loop_remove(int nr
, int *image_fd
) {
2196 _cleanup_close_
int control
= -1;
2202 if (image_fd
&& *image_fd
>= 0) {
2203 r
= ioctl(*image_fd
, LOOP_CLR_FD
);
2205 log_debug_errno(errno
, "Failed to close loop image: %m");
2206 *image_fd
= safe_close(*image_fd
);
2209 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
2211 log_warning_errno(errno
, "Failed to open /dev/loop-control: %m");
2215 r
= ioctl(control
, LOOP_CTL_REMOVE
, nr
);
2217 log_debug_errno(errno
, "Failed to remove loop %d: %m", nr
);
2222 * < 0 : wait_for_terminate() failed to get the state of the
2223 * container, the container was terminated by a signal, or
2224 * failed for an unknown reason. No change is made to the
2225 * container argument.
2226 * > 0 : The program executed in the container terminated with an
2227 * error. The exit code of the program executed in the
2228 * container is returned. The container argument has been set
2229 * to CONTAINER_TERMINATED.
2230 * 0 : The container is being rebooted, has been shut down or exited
2231 * successfully. The container argument has been set to either
2232 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2234 * That is, success is indicated by a return value of zero, and an
2235 * error is indicated by a non-zero value.
2237 static int wait_for_container(pid_t pid
, ContainerStatus
*container
) {
2241 r
= wait_for_terminate(pid
, &status
);
2243 return log_warning_errno(r
, "Failed to wait for container: %m");
2245 switch (status
.si_code
) {
2248 if (status
.si_status
== 0) {
2249 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s exited successfully.", arg_machine
);
2252 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s failed with error code %i.", arg_machine
, status
.si_status
);
2254 *container
= CONTAINER_TERMINATED
;
2255 return status
.si_status
;
2258 if (status
.si_status
== SIGINT
) {
2260 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s has been shut down.", arg_machine
);
2261 *container
= CONTAINER_TERMINATED
;
2264 } else if (status
.si_status
== SIGHUP
) {
2266 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s is being rebooted.", arg_machine
);
2267 *container
= CONTAINER_REBOOTED
;
2271 /* CLD_KILLED fallthrough */
2274 log_error("Container %s terminated by signal %s.", arg_machine
, signal_to_string(status
.si_status
));
2278 log_error("Container %s failed due to unknown reason.", arg_machine
);
2285 static void nop_handler(int sig
) {}
2287 static int on_orderly_shutdown(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
2290 pid
= PTR_TO_UINT32(userdata
);
2292 if (kill(pid
, arg_kill_signal
) >= 0) {
2293 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2294 sd_event_source_set_userdata(s
, NULL
);
2299 sd_event_exit(sd_event_source_get_event(s
), 0);
2303 static int determine_names(void) {
2306 if (arg_template
&& !arg_directory
&& arg_machine
) {
2308 /* If --template= was specified then we should not
2309 * search for a machine, but instead create a new one
2310 * in /var/lib/machine. */
2312 arg_directory
= strjoin("/var/lib/machines/", arg_machine
, NULL
);
2317 if (!arg_image
&& !arg_directory
) {
2319 _cleanup_(image_unrefp
) Image
*i
= NULL
;
2321 r
= image_find(arg_machine
, &i
);
2323 return log_error_errno(r
, "Failed to find image for machine '%s': %m", arg_machine
);
2325 log_error("No image for machine '%s': %m", arg_machine
);
2329 if (i
->type
== IMAGE_RAW
)
2330 r
= set_sanitized_path(&arg_image
, i
->path
);
2332 r
= set_sanitized_path(&arg_directory
, i
->path
);
2334 return log_error_errno(r
, "Invalid image directory: %m");
2337 arg_read_only
= arg_read_only
|| i
->read_only
;
2339 arg_directory
= get_current_dir_name();
2341 if (!arg_directory
&& !arg_machine
) {
2342 log_error("Failed to determine path, please use -D or -i.");
2348 if (arg_directory
&& path_equal(arg_directory
, "/"))
2349 arg_machine
= gethostname_malloc();
2351 arg_machine
= strdup(basename(arg_image
?: arg_directory
));
2356 hostname_cleanup(arg_machine
);
2357 if (!machine_name_is_valid(arg_machine
)) {
2358 log_error("Failed to determine machine name automatically, please use -M.");
2362 if (arg_ephemeral
) {
2365 /* Add a random suffix when this is an
2366 * ephemeral machine, so that we can run many
2367 * instances at once without manually having
2368 * to specify -M each time. */
2370 if (asprintf(&b
, "%s-%016" PRIx64
, arg_machine
, random_u64()) < 0)
2381 static int determine_uid_shift(const char *directory
) {
2389 if (arg_uid_shift
== UID_INVALID
) {
2392 r
= stat(directory
, &st
);
2394 return log_error_errno(errno
, "Failed to determine UID base of %s: %m", directory
);
2396 arg_uid_shift
= st
.st_uid
& UINT32_C(0xffff0000);
2398 if (arg_uid_shift
!= (st
.st_gid
& UINT32_C(0xffff0000))) {
2399 log_error("UID and GID base of %s don't match.", directory
);
2403 arg_uid_range
= UINT32_C(0x10000);
2406 if (arg_uid_shift
> (uid_t
) -1 - arg_uid_range
) {
2407 log_error("UID base too high for UID range.");
2411 log_info("Using user namespaces with base " UID_FMT
" and range " UID_FMT
".", arg_uid_shift
, arg_uid_range
);
2415 static int inner_child(
2417 const char *directory
,
2423 _cleanup_free_
char *home
= NULL
;
2425 const char *envp
[] = {
2426 "PATH=" DEFAULT_PATH_SPLIT_USR
,
2427 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2432 NULL
, /* container_uuid */
2433 NULL
, /* LISTEN_FDS */
2434 NULL
, /* LISTEN_PID */
2438 _cleanup_strv_free_
char **env_use
= NULL
;
2443 assert(kmsg_socket
>= 0);
2448 /* Tell the parent, that it now can write the UID map. */
2449 (void) barrier_place(barrier
); /* #1 */
2451 /* Wait until the parent wrote the UID map */
2452 if (!barrier_place_and_sync(barrier
)) { /* #2 */
2453 log_error("Parent died too early");
2458 r
= mount_all(NULL
, true, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2462 /* Wait until we are cgroup-ified, so that we
2463 * can mount the right cgroup path writable */
2464 if (!barrier_place_and_sync(barrier
)) { /* #3 */
2465 log_error("Parent died too early");
2469 r
= mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy
);
2473 r
= reset_uid_gid();
2475 return log_error_errno(r
, "Couldn't become new root: %m");
2477 r
= setup_boot_id(NULL
);
2481 r
= setup_kmsg(NULL
, kmsg_socket
);
2484 kmsg_socket
= safe_close(kmsg_socket
);
2489 return log_error_errno(errno
, "setsid() failed: %m");
2491 if (arg_private_network
)
2494 if (arg_expose_ports
) {
2495 r
= expose_port_send_rtnl(rtnl_socket
);
2498 rtnl_socket
= safe_close(rtnl_socket
);
2501 if (drop_capabilities() < 0)
2502 return log_error_errno(errno
, "drop_capabilities() failed: %m");
2506 if (arg_personality
!= PERSONALITY_INVALID
) {
2507 if (personality(arg_personality
) < 0)
2508 return log_error_errno(errno
, "personality() failed: %m");
2509 } else if (secondary
) {
2510 if (personality(PER_LINUX32
) < 0)
2511 return log_error_errno(errno
, "personality() failed: %m");
2515 if (arg_selinux_context
)
2516 if (setexeccon((security_context_t
) arg_selinux_context
) < 0)
2517 return log_error_errno(errno
, "setexeccon(\"%s\") failed: %m", arg_selinux_context
);
2520 r
= change_uid_gid(arg_user
, &home
);
2524 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
2528 if ((asprintf((char**)(envp
+ n_env
++), "HOME=%s", home
? home
: "/root") < 0) ||
2529 (asprintf((char**)(envp
+ n_env
++), "USER=%s", arg_user
? arg_user
: "root") < 0) ||
2530 (asprintf((char**)(envp
+ n_env
++), "LOGNAME=%s", arg_user
? arg_user
: "root") < 0))
2533 if (!sd_id128_equal(arg_uuid
, SD_ID128_NULL
)) {
2536 if (asprintf((char**)(envp
+ n_env
++), "container_uuid=%s", id128_format_as_uuid(arg_uuid
, as_uuid
)) < 0)
2540 if (fdset_size(fds
) > 0) {
2541 r
= fdset_cloexec(fds
, false);
2543 return log_error_errno(r
, "Failed to unset O_CLOEXEC for file descriptors.");
2545 if ((asprintf((char **)(envp
+ n_env
++), "LISTEN_FDS=%u", fdset_size(fds
)) < 0) ||
2546 (asprintf((char **)(envp
+ n_env
++), "LISTEN_PID=1") < 0))
2550 env_use
= strv_env_merge(2, envp
, arg_setenv
);
2554 /* Let the parent know that we are ready and
2555 * wait until the parent is ready with the
2557 if (!barrier_place_and_sync(barrier
)) { /* #4 */
2558 log_error("Parent died too early");
2562 /* Now, explicitly close the log, so that we
2563 * then can close all remaining fds. Closing
2564 * the log explicitly first has the benefit
2565 * that the logging subsystem knows about it,
2566 * and is thus ready to be reopened should we
2567 * need it again. Note that the other fds
2568 * closed here are at least the locking and
2571 (void) fdset_close_others(fds
);
2577 /* Automatically search for the init system */
2579 m
= 1 + strv_length(arg_parameters
);
2580 a
= newa(char*, m
+ 1);
2581 if (strv_isempty(arg_parameters
))
2584 memcpy(a
+ 1, arg_parameters
, m
* sizeof(char*));
2586 a
[0] = (char*) "/usr/lib/systemd/systemd";
2587 execve(a
[0], a
, env_use
);
2589 a
[0] = (char*) "/lib/systemd/systemd";
2590 execve(a
[0], a
, env_use
);
2592 a
[0] = (char*) "/sbin/init";
2593 execve(a
[0], a
, env_use
);
2594 } else if (!strv_isempty(arg_parameters
))
2595 execvpe(arg_parameters
[0], arg_parameters
, env_use
);
2597 chdir(home
?: "/root");
2598 execle("/bin/bash", "-bash", NULL
, env_use
);
2599 execle("/bin/sh", "-sh", NULL
, env_use
);
2603 return log_error_errno(errno
, "execv() failed: %m");
2606 static int outer_child(
2608 const char *directory
,
2609 const char *console
,
2610 const char *root_device
, bool root_device_rw
,
2611 const char *home_device
, bool home_device_rw
,
2612 const char *srv_device
, bool srv_device_rw
,
2618 int uid_shift_socket
,
2628 assert(pid_socket
>= 0);
2629 assert(kmsg_socket
>= 0);
2633 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0)
2634 return log_error_errno(errno
, "PR_SET_PDEATHSIG failed: %m");
2637 close_nointr(STDIN_FILENO
);
2638 close_nointr(STDOUT_FILENO
);
2639 close_nointr(STDERR_FILENO
);
2641 r
= open_terminal(console
, O_RDWR
);
2642 if (r
!= STDIN_FILENO
) {
2648 return log_error_errno(r
, "Failed to open console: %m");
2651 if (dup2(STDIN_FILENO
, STDOUT_FILENO
) != STDOUT_FILENO
||
2652 dup2(STDIN_FILENO
, STDERR_FILENO
) != STDERR_FILENO
)
2653 return log_error_errno(errno
, "Failed to duplicate console: %m");
2656 r
= reset_audit_loginuid();
2660 /* Mark everything as slave, so that we still
2661 * receive mounts from the real root, but don't
2662 * propagate mounts to the real root. */
2663 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
2664 return log_error_errno(errno
, "MS_SLAVE|MS_REC failed: %m");
2666 r
= mount_devices(directory
,
2667 root_device
, root_device_rw
,
2668 home_device
, home_device_rw
,
2669 srv_device
, srv_device_rw
);
2673 r
= determine_uid_shift(directory
);
2678 l
= send(uid_shift_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), MSG_NOSIGNAL
);
2680 return log_error_errno(errno
, "Failed to send UID shift: %m");
2681 if (l
!= sizeof(arg_uid_shift
)) {
2682 log_error("Short write while sending UID shift.");
2687 /* Turn directory into bind mount */
2688 if (mount(directory
, directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
2689 return log_error_errno(errno
, "Failed to make bind mount: %m");
2691 r
= setup_volatile(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2695 r
= setup_volatile_state(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2699 r
= base_filesystem_create(directory
, arg_uid_shift
, (gid_t
) arg_uid_shift
);
2703 if (arg_read_only
) {
2704 r
= bind_remount_recursive(directory
, true);
2706 return log_error_errno(r
, "Failed to make tree read-only: %m");
2709 r
= mount_all(directory
, false, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2713 r
= copy_devnodes(directory
);
2717 dev_setup(directory
, arg_uid_shift
, arg_uid_shift
);
2719 r
= setup_pts(directory
);
2723 r
= setup_propagate(directory
);
2727 r
= setup_dev_console(directory
, console
);
2731 r
= setup_seccomp();
2735 r
= setup_timezone(directory
);
2739 r
= setup_resolv_conf(directory
);
2743 r
= setup_journal(directory
);
2747 r
= mount_custom(directory
, arg_custom_mounts
, arg_n_custom_mounts
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2751 r
= mount_cgroups(directory
, arg_unified_cgroup_hierarchy
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2755 r
= mount_move_root(directory
);
2757 return log_error_errno(r
, "Failed to move root directory: %m");
2759 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
|
2760 (arg_share_system
? 0 : CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
) |
2761 (arg_private_network
? CLONE_NEWNET
: 0) |
2762 (arg_userns
? CLONE_NEWUSER
: 0),
2765 return log_error_errno(errno
, "Failed to fork inner child: %m");
2767 pid_socket
= safe_close(pid_socket
);
2768 uid_shift_socket
= safe_close(uid_shift_socket
);
2770 /* The inner child has all namespaces that are
2771 * requested, so that we all are owned by the user if
2772 * user namespaces are turned on. */
2774 r
= inner_child(barrier
, directory
, secondary
, kmsg_socket
, rtnl_socket
, fds
);
2776 _exit(EXIT_FAILURE
);
2778 _exit(EXIT_SUCCESS
);
2781 l
= send(pid_socket
, &pid
, sizeof(pid
), MSG_NOSIGNAL
);
2783 return log_error_errno(errno
, "Failed to send PID: %m");
2784 if (l
!= sizeof(pid
)) {
2785 log_error("Short write while sending PID.");
2789 pid_socket
= safe_close(pid_socket
);
2794 static int setup_uid_map(pid_t pid
) {
2795 char uid_map
[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t
) + 1], line
[DECIMAL_STR_MAX(uid_t
)*3+3+1];
2800 xsprintf(uid_map
, "/proc/" PID_FMT
"/uid_map", pid
);
2801 xsprintf(line
, UID_FMT
" " UID_FMT
" " UID_FMT
"\n", 0, arg_uid_shift
, arg_uid_range
);
2802 r
= write_string_file(uid_map
, line
, 0);
2804 return log_error_errno(r
, "Failed to write UID map: %m");
2806 /* We always assign the same UID and GID ranges */
2807 xsprintf(uid_map
, "/proc/" PID_FMT
"/gid_map", pid
);
2808 r
= write_string_file(uid_map
, line
, 0);
2810 return log_error_errno(r
, "Failed to write GID map: %m");
2815 static int load_settings(void) {
2816 _cleanup_(settings_freep
) Settings
*settings
= NULL
;
2817 _cleanup_fclose_
FILE *f
= NULL
;
2818 _cleanup_free_
char *p
= NULL
;
2822 /* If all settings are masked, there's no point in looking for
2823 * the settings file */
2824 if ((arg_settings_mask
& _SETTINGS_MASK_ALL
) == _SETTINGS_MASK_ALL
)
2827 fn
= strjoina(arg_machine
, ".nspawn");
2829 /* We first look in the admin's directories in /etc and /run */
2830 FOREACH_STRING(i
, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2831 _cleanup_free_
char *j
= NULL
;
2833 j
= strjoin(i
, "/", fn
, NULL
);
2842 /* By default we trust configuration from /etc and /run */
2843 if (arg_settings_trusted
< 0)
2844 arg_settings_trusted
= true;
2849 if (errno
!= ENOENT
)
2850 return log_error_errno(errno
, "Failed to open %s: %m", j
);
2854 /* After that, let's look for a file next to the
2855 * actual image we shall boot. */
2858 p
= file_in_same_dir(arg_image
, fn
);
2861 } else if (arg_directory
) {
2862 p
= file_in_same_dir(arg_directory
, fn
);
2869 if (!f
&& errno
!= ENOENT
)
2870 return log_error_errno(errno
, "Failed to open %s: %m", p
);
2872 /* By default we do not trust configuration from /var/lib/machines */
2873 if (arg_settings_trusted
< 0)
2874 arg_settings_trusted
= false;
2881 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted
));
2883 r
= settings_load(f
, p
, &settings
);
2887 /* Copy over bits from the settings, unless they have been
2888 * explicitly masked by command line switches. */
2890 if ((arg_settings_mask
& SETTING_BOOT
) == 0 &&
2891 settings
->boot
>= 0) {
2892 arg_boot
= settings
->boot
;
2894 strv_free(arg_parameters
);
2895 arg_parameters
= settings
->parameters
;
2896 settings
->parameters
= NULL
;
2899 if ((arg_settings_mask
& SETTING_ENVIRONMENT
) == 0 &&
2900 settings
->environment
) {
2901 strv_free(arg_setenv
);
2902 arg_setenv
= settings
->environment
;
2903 settings
->environment
= NULL
;
2906 if ((arg_settings_mask
& SETTING_USER
) == 0 &&
2909 arg_user
= settings
->user
;
2910 settings
->user
= NULL
;
2913 if ((arg_settings_mask
& SETTING_CAPABILITY
) == 0) {
2915 if (!arg_settings_trusted
&& settings
->capability
!= 0)
2916 log_warning("Ignoring Capability= setting, file %s is not trusted.", p
);
2918 arg_retain
|= settings
->capability
;
2920 arg_retain
&= ~settings
->drop_capability
;
2923 if ((arg_settings_mask
& SETTING_KILL_SIGNAL
) == 0 &&
2924 settings
->kill_signal
> 0)
2925 arg_kill_signal
= settings
->kill_signal
;
2927 if ((arg_settings_mask
& SETTING_PERSONALITY
) == 0 &&
2928 settings
->personality
!= PERSONALITY_INVALID
)
2929 arg_personality
= settings
->personality
;
2931 if ((arg_settings_mask
& SETTING_MACHINE_ID
) == 0 &&
2932 !sd_id128_is_null(settings
->machine_id
)) {
2934 if (!arg_settings_trusted
)
2935 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p
);
2937 arg_uuid
= settings
->machine_id
;
2940 if ((arg_settings_mask
& SETTING_READ_ONLY
) == 0 &&
2941 settings
->read_only
>= 0)
2942 arg_read_only
= settings
->read_only
;
2944 if ((arg_settings_mask
& SETTING_VOLATILE_MODE
) == 0 &&
2945 settings
->volatile_mode
!= _VOLATILE_MODE_INVALID
)
2946 arg_volatile_mode
= settings
->volatile_mode
;
2948 if ((arg_settings_mask
& SETTING_CUSTOM_MOUNTS
) == 0 &&
2949 settings
->n_custom_mounts
> 0) {
2951 if (!arg_settings_trusted
)
2952 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p
);
2954 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
2955 arg_custom_mounts
= settings
->custom_mounts
;
2956 arg_n_custom_mounts
= settings
->n_custom_mounts
;
2958 settings
->custom_mounts
= NULL
;
2959 settings
->n_custom_mounts
= 0;
2963 if ((arg_settings_mask
& SETTING_NETWORK
) == 0 &&
2964 (settings
->private_network
>= 0 ||
2965 settings
->network_veth
>= 0 ||
2966 settings
->network_bridge
||
2967 settings
->network_interfaces
||
2968 settings
->network_macvlan
||
2969 settings
->network_ipvlan
)) {
2971 if (!arg_settings_trusted
)
2972 log_warning("Ignoring network settings, file %s is not trusted.", p
);
2974 strv_free(arg_network_interfaces
);
2975 arg_network_interfaces
= settings
->network_interfaces
;
2976 settings
->network_interfaces
= NULL
;
2978 strv_free(arg_network_macvlan
);
2979 arg_network_macvlan
= settings
->network_macvlan
;
2980 settings
->network_macvlan
= NULL
;
2982 strv_free(arg_network_ipvlan
);
2983 arg_network_ipvlan
= settings
->network_ipvlan
;
2984 settings
->network_ipvlan
= NULL
;
2986 free(arg_network_bridge
);
2987 arg_network_bridge
= settings
->network_bridge
;
2988 settings
->network_bridge
= NULL
;
2990 arg_network_veth
= settings
->network_veth
> 0 || settings
->network_bridge
;
2992 arg_private_network
= true; /* all these settings imply private networking */
2996 if ((arg_settings_mask
& SETTING_EXPOSE_PORTS
) == 0 &&
2997 settings
->expose_ports
) {
2999 if (!arg_settings_trusted
)
3000 log_warning("Ignoring Port= setting, file %s is not trusted.", p
);
3002 expose_port_free_all(arg_expose_ports
);
3003 arg_expose_ports
= settings
->expose_ports
;
3004 settings
->expose_ports
= NULL
;
3011 int main(int argc
, char *argv
[]) {
3013 _cleanup_free_
char *device_path
= NULL
, *root_device
= NULL
, *home_device
= NULL
, *srv_device
= NULL
, *console
= NULL
;
3014 bool root_device_rw
= true, home_device_rw
= true, srv_device_rw
= true;
3015 _cleanup_close_
int master
= -1, image_fd
= -1;
3016 _cleanup_fdset_free_ FDSet
*fds
= NULL
;
3017 int r
, n_fd_passed
, loop_nr
= -1;
3018 char veth_name
[IFNAMSIZ
];
3019 bool secondary
= false, remove_subvol
= false;
3022 int ret
= EXIT_SUCCESS
;
3023 union in_addr_union exposed
= {};
3024 _cleanup_release_lock_file_ LockFile tree_global_lock
= LOCK_FILE_INIT
, tree_local_lock
= LOCK_FILE_INIT
;
3027 log_parse_environment();
3030 r
= parse_argv(argc
, argv
);
3034 if (geteuid() != 0) {
3035 log_error("Need to be root.");
3039 r
= determine_names();
3043 r
= load_settings();
3047 r
= verify_arguments();
3051 n_fd_passed
= sd_listen_fds(false);
3052 if (n_fd_passed
> 0) {
3053 r
= fdset_new_listen_fds(&fds
, false);
3055 log_error_errno(r
, "Failed to collect file descriptors: %m");
3060 if (arg_directory
) {
3063 if (path_equal(arg_directory
, "/") && !arg_ephemeral
) {
3064 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3069 if (arg_ephemeral
) {
3070 _cleanup_free_
char *np
= NULL
;
3072 /* If the specified path is a mount point we
3073 * generate the new snapshot immediately
3074 * inside it under a random name. However if
3075 * the specified is not a mount point we
3076 * create the new snapshot in the parent
3077 * directory, just next to it. */
3078 r
= path_is_mount_point(arg_directory
, 0);
3080 log_error_errno(r
, "Failed to determine whether directory %s is mount point: %m", arg_directory
);
3084 r
= tempfn_random_child(arg_directory
, "machine.", &np
);
3086 r
= tempfn_random(arg_directory
, "machine.", &np
);
3088 log_error_errno(r
, "Failed to generate name for snapshot: %m");
3092 r
= image_path_lock(np
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3094 log_error_errno(r
, "Failed to lock %s: %m", np
);
3098 r
= btrfs_subvol_snapshot(arg_directory
, np
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
);
3100 log_error_errno(r
, "Failed to create snapshot %s from %s: %m", np
, arg_directory
);
3104 free(arg_directory
);
3108 remove_subvol
= true;
3111 r
= image_path_lock(arg_directory
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3113 log_error_errno(r
, "Directory tree %s is currently busy.", arg_directory
);
3117 log_error_errno(r
, "Failed to lock %s: %m", arg_directory
);
3122 r
= btrfs_subvol_snapshot(arg_template
, arg_directory
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
);
3125 log_info("Directory %s already exists, not populating from template %s.", arg_directory
, arg_template
);
3127 log_error_errno(r
, "Couldn't create snapshot %s from %s: %m", arg_directory
, arg_template
);
3131 log_info("Populated %s from template %s.", arg_directory
, arg_template
);
3137 if (path_is_os_tree(arg_directory
) <= 0) {
3138 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory
);
3145 p
= strjoina(arg_directory
,
3146 argc
> optind
&& path_is_absolute(argv
[optind
]) ? argv
[optind
] : "/usr/bin/");
3147 if (access(p
, F_OK
) < 0) {
3148 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory
);
3155 char template[] = "/tmp/nspawn-root-XXXXXX";
3158 assert(!arg_template
);
3160 r
= image_path_lock(arg_image
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3162 r
= log_error_errno(r
, "Disk image %s is currently busy.", arg_image
);
3166 r
= log_error_errno(r
, "Failed to create image lock: %m");
3170 if (!mkdtemp(template)) {
3171 log_error_errno(errno
, "Failed to create temporary directory: %m");
3176 arg_directory
= strdup(template);
3177 if (!arg_directory
) {
3182 image_fd
= setup_image(&device_path
, &loop_nr
);
3188 r
= dissect_image(image_fd
,
3189 &root_device
, &root_device_rw
,
3190 &home_device
, &home_device_rw
,
3191 &srv_device
, &srv_device_rw
,
3197 r
= custom_mounts_prepare();
3202 isatty(STDIN_FILENO
) > 0 &&
3203 isatty(STDOUT_FILENO
) > 0;
3205 master
= posix_openpt(O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NDELAY
);
3207 r
= log_error_errno(errno
, "Failed to acquire pseudo tty: %m");
3211 r
= ptsname_malloc(master
, &console
);
3213 r
= log_error_errno(r
, "Failed to determine tty name: %m");
3217 if (unlockpt(master
) < 0) {
3218 r
= log_error_errno(errno
, "Failed to unlock tty: %m");
3223 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3224 arg_machine
, arg_image
?: arg_directory
);
3226 assert_se(sigprocmask_many(SIG_BLOCK
, NULL
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, -1) >= 0);
3228 assert_se(sigemptyset(&mask_chld
) == 0);
3229 assert_se(sigaddset(&mask_chld
, SIGCHLD
) == 0);
3231 if (prctl(PR_SET_CHILD_SUBREAPER
, 1) < 0) {
3232 r
= log_error_errno(errno
, "Failed to become subreaper: %m");
3237 _cleanup_close_pair_
int kmsg_socket_pair
[2] = { -1, -1 }, rtnl_socket_pair
[2] = { -1, -1 }, pid_socket_pair
[2] = { -1, -1 },
3238 uid_shift_socket_pair
[2] = { -1, -1 };
3239 ContainerStatus container_status
;
3240 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
3241 static const struct sigaction sa
= {
3242 .sa_handler
= nop_handler
,
3243 .sa_flags
= SA_NOCLDSTOP
,
3247 _cleanup_event_unref_ sd_event
*event
= NULL
;
3248 _cleanup_(pty_forward_freep
) PTYForward
*forward
= NULL
;
3249 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
3252 r
= barrier_create(&barrier
);
3254 log_error_errno(r
, "Cannot initialize IPC barrier: %m");
3258 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, kmsg_socket_pair
) < 0) {
3259 r
= log_error_errno(errno
, "Failed to create kmsg socket pair: %m");
3263 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, rtnl_socket_pair
) < 0) {
3264 r
= log_error_errno(errno
, "Failed to create rtnl socket pair: %m");
3268 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, pid_socket_pair
) < 0) {
3269 r
= log_error_errno(errno
, "Failed to create pid socket pair: %m");
3274 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, uid_shift_socket_pair
) < 0) {
3275 r
= log_error_errno(errno
, "Failed to create uid shift socket pair: %m");
3279 /* Child can be killed before execv(), so handle SIGCHLD
3280 * in order to interrupt parent's blocking calls and
3281 * give it a chance to call wait() and terminate. */
3282 r
= sigprocmask(SIG_UNBLOCK
, &mask_chld
, NULL
);
3284 r
= log_error_errno(errno
, "Failed to change the signal mask: %m");
3288 r
= sigaction(SIGCHLD
, &sa
, NULL
);
3290 r
= log_error_errno(errno
, "Failed to install SIGCHLD handler: %m");
3294 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
, NULL
);
3296 if (errno
== EINVAL
)
3297 r
= log_error_errno(errno
, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3299 r
= log_error_errno(errno
, "clone() failed: %m");
3305 /* The outer child only has a file system namespace. */
3306 barrier_set_role(&barrier
, BARRIER_CHILD
);
3308 master
= safe_close(master
);
3310 kmsg_socket_pair
[0] = safe_close(kmsg_socket_pair
[0]);
3311 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3312 pid_socket_pair
[0] = safe_close(pid_socket_pair
[0]);
3313 uid_shift_socket_pair
[0] = safe_close(uid_shift_socket_pair
[0]);
3315 (void) reset_all_signal_handlers();
3316 (void) reset_signal_mask();
3318 r
= outer_child(&barrier
,
3321 root_device
, root_device_rw
,
3322 home_device
, home_device_rw
,
3323 srv_device
, srv_device_rw
,
3327 kmsg_socket_pair
[1],
3328 rtnl_socket_pair
[1],
3329 uid_shift_socket_pair
[1],
3332 _exit(EXIT_FAILURE
);
3334 _exit(EXIT_SUCCESS
);
3337 barrier_set_role(&barrier
, BARRIER_PARENT
);
3342 kmsg_socket_pair
[1] = safe_close(kmsg_socket_pair
[1]);
3343 rtnl_socket_pair
[1] = safe_close(rtnl_socket_pair
[1]);
3344 pid_socket_pair
[1] = safe_close(pid_socket_pair
[1]);
3345 uid_shift_socket_pair
[1] = safe_close(uid_shift_socket_pair
[1]);
3347 /* Wait for the outer child. */
3348 r
= wait_for_terminate_and_warn("namespace helper", pid
, NULL
);
3357 /* And now retrieve the PID of the inner child. */
3358 l
= recv(pid_socket_pair
[0], &pid
, sizeof(pid
), 0);
3360 r
= log_error_errno(errno
, "Failed to read inner child PID: %m");
3363 if (l
!= sizeof(pid
)) {
3364 log_error("Short read while reading inner child PID.");
3369 log_debug("Init process invoked as PID " PID_FMT
, pid
);
3372 if (!barrier_place_and_sync(&barrier
)) { /* #1 */
3373 log_error("Child died too early.");
3378 l
= recv(uid_shift_socket_pair
[0], &arg_uid_shift
, sizeof(arg_uid_shift
), 0);
3380 r
= log_error_errno(errno
, "Failed to read UID shift: %m");
3383 if (l
!= sizeof(arg_uid_shift
)) {
3384 log_error("Short read while reading UID shift.");
3389 r
= setup_uid_map(pid
);
3393 (void) barrier_place(&barrier
); /* #2 */
3396 if (arg_private_network
) {
3398 r
= move_network_interfaces(pid
, arg_network_interfaces
);
3402 if (arg_network_veth
) {
3403 r
= setup_veth(arg_machine
, pid
, veth_name
, !!arg_network_bridge
);
3409 if (arg_network_bridge
) {
3410 r
= setup_bridge(veth_name
, arg_network_bridge
);
3418 r
= setup_macvlan(arg_machine
, pid
, arg_network_macvlan
);
3422 r
= setup_ipvlan(arg_machine
, pid
, arg_network_ipvlan
);
3428 r
= register_machine(
3435 arg_custom_mounts
, arg_n_custom_mounts
,
3443 r
= sync_cgroup(pid
, arg_unified_cgroup_hierarchy
);
3447 if (arg_keep_unit
) {
3448 r
= create_subcgroup(pid
, arg_unified_cgroup_hierarchy
);
3453 r
= chown_cgroup(pid
, arg_uid_shift
);
3457 /* Notify the child that the parent is ready with all
3458 * its setup (including cgroup-ification), and that
3459 * the child can now hand over control to the code to
3460 * run inside the container. */
3461 (void) barrier_place(&barrier
); /* #3 */
3463 /* Block SIGCHLD here, before notifying child.
3464 * process_pty() will handle it with the other signals. */
3465 assert_se(sigprocmask(SIG_BLOCK
, &mask_chld
, NULL
) >= 0);
3467 /* Reset signal to default */
3468 r
= default_signals(SIGCHLD
, -1);
3470 log_error_errno(r
, "Failed to reset SIGCHLD: %m");
3474 /* Let the child know that we are ready and wait that the child is completely ready now. */
3475 if (!barrier_place_and_sync(&barrier
)) { /* #5 */
3476 log_error("Client died too early.");
3483 "STATUS=Container running.\n"
3484 "X_NSPAWN_LEADER_PID=" PID_FMT
, pid
);
3486 r
= sd_event_new(&event
);
3488 log_error_errno(r
, "Failed to get default event source: %m");
3492 if (arg_kill_signal
> 0) {
3493 /* Try to kill the init system on SIGINT or SIGTERM */
3494 sd_event_add_signal(event
, NULL
, SIGINT
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
3495 sd_event_add_signal(event
, NULL
, SIGTERM
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
3497 /* Immediately exit */
3498 sd_event_add_signal(event
, NULL
, SIGINT
, NULL
, NULL
);
3499 sd_event_add_signal(event
, NULL
, SIGTERM
, NULL
, NULL
);
3502 /* simply exit on sigchld */
3503 sd_event_add_signal(event
, NULL
, SIGCHLD
, NULL
, NULL
);
3505 if (arg_expose_ports
) {
3506 r
= expose_port_watch_rtnl(event
, rtnl_socket_pair
[0], on_address_change
, &exposed
, &rtnl
);
3510 (void) expose_port_execute(rtnl
, arg_expose_ports
, &exposed
);
3513 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3515 r
= pty_forward_new(event
, master
, true, !interactive
, &forward
);
3517 log_error_errno(r
, "Failed to create PTY forwarder: %m");
3521 r
= sd_event_loop(event
);
3523 log_error_errno(r
, "Failed to run event loop: %m");
3527 pty_forward_get_last_char(forward
, &last_char
);
3529 forward
= pty_forward_free(forward
);
3531 if (!arg_quiet
&& last_char
!= '\n')
3534 /* Kill if it is not dead yet anyway */
3535 if (arg_register
&& !arg_keep_unit
)
3536 terminate_machine(pid
);
3538 /* Normally redundant, but better safe than sorry */
3541 r
= wait_for_container(pid
, &container_status
);
3545 /* We failed to wait for the container, or the
3546 * container exited abnormally */
3548 else if (r
> 0 || container_status
== CONTAINER_TERMINATED
){
3549 /* The container exited with a non-zero
3550 * status, or with zero status and no reboot
3556 /* CONTAINER_REBOOTED, loop again */
3558 if (arg_keep_unit
) {
3559 /* Special handling if we are running as a
3560 * service: instead of simply restarting the
3561 * machine we want to restart the entire
3562 * service, so let's inform systemd about this
3563 * with the special exit code 133. The service
3564 * file uses RestartForceExitStatus=133 so
3565 * that this results in a full nspawn
3566 * restart. This is necessary since we might
3567 * have cgroup parameters set we want to have
3574 expose_port_flush(arg_expose_ports
, &exposed
);
3580 "STATUS=Terminating...");
3585 /* Try to flush whatever is still queued in the pty */
3587 (void) copy_bytes(master
, STDOUT_FILENO
, (uint64_t) -1, false);
3589 loop_remove(loop_nr
, &image_fd
);
3591 if (remove_subvol
&& arg_directory
) {
3594 k
= btrfs_subvol_remove(arg_directory
, true);
3596 log_warning_errno(k
, "Cannot remove subvolume '%s', ignoring: %m", arg_directory
);
3602 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
3603 (void) rm_rf(p
, REMOVE_ROOT
);
3606 expose_port_flush(arg_expose_ports
, &exposed
);
3608 free(arg_directory
);
3613 strv_free(arg_setenv
);
3614 free(arg_network_bridge
);
3615 strv_free(arg_network_interfaces
);
3616 strv_free(arg_network_macvlan
);
3617 strv_free(arg_network_ipvlan
);
3618 strv_free(arg_parameters
);
3619 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
3620 expose_port_free_all(arg_expose_ports
);
3622 return r
< 0 ? EXIT_FAILURE
: ret
;