1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
23 #include <blkid/blkid.h>
27 #include <linux/loop.h>
33 #include <selinux/selinux.h>
40 #include <sys/mount.h>
41 #include <sys/personality.h>
42 #include <sys/prctl.h>
43 #include <sys/types.h>
46 #include "sd-daemon.h"
49 #include "alloc-util.h"
51 #include "base-filesystem.h"
52 #include "blkid-util.h"
53 #include "btrfs-util.h"
55 #include "capability-util.h"
56 #include "cgroup-util.h"
58 #include "dev-setup.h"
63 #include "formats-util.h"
66 #include "hostname-util.h"
68 #include "loopback-setup.h"
69 #include "machine-image.h"
73 #include "mount-util.h"
74 #include "netlink-util.h"
75 #include "nspawn-cgroup.h"
76 #include "nspawn-expose-ports.h"
77 #include "nspawn-mount.h"
78 #include "nspawn-network.h"
79 #include "nspawn-register.h"
80 #include "nspawn-settings.h"
81 #include "nspawn-setuid.h"
82 #include "parse-util.h"
83 #include "path-util.h"
84 #include "process-util.h"
86 #include "random-util.h"
89 #include "seccomp-util.h"
91 #include "signal-util.h"
92 #include "socket-util.h"
93 #include "stat-util.h"
94 #include "stdio-util.h"
95 #include "string-util.h"
97 #include "terminal-util.h"
98 #include "udev-util.h"
99 #include "umask-util.h"
100 #include "user-util.h"
103 typedef enum ContainerStatus
{
104 CONTAINER_TERMINATED
,
108 typedef enum LinkJournal
{
115 static char *arg_directory
= NULL
;
116 static char *arg_template
= NULL
;
117 static char *arg_user
= NULL
;
118 static sd_id128_t arg_uuid
= {};
119 static char *arg_machine
= NULL
;
120 static const char *arg_selinux_context
= NULL
;
121 static const char *arg_selinux_apifs_context
= NULL
;
122 static const char *arg_slice
= NULL
;
123 static bool arg_private_network
= false;
124 static bool arg_read_only
= false;
125 static bool arg_boot
= false;
126 static bool arg_ephemeral
= false;
127 static LinkJournal arg_link_journal
= LINK_AUTO
;
128 static bool arg_link_journal_try
= false;
129 static uint64_t arg_retain
=
130 (1ULL << CAP_CHOWN
) |
131 (1ULL << CAP_DAC_OVERRIDE
) |
132 (1ULL << CAP_DAC_READ_SEARCH
) |
133 (1ULL << CAP_FOWNER
) |
134 (1ULL << CAP_FSETID
) |
135 (1ULL << CAP_IPC_OWNER
) |
137 (1ULL << CAP_LEASE
) |
138 (1ULL << CAP_LINUX_IMMUTABLE
) |
139 (1ULL << CAP_NET_BIND_SERVICE
) |
140 (1ULL << CAP_NET_BROADCAST
) |
141 (1ULL << CAP_NET_RAW
) |
142 (1ULL << CAP_SETGID
) |
143 (1ULL << CAP_SETFCAP
) |
144 (1ULL << CAP_SETPCAP
) |
145 (1ULL << CAP_SETUID
) |
146 (1ULL << CAP_SYS_ADMIN
) |
147 (1ULL << CAP_SYS_CHROOT
) |
148 (1ULL << CAP_SYS_NICE
) |
149 (1ULL << CAP_SYS_PTRACE
) |
150 (1ULL << CAP_SYS_TTY_CONFIG
) |
151 (1ULL << CAP_SYS_RESOURCE
) |
152 (1ULL << CAP_SYS_BOOT
) |
153 (1ULL << CAP_AUDIT_WRITE
) |
154 (1ULL << CAP_AUDIT_CONTROL
) |
156 static CustomMount
*arg_custom_mounts
= NULL
;
157 static unsigned arg_n_custom_mounts
= 0;
158 static char **arg_setenv
= NULL
;
159 static bool arg_quiet
= false;
160 static bool arg_share_system
= false;
161 static bool arg_register
= true;
162 static bool arg_keep_unit
= false;
163 static char **arg_network_interfaces
= NULL
;
164 static char **arg_network_macvlan
= NULL
;
165 static char **arg_network_ipvlan
= NULL
;
166 static bool arg_network_veth
= false;
167 static char **arg_network_veth_extra
= NULL
;
168 static char *arg_network_bridge
= NULL
;
169 static unsigned long arg_personality
= PERSONALITY_INVALID
;
170 static char *arg_image
= NULL
;
171 static VolatileMode arg_volatile_mode
= VOLATILE_NO
;
172 static ExposePort
*arg_expose_ports
= NULL
;
173 static char **arg_property
= NULL
;
174 static uid_t arg_uid_shift
= UID_INVALID
, arg_uid_range
= 0x10000U
;
175 static bool arg_userns
= false;
176 static int arg_kill_signal
= 0;
177 static bool arg_unified_cgroup_hierarchy
= false;
178 static SettingsMask arg_settings_mask
= 0;
179 static int arg_settings_trusted
= -1;
180 static char **arg_parameters
= NULL
;
181 static const char *arg_container_service_name
= "systemd-nspawn";
183 static void help(void) {
184 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
185 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
186 " -h --help Show this help\n"
187 " --version Print version string\n"
188 " -q --quiet Do not show status information\n"
189 " -D --directory=PATH Root directory for the container\n"
190 " --template=PATH Initialize root directory from template directory,\n"
192 " -x --ephemeral Run container with snapshot of root directory, and\n"
193 " remove it after exit\n"
194 " -i --image=PATH File system device or disk image for the container\n"
195 " -b --boot Boot up full system (i.e. invoke init)\n"
196 " -u --user=USER Run the command under specified user or uid\n"
197 " -M --machine=NAME Set the machine name for the container\n"
198 " --uuid=UUID Set a specific machine UUID for the container\n"
199 " -S --slice=SLICE Place the container in the specified slice\n"
200 " --property=NAME=VALUE Set scope unit property\n"
201 " --private-users[=UIDBASE[:NUIDS]]\n"
202 " Run within user namespace\n"
203 " --private-network Disable network in container\n"
204 " --network-interface=INTERFACE\n"
205 " Assign an existing network interface to the\n"
207 " --network-macvlan=INTERFACE\n"
208 " Create a macvlan network interface based on an\n"
209 " existing network interface to the container\n"
210 " --network-ipvlan=INTERFACE\n"
211 " Create a ipvlan network interface based on an\n"
212 " existing network interface to the container\n"
213 " -n --network-veth Add a virtual Ethernet connection between host\n"
215 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
216 " Add an additional virtual Ethernet link between\n"
217 " host and container\n"
218 " --network-bridge=INTERFACE\n"
219 " Add a virtual Ethernet connection between host\n"
220 " and container and add it to an existing bridge on\n"
222 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
223 " Expose a container IP port on the host\n"
224 " -Z --selinux-context=SECLABEL\n"
225 " Set the SELinux security context to be used by\n"
226 " processes in the container\n"
227 " -L --selinux-apifs-context=SECLABEL\n"
228 " Set the SELinux security context to be used by\n"
229 " API/tmpfs file systems in the container\n"
230 " --capability=CAP In addition to the default, retain specified\n"
232 " --drop-capability=CAP Drop the specified capability from the default set\n"
233 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
234 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
235 " try-guest, try-host\n"
236 " -j Equivalent to --link-journal=try-guest\n"
237 " --read-only Mount the root directory read-only\n"
238 " --bind=PATH[:PATH[:OPTIONS]]\n"
239 " Bind mount a file or directory from the host into\n"
241 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
242 " Similar, but creates a read-only bind mount\n"
243 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
244 " --overlay=PATH[:PATH...]:PATH\n"
245 " Create an overlay mount from the host to \n"
247 " --overlay-ro=PATH[:PATH...]:PATH\n"
248 " Similar, but creates a read-only overlay mount\n"
249 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
250 " --share-system Share system namespaces with host\n"
251 " --register=BOOLEAN Register container as machine\n"
252 " --keep-unit Do not register a scope for the machine, reuse\n"
253 " the service unit nspawn is running in\n"
254 " --volatile[=MODE] Run the system in volatile mode\n"
255 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
256 , program_invocation_short_name
);
260 static int custom_mounts_prepare(void) {
264 /* Ensure the mounts are applied prefix first. */
265 qsort_safe(arg_custom_mounts
, arg_n_custom_mounts
, sizeof(CustomMount
), custom_mount_compare
);
267 /* Allocate working directories for the overlay file systems that need it */
268 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
269 CustomMount
*m
= &arg_custom_mounts
[i
];
271 if (arg_userns
&& arg_uid_shift
== UID_INVALID
&& path_equal(m
->destination
, "/")) {
272 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
276 if (m
->type
!= CUSTOM_MOUNT_OVERLAY
)
285 r
= tempfn_random(m
->source
, NULL
, &m
->work_dir
);
287 return log_error_errno(r
, "Failed to generate work directory from %s: %m", m
->source
);
293 static int detect_unified_cgroup_hierarchy(void) {
297 /* Allow the user to control whether the unified hierarchy is used */
298 e
= getenv("UNIFIED_CGROUP_HIERARCHY");
300 r
= parse_boolean(e
);
302 return log_error_errno(r
, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
304 arg_unified_cgroup_hierarchy
= r
;
308 /* Otherwise inherit the default from the host system */
311 return log_error_errno(r
, "Failed to determine whether the unified cgroups hierarchy is used: %m");
313 arg_unified_cgroup_hierarchy
= r
;
317 static int parse_argv(int argc
, char *argv
[]) {
336 ARG_NETWORK_INTERFACE
,
340 ARG_NETWORK_VETH_EXTRA
,
350 static const struct option options
[] = {
351 { "help", no_argument
, NULL
, 'h' },
352 { "version", no_argument
, NULL
, ARG_VERSION
},
353 { "directory", required_argument
, NULL
, 'D' },
354 { "template", required_argument
, NULL
, ARG_TEMPLATE
},
355 { "ephemeral", no_argument
, NULL
, 'x' },
356 { "user", required_argument
, NULL
, 'u' },
357 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
358 { "boot", no_argument
, NULL
, 'b' },
359 { "uuid", required_argument
, NULL
, ARG_UUID
},
360 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
361 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
362 { "drop-capability", required_argument
, NULL
, ARG_DROP_CAPABILITY
},
363 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
364 { "bind", required_argument
, NULL
, ARG_BIND
},
365 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
366 { "tmpfs", required_argument
, NULL
, ARG_TMPFS
},
367 { "overlay", required_argument
, NULL
, ARG_OVERLAY
},
368 { "overlay-ro", required_argument
, NULL
, ARG_OVERLAY_RO
},
369 { "machine", required_argument
, NULL
, 'M' },
370 { "slice", required_argument
, NULL
, 'S' },
371 { "setenv", required_argument
, NULL
, ARG_SETENV
},
372 { "selinux-context", required_argument
, NULL
, 'Z' },
373 { "selinux-apifs-context", required_argument
, NULL
, 'L' },
374 { "quiet", no_argument
, NULL
, 'q' },
375 { "share-system", no_argument
, NULL
, ARG_SHARE_SYSTEM
},
376 { "register", required_argument
, NULL
, ARG_REGISTER
},
377 { "keep-unit", no_argument
, NULL
, ARG_KEEP_UNIT
},
378 { "network-interface", required_argument
, NULL
, ARG_NETWORK_INTERFACE
},
379 { "network-macvlan", required_argument
, NULL
, ARG_NETWORK_MACVLAN
},
380 { "network-ipvlan", required_argument
, NULL
, ARG_NETWORK_IPVLAN
},
381 { "network-veth", no_argument
, NULL
, 'n' },
382 { "network-veth-extra", required_argument
, NULL
, ARG_NETWORK_VETH_EXTRA
},
383 { "network-bridge", required_argument
, NULL
, ARG_NETWORK_BRIDGE
},
384 { "personality", required_argument
, NULL
, ARG_PERSONALITY
},
385 { "image", required_argument
, NULL
, 'i' },
386 { "volatile", optional_argument
, NULL
, ARG_VOLATILE
},
387 { "port", required_argument
, NULL
, 'p' },
388 { "property", required_argument
, NULL
, ARG_PROPERTY
},
389 { "private-users", optional_argument
, NULL
, ARG_PRIVATE_USERS
},
390 { "kill-signal", required_argument
, NULL
, ARG_KILL_SIGNAL
},
391 { "settings", required_argument
, NULL
, ARG_SETTINGS
},
397 uint64_t plus
= 0, minus
= 0;
398 bool mask_all_settings
= false, mask_no_settings
= false;
403 while ((c
= getopt_long(argc
, argv
, "+hD:u:bL:M:jS:Z:qi:xp:n", options
, NULL
)) >= 0)
415 r
= parse_path_argument_and_warn(optarg
, false, &arg_directory
);
421 r
= parse_path_argument_and_warn(optarg
, false, &arg_template
);
427 r
= parse_path_argument_and_warn(optarg
, false, &arg_image
);
433 arg_ephemeral
= true;
437 r
= free_and_strdup(&arg_user
, optarg
);
441 arg_settings_mask
|= SETTING_USER
;
444 case ARG_NETWORK_BRIDGE
:
445 r
= free_and_strdup(&arg_network_bridge
, optarg
);
452 arg_network_veth
= true;
453 arg_private_network
= true;
454 arg_settings_mask
|= SETTING_NETWORK
;
457 case ARG_NETWORK_VETH_EXTRA
:
458 r
= veth_extra_parse(&arg_network_veth_extra
, optarg
);
460 return log_error_errno(r
, "Failed to parse --network-veth-extra= parameter: %s", optarg
);
462 arg_private_network
= true;
463 arg_settings_mask
|= SETTING_NETWORK
;
466 case ARG_NETWORK_INTERFACE
:
467 if (strv_extend(&arg_network_interfaces
, optarg
) < 0)
470 arg_private_network
= true;
471 arg_settings_mask
|= SETTING_NETWORK
;
474 case ARG_NETWORK_MACVLAN
:
475 if (strv_extend(&arg_network_macvlan
, optarg
) < 0)
478 arg_private_network
= true;
479 arg_settings_mask
|= SETTING_NETWORK
;
482 case ARG_NETWORK_IPVLAN
:
483 if (strv_extend(&arg_network_ipvlan
, optarg
) < 0)
488 case ARG_PRIVATE_NETWORK
:
489 arg_private_network
= true;
490 arg_settings_mask
|= SETTING_NETWORK
;
495 arg_settings_mask
|= SETTING_BOOT
;
499 r
= sd_id128_from_string(optarg
, &arg_uuid
);
501 log_error("Invalid UUID: %s", optarg
);
505 arg_settings_mask
|= SETTING_MACHINE_ID
;
514 arg_machine
= mfree(arg_machine
);
516 if (!machine_name_is_valid(optarg
)) {
517 log_error("Invalid machine name: %s", optarg
);
521 r
= free_and_strdup(&arg_machine
, optarg
);
529 arg_selinux_context
= optarg
;
533 arg_selinux_apifs_context
= optarg
;
537 arg_read_only
= true;
538 arg_settings_mask
|= SETTING_READ_ONLY
;
542 case ARG_DROP_CAPABILITY
: {
545 _cleanup_free_
char *t
= NULL
;
547 r
= extract_first_word(&p
, &t
, ",", 0);
549 return log_error_errno(r
, "Failed to parse capability %s.", t
);
554 if (streq(t
, "all")) {
555 if (c
== ARG_CAPABILITY
)
556 plus
= (uint64_t) -1;
558 minus
= (uint64_t) -1;
562 cap
= capability_from_name(t
);
564 log_error("Failed to parse capability %s.", t
);
568 if (c
== ARG_CAPABILITY
)
569 plus
|= 1ULL << (uint64_t) cap
;
571 minus
|= 1ULL << (uint64_t) cap
;
575 arg_settings_mask
|= SETTING_CAPABILITY
;
580 arg_link_journal
= LINK_GUEST
;
581 arg_link_journal_try
= true;
584 case ARG_LINK_JOURNAL
:
585 if (streq(optarg
, "auto")) {
586 arg_link_journal
= LINK_AUTO
;
587 arg_link_journal_try
= false;
588 } else if (streq(optarg
, "no")) {
589 arg_link_journal
= LINK_NO
;
590 arg_link_journal_try
= false;
591 } else if (streq(optarg
, "guest")) {
592 arg_link_journal
= LINK_GUEST
;
593 arg_link_journal_try
= false;
594 } else if (streq(optarg
, "host")) {
595 arg_link_journal
= LINK_HOST
;
596 arg_link_journal_try
= false;
597 } else if (streq(optarg
, "try-guest")) {
598 arg_link_journal
= LINK_GUEST
;
599 arg_link_journal_try
= true;
600 } else if (streq(optarg
, "try-host")) {
601 arg_link_journal
= LINK_HOST
;
602 arg_link_journal_try
= true;
604 log_error("Failed to parse link journal mode %s", optarg
);
612 r
= bind_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
, c
== ARG_BIND_RO
);
614 return log_error_errno(r
, "Failed to parse --bind(-ro)= argument %s: %m", optarg
);
616 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
620 r
= tmpfs_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
);
622 return log_error_errno(r
, "Failed to parse --tmpfs= argument %s: %m", optarg
);
624 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
628 case ARG_OVERLAY_RO
: {
629 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
630 _cleanup_strv_free_
char **lower
= NULL
;
635 r
= strv_split_extract(&lower
, optarg
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
639 log_error("Invalid overlay specification: %s", optarg
);
643 STRV_FOREACH(i
, lower
) {
644 if (!path_is_absolute(*i
)) {
645 log_error("Overlay path %s is not absolute.", *i
);
653 log_error("--overlay= needs at least two colon-separated directories specified.");
658 /* If two parameters are specified,
659 * the first one is the lower, the
660 * second one the upper directory. And
661 * we'll also define the destination
662 * mount point the same as the upper. */
666 destination
= strdup(upper
);
671 upper
= lower
[n
- 2];
672 destination
= lower
[n
- 1];
676 m
= custom_mount_add(&arg_custom_mounts
, &arg_n_custom_mounts
, CUSTOM_MOUNT_OVERLAY
);
680 m
->destination
= destination
;
683 m
->read_only
= c
== ARG_OVERLAY_RO
;
685 upper
= destination
= NULL
;
688 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
695 if (!env_assignment_is_valid(optarg
)) {
696 log_error("Environment variable assignment '%s' is not valid.", optarg
);
700 n
= strv_env_set(arg_setenv
, optarg
);
704 strv_free(arg_setenv
);
707 arg_settings_mask
|= SETTING_ENVIRONMENT
;
715 case ARG_SHARE_SYSTEM
:
716 arg_share_system
= true;
720 r
= parse_boolean(optarg
);
722 log_error("Failed to parse --register= argument: %s", optarg
);
730 arg_keep_unit
= true;
733 case ARG_PERSONALITY
:
735 arg_personality
= personality_from_string(optarg
);
736 if (arg_personality
== PERSONALITY_INVALID
) {
737 log_error("Unknown or unsupported personality '%s'.", optarg
);
741 arg_settings_mask
|= SETTING_PERSONALITY
;
747 arg_volatile_mode
= VOLATILE_YES
;
751 m
= volatile_mode_from_string(optarg
);
753 log_error("Failed to parse --volatile= argument: %s", optarg
);
756 arg_volatile_mode
= m
;
759 arg_settings_mask
|= SETTING_VOLATILE_MODE
;
763 r
= expose_port_parse(&arg_expose_ports
, optarg
);
765 return log_error_errno(r
, "Duplicate port specification: %s", optarg
);
767 return log_error_errno(r
, "Failed to parse host port %s: %m", optarg
);
769 arg_settings_mask
|= SETTING_EXPOSE_PORTS
;
773 if (strv_extend(&arg_property
, optarg
) < 0)
778 case ARG_PRIVATE_USERS
:
780 _cleanup_free_
char *buffer
= NULL
;
781 const char *range
, *shift
;
783 range
= strchr(optarg
, ':');
785 buffer
= strndup(optarg
, range
- optarg
);
791 if (safe_atou32(range
, &arg_uid_range
) < 0 || arg_uid_range
<= 0) {
792 log_error("Failed to parse UID range: %s", range
);
798 if (parse_uid(shift
, &arg_uid_shift
) < 0) {
799 log_error("Failed to parse UID: %s", optarg
);
807 case ARG_KILL_SIGNAL
:
808 arg_kill_signal
= signal_from_string_try_harder(optarg
);
809 if (arg_kill_signal
< 0) {
810 log_error("Cannot parse signal: %s", optarg
);
814 arg_settings_mask
|= SETTING_KILL_SIGNAL
;
819 /* no → do not read files
820 * yes → read files, do not override cmdline, trust only subset
821 * override → read files, override cmdline, trust only subset
822 * trusted → read files, do not override cmdline, trust all
825 r
= parse_boolean(optarg
);
827 if (streq(optarg
, "trusted")) {
828 mask_all_settings
= false;
829 mask_no_settings
= false;
830 arg_settings_trusted
= true;
832 } else if (streq(optarg
, "override")) {
833 mask_all_settings
= false;
834 mask_no_settings
= true;
835 arg_settings_trusted
= -1;
837 return log_error_errno(r
, "Failed to parse --settings= argument: %s", optarg
);
840 mask_all_settings
= false;
841 mask_no_settings
= false;
842 arg_settings_trusted
= -1;
845 mask_all_settings
= true;
846 mask_no_settings
= false;
847 arg_settings_trusted
= false;
856 assert_not_reached("Unhandled option");
859 if (arg_share_system
)
860 arg_register
= false;
862 if (arg_boot
&& arg_share_system
) {
863 log_error("--boot and --share-system may not be combined.");
867 if (arg_keep_unit
&& cg_pid_get_owner_uid(0, NULL
) >= 0) {
868 log_error("--keep-unit may not be used when invoked from a user session.");
872 if (arg_directory
&& arg_image
) {
873 log_error("--directory= and --image= may not be combined.");
877 if (arg_template
&& arg_image
) {
878 log_error("--template= and --image= may not be combined.");
882 if (arg_template
&& !(arg_directory
|| arg_machine
)) {
883 log_error("--template= needs --directory= or --machine=.");
887 if (arg_ephemeral
&& arg_template
) {
888 log_error("--ephemeral and --template= may not be combined.");
892 if (arg_ephemeral
&& arg_image
) {
893 log_error("--ephemeral and --image= may not be combined.");
897 if (arg_ephemeral
&& !IN_SET(arg_link_journal
, LINK_NO
, LINK_AUTO
)) {
898 log_error("--ephemeral and --link-journal= may not be combined.");
902 if (arg_userns
&& access("/proc/self/uid_map", F_OK
) < 0)
903 return log_error_errno(EOPNOTSUPP
, "--private-users= is not supported, kernel compiled without user namespace support.");
906 arg_parameters
= strv_copy(argv
+ optind
);
910 arg_settings_mask
|= SETTING_BOOT
;
913 /* Load all settings from .nspawn files */
914 if (mask_no_settings
)
915 arg_settings_mask
= 0;
917 /* Don't load any settings from .nspawn files */
918 if (mask_all_settings
)
919 arg_settings_mask
= _SETTINGS_MASK_ALL
;
921 arg_retain
= (arg_retain
| plus
| (arg_private_network
? 1ULL << CAP_NET_ADMIN
: 0)) & ~minus
;
923 r
= detect_unified_cgroup_hierarchy();
927 e
= getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
929 arg_container_service_name
= e
;
934 static int verify_arguments(void) {
936 if (arg_volatile_mode
!= VOLATILE_NO
&& arg_read_only
) {
937 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
941 if (arg_expose_ports
&& !arg_private_network
) {
942 log_error("Cannot use --port= without private networking.");
946 if (arg_boot
&& arg_kill_signal
<= 0)
947 arg_kill_signal
= SIGRTMIN
+3;
952 static int userns_lchown(const char *p
, uid_t uid
, gid_t gid
) {
958 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
961 if (uid
!= UID_INVALID
) {
962 uid
+= arg_uid_shift
;
964 if (uid
< arg_uid_shift
|| uid
>= arg_uid_shift
+ arg_uid_range
)
968 if (gid
!= GID_INVALID
) {
969 gid
+= (gid_t
) arg_uid_shift
;
971 if (gid
< (gid_t
) arg_uid_shift
|| gid
>= (gid_t
) (arg_uid_shift
+ arg_uid_range
))
975 if (lchown(p
, uid
, gid
) < 0)
981 static int userns_mkdir(const char *root
, const char *path
, mode_t mode
, uid_t uid
, gid_t gid
) {
984 q
= prefix_roota(root
, path
);
985 if (mkdir(q
, mode
) < 0) {
991 return userns_lchown(q
, uid
, gid
);
994 static int setup_timezone(const char *dest
) {
995 _cleanup_free_
char *p
= NULL
, *q
= NULL
;
996 const char *where
, *check
, *what
;
1002 /* Fix the timezone, if possible */
1003 r
= readlink_malloc("/etc/localtime", &p
);
1005 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1009 z
= path_startswith(p
, "../usr/share/zoneinfo/");
1011 z
= path_startswith(p
, "/usr/share/zoneinfo/");
1013 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1017 where
= prefix_roota(dest
, "/etc/localtime");
1018 r
= readlink_malloc(where
, &q
);
1020 y
= path_startswith(q
, "../usr/share/zoneinfo/");
1022 y
= path_startswith(q
, "/usr/share/zoneinfo/");
1024 /* Already pointing to the right place? Then do nothing .. */
1025 if (y
&& streq(y
, z
))
1029 check
= strjoina("/usr/share/zoneinfo/", z
);
1030 check
= prefix_roota(dest
, check
);
1031 if (laccess(check
, F_OK
) < 0) {
1032 log_warning("Timezone %s does not exist in container, not updating container timezone.", z
);
1037 if (r
< 0 && errno
!= ENOENT
) {
1038 log_error_errno(errno
, "Failed to remove existing timezone info %s in container: %m", where
);
1042 what
= strjoina("../usr/share/zoneinfo/", z
);
1043 if (symlink(what
, where
) < 0) {
1044 log_error_errno(errno
, "Failed to correct timezone of container: %m");
1048 r
= userns_lchown(where
, 0, 0);
1050 return log_warning_errno(r
, "Failed to chown /etc/localtime: %m");
1055 static int setup_resolv_conf(const char *dest
) {
1056 const char *where
= NULL
;
1061 if (arg_private_network
)
1064 /* Fix resolv.conf, if possible */
1065 where
= prefix_roota(dest
, "/etc/resolv.conf");
1067 r
= copy_file("/etc/resolv.conf", where
, O_TRUNC
|O_NOFOLLOW
, 0644, 0);
1069 /* If the file already exists as symlink, let's
1070 * suppress the warning, under the assumption that
1071 * resolved or something similar runs inside and the
1072 * symlink points there.
1074 * If the disk image is read-only, there's also no
1075 * point in complaining.
1077 log_full_errno(IN_SET(r
, -ELOOP
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
1078 "Failed to copy /etc/resolv.conf to %s: %m", where
);
1082 r
= userns_lchown(where
, 0, 0);
1084 log_warning_errno(r
, "Failed to chown /etc/resolv.conf: %m");
1089 static char* id128_format_as_uuid(sd_id128_t id
, char s
[37]) {
1093 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1094 SD_ID128_FORMAT_VAL(id
));
1099 static int setup_boot_id(const char *dest
) {
1100 const char *from
, *to
;
1101 sd_id128_t rnd
= {};
1105 if (arg_share_system
)
1108 /* Generate a new randomized boot ID, so that each boot-up of
1109 * the container gets a new one */
1111 from
= prefix_roota(dest
, "/run/proc-sys-kernel-random-boot-id");
1112 to
= prefix_roota(dest
, "/proc/sys/kernel/random/boot_id");
1114 r
= sd_id128_randomize(&rnd
);
1116 return log_error_errno(r
, "Failed to generate random boot id: %m");
1118 id128_format_as_uuid(rnd
, as_uuid
);
1120 r
= write_string_file(from
, as_uuid
, WRITE_STRING_FILE_CREATE
);
1122 return log_error_errno(r
, "Failed to write boot id: %m");
1124 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1125 r
= log_error_errno(errno
, "Failed to bind mount boot id: %m");
1126 else if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
) < 0)
1127 log_warning_errno(errno
, "Failed to make boot id read-only: %m");
1133 static int copy_devnodes(const char *dest
) {
1135 static const char devnodes
[] =
1146 _cleanup_umask_ mode_t u
;
1152 /* Create /dev/net, so that we can create /dev/net/tun in it */
1153 if (userns_mkdir(dest
, "/dev/net", 0755, 0, 0) < 0)
1154 return log_error_errno(r
, "Failed to create /dev/net directory: %m");
1156 NULSTR_FOREACH(d
, devnodes
) {
1157 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
1160 from
= strappend("/dev/", d
);
1161 to
= prefix_root(dest
, from
);
1163 if (stat(from
, &st
) < 0) {
1165 if (errno
!= ENOENT
)
1166 return log_error_errno(errno
, "Failed to stat %s: %m", from
);
1168 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
1170 log_error("%s is not a char or block device, cannot copy.", from
);
1174 if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
1176 return log_error_errno(errno
, "mknod(%s) failed: %m", to
);
1178 /* Some systems abusively restrict mknod but
1179 * allow bind mounts. */
1182 return log_error_errno(r
, "touch (%s) failed: %m", to
);
1183 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1184 return log_error_errno(errno
, "Both mknod and bind mount (%s) failed: %m", to
);
1187 r
= userns_lchown(to
, 0, 0);
1189 return log_error_errno(r
, "chown() of device node %s failed: %m", to
);
1196 static int setup_pts(const char *dest
) {
1197 _cleanup_free_
char *options
= NULL
;
1202 if (arg_selinux_apifs_context
)
1203 (void) asprintf(&options
,
1204 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
",context=\"%s\"",
1205 arg_uid_shift
+ TTY_GID
,
1206 arg_selinux_apifs_context
);
1209 (void) asprintf(&options
,
1210 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
,
1211 arg_uid_shift
+ TTY_GID
);
1216 /* Mount /dev/pts itself */
1217 p
= prefix_roota(dest
, "/dev/pts");
1218 if (mkdir(p
, 0755) < 0)
1219 return log_error_errno(errno
, "Failed to create /dev/pts: %m");
1220 if (mount("devpts", p
, "devpts", MS_NOSUID
|MS_NOEXEC
, options
) < 0)
1221 return log_error_errno(errno
, "Failed to mount /dev/pts: %m");
1222 r
= userns_lchown(p
, 0, 0);
1224 return log_error_errno(r
, "Failed to chown /dev/pts: %m");
1226 /* Create /dev/ptmx symlink */
1227 p
= prefix_roota(dest
, "/dev/ptmx");
1228 if (symlink("pts/ptmx", p
) < 0)
1229 return log_error_errno(errno
, "Failed to create /dev/ptmx symlink: %m");
1230 r
= userns_lchown(p
, 0, 0);
1232 return log_error_errno(r
, "Failed to chown /dev/ptmx: %m");
1234 /* And fix /dev/pts/ptmx ownership */
1235 p
= prefix_roota(dest
, "/dev/pts/ptmx");
1236 r
= userns_lchown(p
, 0, 0);
1238 return log_error_errno(r
, "Failed to chown /dev/pts/ptmx: %m");
1243 static int setup_dev_console(const char *dest
, const char *console
) {
1244 _cleanup_umask_ mode_t u
;
1253 r
= chmod_and_chown(console
, 0600, arg_uid_shift
, arg_uid_shift
);
1255 return log_error_errno(r
, "Failed to correct access mode for TTY: %m");
1257 /* We need to bind mount the right tty to /dev/console since
1258 * ptys can only exist on pts file systems. To have something
1259 * to bind mount things on we create a empty regular file. */
1261 to
= prefix_roota(dest
, "/dev/console");
1264 return log_error_errno(r
, "touch() for /dev/console failed: %m");
1266 if (mount(console
, to
, NULL
, MS_BIND
, NULL
) < 0)
1267 return log_error_errno(errno
, "Bind mount for /dev/console failed: %m");
1272 static int setup_kmsg(const char *dest
, int kmsg_socket
) {
1273 const char *from
, *to
;
1274 _cleanup_umask_ mode_t u
;
1277 assert(kmsg_socket
>= 0);
1281 /* We create the kmsg FIFO as /run/kmsg, but immediately
1282 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1283 * on the reading side behave very similar to /proc/kmsg,
1284 * their writing side behaves differently from /dev/kmsg in
1285 * that writing blocks when nothing is reading. In order to
1286 * avoid any problems with containers deadlocking due to this
1287 * we simply make /dev/kmsg unavailable to the container. */
1288 from
= prefix_roota(dest
, "/run/kmsg");
1289 to
= prefix_roota(dest
, "/proc/kmsg");
1291 if (mkfifo(from
, 0600) < 0)
1292 return log_error_errno(errno
, "mkfifo() for /run/kmsg failed: %m");
1293 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1294 return log_error_errno(errno
, "Bind mount for /proc/kmsg failed: %m");
1296 fd
= open(from
, O_RDWR
|O_NDELAY
|O_CLOEXEC
);
1298 return log_error_errno(errno
, "Failed to open fifo: %m");
1300 /* Store away the fd in the socket, so that it stays open as
1301 * long as we run the child */
1302 r
= send_one_fd(kmsg_socket
, fd
, 0);
1306 return log_error_errno(r
, "Failed to send FIFO fd: %m");
1308 /* And now make the FIFO unavailable as /run/kmsg... */
1309 (void) unlink(from
);
1314 static int on_address_change(sd_netlink
*rtnl
, sd_netlink_message
*m
, void *userdata
) {
1315 union in_addr_union
*exposed
= userdata
;
1321 expose_port_execute(rtnl
, arg_expose_ports
, exposed
);
1325 static int setup_hostname(void) {
1327 if (arg_share_system
)
1330 if (sethostname_idempotent(arg_machine
) < 0)
1336 static int setup_journal(const char *directory
) {
1337 sd_id128_t machine_id
, this_id
;
1338 _cleanup_free_
char *b
= NULL
, *d
= NULL
;
1339 const char *etc_machine_id
, *p
, *q
;
1344 /* Don't link journals in ephemeral mode */
1348 if (arg_link_journal
== LINK_NO
)
1351 try = arg_link_journal_try
|| arg_link_journal
== LINK_AUTO
;
1353 etc_machine_id
= prefix_roota(directory
, "/etc/machine-id");
1355 r
= read_one_line_file(etc_machine_id
, &b
);
1356 if (r
== -ENOENT
&& try)
1359 return log_error_errno(r
, "Failed to read machine ID from %s: %m", etc_machine_id
);
1362 if (isempty(id
) && try)
1365 /* Verify validity */
1366 r
= sd_id128_from_string(id
, &machine_id
);
1368 return log_error_errno(r
, "Failed to parse machine ID from %s: %m", etc_machine_id
);
1370 r
= sd_id128_get_machine(&this_id
);
1372 return log_error_errno(r
, "Failed to retrieve machine ID: %m");
1374 if (sd_id128_equal(machine_id
, this_id
)) {
1375 log_full(try ? LOG_WARNING
: LOG_ERR
,
1376 "Host and machine ids are equal (%s): refusing to link journals", id
);
1382 r
= userns_mkdir(directory
, "/var", 0755, 0, 0);
1384 return log_error_errno(r
, "Failed to create /var: %m");
1386 r
= userns_mkdir(directory
, "/var/log", 0755, 0, 0);
1388 return log_error_errno(r
, "Failed to create /var/log: %m");
1390 r
= userns_mkdir(directory
, "/var/log/journal", 0755, 0, 0);
1392 return log_error_errno(r
, "Failed to create /var/log/journal: %m");
1394 p
= strjoina("/var/log/journal/", id
);
1395 q
= prefix_roota(directory
, p
);
1397 if (path_is_mount_point(p
, 0) > 0) {
1401 log_error("%s: already a mount point, refusing to use for journal", p
);
1405 if (path_is_mount_point(q
, 0) > 0) {
1409 log_error("%s: already a mount point, refusing to use for journal", q
);
1413 r
= readlink_and_make_absolute(p
, &d
);
1415 if ((arg_link_journal
== LINK_GUEST
||
1416 arg_link_journal
== LINK_AUTO
) &&
1419 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1421 log_warning_errno(r
, "Failed to create directory %s: %m", q
);
1426 return log_error_errno(errno
, "Failed to remove symlink %s: %m", p
);
1427 } else if (r
== -EINVAL
) {
1429 if (arg_link_journal
== LINK_GUEST
&&
1432 if (errno
== ENOTDIR
) {
1433 log_error("%s already exists and is neither a symlink nor a directory", p
);
1436 return log_error_errno(errno
, "Failed to remove %s: %m", p
);
1438 } else if (r
!= -ENOENT
)
1439 return log_error_errno(r
, "readlink(%s) failed: %m", p
);
1441 if (arg_link_journal
== LINK_GUEST
) {
1443 if (symlink(q
, p
) < 0) {
1445 log_debug_errno(errno
, "Failed to symlink %s to %s, skipping journal setup: %m", q
, p
);
1448 return log_error_errno(errno
, "Failed to symlink %s to %s: %m", q
, p
);
1451 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1453 log_warning_errno(r
, "Failed to create directory %s: %m", q
);
1457 if (arg_link_journal
== LINK_HOST
) {
1458 /* don't create parents here -- if the host doesn't have
1459 * permanent journal set up, don't force it here */
1461 if (mkdir(p
, 0755) < 0 && errno
!= EEXIST
) {
1463 log_debug_errno(errno
, "Failed to create %s, skipping journal setup: %m", p
);
1466 return log_error_errno(errno
, "Failed to create %s: %m", p
);
1469 } else if (access(p
, F_OK
) < 0)
1472 if (dir_is_empty(q
) == 0)
1473 log_warning("%s is not empty, proceeding anyway.", q
);
1475 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1477 return log_error_errno(r
, "Failed to create %s: %m", q
);
1479 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1480 return log_error_errno(errno
, "Failed to bind mount journal from host into guest: %m");
1485 static int drop_capabilities(void) {
1486 return capability_bounding_set_drop(arg_retain
, false);
1489 static int reset_audit_loginuid(void) {
1490 _cleanup_free_
char *p
= NULL
;
1493 if (arg_share_system
)
1496 r
= read_one_line_file("/proc/self/loginuid", &p
);
1500 return log_error_errno(r
, "Failed to read /proc/self/loginuid: %m");
1502 /* Already reset? */
1503 if (streq(p
, "4294967295"))
1506 r
= write_string_file("/proc/self/loginuid", "4294967295", 0);
1509 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1510 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1511 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1512 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1513 "using systemd-nspawn. Sleeping for 5s... (%m)");
1521 static int setup_seccomp(void) {
1524 static const struct {
1525 uint64_t capability
;
1528 { CAP_SYS_RAWIO
, SCMP_SYS(iopl
) },
1529 { CAP_SYS_RAWIO
, SCMP_SYS(ioperm
) },
1530 { CAP_SYS_BOOT
, SCMP_SYS(kexec_load
) },
1531 { CAP_SYS_ADMIN
, SCMP_SYS(swapon
) },
1532 { CAP_SYS_ADMIN
, SCMP_SYS(swapoff
) },
1533 { CAP_SYS_ADMIN
, SCMP_SYS(open_by_handle_at
) },
1534 { CAP_SYS_MODULE
, SCMP_SYS(init_module
) },
1535 { CAP_SYS_MODULE
, SCMP_SYS(finit_module
) },
1536 { CAP_SYS_MODULE
, SCMP_SYS(delete_module
) },
1537 { CAP_SYSLOG
, SCMP_SYS(syslog
) },
1540 scmp_filter_ctx seccomp
;
1544 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
1548 r
= seccomp_add_secondary_archs(seccomp
);
1550 log_error_errno(r
, "Failed to add secondary archs to seccomp filter: %m");
1554 for (i
= 0; i
< ELEMENTSOF(blacklist
); i
++) {
1555 if (arg_retain
& (1ULL << blacklist
[i
].capability
))
1558 r
= seccomp_rule_add(seccomp
, SCMP_ACT_ERRNO(EPERM
), blacklist
[i
].syscall_num
, 0);
1560 continue; /* unknown syscall */
1562 log_error_errno(r
, "Failed to block syscall: %m");
1569 Audit is broken in containers, much of the userspace audit
1570 hookup will fail if running inside a container. We don't
1571 care and just turn off creation of audit sockets.
1573 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1574 with EAFNOSUPPORT which audit userspace uses as indication
1575 that audit is disabled in the kernel.
1578 r
= seccomp_rule_add(
1580 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1583 SCMP_A0(SCMP_CMP_EQ
, AF_NETLINK
),
1584 SCMP_A2(SCMP_CMP_EQ
, NETLINK_AUDIT
));
1586 log_error_errno(r
, "Failed to add audit seccomp rule: %m");
1590 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
1592 log_error_errno(r
, "Failed to unset NO_NEW_PRIVS: %m");
1596 r
= seccomp_load(seccomp
);
1598 log_debug_errno(r
, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1603 log_error_errno(r
, "Failed to install seccomp audit filter: %m");
1608 seccomp_release(seccomp
);
1616 static int setup_propagate(const char *root
) {
1620 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1621 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1622 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
1623 (void) mkdir_p(p
, 0600);
1625 r
= userns_mkdir(root
, "/run/systemd", 0755, 0, 0);
1627 return log_error_errno(r
, "Failed to create /run/systemd: %m");
1629 r
= userns_mkdir(root
, "/run/systemd/nspawn", 0755, 0, 0);
1631 return log_error_errno(r
, "Failed to create /run/systemd/nspawn: %m");
1633 r
= userns_mkdir(root
, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1635 return log_error_errno(r
, "Failed to create /run/systemd/nspawn/incoming: %m");
1637 q
= prefix_roota(root
, "/run/systemd/nspawn/incoming");
1638 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1639 return log_error_errno(errno
, "Failed to install propagation bind mount.");
1641 if (mount(NULL
, q
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
) < 0)
1642 return log_error_errno(errno
, "Failed to make propagation mount read-only");
1647 static int setup_image(char **device_path
, int *loop_nr
) {
1648 struct loop_info64 info
= {
1649 .lo_flags
= LO_FLAGS_AUTOCLEAR
|LO_FLAGS_PARTSCAN
1651 _cleanup_close_
int fd
= -1, control
= -1, loop
= -1;
1652 _cleanup_free_
char* loopdev
= NULL
;
1656 assert(device_path
);
1660 fd
= open(arg_image
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1662 return log_error_errno(errno
, "Failed to open %s: %m", arg_image
);
1664 if (fstat(fd
, &st
) < 0)
1665 return log_error_errno(errno
, "Failed to stat %s: %m", arg_image
);
1667 if (S_ISBLK(st
.st_mode
)) {
1670 p
= strdup(arg_image
);
1684 if (!S_ISREG(st
.st_mode
)) {
1685 log_error("%s is not a regular file or block device.", arg_image
);
1689 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
1691 return log_error_errno(errno
, "Failed to open /dev/loop-control: %m");
1693 nr
= ioctl(control
, LOOP_CTL_GET_FREE
);
1695 return log_error_errno(errno
, "Failed to allocate loop device: %m");
1697 if (asprintf(&loopdev
, "/dev/loop%i", nr
) < 0)
1700 loop
= open(loopdev
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1702 return log_error_errno(errno
, "Failed to open loop device %s: %m", loopdev
);
1704 if (ioctl(loop
, LOOP_SET_FD
, fd
) < 0)
1705 return log_error_errno(errno
, "Failed to set loopback file descriptor on %s: %m", loopdev
);
1708 info
.lo_flags
|= LO_FLAGS_READ_ONLY
;
1710 if (ioctl(loop
, LOOP_SET_STATUS64
, &info
) < 0)
1711 return log_error_errno(errno
, "Failed to set loopback settings on %s: %m", loopdev
);
1713 *device_path
= loopdev
;
1724 #define PARTITION_TABLE_BLURB \
1725 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1726 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1727 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1728 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1729 "to be bootable with systemd-nspawn."
1731 static int dissect_image(
1733 char **root_device
, bool *root_device_rw
,
1734 char **home_device
, bool *home_device_rw
,
1735 char **srv_device
, bool *srv_device_rw
,
1739 int home_nr
= -1, srv_nr
= -1;
1740 #ifdef GPT_ROOT_NATIVE
1743 #ifdef GPT_ROOT_SECONDARY
1744 int secondary_root_nr
= -1;
1746 _cleanup_free_
char *home
= NULL
, *root
= NULL
, *secondary_root
= NULL
, *srv
= NULL
, *generic
= NULL
;
1747 _cleanup_udev_enumerate_unref_
struct udev_enumerate
*e
= NULL
;
1748 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
1749 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
1750 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
1751 struct udev_list_entry
*first
, *item
;
1752 bool home_rw
= true, root_rw
= true, secondary_root_rw
= true, srv_rw
= true, generic_rw
= true;
1753 bool is_gpt
, is_mbr
, multiple_generic
= false;
1754 const char *pttype
= NULL
;
1761 assert(root_device
);
1762 assert(home_device
);
1767 b
= blkid_new_probe();
1772 r
= blkid_probe_set_device(b
, fd
, 0, 0);
1777 return log_error_errno(errno
, "Failed to set device on blkid probe: %m");
1780 blkid_probe_enable_partitions(b
, 1);
1781 blkid_probe_set_partitions_flags(b
, BLKID_PARTS_ENTRY_DETAILS
);
1784 r
= blkid_do_safeprobe(b
);
1785 if (r
== -2 || r
== 1) {
1786 log_error("Failed to identify any partition table on\n"
1788 PARTITION_TABLE_BLURB
, arg_image
);
1790 } else if (r
!= 0) {
1793 return log_error_errno(errno
, "Failed to probe: %m");
1796 (void) blkid_probe_lookup_value(b
, "PTTYPE", &pttype
, NULL
);
1798 is_gpt
= streq_ptr(pttype
, "gpt");
1799 is_mbr
= streq_ptr(pttype
, "dos");
1801 if (!is_gpt
&& !is_mbr
) {
1802 log_error("No GPT or MBR partition table discovered on\n"
1804 PARTITION_TABLE_BLURB
, arg_image
);
1809 pl
= blkid_probe_get_partitions(b
);
1814 log_error("Failed to list partitions of %s", arg_image
);
1822 if (fstat(fd
, &st
) < 0)
1823 return log_error_errno(errno
, "Failed to stat block device: %m");
1825 d
= udev_device_new_from_devnum(udev
, 'b', st
.st_rdev
);
1833 log_error("Kernel partitions never appeared.");
1837 e
= udev_enumerate_new(udev
);
1841 r
= udev_enumerate_add_match_parent(e
, d
);
1845 r
= udev_enumerate_scan_devices(e
);
1847 return log_error_errno(r
, "Failed to scan for partition devices of %s: %m", arg_image
);
1849 /* Count the partitions enumerated by the kernel */
1851 first
= udev_enumerate_get_list_entry(e
);
1852 udev_list_entry_foreach(item
, first
)
1855 /* Count the partitions enumerated by blkid */
1856 m
= blkid_partlist_numof_partitions(pl
);
1860 log_error("blkid and kernel partition list do not match.");
1866 /* The kernel has probed fewer partitions than
1867 * blkid? Maybe the kernel prober is still
1868 * running or it got EBUSY because udev
1869 * already opened the device. Let's reprobe
1870 * the device, which is a synchronous call
1871 * that waits until probing is complete. */
1873 for (j
= 0; j
< 20; j
++) {
1875 r
= ioctl(fd
, BLKRRPART
, 0);
1878 if (r
>= 0 || r
!= -EBUSY
)
1881 /* If something else has the device
1882 * open, such as an udev rule, the
1883 * ioctl will return EBUSY. Since
1884 * there's no way to wait until it
1885 * isn't busy anymore, let's just wait
1886 * a bit, and try again.
1888 * This is really something they
1889 * should fix in the kernel! */
1891 usleep(50 * USEC_PER_MSEC
);
1895 return log_error_errno(r
, "Failed to reread partition table: %m");
1898 e
= udev_enumerate_unref(e
);
1901 first
= udev_enumerate_get_list_entry(e
);
1902 udev_list_entry_foreach(item
, first
) {
1903 _cleanup_udev_device_unref_
struct udev_device
*q
;
1905 unsigned long long flags
;
1911 q
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(item
));
1916 return log_error_errno(errno
, "Failed to get partition device of %s: %m", arg_image
);
1919 qn
= udev_device_get_devnum(q
);
1923 if (st
.st_rdev
== qn
)
1926 node
= udev_device_get_devnode(q
);
1930 pp
= blkid_partlist_devno_to_partition(pl
, qn
);
1934 flags
= blkid_partition_get_flags(pp
);
1936 nr
= blkid_partition_get_partno(pp
);
1944 if (flags
& GPT_FLAG_NO_AUTO
)
1947 stype
= blkid_partition_get_type_string(pp
);
1951 if (sd_id128_from_string(stype
, &type_id
) < 0)
1954 if (sd_id128_equal(type_id
, GPT_HOME
)) {
1956 if (home
&& nr
>= home_nr
)
1960 home_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1962 r
= free_and_strdup(&home
, node
);
1966 } else if (sd_id128_equal(type_id
, GPT_SRV
)) {
1968 if (srv
&& nr
>= srv_nr
)
1972 srv_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1974 r
= free_and_strdup(&srv
, node
);
1978 #ifdef GPT_ROOT_NATIVE
1979 else if (sd_id128_equal(type_id
, GPT_ROOT_NATIVE
)) {
1981 if (root
&& nr
>= root_nr
)
1985 root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1987 r
= free_and_strdup(&root
, node
);
1992 #ifdef GPT_ROOT_SECONDARY
1993 else if (sd_id128_equal(type_id
, GPT_ROOT_SECONDARY
)) {
1995 if (secondary_root
&& nr
>= secondary_root_nr
)
1998 secondary_root_nr
= nr
;
1999 secondary_root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2001 r
= free_and_strdup(&secondary_root
, node
);
2006 else if (sd_id128_equal(type_id
, GPT_LINUX_GENERIC
)) {
2009 multiple_generic
= true;
2011 generic_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2013 r
= free_and_strdup(&generic
, node
);
2019 } else if (is_mbr
) {
2022 if (flags
!= 0x80) /* Bootable flag */
2025 type
= blkid_partition_get_type(pp
);
2026 if (type
!= 0x83) /* Linux partition */
2030 multiple_generic
= true;
2034 r
= free_and_strdup(&root
, node
);
2042 *root_device
= root
;
2045 *root_device_rw
= root_rw
;
2047 } else if (secondary_root
) {
2048 *root_device
= secondary_root
;
2049 secondary_root
= NULL
;
2051 *root_device_rw
= secondary_root_rw
;
2053 } else if (generic
) {
2055 /* There were no partitions with precise meanings
2056 * around, but we found generic partitions. In this
2057 * case, if there's only one, we can go ahead and boot
2058 * it, otherwise we bail out, because we really cannot
2059 * make any sense of it. */
2061 if (multiple_generic
) {
2062 log_error("Identified multiple bootable Linux partitions on\n"
2064 PARTITION_TABLE_BLURB
, arg_image
);
2068 *root_device
= generic
;
2071 *root_device_rw
= generic_rw
;
2074 log_error("Failed to identify root partition in disk image\n"
2076 PARTITION_TABLE_BLURB
, arg_image
);
2081 *home_device
= home
;
2084 *home_device_rw
= home_rw
;
2091 *srv_device_rw
= srv_rw
;
2096 log_error("--image= is not supported, compiled without blkid support.");
2101 static int mount_device(const char *what
, const char *where
, const char *directory
, bool rw
) {
2103 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
2104 const char *fstype
, *p
;
2114 p
= strjoina(where
, directory
);
2119 b
= blkid_new_probe_from_filename(what
);
2123 return log_error_errno(errno
, "Failed to allocate prober for %s: %m", what
);
2126 blkid_probe_enable_superblocks(b
, 1);
2127 blkid_probe_set_superblocks_flags(b
, BLKID_SUBLKS_TYPE
);
2130 r
= blkid_do_safeprobe(b
);
2131 if (r
== -1 || r
== 1) {
2132 log_error("Cannot determine file system type of %s", what
);
2134 } else if (r
!= 0) {
2137 return log_error_errno(errno
, "Failed to probe %s: %m", what
);
2141 if (blkid_probe_lookup_value(b
, "TYPE", &fstype
, NULL
) < 0) {
2144 log_error("Failed to determine file system type of %s", what
);
2148 if (streq(fstype
, "crypto_LUKS")) {
2149 log_error("nspawn currently does not support LUKS disk images.");
2153 if (mount(what
, p
, fstype
, MS_NODEV
|(rw
? 0 : MS_RDONLY
), NULL
) < 0)
2154 return log_error_errno(errno
, "Failed to mount %s: %m", what
);
2158 log_error("--image= is not supported, compiled without blkid support.");
2163 static int mount_devices(
2165 const char *root_device
, bool root_device_rw
,
2166 const char *home_device
, bool home_device_rw
,
2167 const char *srv_device
, bool srv_device_rw
) {
2173 r
= mount_device(root_device
, arg_directory
, NULL
, root_device_rw
);
2175 return log_error_errno(r
, "Failed to mount root directory: %m");
2179 r
= mount_device(home_device
, arg_directory
, "/home", home_device_rw
);
2181 return log_error_errno(r
, "Failed to mount home directory: %m");
2185 r
= mount_device(srv_device
, arg_directory
, "/srv", srv_device_rw
);
2187 return log_error_errno(r
, "Failed to mount server data directory: %m");
2193 static void loop_remove(int nr
, int *image_fd
) {
2194 _cleanup_close_
int control
= -1;
2200 if (image_fd
&& *image_fd
>= 0) {
2201 r
= ioctl(*image_fd
, LOOP_CLR_FD
);
2203 log_debug_errno(errno
, "Failed to close loop image: %m");
2204 *image_fd
= safe_close(*image_fd
);
2207 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
2209 log_warning_errno(errno
, "Failed to open /dev/loop-control: %m");
2213 r
= ioctl(control
, LOOP_CTL_REMOVE
, nr
);
2215 log_debug_errno(errno
, "Failed to remove loop %d: %m", nr
);
2220 * < 0 : wait_for_terminate() failed to get the state of the
2221 * container, the container was terminated by a signal, or
2222 * failed for an unknown reason. No change is made to the
2223 * container argument.
2224 * > 0 : The program executed in the container terminated with an
2225 * error. The exit code of the program executed in the
2226 * container is returned. The container argument has been set
2227 * to CONTAINER_TERMINATED.
2228 * 0 : The container is being rebooted, has been shut down or exited
2229 * successfully. The container argument has been set to either
2230 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2232 * That is, success is indicated by a return value of zero, and an
2233 * error is indicated by a non-zero value.
2235 static int wait_for_container(pid_t pid
, ContainerStatus
*container
) {
2239 r
= wait_for_terminate(pid
, &status
);
2241 return log_warning_errno(r
, "Failed to wait for container: %m");
2243 switch (status
.si_code
) {
2246 if (status
.si_status
== 0) {
2247 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s exited successfully.", arg_machine
);
2250 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s failed with error code %i.", arg_machine
, status
.si_status
);
2252 *container
= CONTAINER_TERMINATED
;
2253 return status
.si_status
;
2256 if (status
.si_status
== SIGINT
) {
2258 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s has been shut down.", arg_machine
);
2259 *container
= CONTAINER_TERMINATED
;
2262 } else if (status
.si_status
== SIGHUP
) {
2264 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s is being rebooted.", arg_machine
);
2265 *container
= CONTAINER_REBOOTED
;
2269 /* CLD_KILLED fallthrough */
2272 log_error("Container %s terminated by signal %s.", arg_machine
, signal_to_string(status
.si_status
));
2276 log_error("Container %s failed due to unknown reason.", arg_machine
);
2283 static int on_orderly_shutdown(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
2286 pid
= PTR_TO_PID(userdata
);
2288 if (kill(pid
, arg_kill_signal
) >= 0) {
2289 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2290 sd_event_source_set_userdata(s
, NULL
);
2295 sd_event_exit(sd_event_source_get_event(s
), 0);
2299 static int determine_names(void) {
2302 if (arg_template
&& !arg_directory
&& arg_machine
) {
2304 /* If --template= was specified then we should not
2305 * search for a machine, but instead create a new one
2306 * in /var/lib/machine. */
2308 arg_directory
= strjoin("/var/lib/machines/", arg_machine
, NULL
);
2313 if (!arg_image
&& !arg_directory
) {
2315 _cleanup_(image_unrefp
) Image
*i
= NULL
;
2317 r
= image_find(arg_machine
, &i
);
2319 return log_error_errno(r
, "Failed to find image for machine '%s': %m", arg_machine
);
2321 log_error("No image for machine '%s': %m", arg_machine
);
2325 if (i
->type
== IMAGE_RAW
)
2326 r
= free_and_strdup(&arg_image
, i
->path
);
2328 r
= free_and_strdup(&arg_directory
, i
->path
);
2330 return log_error_errno(r
, "Invalid image directory: %m");
2333 arg_read_only
= arg_read_only
|| i
->read_only
;
2335 arg_directory
= get_current_dir_name();
2337 if (!arg_directory
&& !arg_machine
) {
2338 log_error("Failed to determine path, please use -D or -i.");
2344 if (arg_directory
&& path_equal(arg_directory
, "/"))
2345 arg_machine
= gethostname_malloc();
2347 arg_machine
= strdup(basename(arg_image
?: arg_directory
));
2352 hostname_cleanup(arg_machine
);
2353 if (!machine_name_is_valid(arg_machine
)) {
2354 log_error("Failed to determine machine name automatically, please use -M.");
2358 if (arg_ephemeral
) {
2361 /* Add a random suffix when this is an
2362 * ephemeral machine, so that we can run many
2363 * instances at once without manually having
2364 * to specify -M each time. */
2366 if (asprintf(&b
, "%s-%016" PRIx64
, arg_machine
, random_u64()) < 0)
2377 static int determine_uid_shift(const char *directory
) {
2385 if (arg_uid_shift
== UID_INVALID
) {
2388 r
= stat(directory
, &st
);
2390 return log_error_errno(errno
, "Failed to determine UID base of %s: %m", directory
);
2392 arg_uid_shift
= st
.st_uid
& UINT32_C(0xffff0000);
2394 if (arg_uid_shift
!= (st
.st_gid
& UINT32_C(0xffff0000))) {
2395 log_error("UID and GID base of %s don't match.", directory
);
2399 arg_uid_range
= UINT32_C(0x10000);
2402 if (arg_uid_shift
> (uid_t
) -1 - arg_uid_range
) {
2403 log_error("UID base too high for UID range.");
2407 log_info("Using user namespaces with base " UID_FMT
" and range " UID_FMT
".", arg_uid_shift
, arg_uid_range
);
2411 static int inner_child(
2413 const char *directory
,
2419 _cleanup_free_
char *home
= NULL
;
2421 const char *envp
[] = {
2422 "PATH=" DEFAULT_PATH_SPLIT_USR
,
2423 NULL
, /* container */
2428 NULL
, /* container_uuid */
2429 NULL
, /* LISTEN_FDS */
2430 NULL
, /* LISTEN_PID */
2434 _cleanup_strv_free_
char **env_use
= NULL
;
2439 assert(kmsg_socket
>= 0);
2444 /* Tell the parent, that it now can write the UID map. */
2445 (void) barrier_place(barrier
); /* #1 */
2447 /* Wait until the parent wrote the UID map */
2448 if (!barrier_place_and_sync(barrier
)) { /* #2 */
2449 log_error("Parent died too early");
2454 r
= mount_all(NULL
, arg_userns
, true, arg_uid_shift
, arg_private_network
, arg_uid_range
, arg_selinux_apifs_context
);
2458 r
= mount_sysfs(NULL
);
2462 /* Wait until we are cgroup-ified, so that we
2463 * can mount the right cgroup path writable */
2464 if (!barrier_place_and_sync(barrier
)) { /* #3 */
2465 log_error("Parent died too early");
2469 r
= mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy
);
2473 r
= reset_uid_gid();
2475 return log_error_errno(r
, "Couldn't become new root: %m");
2477 r
= setup_boot_id(NULL
);
2481 r
= setup_kmsg(NULL
, kmsg_socket
);
2484 kmsg_socket
= safe_close(kmsg_socket
);
2489 return log_error_errno(errno
, "setsid() failed: %m");
2491 if (arg_private_network
)
2494 if (arg_expose_ports
) {
2495 r
= expose_port_send_rtnl(rtnl_socket
);
2498 rtnl_socket
= safe_close(rtnl_socket
);
2501 r
= drop_capabilities();
2503 return log_error_errno(r
, "drop_capabilities() failed: %m");
2507 if (arg_personality
!= PERSONALITY_INVALID
) {
2508 if (personality(arg_personality
) < 0)
2509 return log_error_errno(errno
, "personality() failed: %m");
2510 } else if (secondary
) {
2511 if (personality(PER_LINUX32
) < 0)
2512 return log_error_errno(errno
, "personality() failed: %m");
2516 if (arg_selinux_context
)
2517 if (setexeccon((security_context_t
) arg_selinux_context
) < 0)
2518 return log_error_errno(errno
, "setexeccon(\"%s\") failed: %m", arg_selinux_context
);
2521 r
= change_uid_gid(arg_user
, &home
);
2525 /* LXC sets container=lxc, so follow the scheme here */
2526 envp
[n_env
++] = strjoina("container=", arg_container_service_name
);
2528 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
2532 if ((asprintf((char**)(envp
+ n_env
++), "HOME=%s", home
? home
: "/root") < 0) ||
2533 (asprintf((char**)(envp
+ n_env
++), "USER=%s", arg_user
? arg_user
: "root") < 0) ||
2534 (asprintf((char**)(envp
+ n_env
++), "LOGNAME=%s", arg_user
? arg_user
: "root") < 0))
2537 if (!sd_id128_equal(arg_uuid
, SD_ID128_NULL
)) {
2540 if (asprintf((char**)(envp
+ n_env
++), "container_uuid=%s", id128_format_as_uuid(arg_uuid
, as_uuid
)) < 0)
2544 if (fdset_size(fds
) > 0) {
2545 r
= fdset_cloexec(fds
, false);
2547 return log_error_errno(r
, "Failed to unset O_CLOEXEC for file descriptors.");
2549 if ((asprintf((char **)(envp
+ n_env
++), "LISTEN_FDS=%u", fdset_size(fds
)) < 0) ||
2550 (asprintf((char **)(envp
+ n_env
++), "LISTEN_PID=1") < 0))
2554 env_use
= strv_env_merge(2, envp
, arg_setenv
);
2558 /* Let the parent know that we are ready and
2559 * wait until the parent is ready with the
2561 if (!barrier_place_and_sync(barrier
)) { /* #4 */
2562 log_error("Parent died too early");
2566 /* Now, explicitly close the log, so that we
2567 * then can close all remaining fds. Closing
2568 * the log explicitly first has the benefit
2569 * that the logging subsystem knows about it,
2570 * and is thus ready to be reopened should we
2571 * need it again. Note that the other fds
2572 * closed here are at least the locking and
2575 (void) fdset_close_others(fds
);
2581 /* Automatically search for the init system */
2583 m
= 1 + strv_length(arg_parameters
);
2584 a
= newa(char*, m
+ 1);
2585 if (strv_isempty(arg_parameters
))
2588 memcpy(a
+ 1, arg_parameters
, m
* sizeof(char*));
2590 a
[0] = (char*) "/usr/lib/systemd/systemd";
2591 execve(a
[0], a
, env_use
);
2593 a
[0] = (char*) "/lib/systemd/systemd";
2594 execve(a
[0], a
, env_use
);
2596 a
[0] = (char*) "/sbin/init";
2597 execve(a
[0], a
, env_use
);
2598 } else if (!strv_isempty(arg_parameters
))
2599 execvpe(arg_parameters
[0], arg_parameters
, env_use
);
2601 chdir(home
?: "/root");
2602 execle("/bin/bash", "-bash", NULL
, env_use
);
2603 execle("/bin/sh", "-sh", NULL
, env_use
);
2608 return log_error_errno(r
, "execv() failed: %m");
2611 static int outer_child(
2613 const char *directory
,
2614 const char *console
,
2615 const char *root_device
, bool root_device_rw
,
2616 const char *home_device
, bool home_device_rw
,
2617 const char *srv_device
, bool srv_device_rw
,
2623 int uid_shift_socket
,
2633 assert(pid_socket
>= 0);
2634 assert(kmsg_socket
>= 0);
2638 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0)
2639 return log_error_errno(errno
, "PR_SET_PDEATHSIG failed: %m");
2642 close_nointr(STDIN_FILENO
);
2643 close_nointr(STDOUT_FILENO
);
2644 close_nointr(STDERR_FILENO
);
2646 r
= open_terminal(console
, O_RDWR
);
2647 if (r
!= STDIN_FILENO
) {
2653 return log_error_errno(r
, "Failed to open console: %m");
2656 if (dup2(STDIN_FILENO
, STDOUT_FILENO
) != STDOUT_FILENO
||
2657 dup2(STDIN_FILENO
, STDERR_FILENO
) != STDERR_FILENO
)
2658 return log_error_errno(errno
, "Failed to duplicate console: %m");
2661 r
= reset_audit_loginuid();
2665 /* Mark everything as slave, so that we still
2666 * receive mounts from the real root, but don't
2667 * propagate mounts to the real root. */
2668 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
2669 return log_error_errno(errno
, "MS_SLAVE|MS_REC failed: %m");
2671 r
= mount_devices(directory
,
2672 root_device
, root_device_rw
,
2673 home_device
, home_device_rw
,
2674 srv_device
, srv_device_rw
);
2678 r
= determine_uid_shift(directory
);
2683 l
= send(uid_shift_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), MSG_NOSIGNAL
);
2685 return log_error_errno(errno
, "Failed to send UID shift: %m");
2686 if (l
!= sizeof(arg_uid_shift
)) {
2687 log_error("Short write while sending UID shift.");
2692 /* Turn directory into bind mount */
2693 if (mount(directory
, directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
2694 return log_error_errno(errno
, "Failed to make bind mount: %m");
2696 r
= setup_volatile(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2700 r
= setup_volatile_state(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2704 r
= base_filesystem_create(directory
, arg_uid_shift
, (gid_t
) arg_uid_shift
);
2708 if (arg_read_only
) {
2709 r
= bind_remount_recursive(directory
, true);
2711 return log_error_errno(r
, "Failed to make tree read-only: %m");
2714 r
= mount_all(directory
, arg_userns
, false, arg_private_network
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2718 r
= copy_devnodes(directory
);
2722 dev_setup(directory
, arg_uid_shift
, arg_uid_shift
);
2724 r
= setup_pts(directory
);
2728 r
= setup_propagate(directory
);
2732 r
= setup_dev_console(directory
, console
);
2736 r
= setup_seccomp();
2740 r
= setup_timezone(directory
);
2744 r
= setup_resolv_conf(directory
);
2748 r
= setup_journal(directory
);
2752 r
= mount_custom(directory
, arg_custom_mounts
, arg_n_custom_mounts
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2756 r
= mount_cgroups(directory
, arg_unified_cgroup_hierarchy
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2760 r
= mount_move_root(directory
);
2762 return log_error_errno(r
, "Failed to move root directory: %m");
2764 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
|
2765 (arg_share_system
? 0 : CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
) |
2766 (arg_private_network
? CLONE_NEWNET
: 0) |
2767 (arg_userns
? CLONE_NEWUSER
: 0),
2770 return log_error_errno(errno
, "Failed to fork inner child: %m");
2772 pid_socket
= safe_close(pid_socket
);
2773 uid_shift_socket
= safe_close(uid_shift_socket
);
2775 /* The inner child has all namespaces that are
2776 * requested, so that we all are owned by the user if
2777 * user namespaces are turned on. */
2779 r
= inner_child(barrier
, directory
, secondary
, kmsg_socket
, rtnl_socket
, fds
);
2781 _exit(EXIT_FAILURE
);
2783 _exit(EXIT_SUCCESS
);
2786 l
= send(pid_socket
, &pid
, sizeof(pid
), MSG_NOSIGNAL
);
2788 return log_error_errno(errno
, "Failed to send PID: %m");
2789 if (l
!= sizeof(pid
)) {
2790 log_error("Short write while sending PID.");
2794 pid_socket
= safe_close(pid_socket
);
2795 kmsg_socket
= safe_close(kmsg_socket
);
2796 rtnl_socket
= safe_close(rtnl_socket
);
2801 static int setup_uid_map(pid_t pid
) {
2802 char uid_map
[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t
) + 1], line
[DECIMAL_STR_MAX(uid_t
)*3+3+1];
2807 xsprintf(uid_map
, "/proc/" PID_FMT
"/uid_map", pid
);
2808 xsprintf(line
, UID_FMT
" " UID_FMT
" " UID_FMT
"\n", 0, arg_uid_shift
, arg_uid_range
);
2809 r
= write_string_file(uid_map
, line
, 0);
2811 return log_error_errno(r
, "Failed to write UID map: %m");
2813 /* We always assign the same UID and GID ranges */
2814 xsprintf(uid_map
, "/proc/" PID_FMT
"/gid_map", pid
);
2815 r
= write_string_file(uid_map
, line
, 0);
2817 return log_error_errno(r
, "Failed to write GID map: %m");
2822 static int load_settings(void) {
2823 _cleanup_(settings_freep
) Settings
*settings
= NULL
;
2824 _cleanup_fclose_
FILE *f
= NULL
;
2825 _cleanup_free_
char *p
= NULL
;
2829 /* If all settings are masked, there's no point in looking for
2830 * the settings file */
2831 if ((arg_settings_mask
& _SETTINGS_MASK_ALL
) == _SETTINGS_MASK_ALL
)
2834 fn
= strjoina(arg_machine
, ".nspawn");
2836 /* We first look in the admin's directories in /etc and /run */
2837 FOREACH_STRING(i
, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2838 _cleanup_free_
char *j
= NULL
;
2840 j
= strjoin(i
, "/", fn
, NULL
);
2849 /* By default, we trust configuration from /etc and /run */
2850 if (arg_settings_trusted
< 0)
2851 arg_settings_trusted
= true;
2856 if (errno
!= ENOENT
)
2857 return log_error_errno(errno
, "Failed to open %s: %m", j
);
2861 /* After that, let's look for a file next to the
2862 * actual image we shall boot. */
2865 p
= file_in_same_dir(arg_image
, fn
);
2868 } else if (arg_directory
) {
2869 p
= file_in_same_dir(arg_directory
, fn
);
2876 if (!f
&& errno
!= ENOENT
)
2877 return log_error_errno(errno
, "Failed to open %s: %m", p
);
2879 /* By default, we do not trust configuration from /var/lib/machines */
2880 if (arg_settings_trusted
< 0)
2881 arg_settings_trusted
= false;
2888 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted
));
2890 r
= settings_load(f
, p
, &settings
);
2894 /* Copy over bits from the settings, unless they have been
2895 * explicitly masked by command line switches. */
2897 if ((arg_settings_mask
& SETTING_BOOT
) == 0 &&
2898 settings
->boot
>= 0) {
2899 arg_boot
= settings
->boot
;
2901 strv_free(arg_parameters
);
2902 arg_parameters
= settings
->parameters
;
2903 settings
->parameters
= NULL
;
2906 if ((arg_settings_mask
& SETTING_ENVIRONMENT
) == 0 &&
2907 settings
->environment
) {
2908 strv_free(arg_setenv
);
2909 arg_setenv
= settings
->environment
;
2910 settings
->environment
= NULL
;
2913 if ((arg_settings_mask
& SETTING_USER
) == 0 &&
2916 arg_user
= settings
->user
;
2917 settings
->user
= NULL
;
2920 if ((arg_settings_mask
& SETTING_CAPABILITY
) == 0) {
2923 plus
= settings
->capability
;
2924 if (settings_private_network(settings
))
2925 plus
|= (1ULL << CAP_NET_ADMIN
);
2927 if (!arg_settings_trusted
&& plus
!= 0) {
2928 if (settings
->capability
!= 0)
2929 log_warning("Ignoring Capability= setting, file %s is not trusted.", p
);
2933 arg_retain
&= ~settings
->drop_capability
;
2936 if ((arg_settings_mask
& SETTING_KILL_SIGNAL
) == 0 &&
2937 settings
->kill_signal
> 0)
2938 arg_kill_signal
= settings
->kill_signal
;
2940 if ((arg_settings_mask
& SETTING_PERSONALITY
) == 0 &&
2941 settings
->personality
!= PERSONALITY_INVALID
)
2942 arg_personality
= settings
->personality
;
2944 if ((arg_settings_mask
& SETTING_MACHINE_ID
) == 0 &&
2945 !sd_id128_is_null(settings
->machine_id
)) {
2947 if (!arg_settings_trusted
)
2948 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p
);
2950 arg_uuid
= settings
->machine_id
;
2953 if ((arg_settings_mask
& SETTING_READ_ONLY
) == 0 &&
2954 settings
->read_only
>= 0)
2955 arg_read_only
= settings
->read_only
;
2957 if ((arg_settings_mask
& SETTING_VOLATILE_MODE
) == 0 &&
2958 settings
->volatile_mode
!= _VOLATILE_MODE_INVALID
)
2959 arg_volatile_mode
= settings
->volatile_mode
;
2961 if ((arg_settings_mask
& SETTING_CUSTOM_MOUNTS
) == 0 &&
2962 settings
->n_custom_mounts
> 0) {
2964 if (!arg_settings_trusted
)
2965 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p
);
2967 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
2968 arg_custom_mounts
= settings
->custom_mounts
;
2969 arg_n_custom_mounts
= settings
->n_custom_mounts
;
2971 settings
->custom_mounts
= NULL
;
2972 settings
->n_custom_mounts
= 0;
2976 if ((arg_settings_mask
& SETTING_NETWORK
) == 0 &&
2977 (settings
->private_network
>= 0 ||
2978 settings
->network_veth
>= 0 ||
2979 settings
->network_bridge
||
2980 settings
->network_interfaces
||
2981 settings
->network_macvlan
||
2982 settings
->network_ipvlan
||
2983 settings
->network_veth_extra
)) {
2985 if (!arg_settings_trusted
)
2986 log_warning("Ignoring network settings, file %s is not trusted.", p
);
2988 arg_network_veth
= settings_network_veth(settings
);
2989 arg_private_network
= settings_private_network(settings
);
2991 strv_free(arg_network_interfaces
);
2992 arg_network_interfaces
= settings
->network_interfaces
;
2993 settings
->network_interfaces
= NULL
;
2995 strv_free(arg_network_macvlan
);
2996 arg_network_macvlan
= settings
->network_macvlan
;
2997 settings
->network_macvlan
= NULL
;
2999 strv_free(arg_network_ipvlan
);
3000 arg_network_ipvlan
= settings
->network_ipvlan
;
3001 settings
->network_ipvlan
= NULL
;
3003 strv_free(arg_network_veth_extra
);
3004 arg_network_veth_extra
= settings
->network_veth_extra
;
3005 settings
->network_veth_extra
= NULL
;
3007 free(arg_network_bridge
);
3008 arg_network_bridge
= settings
->network_bridge
;
3009 settings
->network_bridge
= NULL
;
3013 if ((arg_settings_mask
& SETTING_EXPOSE_PORTS
) == 0 &&
3014 settings
->expose_ports
) {
3016 if (!arg_settings_trusted
)
3017 log_warning("Ignoring Port= setting, file %s is not trusted.", p
);
3019 expose_port_free_all(arg_expose_ports
);
3020 arg_expose_ports
= settings
->expose_ports
;
3021 settings
->expose_ports
= NULL
;
3028 int main(int argc
, char *argv
[]) {
3030 _cleanup_free_
char *device_path
= NULL
, *root_device
= NULL
, *home_device
= NULL
, *srv_device
= NULL
, *console
= NULL
;
3031 bool root_device_rw
= true, home_device_rw
= true, srv_device_rw
= true;
3032 _cleanup_close_
int master
= -1, image_fd
= -1;
3033 _cleanup_fdset_free_ FDSet
*fds
= NULL
;
3034 int r
, n_fd_passed
, loop_nr
= -1;
3035 char veth_name
[IFNAMSIZ
];
3036 bool secondary
= false, remove_subvol
= false;
3039 int ret
= EXIT_SUCCESS
;
3040 union in_addr_union exposed
= {};
3041 _cleanup_release_lock_file_ LockFile tree_global_lock
= LOCK_FILE_INIT
, tree_local_lock
= LOCK_FILE_INIT
;
3044 log_parse_environment();
3047 r
= parse_argv(argc
, argv
);
3051 if (geteuid() != 0) {
3052 log_error("Need to be root.");
3056 r
= determine_names();
3060 r
= load_settings();
3064 r
= verify_arguments();
3068 n_fd_passed
= sd_listen_fds(false);
3069 if (n_fd_passed
> 0) {
3070 r
= fdset_new_listen_fds(&fds
, false);
3072 log_error_errno(r
, "Failed to collect file descriptors: %m");
3077 if (arg_directory
) {
3080 if (path_equal(arg_directory
, "/") && !arg_ephemeral
) {
3081 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3086 if (arg_ephemeral
) {
3087 _cleanup_free_
char *np
= NULL
;
3089 /* If the specified path is a mount point we
3090 * generate the new snapshot immediately
3091 * inside it under a random name. However if
3092 * the specified is not a mount point we
3093 * create the new snapshot in the parent
3094 * directory, just next to it. */
3095 r
= path_is_mount_point(arg_directory
, 0);
3097 log_error_errno(r
, "Failed to determine whether directory %s is mount point: %m", arg_directory
);
3101 r
= tempfn_random_child(arg_directory
, "machine.", &np
);
3103 r
= tempfn_random(arg_directory
, "machine.", &np
);
3105 log_error_errno(r
, "Failed to generate name for snapshot: %m");
3109 r
= image_path_lock(np
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3111 log_error_errno(r
, "Failed to lock %s: %m", np
);
3115 r
= btrfs_subvol_snapshot(arg_directory
, np
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
3117 log_error_errno(r
, "Failed to create snapshot %s from %s: %m", np
, arg_directory
);
3121 free(arg_directory
);
3125 remove_subvol
= true;
3128 r
= image_path_lock(arg_directory
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3130 log_error_errno(r
, "Directory tree %s is currently busy.", arg_directory
);
3134 log_error_errno(r
, "Failed to lock %s: %m", arg_directory
);
3139 r
= btrfs_subvol_snapshot(arg_template
, arg_directory
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
3142 log_info("Directory %s already exists, not populating from template %s.", arg_directory
, arg_template
);
3144 log_error_errno(r
, "Couldn't create snapshot %s from %s: %m", arg_directory
, arg_template
);
3148 log_info("Populated %s from template %s.", arg_directory
, arg_template
);
3154 if (path_is_os_tree(arg_directory
) <= 0) {
3155 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory
);
3162 p
= strjoina(arg_directory
, "/usr/");
3163 if (laccess(p
, F_OK
) < 0) {
3164 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory
);
3171 char template[] = "/tmp/nspawn-root-XXXXXX";
3174 assert(!arg_template
);
3176 r
= image_path_lock(arg_image
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3178 r
= log_error_errno(r
, "Disk image %s is currently busy.", arg_image
);
3182 r
= log_error_errno(r
, "Failed to create image lock: %m");
3186 if (!mkdtemp(template)) {
3187 log_error_errno(errno
, "Failed to create temporary directory: %m");
3192 arg_directory
= strdup(template);
3193 if (!arg_directory
) {
3198 image_fd
= setup_image(&device_path
, &loop_nr
);
3204 r
= dissect_image(image_fd
,
3205 &root_device
, &root_device_rw
,
3206 &home_device
, &home_device_rw
,
3207 &srv_device
, &srv_device_rw
,
3213 r
= custom_mounts_prepare();
3218 isatty(STDIN_FILENO
) > 0 &&
3219 isatty(STDOUT_FILENO
) > 0;
3221 master
= posix_openpt(O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NDELAY
);
3223 r
= log_error_errno(errno
, "Failed to acquire pseudo tty: %m");
3227 r
= ptsname_malloc(master
, &console
);
3229 r
= log_error_errno(r
, "Failed to determine tty name: %m");
3233 if (unlockpt(master
) < 0) {
3234 r
= log_error_errno(errno
, "Failed to unlock tty: %m");
3239 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3240 arg_machine
, arg_image
?: arg_directory
);
3242 assert_se(sigprocmask_many(SIG_BLOCK
, NULL
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, -1) >= 0);
3244 assert_se(sigemptyset(&mask_chld
) == 0);
3245 assert_se(sigaddset(&mask_chld
, SIGCHLD
) == 0);
3247 if (prctl(PR_SET_CHILD_SUBREAPER
, 1) < 0) {
3248 r
= log_error_errno(errno
, "Failed to become subreaper: %m");
3253 _cleanup_close_pair_
int kmsg_socket_pair
[2] = { -1, -1 }, rtnl_socket_pair
[2] = { -1, -1 }, pid_socket_pair
[2] = { -1, -1 }, uid_shift_socket_pair
[2] = { -1, -1 };
3254 ContainerStatus container_status
;
3255 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
3256 static const struct sigaction sa
= {
3257 .sa_handler
= nop_signal_handler
,
3258 .sa_flags
= SA_NOCLDSTOP
,
3262 _cleanup_(sd_event_unrefp
) sd_event
*event
= NULL
;
3263 _cleanup_(pty_forward_freep
) PTYForward
*forward
= NULL
;
3264 _cleanup_(sd_netlink_unrefp
) sd_netlink
*rtnl
= NULL
;
3267 r
= barrier_create(&barrier
);
3269 log_error_errno(r
, "Cannot initialize IPC barrier: %m");
3273 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, kmsg_socket_pair
) < 0) {
3274 r
= log_error_errno(errno
, "Failed to create kmsg socket pair: %m");
3278 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, rtnl_socket_pair
) < 0) {
3279 r
= log_error_errno(errno
, "Failed to create rtnl socket pair: %m");
3283 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, pid_socket_pair
) < 0) {
3284 r
= log_error_errno(errno
, "Failed to create pid socket pair: %m");
3289 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, uid_shift_socket_pair
) < 0) {
3290 r
= log_error_errno(errno
, "Failed to create uid shift socket pair: %m");
3294 /* Child can be killed before execv(), so handle SIGCHLD
3295 * in order to interrupt parent's blocking calls and
3296 * give it a chance to call wait() and terminate. */
3297 r
= sigprocmask(SIG_UNBLOCK
, &mask_chld
, NULL
);
3299 r
= log_error_errno(errno
, "Failed to change the signal mask: %m");
3303 r
= sigaction(SIGCHLD
, &sa
, NULL
);
3305 r
= log_error_errno(errno
, "Failed to install SIGCHLD handler: %m");
3309 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
, NULL
);
3311 if (errno
== EINVAL
)
3312 r
= log_error_errno(errno
, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3314 r
= log_error_errno(errno
, "clone() failed: %m");
3320 /* The outer child only has a file system namespace. */
3321 barrier_set_role(&barrier
, BARRIER_CHILD
);
3323 master
= safe_close(master
);
3325 kmsg_socket_pair
[0] = safe_close(kmsg_socket_pair
[0]);
3326 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3327 pid_socket_pair
[0] = safe_close(pid_socket_pair
[0]);
3328 uid_shift_socket_pair
[0] = safe_close(uid_shift_socket_pair
[0]);
3330 (void) reset_all_signal_handlers();
3331 (void) reset_signal_mask();
3333 r
= outer_child(&barrier
,
3336 root_device
, root_device_rw
,
3337 home_device
, home_device_rw
,
3338 srv_device
, srv_device_rw
,
3342 kmsg_socket_pair
[1],
3343 rtnl_socket_pair
[1],
3344 uid_shift_socket_pair
[1],
3347 _exit(EXIT_FAILURE
);
3349 _exit(EXIT_SUCCESS
);
3352 barrier_set_role(&barrier
, BARRIER_PARENT
);
3354 fds
= fdset_free(fds
);
3356 kmsg_socket_pair
[1] = safe_close(kmsg_socket_pair
[1]);
3357 rtnl_socket_pair
[1] = safe_close(rtnl_socket_pair
[1]);
3358 pid_socket_pair
[1] = safe_close(pid_socket_pair
[1]);
3359 uid_shift_socket_pair
[1] = safe_close(uid_shift_socket_pair
[1]);
3361 /* Wait for the outer child. */
3362 r
= wait_for_terminate_and_warn("namespace helper", pid
, NULL
);
3371 /* And now retrieve the PID of the inner child. */
3372 l
= recv(pid_socket_pair
[0], &pid
, sizeof(pid
), 0);
3374 r
= log_error_errno(errno
, "Failed to read inner child PID: %m");
3377 if (l
!= sizeof(pid
)) {
3378 log_error("Short read while reading inner child PID.");
3383 log_debug("Init process invoked as PID " PID_FMT
, pid
);
3386 if (!barrier_place_and_sync(&barrier
)) { /* #1 */
3387 log_error("Child died too early.");
3392 l
= recv(uid_shift_socket_pair
[0], &arg_uid_shift
, sizeof(arg_uid_shift
), 0);
3394 r
= log_error_errno(errno
, "Failed to read UID shift: %m");
3397 if (l
!= sizeof(arg_uid_shift
)) {
3398 log_error("Short read while reading UID shift.");
3403 r
= setup_uid_map(pid
);
3407 (void) barrier_place(&barrier
); /* #2 */
3410 if (arg_private_network
) {
3412 r
= move_network_interfaces(pid
, arg_network_interfaces
);
3416 if (arg_network_veth
) {
3417 r
= setup_veth(arg_machine
, pid
, veth_name
, !!arg_network_bridge
);
3423 if (arg_network_bridge
) {
3424 r
= setup_bridge(veth_name
, arg_network_bridge
);
3432 r
= setup_veth_extra(arg_machine
, pid
, arg_network_veth_extra
);
3436 r
= setup_macvlan(arg_machine
, pid
, arg_network_macvlan
);
3440 r
= setup_ipvlan(arg_machine
, pid
, arg_network_ipvlan
);
3446 r
= register_machine(
3453 arg_custom_mounts
, arg_n_custom_mounts
,
3457 arg_container_service_name
);
3462 r
= sync_cgroup(pid
, arg_unified_cgroup_hierarchy
);
3466 if (arg_keep_unit
) {
3467 r
= create_subcgroup(pid
, arg_unified_cgroup_hierarchy
);
3472 r
= chown_cgroup(pid
, arg_uid_shift
);
3476 /* Notify the child that the parent is ready with all
3477 * its setup (including cgroup-ification), and that
3478 * the child can now hand over control to the code to
3479 * run inside the container. */
3480 (void) barrier_place(&barrier
); /* #3 */
3482 /* Block SIGCHLD here, before notifying child.
3483 * process_pty() will handle it with the other signals. */
3484 assert_se(sigprocmask(SIG_BLOCK
, &mask_chld
, NULL
) >= 0);
3486 /* Reset signal to default */
3487 r
= default_signals(SIGCHLD
, -1);
3489 log_error_errno(r
, "Failed to reset SIGCHLD: %m");
3493 /* Let the child know that we are ready and wait that the child is completely ready now. */
3494 if (!barrier_place_and_sync(&barrier
)) { /* #4 */
3495 log_error("Child died too early.");
3502 "STATUS=Container running.\n"
3503 "X_NSPAWN_LEADER_PID=" PID_FMT
, pid
);
3505 r
= sd_event_new(&event
);
3507 log_error_errno(r
, "Failed to get default event source: %m");
3511 if (arg_kill_signal
> 0) {
3512 /* Try to kill the init system on SIGINT or SIGTERM */
3513 sd_event_add_signal(event
, NULL
, SIGINT
, on_orderly_shutdown
, PID_TO_PTR(pid
));
3514 sd_event_add_signal(event
, NULL
, SIGTERM
, on_orderly_shutdown
, PID_TO_PTR(pid
));
3516 /* Immediately exit */
3517 sd_event_add_signal(event
, NULL
, SIGINT
, NULL
, NULL
);
3518 sd_event_add_signal(event
, NULL
, SIGTERM
, NULL
, NULL
);
3521 /* simply exit on sigchld */
3522 sd_event_add_signal(event
, NULL
, SIGCHLD
, NULL
, NULL
);
3524 if (arg_expose_ports
) {
3525 r
= expose_port_watch_rtnl(event
, rtnl_socket_pair
[0], on_address_change
, &exposed
, &rtnl
);
3529 (void) expose_port_execute(rtnl
, arg_expose_ports
, &exposed
);
3532 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3534 r
= pty_forward_new(event
, master
, PTY_FORWARD_IGNORE_VHANGUP
| (interactive
? 0 : PTY_FORWARD_READ_ONLY
), &forward
);
3536 log_error_errno(r
, "Failed to create PTY forwarder: %m");
3540 r
= sd_event_loop(event
);
3542 log_error_errno(r
, "Failed to run event loop: %m");
3546 pty_forward_get_last_char(forward
, &last_char
);
3548 forward
= pty_forward_free(forward
);
3550 if (!arg_quiet
&& last_char
!= '\n')
3553 /* Kill if it is not dead yet anyway */
3554 if (arg_register
&& !arg_keep_unit
)
3555 terminate_machine(pid
);
3557 /* Normally redundant, but better safe than sorry */
3560 r
= wait_for_container(pid
, &container_status
);
3564 /* We failed to wait for the container, or the
3565 * container exited abnormally */
3567 else if (r
> 0 || container_status
== CONTAINER_TERMINATED
){
3568 /* The container exited with a non-zero
3569 * status, or with zero status and no reboot
3575 /* CONTAINER_REBOOTED, loop again */
3577 if (arg_keep_unit
) {
3578 /* Special handling if we are running as a
3579 * service: instead of simply restarting the
3580 * machine we want to restart the entire
3581 * service, so let's inform systemd about this
3582 * with the special exit code 133. The service
3583 * file uses RestartForceExitStatus=133 so
3584 * that this results in a full nspawn
3585 * restart. This is necessary since we might
3586 * have cgroup parameters set we want to have
3593 expose_port_flush(arg_expose_ports
, &exposed
);
3599 "STATUS=Terminating...");
3604 /* Try to flush whatever is still queued in the pty */
3606 (void) copy_bytes(master
, STDOUT_FILENO
, (uint64_t) -1, false);
3608 loop_remove(loop_nr
, &image_fd
);
3610 if (remove_subvol
&& arg_directory
) {
3613 k
= btrfs_subvol_remove(arg_directory
, BTRFS_REMOVE_RECURSIVE
|BTRFS_REMOVE_QUOTA
);
3615 log_warning_errno(k
, "Cannot remove subvolume '%s', ignoring: %m", arg_directory
);
3621 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
3622 (void) rm_rf(p
, REMOVE_ROOT
);
3625 expose_port_flush(arg_expose_ports
, &exposed
);
3627 free(arg_directory
);
3632 strv_free(arg_setenv
);
3633 free(arg_network_bridge
);
3634 strv_free(arg_network_interfaces
);
3635 strv_free(arg_network_macvlan
);
3636 strv_free(arg_network_ipvlan
);
3637 strv_free(arg_network_veth_extra
);
3638 strv_free(arg_parameters
);
3639 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
3640 expose_port_free_all(arg_expose_ports
);
3642 return r
< 0 ? EXIT_FAILURE
: ret
;