1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
23 #include <blkid/blkid.h>
27 #include <linux/loop.h>
33 #include <selinux/selinux.h>
40 #include <sys/mount.h>
41 #include <sys/personality.h>
42 #include <sys/prctl.h>
43 #include <sys/types.h>
46 #include "sd-daemon.h"
49 #include "alloc-util.h"
51 #include "base-filesystem.h"
52 #include "blkid-util.h"
53 #include "btrfs-util.h"
55 #include "capability-util.h"
56 #include "cgroup-util.h"
58 #include "dev-setup.h"
63 #include "formats-util.h"
66 #include "hostname-util.h"
68 #include "loopback-setup.h"
69 #include "machine-image.h"
73 #include "mount-util.h"
74 #include "netlink-util.h"
75 #include "nspawn-cgroup.h"
76 #include "nspawn-expose-ports.h"
77 #include "nspawn-mount.h"
78 #include "nspawn-network.h"
79 #include "nspawn-register.h"
80 #include "nspawn-settings.h"
81 #include "nspawn-setuid.h"
82 #include "parse-util.h"
83 #include "path-util.h"
84 #include "process-util.h"
86 #include "random-util.h"
89 #include "seccomp-util.h"
91 #include "signal-util.h"
92 #include "socket-util.h"
93 #include "stat-util.h"
94 #include "stdio-util.h"
95 #include "string-util.h"
97 #include "terminal-util.h"
98 #include "udev-util.h"
99 #include "umask-util.h"
100 #include "user-util.h"
103 typedef enum ContainerStatus
{
104 CONTAINER_TERMINATED
,
108 typedef enum LinkJournal
{
115 static char *arg_directory
= NULL
;
116 static char *arg_template
= NULL
;
117 static char *arg_user
= NULL
;
118 static sd_id128_t arg_uuid
= {};
119 static char *arg_machine
= NULL
;
120 static const char *arg_selinux_context
= NULL
;
121 static const char *arg_selinux_apifs_context
= NULL
;
122 static const char *arg_slice
= NULL
;
123 static bool arg_private_network
= false;
124 static bool arg_read_only
= false;
125 static bool arg_boot
= false;
126 static bool arg_ephemeral
= false;
127 static LinkJournal arg_link_journal
= LINK_AUTO
;
128 static bool arg_link_journal_try
= false;
129 static uint64_t arg_retain
=
130 (1ULL << CAP_CHOWN
) |
131 (1ULL << CAP_DAC_OVERRIDE
) |
132 (1ULL << CAP_DAC_READ_SEARCH
) |
133 (1ULL << CAP_FOWNER
) |
134 (1ULL << CAP_FSETID
) |
135 (1ULL << CAP_IPC_OWNER
) |
137 (1ULL << CAP_LEASE
) |
138 (1ULL << CAP_LINUX_IMMUTABLE
) |
139 (1ULL << CAP_NET_BIND_SERVICE
) |
140 (1ULL << CAP_NET_BROADCAST
) |
141 (1ULL << CAP_NET_RAW
) |
142 (1ULL << CAP_SETGID
) |
143 (1ULL << CAP_SETFCAP
) |
144 (1ULL << CAP_SETPCAP
) |
145 (1ULL << CAP_SETUID
) |
146 (1ULL << CAP_SYS_ADMIN
) |
147 (1ULL << CAP_SYS_CHROOT
) |
148 (1ULL << CAP_SYS_NICE
) |
149 (1ULL << CAP_SYS_PTRACE
) |
150 (1ULL << CAP_SYS_TTY_CONFIG
) |
151 (1ULL << CAP_SYS_RESOURCE
) |
152 (1ULL << CAP_SYS_BOOT
) |
153 (1ULL << CAP_AUDIT_WRITE
) |
154 (1ULL << CAP_AUDIT_CONTROL
) |
156 static CustomMount
*arg_custom_mounts
= NULL
;
157 static unsigned arg_n_custom_mounts
= 0;
158 static char **arg_setenv
= NULL
;
159 static bool arg_quiet
= false;
160 static bool arg_share_system
= false;
161 static bool arg_register
= true;
162 static bool arg_keep_unit
= false;
163 static char **arg_network_interfaces
= NULL
;
164 static char **arg_network_macvlan
= NULL
;
165 static char **arg_network_ipvlan
= NULL
;
166 static bool arg_network_veth
= false;
167 static char **arg_network_veth_extra
= NULL
;
168 static char *arg_network_bridge
= NULL
;
169 static unsigned long arg_personality
= PERSONALITY_INVALID
;
170 static char *arg_image
= NULL
;
171 static VolatileMode arg_volatile_mode
= VOLATILE_NO
;
172 static ExposePort
*arg_expose_ports
= NULL
;
173 static char **arg_property
= NULL
;
174 static uid_t arg_uid_shift
= UID_INVALID
, arg_uid_range
= 0x10000U
;
175 static bool arg_userns
= false;
176 static int arg_kill_signal
= 0;
177 static bool arg_unified_cgroup_hierarchy
= false;
178 static SettingsMask arg_settings_mask
= 0;
179 static int arg_settings_trusted
= -1;
180 static char **arg_parameters
= NULL
;
181 static const char *arg_container_service_name
= "systemd-nspawn";
183 static void help(void) {
184 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
185 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
186 " -h --help Show this help\n"
187 " --version Print version string\n"
188 " -q --quiet Do not show status information\n"
189 " -D --directory=PATH Root directory for the container\n"
190 " --template=PATH Initialize root directory from template directory,\n"
192 " -x --ephemeral Run container with snapshot of root directory, and\n"
193 " remove it after exit\n"
194 " -i --image=PATH File system device or disk image for the container\n"
195 " -b --boot Boot up full system (i.e. invoke init)\n"
196 " -u --user=USER Run the command under specified user or uid\n"
197 " -M --machine=NAME Set the machine name for the container\n"
198 " --uuid=UUID Set a specific machine UUID for the container\n"
199 " -S --slice=SLICE Place the container in the specified slice\n"
200 " --property=NAME=VALUE Set scope unit property\n"
201 " --private-users[=UIDBASE[:NUIDS]]\n"
202 " Run within user namespace\n"
203 " --private-network Disable network in container\n"
204 " --network-interface=INTERFACE\n"
205 " Assign an existing network interface to the\n"
207 " --network-macvlan=INTERFACE\n"
208 " Create a macvlan network interface based on an\n"
209 " existing network interface to the container\n"
210 " --network-ipvlan=INTERFACE\n"
211 " Create a ipvlan network interface based on an\n"
212 " existing network interface to the container\n"
213 " -n --network-veth Add a virtual Ethernet connection between host\n"
215 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
216 " Add an additional virtual Ethernet link between\n"
217 " host and container\n"
218 " --network-bridge=INTERFACE\n"
219 " Add a virtual Ethernet connection between host\n"
220 " and container and add it to an existing bridge on\n"
222 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
223 " Expose a container IP port on the host\n"
224 " -Z --selinux-context=SECLABEL\n"
225 " Set the SELinux security context to be used by\n"
226 " processes in the container\n"
227 " -L --selinux-apifs-context=SECLABEL\n"
228 " Set the SELinux security context to be used by\n"
229 " API/tmpfs file systems in the container\n"
230 " --capability=CAP In addition to the default, retain specified\n"
232 " --drop-capability=CAP Drop the specified capability from the default set\n"
233 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
234 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
235 " try-guest, try-host\n"
236 " -j Equivalent to --link-journal=try-guest\n"
237 " --read-only Mount the root directory read-only\n"
238 " --bind=PATH[:PATH[:OPTIONS]]\n"
239 " Bind mount a file or directory from the host into\n"
241 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
242 " Similar, but creates a read-only bind mount\n"
243 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
244 " --overlay=PATH[:PATH...]:PATH\n"
245 " Create an overlay mount from the host to \n"
247 " --overlay-ro=PATH[:PATH...]:PATH\n"
248 " Similar, but creates a read-only overlay mount\n"
249 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
250 " --share-system Share system namespaces with host\n"
251 " --register=BOOLEAN Register container as machine\n"
252 " --keep-unit Do not register a scope for the machine, reuse\n"
253 " the service unit nspawn is running in\n"
254 " --volatile[=MODE] Run the system in volatile mode\n"
255 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
256 , program_invocation_short_name
);
260 static int custom_mounts_prepare(void) {
264 /* Ensure the mounts are applied prefix first. */
265 qsort_safe(arg_custom_mounts
, arg_n_custom_mounts
, sizeof(CustomMount
), custom_mount_compare
);
267 /* Allocate working directories for the overlay file systems that need it */
268 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
269 CustomMount
*m
= &arg_custom_mounts
[i
];
271 if (arg_userns
&& arg_uid_shift
== UID_INVALID
&& path_equal(m
->destination
, "/")) {
272 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
276 if (m
->type
!= CUSTOM_MOUNT_OVERLAY
)
285 r
= tempfn_random(m
->source
, NULL
, &m
->work_dir
);
287 return log_error_errno(r
, "Failed to generate work directory from %s: %m", m
->source
);
293 static int detect_unified_cgroup_hierarchy(void) {
297 /* Allow the user to control whether the unified hierarchy is used */
298 e
= getenv("UNIFIED_CGROUP_HIERARCHY");
300 r
= parse_boolean(e
);
302 return log_error_errno(r
, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
304 arg_unified_cgroup_hierarchy
= r
;
308 /* Otherwise inherit the default from the host system */
311 return log_error_errno(r
, "Failed to determine whether the unified cgroups hierarchy is used: %m");
313 arg_unified_cgroup_hierarchy
= r
;
317 static int parse_argv(int argc
, char *argv
[]) {
336 ARG_NETWORK_INTERFACE
,
340 ARG_NETWORK_VETH_EXTRA
,
350 static const struct option options
[] = {
351 { "help", no_argument
, NULL
, 'h' },
352 { "version", no_argument
, NULL
, ARG_VERSION
},
353 { "directory", required_argument
, NULL
, 'D' },
354 { "template", required_argument
, NULL
, ARG_TEMPLATE
},
355 { "ephemeral", no_argument
, NULL
, 'x' },
356 { "user", required_argument
, NULL
, 'u' },
357 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
358 { "boot", no_argument
, NULL
, 'b' },
359 { "uuid", required_argument
, NULL
, ARG_UUID
},
360 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
361 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
362 { "drop-capability", required_argument
, NULL
, ARG_DROP_CAPABILITY
},
363 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
364 { "bind", required_argument
, NULL
, ARG_BIND
},
365 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
366 { "tmpfs", required_argument
, NULL
, ARG_TMPFS
},
367 { "overlay", required_argument
, NULL
, ARG_OVERLAY
},
368 { "overlay-ro", required_argument
, NULL
, ARG_OVERLAY_RO
},
369 { "machine", required_argument
, NULL
, 'M' },
370 { "slice", required_argument
, NULL
, 'S' },
371 { "setenv", required_argument
, NULL
, ARG_SETENV
},
372 { "selinux-context", required_argument
, NULL
, 'Z' },
373 { "selinux-apifs-context", required_argument
, NULL
, 'L' },
374 { "quiet", no_argument
, NULL
, 'q' },
375 { "share-system", no_argument
, NULL
, ARG_SHARE_SYSTEM
},
376 { "register", required_argument
, NULL
, ARG_REGISTER
},
377 { "keep-unit", no_argument
, NULL
, ARG_KEEP_UNIT
},
378 { "network-interface", required_argument
, NULL
, ARG_NETWORK_INTERFACE
},
379 { "network-macvlan", required_argument
, NULL
, ARG_NETWORK_MACVLAN
},
380 { "network-ipvlan", required_argument
, NULL
, ARG_NETWORK_IPVLAN
},
381 { "network-veth", no_argument
, NULL
, 'n' },
382 { "network-veth-extra", required_argument
, NULL
, ARG_NETWORK_VETH_EXTRA
},
383 { "network-bridge", required_argument
, NULL
, ARG_NETWORK_BRIDGE
},
384 { "personality", required_argument
, NULL
, ARG_PERSONALITY
},
385 { "image", required_argument
, NULL
, 'i' },
386 { "volatile", optional_argument
, NULL
, ARG_VOLATILE
},
387 { "port", required_argument
, NULL
, 'p' },
388 { "property", required_argument
, NULL
, ARG_PROPERTY
},
389 { "private-users", optional_argument
, NULL
, ARG_PRIVATE_USERS
},
390 { "kill-signal", required_argument
, NULL
, ARG_KILL_SIGNAL
},
391 { "settings", required_argument
, NULL
, ARG_SETTINGS
},
397 uint64_t plus
= 0, minus
= 0;
398 bool mask_all_settings
= false, mask_no_settings
= false;
403 while ((c
= getopt_long(argc
, argv
, "+hD:u:bL:M:jS:Z:qi:xp:n", options
, NULL
)) >= 0)
415 r
= parse_path_argument_and_warn(optarg
, false, &arg_directory
);
421 r
= parse_path_argument_and_warn(optarg
, false, &arg_template
);
427 r
= parse_path_argument_and_warn(optarg
, false, &arg_image
);
433 arg_ephemeral
= true;
437 r
= free_and_strdup(&arg_user
, optarg
);
441 arg_settings_mask
|= SETTING_USER
;
444 case ARG_NETWORK_BRIDGE
:
445 r
= free_and_strdup(&arg_network_bridge
, optarg
);
452 arg_network_veth
= true;
453 arg_private_network
= true;
454 arg_settings_mask
|= SETTING_NETWORK
;
457 case ARG_NETWORK_VETH_EXTRA
:
458 r
= veth_extra_parse(&arg_network_veth_extra
, optarg
);
460 return log_error_errno(r
, "Failed to parse --network-veth-extra= parameter: %s", optarg
);
462 arg_private_network
= true;
463 arg_settings_mask
|= SETTING_NETWORK
;
466 case ARG_NETWORK_INTERFACE
:
467 if (strv_extend(&arg_network_interfaces
, optarg
) < 0)
470 arg_private_network
= true;
471 arg_settings_mask
|= SETTING_NETWORK
;
474 case ARG_NETWORK_MACVLAN
:
475 if (strv_extend(&arg_network_macvlan
, optarg
) < 0)
478 arg_private_network
= true;
479 arg_settings_mask
|= SETTING_NETWORK
;
482 case ARG_NETWORK_IPVLAN
:
483 if (strv_extend(&arg_network_ipvlan
, optarg
) < 0)
488 case ARG_PRIVATE_NETWORK
:
489 arg_private_network
= true;
490 arg_settings_mask
|= SETTING_NETWORK
;
495 arg_settings_mask
|= SETTING_BOOT
;
499 r
= sd_id128_from_string(optarg
, &arg_uuid
);
501 log_error("Invalid UUID: %s", optarg
);
505 arg_settings_mask
|= SETTING_MACHINE_ID
;
514 arg_machine
= mfree(arg_machine
);
516 if (!machine_name_is_valid(optarg
)) {
517 log_error("Invalid machine name: %s", optarg
);
521 r
= free_and_strdup(&arg_machine
, optarg
);
529 arg_selinux_context
= optarg
;
533 arg_selinux_apifs_context
= optarg
;
537 arg_read_only
= true;
538 arg_settings_mask
|= SETTING_READ_ONLY
;
542 case ARG_DROP_CAPABILITY
: {
545 _cleanup_free_
char *t
= NULL
;
547 r
= extract_first_word(&p
, &t
, ",", 0);
549 return log_error_errno(r
, "Failed to parse capability %s.", t
);
554 if (streq(t
, "all")) {
555 if (c
== ARG_CAPABILITY
)
556 plus
= (uint64_t) -1;
558 minus
= (uint64_t) -1;
562 cap
= capability_from_name(t
);
564 log_error("Failed to parse capability %s.", t
);
568 if (c
== ARG_CAPABILITY
)
569 plus
|= 1ULL << (uint64_t) cap
;
571 minus
|= 1ULL << (uint64_t) cap
;
575 arg_settings_mask
|= SETTING_CAPABILITY
;
580 arg_link_journal
= LINK_GUEST
;
581 arg_link_journal_try
= true;
584 case ARG_LINK_JOURNAL
:
585 if (streq(optarg
, "auto")) {
586 arg_link_journal
= LINK_AUTO
;
587 arg_link_journal_try
= false;
588 } else if (streq(optarg
, "no")) {
589 arg_link_journal
= LINK_NO
;
590 arg_link_journal_try
= false;
591 } else if (streq(optarg
, "guest")) {
592 arg_link_journal
= LINK_GUEST
;
593 arg_link_journal_try
= false;
594 } else if (streq(optarg
, "host")) {
595 arg_link_journal
= LINK_HOST
;
596 arg_link_journal_try
= false;
597 } else if (streq(optarg
, "try-guest")) {
598 arg_link_journal
= LINK_GUEST
;
599 arg_link_journal_try
= true;
600 } else if (streq(optarg
, "try-host")) {
601 arg_link_journal
= LINK_HOST
;
602 arg_link_journal_try
= true;
604 log_error("Failed to parse link journal mode %s", optarg
);
612 r
= bind_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
, c
== ARG_BIND_RO
);
614 return log_error_errno(r
, "Failed to parse --bind(-ro)= argument %s: %m", optarg
);
616 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
620 r
= tmpfs_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
);
622 return log_error_errno(r
, "Failed to parse --tmpfs= argument %s: %m", optarg
);
624 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
628 case ARG_OVERLAY_RO
: {
629 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
630 _cleanup_strv_free_
char **lower
= NULL
;
635 r
= strv_split_extract(&lower
, optarg
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
639 log_error("Invalid overlay specification: %s", optarg
);
643 STRV_FOREACH(i
, lower
) {
644 if (!path_is_absolute(*i
)) {
645 log_error("Overlay path %s is not absolute.", *i
);
653 log_error("--overlay= needs at least two colon-separated directories specified.");
658 /* If two parameters are specified,
659 * the first one is the lower, the
660 * second one the upper directory. And
661 * we'll also define the destination
662 * mount point the same as the upper. */
666 destination
= strdup(upper
);
671 upper
= lower
[n
- 2];
672 destination
= lower
[n
- 1];
676 m
= custom_mount_add(&arg_custom_mounts
, &arg_n_custom_mounts
, CUSTOM_MOUNT_OVERLAY
);
680 m
->destination
= destination
;
683 m
->read_only
= c
== ARG_OVERLAY_RO
;
685 upper
= destination
= NULL
;
688 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
695 if (!env_assignment_is_valid(optarg
)) {
696 log_error("Environment variable assignment '%s' is not valid.", optarg
);
700 n
= strv_env_set(arg_setenv
, optarg
);
704 strv_free(arg_setenv
);
707 arg_settings_mask
|= SETTING_ENVIRONMENT
;
715 case ARG_SHARE_SYSTEM
:
716 arg_share_system
= true;
720 r
= parse_boolean(optarg
);
722 log_error("Failed to parse --register= argument: %s", optarg
);
730 arg_keep_unit
= true;
733 case ARG_PERSONALITY
:
735 arg_personality
= personality_from_string(optarg
);
736 if (arg_personality
== PERSONALITY_INVALID
) {
737 log_error("Unknown or unsupported personality '%s'.", optarg
);
741 arg_settings_mask
|= SETTING_PERSONALITY
;
747 arg_volatile_mode
= VOLATILE_YES
;
751 m
= volatile_mode_from_string(optarg
);
753 log_error("Failed to parse --volatile= argument: %s", optarg
);
756 arg_volatile_mode
= m
;
759 arg_settings_mask
|= SETTING_VOLATILE_MODE
;
763 r
= expose_port_parse(&arg_expose_ports
, optarg
);
765 return log_error_errno(r
, "Duplicate port specification: %s", optarg
);
767 return log_error_errno(r
, "Failed to parse host port %s: %m", optarg
);
769 arg_settings_mask
|= SETTING_EXPOSE_PORTS
;
773 if (strv_extend(&arg_property
, optarg
) < 0)
778 case ARG_PRIVATE_USERS
:
780 _cleanup_free_
char *buffer
= NULL
;
781 const char *range
, *shift
;
783 range
= strchr(optarg
, ':');
785 buffer
= strndup(optarg
, range
- optarg
);
791 if (safe_atou32(range
, &arg_uid_range
) < 0 || arg_uid_range
<= 0) {
792 log_error("Failed to parse UID range: %s", range
);
798 if (parse_uid(shift
, &arg_uid_shift
) < 0) {
799 log_error("Failed to parse UID: %s", optarg
);
807 case ARG_KILL_SIGNAL
:
808 arg_kill_signal
= signal_from_string_try_harder(optarg
);
809 if (arg_kill_signal
< 0) {
810 log_error("Cannot parse signal: %s", optarg
);
814 arg_settings_mask
|= SETTING_KILL_SIGNAL
;
819 /* no → do not read files
820 * yes → read files, do not override cmdline, trust only subset
821 * override → read files, override cmdline, trust only subset
822 * trusted → read files, do not override cmdline, trust all
825 r
= parse_boolean(optarg
);
827 if (streq(optarg
, "trusted")) {
828 mask_all_settings
= false;
829 mask_no_settings
= false;
830 arg_settings_trusted
= true;
832 } else if (streq(optarg
, "override")) {
833 mask_all_settings
= false;
834 mask_no_settings
= true;
835 arg_settings_trusted
= -1;
837 return log_error_errno(r
, "Failed to parse --settings= argument: %s", optarg
);
840 mask_all_settings
= false;
841 mask_no_settings
= false;
842 arg_settings_trusted
= -1;
845 mask_all_settings
= true;
846 mask_no_settings
= false;
847 arg_settings_trusted
= false;
856 assert_not_reached("Unhandled option");
859 if (arg_share_system
)
860 arg_register
= false;
862 if (arg_boot
&& arg_share_system
) {
863 log_error("--boot and --share-system may not be combined.");
867 if (arg_keep_unit
&& cg_pid_get_owner_uid(0, NULL
) >= 0) {
868 log_error("--keep-unit may not be used when invoked from a user session.");
872 if (arg_directory
&& arg_image
) {
873 log_error("--directory= and --image= may not be combined.");
877 if (arg_template
&& arg_image
) {
878 log_error("--template= and --image= may not be combined.");
882 if (arg_template
&& !(arg_directory
|| arg_machine
)) {
883 log_error("--template= needs --directory= or --machine=.");
887 if (arg_ephemeral
&& arg_template
) {
888 log_error("--ephemeral and --template= may not be combined.");
892 if (arg_ephemeral
&& arg_image
) {
893 log_error("--ephemeral and --image= may not be combined.");
897 if (arg_ephemeral
&& !IN_SET(arg_link_journal
, LINK_NO
, LINK_AUTO
)) {
898 log_error("--ephemeral and --link-journal= may not be combined.");
902 if (arg_userns
&& access("/proc/self/uid_map", F_OK
) < 0)
903 return log_error_errno(EOPNOTSUPP
, "--private-users= is not supported, kernel compiled without user namespace support.");
906 arg_parameters
= strv_copy(argv
+ optind
);
910 arg_settings_mask
|= SETTING_BOOT
;
913 /* Load all settings from .nspawn files */
914 if (mask_no_settings
)
915 arg_settings_mask
= 0;
917 /* Don't load any settings from .nspawn files */
918 if (mask_all_settings
)
919 arg_settings_mask
= _SETTINGS_MASK_ALL
;
921 arg_retain
= (arg_retain
| plus
| (arg_private_network
? 1ULL << CAP_NET_ADMIN
: 0)) & ~minus
;
923 r
= detect_unified_cgroup_hierarchy();
927 e
= getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
929 arg_container_service_name
= e
;
934 static int verify_arguments(void) {
936 if (arg_volatile_mode
!= VOLATILE_NO
&& arg_read_only
) {
937 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
941 if (arg_expose_ports
&& !arg_private_network
) {
942 log_error("Cannot use --port= without private networking.");
946 if (arg_boot
&& arg_kill_signal
<= 0)
947 arg_kill_signal
= SIGRTMIN
+3;
952 static int userns_lchown(const char *p
, uid_t uid
, gid_t gid
) {
958 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
961 if (uid
!= UID_INVALID
) {
962 uid
+= arg_uid_shift
;
964 if (uid
< arg_uid_shift
|| uid
>= arg_uid_shift
+ arg_uid_range
)
968 if (gid
!= GID_INVALID
) {
969 gid
+= (gid_t
) arg_uid_shift
;
971 if (gid
< (gid_t
) arg_uid_shift
|| gid
>= (gid_t
) (arg_uid_shift
+ arg_uid_range
))
975 if (lchown(p
, uid
, gid
) < 0)
981 static int userns_mkdir(const char *root
, const char *path
, mode_t mode
, uid_t uid
, gid_t gid
) {
984 q
= prefix_roota(root
, path
);
985 if (mkdir(q
, mode
) < 0) {
991 return userns_lchown(q
, uid
, gid
);
994 static int setup_timezone(const char *dest
) {
995 _cleanup_free_
char *p
= NULL
, *q
= NULL
;
996 const char *where
, *check
, *what
;
1002 /* Fix the timezone, if possible */
1003 r
= readlink_malloc("/etc/localtime", &p
);
1005 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1009 z
= path_startswith(p
, "../usr/share/zoneinfo/");
1011 z
= path_startswith(p
, "/usr/share/zoneinfo/");
1013 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1017 where
= prefix_roota(dest
, "/etc/localtime");
1018 r
= readlink_malloc(where
, &q
);
1020 y
= path_startswith(q
, "../usr/share/zoneinfo/");
1022 y
= path_startswith(q
, "/usr/share/zoneinfo/");
1024 /* Already pointing to the right place? Then do nothing .. */
1025 if (y
&& streq(y
, z
))
1029 check
= strjoina("/usr/share/zoneinfo/", z
);
1030 check
= prefix_roota(dest
, check
);
1031 if (laccess(check
, F_OK
) < 0) {
1032 log_warning("Timezone %s does not exist in container, not updating container timezone.", z
);
1037 if (r
< 0 && errno
!= ENOENT
) {
1038 log_error_errno(errno
, "Failed to remove existing timezone info %s in container: %m", where
);
1042 what
= strjoina("../usr/share/zoneinfo/", z
);
1043 if (symlink(what
, where
) < 0) {
1044 log_error_errno(errno
, "Failed to correct timezone of container: %m");
1048 r
= userns_lchown(where
, 0, 0);
1050 return log_warning_errno(r
, "Failed to chown /etc/localtime: %m");
1055 static int setup_resolv_conf(const char *dest
) {
1056 const char *where
= NULL
;
1061 if (arg_private_network
)
1064 /* Fix resolv.conf, if possible */
1065 where
= prefix_roota(dest
, "/etc/resolv.conf");
1067 r
= copy_file("/etc/resolv.conf", where
, O_TRUNC
|O_NOFOLLOW
, 0644, 0);
1069 /* If the file already exists as symlink, let's
1070 * suppress the warning, under the assumption that
1071 * resolved or something similar runs inside and the
1072 * symlink points there.
1074 * If the disk image is read-only, there's also no
1075 * point in complaining.
1077 log_full_errno(IN_SET(r
, -ELOOP
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
1078 "Failed to copy /etc/resolv.conf to %s: %m", where
);
1082 r
= userns_lchown(where
, 0, 0);
1084 log_warning_errno(r
, "Failed to chown /etc/resolv.conf: %m");
1089 static char* id128_format_as_uuid(sd_id128_t id
, char s
[37]) {
1093 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1094 SD_ID128_FORMAT_VAL(id
));
1099 static int setup_boot_id(const char *dest
) {
1100 const char *from
, *to
;
1101 sd_id128_t rnd
= {};
1105 if (arg_share_system
)
1108 /* Generate a new randomized boot ID, so that each boot-up of
1109 * the container gets a new one */
1111 from
= prefix_roota(dest
, "/run/proc-sys-kernel-random-boot-id");
1112 to
= prefix_roota(dest
, "/proc/sys/kernel/random/boot_id");
1114 r
= sd_id128_randomize(&rnd
);
1116 return log_error_errno(r
, "Failed to generate random boot id: %m");
1118 id128_format_as_uuid(rnd
, as_uuid
);
1120 r
= write_string_file(from
, as_uuid
, WRITE_STRING_FILE_CREATE
);
1122 return log_error_errno(r
, "Failed to write boot id: %m");
1124 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1125 r
= log_error_errno(errno
, "Failed to bind mount boot id: %m");
1126 else if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
) < 0)
1127 log_warning_errno(errno
, "Failed to make boot id read-only: %m");
1133 static int copy_devnodes(const char *dest
) {
1135 static const char devnodes
[] =
1146 _cleanup_umask_ mode_t u
;
1152 /* Create /dev/net, so that we can create /dev/net/tun in it */
1153 if (userns_mkdir(dest
, "/dev/net", 0755, 0, 0) < 0)
1154 return log_error_errno(r
, "Failed to create /dev/net directory: %m");
1156 NULSTR_FOREACH(d
, devnodes
) {
1157 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
1160 from
= strappend("/dev/", d
);
1161 to
= prefix_root(dest
, from
);
1163 if (stat(from
, &st
) < 0) {
1165 if (errno
!= ENOENT
)
1166 return log_error_errno(errno
, "Failed to stat %s: %m", from
);
1168 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
1170 log_error("%s is not a char or block device, cannot copy.", from
);
1174 if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
1176 return log_error_errno(errno
, "mknod(%s) failed: %m", to
);
1178 /* Some systems abusively restrict mknod but
1179 * allow bind mounts. */
1182 return log_error_errno(r
, "touch (%s) failed: %m", to
);
1183 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1184 return log_error_errno(errno
, "Both mknod and bind mount (%s) failed: %m", to
);
1187 r
= userns_lchown(to
, 0, 0);
1189 return log_error_errno(r
, "chown() of device node %s failed: %m", to
);
1196 static int setup_pts(const char *dest
) {
1197 _cleanup_free_
char *options
= NULL
;
1202 if (arg_selinux_apifs_context
)
1203 (void) asprintf(&options
,
1204 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
",context=\"%s\"",
1205 arg_uid_shift
+ TTY_GID
,
1206 arg_selinux_apifs_context
);
1209 (void) asprintf(&options
,
1210 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
,
1211 arg_uid_shift
+ TTY_GID
);
1216 /* Mount /dev/pts itself */
1217 p
= prefix_roota(dest
, "/dev/pts");
1218 if (mkdir(p
, 0755) < 0)
1219 return log_error_errno(errno
, "Failed to create /dev/pts: %m");
1220 if (mount("devpts", p
, "devpts", MS_NOSUID
|MS_NOEXEC
, options
) < 0)
1221 return log_error_errno(errno
, "Failed to mount /dev/pts: %m");
1222 r
= userns_lchown(p
, 0, 0);
1224 return log_error_errno(r
, "Failed to chown /dev/pts: %m");
1226 /* Create /dev/ptmx symlink */
1227 p
= prefix_roota(dest
, "/dev/ptmx");
1228 if (symlink("pts/ptmx", p
) < 0)
1229 return log_error_errno(errno
, "Failed to create /dev/ptmx symlink: %m");
1230 r
= userns_lchown(p
, 0, 0);
1232 return log_error_errno(r
, "Failed to chown /dev/ptmx: %m");
1234 /* And fix /dev/pts/ptmx ownership */
1235 p
= prefix_roota(dest
, "/dev/pts/ptmx");
1236 r
= userns_lchown(p
, 0, 0);
1238 return log_error_errno(r
, "Failed to chown /dev/pts/ptmx: %m");
1243 static int setup_dev_console(const char *dest
, const char *console
) {
1244 _cleanup_umask_ mode_t u
;
1253 r
= chmod_and_chown(console
, 0600, arg_uid_shift
, arg_uid_shift
);
1255 return log_error_errno(r
, "Failed to correct access mode for TTY: %m");
1257 /* We need to bind mount the right tty to /dev/console since
1258 * ptys can only exist on pts file systems. To have something
1259 * to bind mount things on we create a empty regular file. */
1261 to
= prefix_roota(dest
, "/dev/console");
1264 return log_error_errno(r
, "touch() for /dev/console failed: %m");
1266 if (mount(console
, to
, NULL
, MS_BIND
, NULL
) < 0)
1267 return log_error_errno(errno
, "Bind mount for /dev/console failed: %m");
1272 static int setup_kmsg(const char *dest
, int kmsg_socket
) {
1273 const char *from
, *to
;
1274 _cleanup_umask_ mode_t u
;
1277 assert(kmsg_socket
>= 0);
1281 /* We create the kmsg FIFO as /run/kmsg, but immediately
1282 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1283 * on the reading side behave very similar to /proc/kmsg,
1284 * their writing side behaves differently from /dev/kmsg in
1285 * that writing blocks when nothing is reading. In order to
1286 * avoid any problems with containers deadlocking due to this
1287 * we simply make /dev/kmsg unavailable to the container. */
1288 from
= prefix_roota(dest
, "/run/kmsg");
1289 to
= prefix_roota(dest
, "/proc/kmsg");
1291 if (mkfifo(from
, 0600) < 0)
1292 return log_error_errno(errno
, "mkfifo() for /run/kmsg failed: %m");
1293 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1294 return log_error_errno(errno
, "Bind mount for /proc/kmsg failed: %m");
1296 fd
= open(from
, O_RDWR
|O_NDELAY
|O_CLOEXEC
);
1298 return log_error_errno(errno
, "Failed to open fifo: %m");
1300 /* Store away the fd in the socket, so that it stays open as
1301 * long as we run the child */
1302 r
= send_one_fd(kmsg_socket
, fd
, 0);
1306 return log_error_errno(r
, "Failed to send FIFO fd: %m");
1308 /* And now make the FIFO unavailable as /run/kmsg... */
1309 (void) unlink(from
);
1314 static int on_address_change(sd_netlink
*rtnl
, sd_netlink_message
*m
, void *userdata
) {
1315 union in_addr_union
*exposed
= userdata
;
1321 expose_port_execute(rtnl
, arg_expose_ports
, exposed
);
1325 static int setup_hostname(void) {
1327 if (arg_share_system
)
1330 if (sethostname_idempotent(arg_machine
) < 0)
1336 static int setup_journal(const char *directory
) {
1337 sd_id128_t machine_id
, this_id
;
1338 _cleanup_free_
char *b
= NULL
, *d
= NULL
;
1339 const char *etc_machine_id
, *p
, *q
;
1343 /* Don't link journals in ephemeral mode */
1347 etc_machine_id
= prefix_roota(directory
, "/etc/machine-id");
1349 r
= read_one_line_file(etc_machine_id
, &b
);
1350 if (r
== -ENOENT
&& arg_link_journal
== LINK_AUTO
)
1353 return log_error_errno(r
, "Failed to read machine ID from %s: %m", etc_machine_id
);
1356 if (isempty(id
) && arg_link_journal
== LINK_AUTO
)
1359 /* Verify validity */
1360 r
= sd_id128_from_string(id
, &machine_id
);
1362 return log_error_errno(r
, "Failed to parse machine ID from %s: %m", etc_machine_id
);
1364 r
= sd_id128_get_machine(&this_id
);
1366 return log_error_errno(r
, "Failed to retrieve machine ID: %m");
1368 if (sd_id128_equal(machine_id
, this_id
)) {
1369 log_full(arg_link_journal
== LINK_AUTO
? LOG_WARNING
: LOG_ERR
,
1370 "Host and machine ids are equal (%s): refusing to link journals", id
);
1371 if (arg_link_journal
== LINK_AUTO
)
1376 if (arg_link_journal
== LINK_NO
)
1379 r
= userns_mkdir(directory
, "/var", 0755, 0, 0);
1381 return log_error_errno(r
, "Failed to create /var: %m");
1383 r
= userns_mkdir(directory
, "/var/log", 0755, 0, 0);
1385 return log_error_errno(r
, "Failed to create /var/log: %m");
1387 r
= userns_mkdir(directory
, "/var/log/journal", 0755, 0, 0);
1389 return log_error_errno(r
, "Failed to create /var/log/journal: %m");
1391 p
= strjoina("/var/log/journal/", id
);
1392 q
= prefix_roota(directory
, p
);
1394 if (path_is_mount_point(p
, 0) > 0) {
1395 if (arg_link_journal
!= LINK_AUTO
) {
1396 log_error("%s: already a mount point, refusing to use for journal", p
);
1403 if (path_is_mount_point(q
, 0) > 0) {
1404 if (arg_link_journal
!= LINK_AUTO
) {
1405 log_error("%s: already a mount point, refusing to use for journal", q
);
1412 r
= readlink_and_make_absolute(p
, &d
);
1414 if ((arg_link_journal
== LINK_GUEST
||
1415 arg_link_journal
== LINK_AUTO
) &&
1418 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1420 log_warning_errno(r
, "Failed to create directory %s: %m", q
);
1425 return log_error_errno(errno
, "Failed to remove symlink %s: %m", p
);
1426 } else if (r
== -EINVAL
) {
1428 if (arg_link_journal
== LINK_GUEST
&&
1431 if (errno
== ENOTDIR
) {
1432 log_error("%s already exists and is neither a symlink nor a directory", p
);
1435 return log_error_errno(errno
, "Failed to remove %s: %m", p
);
1437 } else if (r
!= -ENOENT
)
1438 return log_error_errno(r
, "readlink(%s) failed: %m", p
);
1440 if (arg_link_journal
== LINK_GUEST
) {
1442 if (symlink(q
, p
) < 0) {
1443 if (arg_link_journal_try
) {
1444 log_debug_errno(errno
, "Failed to symlink %s to %s, skipping journal setup: %m", q
, p
);
1447 return log_error_errno(errno
, "Failed to symlink %s to %s: %m", q
, p
);
1450 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1452 log_warning_errno(r
, "Failed to create directory %s: %m", q
);
1456 if (arg_link_journal
== LINK_HOST
) {
1457 /* don't create parents here -- if the host doesn't have
1458 * permanent journal set up, don't force it here */
1461 if (arg_link_journal_try
) {
1462 log_debug_errno(errno
, "Failed to create %s, skipping journal setup: %m", p
);
1465 return log_error_errno(errno
, "Failed to create %s: %m", p
);
1468 } else if (access(p
, F_OK
) < 0)
1471 if (dir_is_empty(q
) == 0)
1472 log_warning("%s is not empty, proceeding anyway.", q
);
1474 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1476 return log_error_errno(r
, "Failed to create %s: %m", q
);
1478 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1479 return log_error_errno(errno
, "Failed to bind mount journal from host into guest: %m");
1484 static int drop_capabilities(void) {
1485 return capability_bounding_set_drop(arg_retain
, false);
1488 static int reset_audit_loginuid(void) {
1489 _cleanup_free_
char *p
= NULL
;
1492 if (arg_share_system
)
1495 r
= read_one_line_file("/proc/self/loginuid", &p
);
1499 return log_error_errno(r
, "Failed to read /proc/self/loginuid: %m");
1501 /* Already reset? */
1502 if (streq(p
, "4294967295"))
1505 r
= write_string_file("/proc/self/loginuid", "4294967295", 0);
1508 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1509 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1510 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1511 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1512 "using systemd-nspawn. Sleeping for 5s... (%m)");
1520 static int setup_seccomp(void) {
1523 static const struct {
1524 uint64_t capability
;
1527 { CAP_SYS_RAWIO
, SCMP_SYS(iopl
) },
1528 { CAP_SYS_RAWIO
, SCMP_SYS(ioperm
) },
1529 { CAP_SYS_BOOT
, SCMP_SYS(kexec_load
) },
1530 { CAP_SYS_ADMIN
, SCMP_SYS(swapon
) },
1531 { CAP_SYS_ADMIN
, SCMP_SYS(swapoff
) },
1532 { CAP_SYS_ADMIN
, SCMP_SYS(open_by_handle_at
) },
1533 { CAP_SYS_MODULE
, SCMP_SYS(init_module
) },
1534 { CAP_SYS_MODULE
, SCMP_SYS(finit_module
) },
1535 { CAP_SYS_MODULE
, SCMP_SYS(delete_module
) },
1536 { CAP_SYSLOG
, SCMP_SYS(syslog
) },
1539 scmp_filter_ctx seccomp
;
1543 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
1547 r
= seccomp_add_secondary_archs(seccomp
);
1549 log_error_errno(r
, "Failed to add secondary archs to seccomp filter: %m");
1553 for (i
= 0; i
< ELEMENTSOF(blacklist
); i
++) {
1554 if (arg_retain
& (1ULL << blacklist
[i
].capability
))
1557 r
= seccomp_rule_add(seccomp
, SCMP_ACT_ERRNO(EPERM
), blacklist
[i
].syscall_num
, 0);
1559 continue; /* unknown syscall */
1561 log_error_errno(r
, "Failed to block syscall: %m");
1568 Audit is broken in containers, much of the userspace audit
1569 hookup will fail if running inside a container. We don't
1570 care and just turn off creation of audit sockets.
1572 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1573 with EAFNOSUPPORT which audit userspace uses as indication
1574 that audit is disabled in the kernel.
1577 r
= seccomp_rule_add(
1579 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1582 SCMP_A0(SCMP_CMP_EQ
, AF_NETLINK
),
1583 SCMP_A2(SCMP_CMP_EQ
, NETLINK_AUDIT
));
1585 log_error_errno(r
, "Failed to add audit seccomp rule: %m");
1589 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
1591 log_error_errno(r
, "Failed to unset NO_NEW_PRIVS: %m");
1595 r
= seccomp_load(seccomp
);
1597 log_debug_errno(r
, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1602 log_error_errno(r
, "Failed to install seccomp audit filter: %m");
1607 seccomp_release(seccomp
);
1615 static int setup_propagate(const char *root
) {
1619 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1620 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1621 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
1622 (void) mkdir_p(p
, 0600);
1624 r
= userns_mkdir(root
, "/run/systemd", 0755, 0, 0);
1626 return log_error_errno(r
, "Failed to create /run/systemd: %m");
1628 r
= userns_mkdir(root
, "/run/systemd/nspawn", 0755, 0, 0);
1630 return log_error_errno(r
, "Failed to create /run/systemd/nspawn: %m");
1632 r
= userns_mkdir(root
, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1634 return log_error_errno(r
, "Failed to create /run/systemd/nspawn/incoming: %m");
1636 q
= prefix_roota(root
, "/run/systemd/nspawn/incoming");
1637 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1638 return log_error_errno(errno
, "Failed to install propagation bind mount.");
1640 if (mount(NULL
, q
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
) < 0)
1641 return log_error_errno(errno
, "Failed to make propagation mount read-only");
1646 static int setup_image(char **device_path
, int *loop_nr
) {
1647 struct loop_info64 info
= {
1648 .lo_flags
= LO_FLAGS_AUTOCLEAR
|LO_FLAGS_PARTSCAN
1650 _cleanup_close_
int fd
= -1, control
= -1, loop
= -1;
1651 _cleanup_free_
char* loopdev
= NULL
;
1655 assert(device_path
);
1659 fd
= open(arg_image
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1661 return log_error_errno(errno
, "Failed to open %s: %m", arg_image
);
1663 if (fstat(fd
, &st
) < 0)
1664 return log_error_errno(errno
, "Failed to stat %s: %m", arg_image
);
1666 if (S_ISBLK(st
.st_mode
)) {
1669 p
= strdup(arg_image
);
1683 if (!S_ISREG(st
.st_mode
)) {
1684 log_error("%s is not a regular file or block device.", arg_image
);
1688 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
1690 return log_error_errno(errno
, "Failed to open /dev/loop-control: %m");
1692 nr
= ioctl(control
, LOOP_CTL_GET_FREE
);
1694 return log_error_errno(errno
, "Failed to allocate loop device: %m");
1696 if (asprintf(&loopdev
, "/dev/loop%i", nr
) < 0)
1699 loop
= open(loopdev
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1701 return log_error_errno(errno
, "Failed to open loop device %s: %m", loopdev
);
1703 if (ioctl(loop
, LOOP_SET_FD
, fd
) < 0)
1704 return log_error_errno(errno
, "Failed to set loopback file descriptor on %s: %m", loopdev
);
1707 info
.lo_flags
|= LO_FLAGS_READ_ONLY
;
1709 if (ioctl(loop
, LOOP_SET_STATUS64
, &info
) < 0)
1710 return log_error_errno(errno
, "Failed to set loopback settings on %s: %m", loopdev
);
1712 *device_path
= loopdev
;
1723 #define PARTITION_TABLE_BLURB \
1724 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1725 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1726 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1727 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1728 "to be bootable with systemd-nspawn."
1730 static int dissect_image(
1732 char **root_device
, bool *root_device_rw
,
1733 char **home_device
, bool *home_device_rw
,
1734 char **srv_device
, bool *srv_device_rw
,
1738 int home_nr
= -1, srv_nr
= -1;
1739 #ifdef GPT_ROOT_NATIVE
1742 #ifdef GPT_ROOT_SECONDARY
1743 int secondary_root_nr
= -1;
1745 _cleanup_free_
char *home
= NULL
, *root
= NULL
, *secondary_root
= NULL
, *srv
= NULL
, *generic
= NULL
;
1746 _cleanup_udev_enumerate_unref_
struct udev_enumerate
*e
= NULL
;
1747 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
1748 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
1749 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
1750 struct udev_list_entry
*first
, *item
;
1751 bool home_rw
= true, root_rw
= true, secondary_root_rw
= true, srv_rw
= true, generic_rw
= true;
1752 bool is_gpt
, is_mbr
, multiple_generic
= false;
1753 const char *pttype
= NULL
;
1760 assert(root_device
);
1761 assert(home_device
);
1766 b
= blkid_new_probe();
1771 r
= blkid_probe_set_device(b
, fd
, 0, 0);
1776 return log_error_errno(errno
, "Failed to set device on blkid probe: %m");
1779 blkid_probe_enable_partitions(b
, 1);
1780 blkid_probe_set_partitions_flags(b
, BLKID_PARTS_ENTRY_DETAILS
);
1783 r
= blkid_do_safeprobe(b
);
1784 if (r
== -2 || r
== 1) {
1785 log_error("Failed to identify any partition table on\n"
1787 PARTITION_TABLE_BLURB
, arg_image
);
1789 } else if (r
!= 0) {
1792 return log_error_errno(errno
, "Failed to probe: %m");
1795 (void) blkid_probe_lookup_value(b
, "PTTYPE", &pttype
, NULL
);
1797 is_gpt
= streq_ptr(pttype
, "gpt");
1798 is_mbr
= streq_ptr(pttype
, "dos");
1800 if (!is_gpt
&& !is_mbr
) {
1801 log_error("No GPT or MBR partition table discovered on\n"
1803 PARTITION_TABLE_BLURB
, arg_image
);
1808 pl
= blkid_probe_get_partitions(b
);
1813 log_error("Failed to list partitions of %s", arg_image
);
1821 if (fstat(fd
, &st
) < 0)
1822 return log_error_errno(errno
, "Failed to stat block device: %m");
1824 d
= udev_device_new_from_devnum(udev
, 'b', st
.st_rdev
);
1832 log_error("Kernel partitions never appeared.");
1836 e
= udev_enumerate_new(udev
);
1840 r
= udev_enumerate_add_match_parent(e
, d
);
1844 r
= udev_enumerate_scan_devices(e
);
1846 return log_error_errno(r
, "Failed to scan for partition devices of %s: %m", arg_image
);
1848 /* Count the partitions enumerated by the kernel */
1850 first
= udev_enumerate_get_list_entry(e
);
1851 udev_list_entry_foreach(item
, first
)
1854 /* Count the partitions enumerated by blkid */
1855 m
= blkid_partlist_numof_partitions(pl
);
1859 log_error("blkid and kernel partition list do not match.");
1865 /* The kernel has probed fewer partitions than
1866 * blkid? Maybe the kernel prober is still
1867 * running or it got EBUSY because udev
1868 * already opened the device. Let's reprobe
1869 * the device, which is a synchronous call
1870 * that waits until probing is complete. */
1872 for (j
= 0; j
< 20; j
++) {
1874 r
= ioctl(fd
, BLKRRPART
, 0);
1877 if (r
>= 0 || r
!= -EBUSY
)
1880 /* If something else has the device
1881 * open, such as an udev rule, the
1882 * ioctl will return EBUSY. Since
1883 * there's no way to wait until it
1884 * isn't busy anymore, let's just wait
1885 * a bit, and try again.
1887 * This is really something they
1888 * should fix in the kernel! */
1890 usleep(50 * USEC_PER_MSEC
);
1894 return log_error_errno(r
, "Failed to reread partition table: %m");
1897 e
= udev_enumerate_unref(e
);
1900 first
= udev_enumerate_get_list_entry(e
);
1901 udev_list_entry_foreach(item
, first
) {
1902 _cleanup_udev_device_unref_
struct udev_device
*q
;
1904 unsigned long long flags
;
1910 q
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(item
));
1915 return log_error_errno(errno
, "Failed to get partition device of %s: %m", arg_image
);
1918 qn
= udev_device_get_devnum(q
);
1922 if (st
.st_rdev
== qn
)
1925 node
= udev_device_get_devnode(q
);
1929 pp
= blkid_partlist_devno_to_partition(pl
, qn
);
1933 flags
= blkid_partition_get_flags(pp
);
1935 nr
= blkid_partition_get_partno(pp
);
1943 if (flags
& GPT_FLAG_NO_AUTO
)
1946 stype
= blkid_partition_get_type_string(pp
);
1950 if (sd_id128_from_string(stype
, &type_id
) < 0)
1953 if (sd_id128_equal(type_id
, GPT_HOME
)) {
1955 if (home
&& nr
>= home_nr
)
1959 home_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1961 r
= free_and_strdup(&home
, node
);
1965 } else if (sd_id128_equal(type_id
, GPT_SRV
)) {
1967 if (srv
&& nr
>= srv_nr
)
1971 srv_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1973 r
= free_and_strdup(&srv
, node
);
1977 #ifdef GPT_ROOT_NATIVE
1978 else if (sd_id128_equal(type_id
, GPT_ROOT_NATIVE
)) {
1980 if (root
&& nr
>= root_nr
)
1984 root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1986 r
= free_and_strdup(&root
, node
);
1991 #ifdef GPT_ROOT_SECONDARY
1992 else if (sd_id128_equal(type_id
, GPT_ROOT_SECONDARY
)) {
1994 if (secondary_root
&& nr
>= secondary_root_nr
)
1997 secondary_root_nr
= nr
;
1998 secondary_root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2000 r
= free_and_strdup(&secondary_root
, node
);
2005 else if (sd_id128_equal(type_id
, GPT_LINUX_GENERIC
)) {
2008 multiple_generic
= true;
2010 generic_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2012 r
= free_and_strdup(&generic
, node
);
2018 } else if (is_mbr
) {
2021 if (flags
!= 0x80) /* Bootable flag */
2024 type
= blkid_partition_get_type(pp
);
2025 if (type
!= 0x83) /* Linux partition */
2029 multiple_generic
= true;
2033 r
= free_and_strdup(&root
, node
);
2041 *root_device
= root
;
2044 *root_device_rw
= root_rw
;
2046 } else if (secondary_root
) {
2047 *root_device
= secondary_root
;
2048 secondary_root
= NULL
;
2050 *root_device_rw
= secondary_root_rw
;
2052 } else if (generic
) {
2054 /* There were no partitions with precise meanings
2055 * around, but we found generic partitions. In this
2056 * case, if there's only one, we can go ahead and boot
2057 * it, otherwise we bail out, because we really cannot
2058 * make any sense of it. */
2060 if (multiple_generic
) {
2061 log_error("Identified multiple bootable Linux partitions on\n"
2063 PARTITION_TABLE_BLURB
, arg_image
);
2067 *root_device
= generic
;
2070 *root_device_rw
= generic_rw
;
2073 log_error("Failed to identify root partition in disk image\n"
2075 PARTITION_TABLE_BLURB
, arg_image
);
2080 *home_device
= home
;
2083 *home_device_rw
= home_rw
;
2090 *srv_device_rw
= srv_rw
;
2095 log_error("--image= is not supported, compiled without blkid support.");
2100 static int mount_device(const char *what
, const char *where
, const char *directory
, bool rw
) {
2102 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
2103 const char *fstype
, *p
;
2113 p
= strjoina(where
, directory
);
2118 b
= blkid_new_probe_from_filename(what
);
2122 return log_error_errno(errno
, "Failed to allocate prober for %s: %m", what
);
2125 blkid_probe_enable_superblocks(b
, 1);
2126 blkid_probe_set_superblocks_flags(b
, BLKID_SUBLKS_TYPE
);
2129 r
= blkid_do_safeprobe(b
);
2130 if (r
== -1 || r
== 1) {
2131 log_error("Cannot determine file system type of %s", what
);
2133 } else if (r
!= 0) {
2136 return log_error_errno(errno
, "Failed to probe %s: %m", what
);
2140 if (blkid_probe_lookup_value(b
, "TYPE", &fstype
, NULL
) < 0) {
2143 log_error("Failed to determine file system type of %s", what
);
2147 if (streq(fstype
, "crypto_LUKS")) {
2148 log_error("nspawn currently does not support LUKS disk images.");
2152 if (mount(what
, p
, fstype
, MS_NODEV
|(rw
? 0 : MS_RDONLY
), NULL
) < 0)
2153 return log_error_errno(errno
, "Failed to mount %s: %m", what
);
2157 log_error("--image= is not supported, compiled without blkid support.");
2162 static int mount_devices(
2164 const char *root_device
, bool root_device_rw
,
2165 const char *home_device
, bool home_device_rw
,
2166 const char *srv_device
, bool srv_device_rw
) {
2172 r
= mount_device(root_device
, arg_directory
, NULL
, root_device_rw
);
2174 return log_error_errno(r
, "Failed to mount root directory: %m");
2178 r
= mount_device(home_device
, arg_directory
, "/home", home_device_rw
);
2180 return log_error_errno(r
, "Failed to mount home directory: %m");
2184 r
= mount_device(srv_device
, arg_directory
, "/srv", srv_device_rw
);
2186 return log_error_errno(r
, "Failed to mount server data directory: %m");
2192 static void loop_remove(int nr
, int *image_fd
) {
2193 _cleanup_close_
int control
= -1;
2199 if (image_fd
&& *image_fd
>= 0) {
2200 r
= ioctl(*image_fd
, LOOP_CLR_FD
);
2202 log_debug_errno(errno
, "Failed to close loop image: %m");
2203 *image_fd
= safe_close(*image_fd
);
2206 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
2208 log_warning_errno(errno
, "Failed to open /dev/loop-control: %m");
2212 r
= ioctl(control
, LOOP_CTL_REMOVE
, nr
);
2214 log_debug_errno(errno
, "Failed to remove loop %d: %m", nr
);
2219 * < 0 : wait_for_terminate() failed to get the state of the
2220 * container, the container was terminated by a signal, or
2221 * failed for an unknown reason. No change is made to the
2222 * container argument.
2223 * > 0 : The program executed in the container terminated with an
2224 * error. The exit code of the program executed in the
2225 * container is returned. The container argument has been set
2226 * to CONTAINER_TERMINATED.
2227 * 0 : The container is being rebooted, has been shut down or exited
2228 * successfully. The container argument has been set to either
2229 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2231 * That is, success is indicated by a return value of zero, and an
2232 * error is indicated by a non-zero value.
2234 static int wait_for_container(pid_t pid
, ContainerStatus
*container
) {
2238 r
= wait_for_terminate(pid
, &status
);
2240 return log_warning_errno(r
, "Failed to wait for container: %m");
2242 switch (status
.si_code
) {
2245 if (status
.si_status
== 0) {
2246 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s exited successfully.", arg_machine
);
2249 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s failed with error code %i.", arg_machine
, status
.si_status
);
2251 *container
= CONTAINER_TERMINATED
;
2252 return status
.si_status
;
2255 if (status
.si_status
== SIGINT
) {
2257 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s has been shut down.", arg_machine
);
2258 *container
= CONTAINER_TERMINATED
;
2261 } else if (status
.si_status
== SIGHUP
) {
2263 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s is being rebooted.", arg_machine
);
2264 *container
= CONTAINER_REBOOTED
;
2268 /* CLD_KILLED fallthrough */
2271 log_error("Container %s terminated by signal %s.", arg_machine
, signal_to_string(status
.si_status
));
2275 log_error("Container %s failed due to unknown reason.", arg_machine
);
2282 static int on_orderly_shutdown(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
2285 pid
= PTR_TO_PID(userdata
);
2287 if (kill(pid
, arg_kill_signal
) >= 0) {
2288 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2289 sd_event_source_set_userdata(s
, NULL
);
2294 sd_event_exit(sd_event_source_get_event(s
), 0);
2298 static int determine_names(void) {
2301 if (arg_template
&& !arg_directory
&& arg_machine
) {
2303 /* If --template= was specified then we should not
2304 * search for a machine, but instead create a new one
2305 * in /var/lib/machine. */
2307 arg_directory
= strjoin("/var/lib/machines/", arg_machine
, NULL
);
2312 if (!arg_image
&& !arg_directory
) {
2314 _cleanup_(image_unrefp
) Image
*i
= NULL
;
2316 r
= image_find(arg_machine
, &i
);
2318 return log_error_errno(r
, "Failed to find image for machine '%s': %m", arg_machine
);
2320 log_error("No image for machine '%s': %m", arg_machine
);
2324 if (i
->type
== IMAGE_RAW
)
2325 r
= free_and_strdup(&arg_image
, i
->path
);
2327 r
= free_and_strdup(&arg_directory
, i
->path
);
2329 return log_error_errno(r
, "Invalid image directory: %m");
2332 arg_read_only
= arg_read_only
|| i
->read_only
;
2334 arg_directory
= get_current_dir_name();
2336 if (!arg_directory
&& !arg_machine
) {
2337 log_error("Failed to determine path, please use -D or -i.");
2343 if (arg_directory
&& path_equal(arg_directory
, "/"))
2344 arg_machine
= gethostname_malloc();
2346 arg_machine
= strdup(basename(arg_image
?: arg_directory
));
2351 hostname_cleanup(arg_machine
);
2352 if (!machine_name_is_valid(arg_machine
)) {
2353 log_error("Failed to determine machine name automatically, please use -M.");
2357 if (arg_ephemeral
) {
2360 /* Add a random suffix when this is an
2361 * ephemeral machine, so that we can run many
2362 * instances at once without manually having
2363 * to specify -M each time. */
2365 if (asprintf(&b
, "%s-%016" PRIx64
, arg_machine
, random_u64()) < 0)
2376 static int determine_uid_shift(const char *directory
) {
2384 if (arg_uid_shift
== UID_INVALID
) {
2387 r
= stat(directory
, &st
);
2389 return log_error_errno(errno
, "Failed to determine UID base of %s: %m", directory
);
2391 arg_uid_shift
= st
.st_uid
& UINT32_C(0xffff0000);
2393 if (arg_uid_shift
!= (st
.st_gid
& UINT32_C(0xffff0000))) {
2394 log_error("UID and GID base of %s don't match.", directory
);
2398 arg_uid_range
= UINT32_C(0x10000);
2401 if (arg_uid_shift
> (uid_t
) -1 - arg_uid_range
) {
2402 log_error("UID base too high for UID range.");
2406 log_info("Using user namespaces with base " UID_FMT
" and range " UID_FMT
".", arg_uid_shift
, arg_uid_range
);
2410 static int inner_child(
2412 const char *directory
,
2418 _cleanup_free_
char *home
= NULL
;
2420 const char *envp
[] = {
2421 "PATH=" DEFAULT_PATH_SPLIT_USR
,
2422 NULL
, /* container */
2427 NULL
, /* container_uuid */
2428 NULL
, /* LISTEN_FDS */
2429 NULL
, /* LISTEN_PID */
2433 _cleanup_strv_free_
char **env_use
= NULL
;
2438 assert(kmsg_socket
>= 0);
2443 /* Tell the parent, that it now can write the UID map. */
2444 (void) barrier_place(barrier
); /* #1 */
2446 /* Wait until the parent wrote the UID map */
2447 if (!barrier_place_and_sync(barrier
)) { /* #2 */
2448 log_error("Parent died too early");
2453 r
= mount_all(NULL
, arg_userns
, true, arg_uid_shift
, arg_private_network
, arg_uid_range
, arg_selinux_apifs_context
);
2457 r
= mount_sysfs(NULL
);
2461 /* Wait until we are cgroup-ified, so that we
2462 * can mount the right cgroup path writable */
2463 if (!barrier_place_and_sync(barrier
)) { /* #3 */
2464 log_error("Parent died too early");
2468 r
= mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy
);
2472 r
= reset_uid_gid();
2474 return log_error_errno(r
, "Couldn't become new root: %m");
2476 r
= setup_boot_id(NULL
);
2480 r
= setup_kmsg(NULL
, kmsg_socket
);
2483 kmsg_socket
= safe_close(kmsg_socket
);
2488 return log_error_errno(errno
, "setsid() failed: %m");
2490 if (arg_private_network
)
2493 if (arg_expose_ports
) {
2494 r
= expose_port_send_rtnl(rtnl_socket
);
2497 rtnl_socket
= safe_close(rtnl_socket
);
2500 r
= drop_capabilities();
2502 return log_error_errno(r
, "drop_capabilities() failed: %m");
2506 if (arg_personality
!= PERSONALITY_INVALID
) {
2507 if (personality(arg_personality
) < 0)
2508 return log_error_errno(errno
, "personality() failed: %m");
2509 } else if (secondary
) {
2510 if (personality(PER_LINUX32
) < 0)
2511 return log_error_errno(errno
, "personality() failed: %m");
2515 if (arg_selinux_context
)
2516 if (setexeccon((security_context_t
) arg_selinux_context
) < 0)
2517 return log_error_errno(errno
, "setexeccon(\"%s\") failed: %m", arg_selinux_context
);
2520 r
= change_uid_gid(arg_user
, &home
);
2524 /* LXC sets container=lxc, so follow the scheme here */
2525 envp
[n_env
++] = strjoina("container=", arg_container_service_name
);
2527 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
2531 if ((asprintf((char**)(envp
+ n_env
++), "HOME=%s", home
? home
: "/root") < 0) ||
2532 (asprintf((char**)(envp
+ n_env
++), "USER=%s", arg_user
? arg_user
: "root") < 0) ||
2533 (asprintf((char**)(envp
+ n_env
++), "LOGNAME=%s", arg_user
? arg_user
: "root") < 0))
2536 if (!sd_id128_equal(arg_uuid
, SD_ID128_NULL
)) {
2539 if (asprintf((char**)(envp
+ n_env
++), "container_uuid=%s", id128_format_as_uuid(arg_uuid
, as_uuid
)) < 0)
2543 if (fdset_size(fds
) > 0) {
2544 r
= fdset_cloexec(fds
, false);
2546 return log_error_errno(r
, "Failed to unset O_CLOEXEC for file descriptors.");
2548 if ((asprintf((char **)(envp
+ n_env
++), "LISTEN_FDS=%u", fdset_size(fds
)) < 0) ||
2549 (asprintf((char **)(envp
+ n_env
++), "LISTEN_PID=1") < 0))
2553 env_use
= strv_env_merge(2, envp
, arg_setenv
);
2557 /* Let the parent know that we are ready and
2558 * wait until the parent is ready with the
2560 if (!barrier_place_and_sync(barrier
)) { /* #4 */
2561 log_error("Parent died too early");
2565 /* Now, explicitly close the log, so that we
2566 * then can close all remaining fds. Closing
2567 * the log explicitly first has the benefit
2568 * that the logging subsystem knows about it,
2569 * and is thus ready to be reopened should we
2570 * need it again. Note that the other fds
2571 * closed here are at least the locking and
2574 (void) fdset_close_others(fds
);
2580 /* Automatically search for the init system */
2582 m
= 1 + strv_length(arg_parameters
);
2583 a
= newa(char*, m
+ 1);
2584 if (strv_isempty(arg_parameters
))
2587 memcpy(a
+ 1, arg_parameters
, m
* sizeof(char*));
2589 a
[0] = (char*) "/usr/lib/systemd/systemd";
2590 execve(a
[0], a
, env_use
);
2592 a
[0] = (char*) "/lib/systemd/systemd";
2593 execve(a
[0], a
, env_use
);
2595 a
[0] = (char*) "/sbin/init";
2596 execve(a
[0], a
, env_use
);
2597 } else if (!strv_isempty(arg_parameters
))
2598 execvpe(arg_parameters
[0], arg_parameters
, env_use
);
2600 chdir(home
?: "/root");
2601 execle("/bin/bash", "-bash", NULL
, env_use
);
2602 execle("/bin/sh", "-sh", NULL
, env_use
);
2607 return log_error_errno(r
, "execv() failed: %m");
2610 static int outer_child(
2612 const char *directory
,
2613 const char *console
,
2614 const char *root_device
, bool root_device_rw
,
2615 const char *home_device
, bool home_device_rw
,
2616 const char *srv_device
, bool srv_device_rw
,
2622 int uid_shift_socket
,
2632 assert(pid_socket
>= 0);
2633 assert(kmsg_socket
>= 0);
2637 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0)
2638 return log_error_errno(errno
, "PR_SET_PDEATHSIG failed: %m");
2641 close_nointr(STDIN_FILENO
);
2642 close_nointr(STDOUT_FILENO
);
2643 close_nointr(STDERR_FILENO
);
2645 r
= open_terminal(console
, O_RDWR
);
2646 if (r
!= STDIN_FILENO
) {
2652 return log_error_errno(r
, "Failed to open console: %m");
2655 if (dup2(STDIN_FILENO
, STDOUT_FILENO
) != STDOUT_FILENO
||
2656 dup2(STDIN_FILENO
, STDERR_FILENO
) != STDERR_FILENO
)
2657 return log_error_errno(errno
, "Failed to duplicate console: %m");
2660 r
= reset_audit_loginuid();
2664 /* Mark everything as slave, so that we still
2665 * receive mounts from the real root, but don't
2666 * propagate mounts to the real root. */
2667 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
2668 return log_error_errno(errno
, "MS_SLAVE|MS_REC failed: %m");
2670 r
= mount_devices(directory
,
2671 root_device
, root_device_rw
,
2672 home_device
, home_device_rw
,
2673 srv_device
, srv_device_rw
);
2677 r
= determine_uid_shift(directory
);
2682 l
= send(uid_shift_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), MSG_NOSIGNAL
);
2684 return log_error_errno(errno
, "Failed to send UID shift: %m");
2685 if (l
!= sizeof(arg_uid_shift
)) {
2686 log_error("Short write while sending UID shift.");
2691 /* Turn directory into bind mount */
2692 if (mount(directory
, directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
2693 return log_error_errno(errno
, "Failed to make bind mount: %m");
2695 r
= setup_volatile(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2699 r
= setup_volatile_state(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2703 r
= base_filesystem_create(directory
, arg_uid_shift
, (gid_t
) arg_uid_shift
);
2707 if (arg_read_only
) {
2708 r
= bind_remount_recursive(directory
, true);
2710 return log_error_errno(r
, "Failed to make tree read-only: %m");
2713 r
= mount_all(directory
, arg_userns
, false, arg_private_network
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2717 r
= copy_devnodes(directory
);
2721 dev_setup(directory
, arg_uid_shift
, arg_uid_shift
);
2723 r
= setup_pts(directory
);
2727 r
= setup_propagate(directory
);
2731 r
= setup_dev_console(directory
, console
);
2735 r
= setup_seccomp();
2739 r
= setup_timezone(directory
);
2743 r
= setup_resolv_conf(directory
);
2747 r
= setup_journal(directory
);
2751 r
= mount_custom(directory
, arg_custom_mounts
, arg_n_custom_mounts
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2755 r
= mount_cgroups(directory
, arg_unified_cgroup_hierarchy
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2759 r
= mount_move_root(directory
);
2761 return log_error_errno(r
, "Failed to move root directory: %m");
2763 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
|
2764 (arg_share_system
? 0 : CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
) |
2765 (arg_private_network
? CLONE_NEWNET
: 0) |
2766 (arg_userns
? CLONE_NEWUSER
: 0),
2769 return log_error_errno(errno
, "Failed to fork inner child: %m");
2771 pid_socket
= safe_close(pid_socket
);
2772 uid_shift_socket
= safe_close(uid_shift_socket
);
2774 /* The inner child has all namespaces that are
2775 * requested, so that we all are owned by the user if
2776 * user namespaces are turned on. */
2778 r
= inner_child(barrier
, directory
, secondary
, kmsg_socket
, rtnl_socket
, fds
);
2780 _exit(EXIT_FAILURE
);
2782 _exit(EXIT_SUCCESS
);
2785 l
= send(pid_socket
, &pid
, sizeof(pid
), MSG_NOSIGNAL
);
2787 return log_error_errno(errno
, "Failed to send PID: %m");
2788 if (l
!= sizeof(pid
)) {
2789 log_error("Short write while sending PID.");
2793 pid_socket
= safe_close(pid_socket
);
2794 kmsg_socket
= safe_close(kmsg_socket
);
2795 rtnl_socket
= safe_close(rtnl_socket
);
2800 static int setup_uid_map(pid_t pid
) {
2801 char uid_map
[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t
) + 1], line
[DECIMAL_STR_MAX(uid_t
)*3+3+1];
2806 xsprintf(uid_map
, "/proc/" PID_FMT
"/uid_map", pid
);
2807 xsprintf(line
, UID_FMT
" " UID_FMT
" " UID_FMT
"\n", 0, arg_uid_shift
, arg_uid_range
);
2808 r
= write_string_file(uid_map
, line
, 0);
2810 return log_error_errno(r
, "Failed to write UID map: %m");
2812 /* We always assign the same UID and GID ranges */
2813 xsprintf(uid_map
, "/proc/" PID_FMT
"/gid_map", pid
);
2814 r
= write_string_file(uid_map
, line
, 0);
2816 return log_error_errno(r
, "Failed to write GID map: %m");
2821 static int load_settings(void) {
2822 _cleanup_(settings_freep
) Settings
*settings
= NULL
;
2823 _cleanup_fclose_
FILE *f
= NULL
;
2824 _cleanup_free_
char *p
= NULL
;
2828 /* If all settings are masked, there's no point in looking for
2829 * the settings file */
2830 if ((arg_settings_mask
& _SETTINGS_MASK_ALL
) == _SETTINGS_MASK_ALL
)
2833 fn
= strjoina(arg_machine
, ".nspawn");
2835 /* We first look in the admin's directories in /etc and /run */
2836 FOREACH_STRING(i
, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2837 _cleanup_free_
char *j
= NULL
;
2839 j
= strjoin(i
, "/", fn
, NULL
);
2848 /* By default, we trust configuration from /etc and /run */
2849 if (arg_settings_trusted
< 0)
2850 arg_settings_trusted
= true;
2855 if (errno
!= ENOENT
)
2856 return log_error_errno(errno
, "Failed to open %s: %m", j
);
2860 /* After that, let's look for a file next to the
2861 * actual image we shall boot. */
2864 p
= file_in_same_dir(arg_image
, fn
);
2867 } else if (arg_directory
) {
2868 p
= file_in_same_dir(arg_directory
, fn
);
2875 if (!f
&& errno
!= ENOENT
)
2876 return log_error_errno(errno
, "Failed to open %s: %m", p
);
2878 /* By default, we do not trust configuration from /var/lib/machines */
2879 if (arg_settings_trusted
< 0)
2880 arg_settings_trusted
= false;
2887 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted
));
2889 r
= settings_load(f
, p
, &settings
);
2893 /* Copy over bits from the settings, unless they have been
2894 * explicitly masked by command line switches. */
2896 if ((arg_settings_mask
& SETTING_BOOT
) == 0 &&
2897 settings
->boot
>= 0) {
2898 arg_boot
= settings
->boot
;
2900 strv_free(arg_parameters
);
2901 arg_parameters
= settings
->parameters
;
2902 settings
->parameters
= NULL
;
2905 if ((arg_settings_mask
& SETTING_ENVIRONMENT
) == 0 &&
2906 settings
->environment
) {
2907 strv_free(arg_setenv
);
2908 arg_setenv
= settings
->environment
;
2909 settings
->environment
= NULL
;
2912 if ((arg_settings_mask
& SETTING_USER
) == 0 &&
2915 arg_user
= settings
->user
;
2916 settings
->user
= NULL
;
2919 if ((arg_settings_mask
& SETTING_CAPABILITY
) == 0) {
2922 plus
= settings
->capability
;
2923 if (settings_private_network(settings
))
2924 plus
|= (1ULL << CAP_NET_ADMIN
);
2926 if (!arg_settings_trusted
&& plus
!= 0) {
2927 if (settings
->capability
!= 0)
2928 log_warning("Ignoring Capability= setting, file %s is not trusted.", p
);
2932 arg_retain
&= ~settings
->drop_capability
;
2935 if ((arg_settings_mask
& SETTING_KILL_SIGNAL
) == 0 &&
2936 settings
->kill_signal
> 0)
2937 arg_kill_signal
= settings
->kill_signal
;
2939 if ((arg_settings_mask
& SETTING_PERSONALITY
) == 0 &&
2940 settings
->personality
!= PERSONALITY_INVALID
)
2941 arg_personality
= settings
->personality
;
2943 if ((arg_settings_mask
& SETTING_MACHINE_ID
) == 0 &&
2944 !sd_id128_is_null(settings
->machine_id
)) {
2946 if (!arg_settings_trusted
)
2947 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p
);
2949 arg_uuid
= settings
->machine_id
;
2952 if ((arg_settings_mask
& SETTING_READ_ONLY
) == 0 &&
2953 settings
->read_only
>= 0)
2954 arg_read_only
= settings
->read_only
;
2956 if ((arg_settings_mask
& SETTING_VOLATILE_MODE
) == 0 &&
2957 settings
->volatile_mode
!= _VOLATILE_MODE_INVALID
)
2958 arg_volatile_mode
= settings
->volatile_mode
;
2960 if ((arg_settings_mask
& SETTING_CUSTOM_MOUNTS
) == 0 &&
2961 settings
->n_custom_mounts
> 0) {
2963 if (!arg_settings_trusted
)
2964 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p
);
2966 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
2967 arg_custom_mounts
= settings
->custom_mounts
;
2968 arg_n_custom_mounts
= settings
->n_custom_mounts
;
2970 settings
->custom_mounts
= NULL
;
2971 settings
->n_custom_mounts
= 0;
2975 if ((arg_settings_mask
& SETTING_NETWORK
) == 0 &&
2976 (settings
->private_network
>= 0 ||
2977 settings
->network_veth
>= 0 ||
2978 settings
->network_bridge
||
2979 settings
->network_interfaces
||
2980 settings
->network_macvlan
||
2981 settings
->network_ipvlan
||
2982 settings
->network_veth_extra
)) {
2984 if (!arg_settings_trusted
)
2985 log_warning("Ignoring network settings, file %s is not trusted.", p
);
2987 arg_network_veth
= settings_network_veth(settings
);
2988 arg_private_network
= settings_private_network(settings
);
2990 strv_free(arg_network_interfaces
);
2991 arg_network_interfaces
= settings
->network_interfaces
;
2992 settings
->network_interfaces
= NULL
;
2994 strv_free(arg_network_macvlan
);
2995 arg_network_macvlan
= settings
->network_macvlan
;
2996 settings
->network_macvlan
= NULL
;
2998 strv_free(arg_network_ipvlan
);
2999 arg_network_ipvlan
= settings
->network_ipvlan
;
3000 settings
->network_ipvlan
= NULL
;
3002 strv_free(arg_network_veth_extra
);
3003 arg_network_veth_extra
= settings
->network_veth_extra
;
3004 settings
->network_veth_extra
= NULL
;
3006 free(arg_network_bridge
);
3007 arg_network_bridge
= settings
->network_bridge
;
3008 settings
->network_bridge
= NULL
;
3012 if ((arg_settings_mask
& SETTING_EXPOSE_PORTS
) == 0 &&
3013 settings
->expose_ports
) {
3015 if (!arg_settings_trusted
)
3016 log_warning("Ignoring Port= setting, file %s is not trusted.", p
);
3018 expose_port_free_all(arg_expose_ports
);
3019 arg_expose_ports
= settings
->expose_ports
;
3020 settings
->expose_ports
= NULL
;
3027 int main(int argc
, char *argv
[]) {
3029 _cleanup_free_
char *device_path
= NULL
, *root_device
= NULL
, *home_device
= NULL
, *srv_device
= NULL
, *console
= NULL
;
3030 bool root_device_rw
= true, home_device_rw
= true, srv_device_rw
= true;
3031 _cleanup_close_
int master
= -1, image_fd
= -1;
3032 _cleanup_fdset_free_ FDSet
*fds
= NULL
;
3033 int r
, n_fd_passed
, loop_nr
= -1;
3034 char veth_name
[IFNAMSIZ
];
3035 bool secondary
= false, remove_subvol
= false;
3038 int ret
= EXIT_SUCCESS
;
3039 union in_addr_union exposed
= {};
3040 _cleanup_release_lock_file_ LockFile tree_global_lock
= LOCK_FILE_INIT
, tree_local_lock
= LOCK_FILE_INIT
;
3043 log_parse_environment();
3046 r
= parse_argv(argc
, argv
);
3050 if (geteuid() != 0) {
3051 log_error("Need to be root.");
3055 r
= determine_names();
3059 r
= load_settings();
3063 r
= verify_arguments();
3067 n_fd_passed
= sd_listen_fds(false);
3068 if (n_fd_passed
> 0) {
3069 r
= fdset_new_listen_fds(&fds
, false);
3071 log_error_errno(r
, "Failed to collect file descriptors: %m");
3076 if (arg_directory
) {
3079 if (path_equal(arg_directory
, "/") && !arg_ephemeral
) {
3080 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3085 if (arg_ephemeral
) {
3086 _cleanup_free_
char *np
= NULL
;
3088 /* If the specified path is a mount point we
3089 * generate the new snapshot immediately
3090 * inside it under a random name. However if
3091 * the specified is not a mount point we
3092 * create the new snapshot in the parent
3093 * directory, just next to it. */
3094 r
= path_is_mount_point(arg_directory
, 0);
3096 log_error_errno(r
, "Failed to determine whether directory %s is mount point: %m", arg_directory
);
3100 r
= tempfn_random_child(arg_directory
, "machine.", &np
);
3102 r
= tempfn_random(arg_directory
, "machine.", &np
);
3104 log_error_errno(r
, "Failed to generate name for snapshot: %m");
3108 r
= image_path_lock(np
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3110 log_error_errno(r
, "Failed to lock %s: %m", np
);
3114 r
= btrfs_subvol_snapshot(arg_directory
, np
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
3116 log_error_errno(r
, "Failed to create snapshot %s from %s: %m", np
, arg_directory
);
3120 free(arg_directory
);
3124 remove_subvol
= true;
3127 r
= image_path_lock(arg_directory
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3129 log_error_errno(r
, "Directory tree %s is currently busy.", arg_directory
);
3133 log_error_errno(r
, "Failed to lock %s: %m", arg_directory
);
3138 r
= btrfs_subvol_snapshot(arg_template
, arg_directory
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
3141 log_info("Directory %s already exists, not populating from template %s.", arg_directory
, arg_template
);
3143 log_error_errno(r
, "Couldn't create snapshot %s from %s: %m", arg_directory
, arg_template
);
3147 log_info("Populated %s from template %s.", arg_directory
, arg_template
);
3153 if (path_is_os_tree(arg_directory
) <= 0) {
3154 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory
);
3161 p
= strjoina(arg_directory
, "/usr/");
3162 if (laccess(p
, F_OK
) < 0) {
3163 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory
);
3170 char template[] = "/tmp/nspawn-root-XXXXXX";
3173 assert(!arg_template
);
3175 r
= image_path_lock(arg_image
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3177 r
= log_error_errno(r
, "Disk image %s is currently busy.", arg_image
);
3181 r
= log_error_errno(r
, "Failed to create image lock: %m");
3185 if (!mkdtemp(template)) {
3186 log_error_errno(errno
, "Failed to create temporary directory: %m");
3191 arg_directory
= strdup(template);
3192 if (!arg_directory
) {
3197 image_fd
= setup_image(&device_path
, &loop_nr
);
3203 r
= dissect_image(image_fd
,
3204 &root_device
, &root_device_rw
,
3205 &home_device
, &home_device_rw
,
3206 &srv_device
, &srv_device_rw
,
3212 r
= custom_mounts_prepare();
3217 isatty(STDIN_FILENO
) > 0 &&
3218 isatty(STDOUT_FILENO
) > 0;
3220 master
= posix_openpt(O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NDELAY
);
3222 r
= log_error_errno(errno
, "Failed to acquire pseudo tty: %m");
3226 r
= ptsname_malloc(master
, &console
);
3228 r
= log_error_errno(r
, "Failed to determine tty name: %m");
3232 if (unlockpt(master
) < 0) {
3233 r
= log_error_errno(errno
, "Failed to unlock tty: %m");
3238 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3239 arg_machine
, arg_image
?: arg_directory
);
3241 assert_se(sigprocmask_many(SIG_BLOCK
, NULL
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, -1) >= 0);
3243 assert_se(sigemptyset(&mask_chld
) == 0);
3244 assert_se(sigaddset(&mask_chld
, SIGCHLD
) == 0);
3246 if (prctl(PR_SET_CHILD_SUBREAPER
, 1) < 0) {
3247 r
= log_error_errno(errno
, "Failed to become subreaper: %m");
3252 _cleanup_close_pair_
int kmsg_socket_pair
[2] = { -1, -1 }, rtnl_socket_pair
[2] = { -1, -1 }, pid_socket_pair
[2] = { -1, -1 }, uid_shift_socket_pair
[2] = { -1, -1 };
3253 ContainerStatus container_status
;
3254 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
3255 static const struct sigaction sa
= {
3256 .sa_handler
= nop_signal_handler
,
3257 .sa_flags
= SA_NOCLDSTOP
,
3261 _cleanup_(sd_event_unrefp
) sd_event
*event
= NULL
;
3262 _cleanup_(pty_forward_freep
) PTYForward
*forward
= NULL
;
3263 _cleanup_(sd_netlink_unrefp
) sd_netlink
*rtnl
= NULL
;
3266 r
= barrier_create(&barrier
);
3268 log_error_errno(r
, "Cannot initialize IPC barrier: %m");
3272 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, kmsg_socket_pair
) < 0) {
3273 r
= log_error_errno(errno
, "Failed to create kmsg socket pair: %m");
3277 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, rtnl_socket_pair
) < 0) {
3278 r
= log_error_errno(errno
, "Failed to create rtnl socket pair: %m");
3282 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, pid_socket_pair
) < 0) {
3283 r
= log_error_errno(errno
, "Failed to create pid socket pair: %m");
3288 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, uid_shift_socket_pair
) < 0) {
3289 r
= log_error_errno(errno
, "Failed to create uid shift socket pair: %m");
3293 /* Child can be killed before execv(), so handle SIGCHLD
3294 * in order to interrupt parent's blocking calls and
3295 * give it a chance to call wait() and terminate. */
3296 r
= sigprocmask(SIG_UNBLOCK
, &mask_chld
, NULL
);
3298 r
= log_error_errno(errno
, "Failed to change the signal mask: %m");
3302 r
= sigaction(SIGCHLD
, &sa
, NULL
);
3304 r
= log_error_errno(errno
, "Failed to install SIGCHLD handler: %m");
3308 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
, NULL
);
3310 if (errno
== EINVAL
)
3311 r
= log_error_errno(errno
, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3313 r
= log_error_errno(errno
, "clone() failed: %m");
3319 /* The outer child only has a file system namespace. */
3320 barrier_set_role(&barrier
, BARRIER_CHILD
);
3322 master
= safe_close(master
);
3324 kmsg_socket_pair
[0] = safe_close(kmsg_socket_pair
[0]);
3325 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3326 pid_socket_pair
[0] = safe_close(pid_socket_pair
[0]);
3327 uid_shift_socket_pair
[0] = safe_close(uid_shift_socket_pair
[0]);
3329 (void) reset_all_signal_handlers();
3330 (void) reset_signal_mask();
3332 r
= outer_child(&barrier
,
3335 root_device
, root_device_rw
,
3336 home_device
, home_device_rw
,
3337 srv_device
, srv_device_rw
,
3341 kmsg_socket_pair
[1],
3342 rtnl_socket_pair
[1],
3343 uid_shift_socket_pair
[1],
3346 _exit(EXIT_FAILURE
);
3348 _exit(EXIT_SUCCESS
);
3351 barrier_set_role(&barrier
, BARRIER_PARENT
);
3353 fds
= fdset_free(fds
);
3355 kmsg_socket_pair
[1] = safe_close(kmsg_socket_pair
[1]);
3356 rtnl_socket_pair
[1] = safe_close(rtnl_socket_pair
[1]);
3357 pid_socket_pair
[1] = safe_close(pid_socket_pair
[1]);
3358 uid_shift_socket_pair
[1] = safe_close(uid_shift_socket_pair
[1]);
3360 /* Wait for the outer child. */
3361 r
= wait_for_terminate_and_warn("namespace helper", pid
, NULL
);
3370 /* And now retrieve the PID of the inner child. */
3371 l
= recv(pid_socket_pair
[0], &pid
, sizeof(pid
), 0);
3373 r
= log_error_errno(errno
, "Failed to read inner child PID: %m");
3376 if (l
!= sizeof(pid
)) {
3377 log_error("Short read while reading inner child PID.");
3382 log_debug("Init process invoked as PID " PID_FMT
, pid
);
3385 if (!barrier_place_and_sync(&barrier
)) { /* #1 */
3386 log_error("Child died too early.");
3391 l
= recv(uid_shift_socket_pair
[0], &arg_uid_shift
, sizeof(arg_uid_shift
), 0);
3393 r
= log_error_errno(errno
, "Failed to read UID shift: %m");
3396 if (l
!= sizeof(arg_uid_shift
)) {
3397 log_error("Short read while reading UID shift.");
3402 r
= setup_uid_map(pid
);
3406 (void) barrier_place(&barrier
); /* #2 */
3409 if (arg_private_network
) {
3411 r
= move_network_interfaces(pid
, arg_network_interfaces
);
3415 if (arg_network_veth
) {
3416 r
= setup_veth(arg_machine
, pid
, veth_name
, !!arg_network_bridge
);
3422 if (arg_network_bridge
) {
3423 r
= setup_bridge(veth_name
, arg_network_bridge
);
3431 r
= setup_veth_extra(arg_machine
, pid
, arg_network_veth_extra
);
3435 r
= setup_macvlan(arg_machine
, pid
, arg_network_macvlan
);
3439 r
= setup_ipvlan(arg_machine
, pid
, arg_network_ipvlan
);
3445 r
= register_machine(
3452 arg_custom_mounts
, arg_n_custom_mounts
,
3456 arg_container_service_name
);
3461 r
= sync_cgroup(pid
, arg_unified_cgroup_hierarchy
);
3465 if (arg_keep_unit
) {
3466 r
= create_subcgroup(pid
, arg_unified_cgroup_hierarchy
);
3471 r
= chown_cgroup(pid
, arg_uid_shift
);
3475 /* Notify the child that the parent is ready with all
3476 * its setup (including cgroup-ification), and that
3477 * the child can now hand over control to the code to
3478 * run inside the container. */
3479 (void) barrier_place(&barrier
); /* #3 */
3481 /* Block SIGCHLD here, before notifying child.
3482 * process_pty() will handle it with the other signals. */
3483 assert_se(sigprocmask(SIG_BLOCK
, &mask_chld
, NULL
) >= 0);
3485 /* Reset signal to default */
3486 r
= default_signals(SIGCHLD
, -1);
3488 log_error_errno(r
, "Failed to reset SIGCHLD: %m");
3492 /* Let the child know that we are ready and wait that the child is completely ready now. */
3493 if (!barrier_place_and_sync(&barrier
)) { /* #4 */
3494 log_error("Child died too early.");
3501 "STATUS=Container running.\n"
3502 "X_NSPAWN_LEADER_PID=" PID_FMT
, pid
);
3504 r
= sd_event_new(&event
);
3506 log_error_errno(r
, "Failed to get default event source: %m");
3510 if (arg_kill_signal
> 0) {
3511 /* Try to kill the init system on SIGINT or SIGTERM */
3512 sd_event_add_signal(event
, NULL
, SIGINT
, on_orderly_shutdown
, PID_TO_PTR(pid
));
3513 sd_event_add_signal(event
, NULL
, SIGTERM
, on_orderly_shutdown
, PID_TO_PTR(pid
));
3515 /* Immediately exit */
3516 sd_event_add_signal(event
, NULL
, SIGINT
, NULL
, NULL
);
3517 sd_event_add_signal(event
, NULL
, SIGTERM
, NULL
, NULL
);
3520 /* simply exit on sigchld */
3521 sd_event_add_signal(event
, NULL
, SIGCHLD
, NULL
, NULL
);
3523 if (arg_expose_ports
) {
3524 r
= expose_port_watch_rtnl(event
, rtnl_socket_pair
[0], on_address_change
, &exposed
, &rtnl
);
3528 (void) expose_port_execute(rtnl
, arg_expose_ports
, &exposed
);
3531 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3533 r
= pty_forward_new(event
, master
, PTY_FORWARD_IGNORE_VHANGUP
| (interactive
? 0 : PTY_FORWARD_READ_ONLY
), &forward
);
3535 log_error_errno(r
, "Failed to create PTY forwarder: %m");
3539 r
= sd_event_loop(event
);
3541 log_error_errno(r
, "Failed to run event loop: %m");
3545 pty_forward_get_last_char(forward
, &last_char
);
3547 forward
= pty_forward_free(forward
);
3549 if (!arg_quiet
&& last_char
!= '\n')
3552 /* Kill if it is not dead yet anyway */
3553 if (arg_register
&& !arg_keep_unit
)
3554 terminate_machine(pid
);
3556 /* Normally redundant, but better safe than sorry */
3559 r
= wait_for_container(pid
, &container_status
);
3563 /* We failed to wait for the container, or the
3564 * container exited abnormally */
3566 else if (r
> 0 || container_status
== CONTAINER_TERMINATED
){
3567 /* The container exited with a non-zero
3568 * status, or with zero status and no reboot
3574 /* CONTAINER_REBOOTED, loop again */
3576 if (arg_keep_unit
) {
3577 /* Special handling if we are running as a
3578 * service: instead of simply restarting the
3579 * machine we want to restart the entire
3580 * service, so let's inform systemd about this
3581 * with the special exit code 133. The service
3582 * file uses RestartForceExitStatus=133 so
3583 * that this results in a full nspawn
3584 * restart. This is necessary since we might
3585 * have cgroup parameters set we want to have
3592 expose_port_flush(arg_expose_ports
, &exposed
);
3598 "STATUS=Terminating...");
3603 /* Try to flush whatever is still queued in the pty */
3605 (void) copy_bytes(master
, STDOUT_FILENO
, (uint64_t) -1, false);
3607 loop_remove(loop_nr
, &image_fd
);
3609 if (remove_subvol
&& arg_directory
) {
3612 k
= btrfs_subvol_remove(arg_directory
, BTRFS_REMOVE_RECURSIVE
|BTRFS_REMOVE_QUOTA
);
3614 log_warning_errno(k
, "Cannot remove subvolume '%s', ignoring: %m", arg_directory
);
3620 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
3621 (void) rm_rf(p
, REMOVE_ROOT
);
3624 expose_port_flush(arg_expose_ports
, &exposed
);
3626 free(arg_directory
);
3631 strv_free(arg_setenv
);
3632 free(arg_network_bridge
);
3633 strv_free(arg_network_interfaces
);
3634 strv_free(arg_network_macvlan
);
3635 strv_free(arg_network_ipvlan
);
3636 strv_free(arg_network_veth_extra
);
3637 strv_free(arg_parameters
);
3638 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
3639 expose_port_free_all(arg_expose_ports
);
3641 return r
< 0 ? EXIT_FAILURE
: ret
;