1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
23 #include <blkid/blkid.h>
27 #include <linux/loop.h>
33 #include <selinux/selinux.h>
40 #include <sys/mount.h>
41 #include <sys/personality.h>
42 #include <sys/prctl.h>
43 #include <sys/types.h>
46 #include "sd-daemon.h"
49 #include "alloc-util.h"
51 #include "base-filesystem.h"
52 #include "blkid-util.h"
53 #include "btrfs-util.h"
55 #include "capability-util.h"
56 #include "cgroup-util.h"
58 #include "dev-setup.h"
60 #include "event-util.h"
64 #include "formats-util.h"
67 #include "hostname-util.h"
69 #include "loopback-setup.h"
70 #include "machine-image.h"
74 #include "mount-util.h"
75 #include "netlink-util.h"
76 #include "nspawn-cgroup.h"
77 #include "nspawn-expose-ports.h"
78 #include "nspawn-mount.h"
79 #include "nspawn-network.h"
80 #include "nspawn-register.h"
81 #include "nspawn-settings.h"
82 #include "nspawn-setuid.h"
83 #include "parse-util.h"
84 #include "path-util.h"
85 #include "process-util.h"
87 #include "random-util.h"
90 #include "seccomp-util.h"
92 #include "signal-util.h"
93 #include "socket-util.h"
94 #include "stat-util.h"
95 #include "stdio-util.h"
96 #include "string-util.h"
98 #include "terminal-util.h"
99 #include "udev-util.h"
100 #include "umask-util.h"
101 #include "user-util.h"
104 typedef enum ContainerStatus
{
105 CONTAINER_TERMINATED
,
109 typedef enum LinkJournal
{
116 static char *arg_directory
= NULL
;
117 static char *arg_template
= NULL
;
118 static char *arg_user
= NULL
;
119 static sd_id128_t arg_uuid
= {};
120 static char *arg_machine
= NULL
;
121 static const char *arg_selinux_context
= NULL
;
122 static const char *arg_selinux_apifs_context
= NULL
;
123 static const char *arg_slice
= NULL
;
124 static bool arg_private_network
= false;
125 static bool arg_read_only
= false;
126 static bool arg_boot
= false;
127 static bool arg_ephemeral
= false;
128 static LinkJournal arg_link_journal
= LINK_AUTO
;
129 static bool arg_link_journal_try
= false;
130 static uint64_t arg_retain
=
131 (1ULL << CAP_CHOWN
) |
132 (1ULL << CAP_DAC_OVERRIDE
) |
133 (1ULL << CAP_DAC_READ_SEARCH
) |
134 (1ULL << CAP_FOWNER
) |
135 (1ULL << CAP_FSETID
) |
136 (1ULL << CAP_IPC_OWNER
) |
138 (1ULL << CAP_LEASE
) |
139 (1ULL << CAP_LINUX_IMMUTABLE
) |
140 (1ULL << CAP_NET_BIND_SERVICE
) |
141 (1ULL << CAP_NET_BROADCAST
) |
142 (1ULL << CAP_NET_RAW
) |
143 (1ULL << CAP_SETGID
) |
144 (1ULL << CAP_SETFCAP
) |
145 (1ULL << CAP_SETPCAP
) |
146 (1ULL << CAP_SETUID
) |
147 (1ULL << CAP_SYS_ADMIN
) |
148 (1ULL << CAP_SYS_CHROOT
) |
149 (1ULL << CAP_SYS_NICE
) |
150 (1ULL << CAP_SYS_PTRACE
) |
151 (1ULL << CAP_SYS_TTY_CONFIG
) |
152 (1ULL << CAP_SYS_RESOURCE
) |
153 (1ULL << CAP_SYS_BOOT
) |
154 (1ULL << CAP_AUDIT_WRITE
) |
155 (1ULL << CAP_AUDIT_CONTROL
) |
157 static CustomMount
*arg_custom_mounts
= NULL
;
158 static unsigned arg_n_custom_mounts
= 0;
159 static char **arg_setenv
= NULL
;
160 static bool arg_quiet
= false;
161 static bool arg_share_system
= false;
162 static bool arg_register
= true;
163 static bool arg_keep_unit
= false;
164 static char **arg_network_interfaces
= NULL
;
165 static char **arg_network_macvlan
= NULL
;
166 static char **arg_network_ipvlan
= NULL
;
167 static bool arg_network_veth
= false;
168 static char **arg_network_veth_extra
= NULL
;
169 static char *arg_network_bridge
= NULL
;
170 static unsigned long arg_personality
= PERSONALITY_INVALID
;
171 static char *arg_image
= NULL
;
172 static VolatileMode arg_volatile_mode
= VOLATILE_NO
;
173 static ExposePort
*arg_expose_ports
= NULL
;
174 static char **arg_property
= NULL
;
175 static uid_t arg_uid_shift
= UID_INVALID
, arg_uid_range
= 0x10000U
;
176 static bool arg_userns
= false;
177 static int arg_kill_signal
= 0;
178 static bool arg_unified_cgroup_hierarchy
= false;
179 static SettingsMask arg_settings_mask
= 0;
180 static int arg_settings_trusted
= -1;
181 static char **arg_parameters
= NULL
;
182 static const char *arg_container_service_name
= "systemd-nspawn";
184 static void help(void) {
185 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
186 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
187 " -h --help Show this help\n"
188 " --version Print version string\n"
189 " -q --quiet Do not show status information\n"
190 " -D --directory=PATH Root directory for the container\n"
191 " --template=PATH Initialize root directory from template directory,\n"
193 " -x --ephemeral Run container with snapshot of root directory, and\n"
194 " remove it after exit\n"
195 " -i --image=PATH File system device or disk image for the container\n"
196 " -b --boot Boot up full system (i.e. invoke init)\n"
197 " -u --user=USER Run the command under specified user or uid\n"
198 " -M --machine=NAME Set the machine name for the container\n"
199 " --uuid=UUID Set a specific machine UUID for the container\n"
200 " -S --slice=SLICE Place the container in the specified slice\n"
201 " --property=NAME=VALUE Set scope unit property\n"
202 " --private-users[=UIDBASE[:NUIDS]]\n"
203 " Run within user namespace\n"
204 " --private-network Disable network in container\n"
205 " --network-interface=INTERFACE\n"
206 " Assign an existing network interface to the\n"
208 " --network-macvlan=INTERFACE\n"
209 " Create a macvlan network interface based on an\n"
210 " existing network interface to the container\n"
211 " --network-ipvlan=INTERFACE\n"
212 " Create a ipvlan network interface based on an\n"
213 " existing network interface to the container\n"
214 " -n --network-veth Add a virtual Ethernet connection between host\n"
216 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
217 " Add an additional virtual Ethernet link between\n"
218 " host and container\n"
219 " --network-bridge=INTERFACE\n"
220 " Add a virtual Ethernet connection between host\n"
221 " and container and add it to an existing bridge on\n"
223 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
224 " Expose a container IP port on the host\n"
225 " -Z --selinux-context=SECLABEL\n"
226 " Set the SELinux security context to be used by\n"
227 " processes in the container\n"
228 " -L --selinux-apifs-context=SECLABEL\n"
229 " Set the SELinux security context to be used by\n"
230 " API/tmpfs file systems in the container\n"
231 " --capability=CAP In addition to the default, retain specified\n"
233 " --drop-capability=CAP Drop the specified capability from the default set\n"
234 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
235 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
236 " try-guest, try-host\n"
237 " -j Equivalent to --link-journal=try-guest\n"
238 " --read-only Mount the root directory read-only\n"
239 " --bind=PATH[:PATH[:OPTIONS]]\n"
240 " Bind mount a file or directory from the host into\n"
242 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
243 " Similar, but creates a read-only bind mount\n"
244 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
245 " --overlay=PATH[:PATH...]:PATH\n"
246 " Create an overlay mount from the host to \n"
248 " --overlay-ro=PATH[:PATH...]:PATH\n"
249 " Similar, but creates a read-only overlay mount\n"
250 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
251 " --share-system Share system namespaces with host\n"
252 " --register=BOOLEAN Register container as machine\n"
253 " --keep-unit Do not register a scope for the machine, reuse\n"
254 " the service unit nspawn is running in\n"
255 " --volatile[=MODE] Run the system in volatile mode\n"
256 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
257 , program_invocation_short_name
);
261 static int custom_mounts_prepare(void) {
265 /* Ensure the mounts are applied prefix first. */
266 qsort_safe(arg_custom_mounts
, arg_n_custom_mounts
, sizeof(CustomMount
), custom_mount_compare
);
268 /* Allocate working directories for the overlay file systems that need it */
269 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
270 CustomMount
*m
= &arg_custom_mounts
[i
];
272 if (arg_userns
&& arg_uid_shift
== UID_INVALID
&& path_equal(m
->destination
, "/")) {
273 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
277 if (m
->type
!= CUSTOM_MOUNT_OVERLAY
)
286 r
= tempfn_random(m
->source
, NULL
, &m
->work_dir
);
288 return log_error_errno(r
, "Failed to generate work directory from %s: %m", m
->source
);
294 static int detect_unified_cgroup_hierarchy(void) {
298 /* Allow the user to control whether the unified hierarchy is used */
299 e
= getenv("UNIFIED_CGROUP_HIERARCHY");
301 r
= parse_boolean(e
);
303 return log_error_errno(r
, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
305 arg_unified_cgroup_hierarchy
= r
;
309 /* Otherwise inherit the default from the host system */
312 return log_error_errno(r
, "Failed to determine whether the unified cgroups hierarchy is used: %m");
314 arg_unified_cgroup_hierarchy
= r
;
318 static int parse_argv(int argc
, char *argv
[]) {
337 ARG_NETWORK_INTERFACE
,
341 ARG_NETWORK_VETH_EXTRA
,
351 static const struct option options
[] = {
352 { "help", no_argument
, NULL
, 'h' },
353 { "version", no_argument
, NULL
, ARG_VERSION
},
354 { "directory", required_argument
, NULL
, 'D' },
355 { "template", required_argument
, NULL
, ARG_TEMPLATE
},
356 { "ephemeral", no_argument
, NULL
, 'x' },
357 { "user", required_argument
, NULL
, 'u' },
358 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
359 { "boot", no_argument
, NULL
, 'b' },
360 { "uuid", required_argument
, NULL
, ARG_UUID
},
361 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
362 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
363 { "drop-capability", required_argument
, NULL
, ARG_DROP_CAPABILITY
},
364 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
365 { "bind", required_argument
, NULL
, ARG_BIND
},
366 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
367 { "tmpfs", required_argument
, NULL
, ARG_TMPFS
},
368 { "overlay", required_argument
, NULL
, ARG_OVERLAY
},
369 { "overlay-ro", required_argument
, NULL
, ARG_OVERLAY_RO
},
370 { "machine", required_argument
, NULL
, 'M' },
371 { "slice", required_argument
, NULL
, 'S' },
372 { "setenv", required_argument
, NULL
, ARG_SETENV
},
373 { "selinux-context", required_argument
, NULL
, 'Z' },
374 { "selinux-apifs-context", required_argument
, NULL
, 'L' },
375 { "quiet", no_argument
, NULL
, 'q' },
376 { "share-system", no_argument
, NULL
, ARG_SHARE_SYSTEM
},
377 { "register", required_argument
, NULL
, ARG_REGISTER
},
378 { "keep-unit", no_argument
, NULL
, ARG_KEEP_UNIT
},
379 { "network-interface", required_argument
, NULL
, ARG_NETWORK_INTERFACE
},
380 { "network-macvlan", required_argument
, NULL
, ARG_NETWORK_MACVLAN
},
381 { "network-ipvlan", required_argument
, NULL
, ARG_NETWORK_IPVLAN
},
382 { "network-veth", no_argument
, NULL
, 'n' },
383 { "network-veth-extra", required_argument
, NULL
, ARG_NETWORK_VETH_EXTRA
},
384 { "network-bridge", required_argument
, NULL
, ARG_NETWORK_BRIDGE
},
385 { "personality", required_argument
, NULL
, ARG_PERSONALITY
},
386 { "image", required_argument
, NULL
, 'i' },
387 { "volatile", optional_argument
, NULL
, ARG_VOLATILE
},
388 { "port", required_argument
, NULL
, 'p' },
389 { "property", required_argument
, NULL
, ARG_PROPERTY
},
390 { "private-users", optional_argument
, NULL
, ARG_PRIVATE_USERS
},
391 { "kill-signal", required_argument
, NULL
, ARG_KILL_SIGNAL
},
392 { "settings", required_argument
, NULL
, ARG_SETTINGS
},
398 uint64_t plus
= 0, minus
= 0;
399 bool mask_all_settings
= false, mask_no_settings
= false;
404 while ((c
= getopt_long(argc
, argv
, "+hD:u:bL:M:jS:Z:qi:xp:n", options
, NULL
)) >= 0)
416 r
= parse_path_argument_and_warn(optarg
, false, &arg_directory
);
422 r
= parse_path_argument_and_warn(optarg
, false, &arg_template
);
428 r
= parse_path_argument_and_warn(optarg
, false, &arg_image
);
434 arg_ephemeral
= true;
438 r
= free_and_strdup(&arg_user
, optarg
);
442 arg_settings_mask
|= SETTING_USER
;
445 case ARG_NETWORK_BRIDGE
:
446 r
= free_and_strdup(&arg_network_bridge
, optarg
);
453 arg_network_veth
= true;
454 arg_private_network
= true;
455 arg_settings_mask
|= SETTING_NETWORK
;
458 case ARG_NETWORK_VETH_EXTRA
:
459 r
= veth_extra_parse(&arg_network_veth_extra
, optarg
);
461 return log_error_errno(r
, "Failed to parse --network-veth-extra= parameter: %s", optarg
);
463 arg_private_network
= true;
464 arg_settings_mask
|= SETTING_NETWORK
;
467 case ARG_NETWORK_INTERFACE
:
468 if (strv_extend(&arg_network_interfaces
, optarg
) < 0)
471 arg_private_network
= true;
472 arg_settings_mask
|= SETTING_NETWORK
;
475 case ARG_NETWORK_MACVLAN
:
476 if (strv_extend(&arg_network_macvlan
, optarg
) < 0)
479 arg_private_network
= true;
480 arg_settings_mask
|= SETTING_NETWORK
;
483 case ARG_NETWORK_IPVLAN
:
484 if (strv_extend(&arg_network_ipvlan
, optarg
) < 0)
489 case ARG_PRIVATE_NETWORK
:
490 arg_private_network
= true;
491 arg_settings_mask
|= SETTING_NETWORK
;
496 arg_settings_mask
|= SETTING_BOOT
;
500 r
= sd_id128_from_string(optarg
, &arg_uuid
);
502 log_error("Invalid UUID: %s", optarg
);
506 arg_settings_mask
|= SETTING_MACHINE_ID
;
515 arg_machine
= mfree(arg_machine
);
517 if (!machine_name_is_valid(optarg
)) {
518 log_error("Invalid machine name: %s", optarg
);
522 r
= free_and_strdup(&arg_machine
, optarg
);
530 arg_selinux_context
= optarg
;
534 arg_selinux_apifs_context
= optarg
;
538 arg_read_only
= true;
539 arg_settings_mask
|= SETTING_READ_ONLY
;
543 case ARG_DROP_CAPABILITY
: {
546 _cleanup_free_
char *t
= NULL
;
548 r
= extract_first_word(&p
, &t
, ",", 0);
550 return log_error_errno(r
, "Failed to parse capability %s.", t
);
555 if (streq(t
, "all")) {
556 if (c
== ARG_CAPABILITY
)
557 plus
= (uint64_t) -1;
559 minus
= (uint64_t) -1;
563 cap
= capability_from_name(t
);
565 log_error("Failed to parse capability %s.", t
);
569 if (c
== ARG_CAPABILITY
)
570 plus
|= 1ULL << (uint64_t) cap
;
572 minus
|= 1ULL << (uint64_t) cap
;
576 arg_settings_mask
|= SETTING_CAPABILITY
;
581 arg_link_journal
= LINK_GUEST
;
582 arg_link_journal_try
= true;
585 case ARG_LINK_JOURNAL
:
586 if (streq(optarg
, "auto")) {
587 arg_link_journal
= LINK_AUTO
;
588 arg_link_journal_try
= false;
589 } else if (streq(optarg
, "no")) {
590 arg_link_journal
= LINK_NO
;
591 arg_link_journal_try
= false;
592 } else if (streq(optarg
, "guest")) {
593 arg_link_journal
= LINK_GUEST
;
594 arg_link_journal_try
= false;
595 } else if (streq(optarg
, "host")) {
596 arg_link_journal
= LINK_HOST
;
597 arg_link_journal_try
= false;
598 } else if (streq(optarg
, "try-guest")) {
599 arg_link_journal
= LINK_GUEST
;
600 arg_link_journal_try
= true;
601 } else if (streq(optarg
, "try-host")) {
602 arg_link_journal
= LINK_HOST
;
603 arg_link_journal_try
= true;
605 log_error("Failed to parse link journal mode %s", optarg
);
613 r
= bind_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
, c
== ARG_BIND_RO
);
615 return log_error_errno(r
, "Failed to parse --bind(-ro)= argument %s: %m", optarg
);
617 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
621 r
= tmpfs_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
);
623 return log_error_errno(r
, "Failed to parse --tmpfs= argument %s: %m", optarg
);
625 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
629 case ARG_OVERLAY_RO
: {
630 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
631 _cleanup_strv_free_
char **lower
= NULL
;
636 r
= strv_split_extract(&lower
, optarg
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
640 log_error("Invalid overlay specification: %s", optarg
);
644 STRV_FOREACH(i
, lower
) {
645 if (!path_is_absolute(*i
)) {
646 log_error("Overlay path %s is not absolute.", *i
);
654 log_error("--overlay= needs at least two colon-separated directories specified.");
659 /* If two parameters are specified,
660 * the first one is the lower, the
661 * second one the upper directory. And
662 * we'll also define the destination
663 * mount point the same as the upper. */
667 destination
= strdup(upper
);
672 upper
= lower
[n
- 2];
673 destination
= lower
[n
- 1];
677 m
= custom_mount_add(&arg_custom_mounts
, &arg_n_custom_mounts
, CUSTOM_MOUNT_OVERLAY
);
681 m
->destination
= destination
;
684 m
->read_only
= c
== ARG_OVERLAY_RO
;
686 upper
= destination
= NULL
;
689 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
696 if (!env_assignment_is_valid(optarg
)) {
697 log_error("Environment variable assignment '%s' is not valid.", optarg
);
701 n
= strv_env_set(arg_setenv
, optarg
);
705 strv_free(arg_setenv
);
708 arg_settings_mask
|= SETTING_ENVIRONMENT
;
716 case ARG_SHARE_SYSTEM
:
717 arg_share_system
= true;
721 r
= parse_boolean(optarg
);
723 log_error("Failed to parse --register= argument: %s", optarg
);
731 arg_keep_unit
= true;
734 case ARG_PERSONALITY
:
736 arg_personality
= personality_from_string(optarg
);
737 if (arg_personality
== PERSONALITY_INVALID
) {
738 log_error("Unknown or unsupported personality '%s'.", optarg
);
742 arg_settings_mask
|= SETTING_PERSONALITY
;
748 arg_volatile_mode
= VOLATILE_YES
;
752 m
= volatile_mode_from_string(optarg
);
754 log_error("Failed to parse --volatile= argument: %s", optarg
);
757 arg_volatile_mode
= m
;
760 arg_settings_mask
|= SETTING_VOLATILE_MODE
;
764 r
= expose_port_parse(&arg_expose_ports
, optarg
);
766 return log_error_errno(r
, "Duplicate port specification: %s", optarg
);
768 return log_error_errno(r
, "Failed to parse host port %s: %m", optarg
);
770 arg_settings_mask
|= SETTING_EXPOSE_PORTS
;
774 if (strv_extend(&arg_property
, optarg
) < 0)
779 case ARG_PRIVATE_USERS
:
781 _cleanup_free_
char *buffer
= NULL
;
782 const char *range
, *shift
;
784 range
= strchr(optarg
, ':');
786 buffer
= strndup(optarg
, range
- optarg
);
792 if (safe_atou32(range
, &arg_uid_range
) < 0 || arg_uid_range
<= 0) {
793 log_error("Failed to parse UID range: %s", range
);
799 if (parse_uid(shift
, &arg_uid_shift
) < 0) {
800 log_error("Failed to parse UID: %s", optarg
);
808 case ARG_KILL_SIGNAL
:
809 arg_kill_signal
= signal_from_string_try_harder(optarg
);
810 if (arg_kill_signal
< 0) {
811 log_error("Cannot parse signal: %s", optarg
);
815 arg_settings_mask
|= SETTING_KILL_SIGNAL
;
820 /* no → do not read files
821 * yes → read files, do not override cmdline, trust only subset
822 * override → read files, override cmdline, trust only subset
823 * trusted → read files, do not override cmdline, trust all
826 r
= parse_boolean(optarg
);
828 if (streq(optarg
, "trusted")) {
829 mask_all_settings
= false;
830 mask_no_settings
= false;
831 arg_settings_trusted
= true;
833 } else if (streq(optarg
, "override")) {
834 mask_all_settings
= false;
835 mask_no_settings
= true;
836 arg_settings_trusted
= -1;
838 return log_error_errno(r
, "Failed to parse --settings= argument: %s", optarg
);
841 mask_all_settings
= false;
842 mask_no_settings
= false;
843 arg_settings_trusted
= -1;
846 mask_all_settings
= true;
847 mask_no_settings
= false;
848 arg_settings_trusted
= false;
857 assert_not_reached("Unhandled option");
860 if (arg_share_system
)
861 arg_register
= false;
863 if (arg_boot
&& arg_share_system
) {
864 log_error("--boot and --share-system may not be combined.");
868 if (arg_keep_unit
&& cg_pid_get_owner_uid(0, NULL
) >= 0) {
869 log_error("--keep-unit may not be used when invoked from a user session.");
873 if (arg_directory
&& arg_image
) {
874 log_error("--directory= and --image= may not be combined.");
878 if (arg_template
&& arg_image
) {
879 log_error("--template= and --image= may not be combined.");
883 if (arg_template
&& !(arg_directory
|| arg_machine
)) {
884 log_error("--template= needs --directory= or --machine=.");
888 if (arg_ephemeral
&& arg_template
) {
889 log_error("--ephemeral and --template= may not be combined.");
893 if (arg_ephemeral
&& arg_image
) {
894 log_error("--ephemeral and --image= may not be combined.");
898 if (arg_ephemeral
&& !IN_SET(arg_link_journal
, LINK_NO
, LINK_AUTO
)) {
899 log_error("--ephemeral and --link-journal= may not be combined.");
903 if (arg_userns
&& access("/proc/self/uid_map", F_OK
) < 0)
904 return log_error_errno(EOPNOTSUPP
, "--private-users= is not supported, kernel compiled without user namespace support.");
907 arg_parameters
= strv_copy(argv
+ optind
);
911 arg_settings_mask
|= SETTING_BOOT
;
914 /* Load all settings from .nspawn files */
915 if (mask_no_settings
)
916 arg_settings_mask
= 0;
918 /* Don't load any settings from .nspawn files */
919 if (mask_all_settings
)
920 arg_settings_mask
= _SETTINGS_MASK_ALL
;
922 arg_retain
= (arg_retain
| plus
| (arg_private_network
? 1ULL << CAP_NET_ADMIN
: 0)) & ~minus
;
924 r
= detect_unified_cgroup_hierarchy();
928 e
= getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
930 arg_container_service_name
= e
;
935 static int verify_arguments(void) {
937 if (arg_volatile_mode
!= VOLATILE_NO
&& arg_read_only
) {
938 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
942 if (arg_expose_ports
&& !arg_private_network
) {
943 log_error("Cannot use --port= without private networking.");
947 if (arg_boot
&& arg_kill_signal
<= 0)
948 arg_kill_signal
= SIGRTMIN
+3;
953 static int userns_lchown(const char *p
, uid_t uid
, gid_t gid
) {
959 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
962 if (uid
!= UID_INVALID
) {
963 uid
+= arg_uid_shift
;
965 if (uid
< arg_uid_shift
|| uid
>= arg_uid_shift
+ arg_uid_range
)
969 if (gid
!= GID_INVALID
) {
970 gid
+= (gid_t
) arg_uid_shift
;
972 if (gid
< (gid_t
) arg_uid_shift
|| gid
>= (gid_t
) (arg_uid_shift
+ arg_uid_range
))
976 if (lchown(p
, uid
, gid
) < 0)
982 static int userns_mkdir(const char *root
, const char *path
, mode_t mode
, uid_t uid
, gid_t gid
) {
985 q
= prefix_roota(root
, path
);
986 if (mkdir(q
, mode
) < 0) {
992 return userns_lchown(q
, uid
, gid
);
995 static int setup_timezone(const char *dest
) {
996 _cleanup_free_
char *p
= NULL
, *q
= NULL
;
997 const char *where
, *check
, *what
;
1003 /* Fix the timezone, if possible */
1004 r
= readlink_malloc("/etc/localtime", &p
);
1006 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1010 z
= path_startswith(p
, "../usr/share/zoneinfo/");
1012 z
= path_startswith(p
, "/usr/share/zoneinfo/");
1014 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1018 where
= prefix_roota(dest
, "/etc/localtime");
1019 r
= readlink_malloc(where
, &q
);
1021 y
= path_startswith(q
, "../usr/share/zoneinfo/");
1023 y
= path_startswith(q
, "/usr/share/zoneinfo/");
1025 /* Already pointing to the right place? Then do nothing .. */
1026 if (y
&& streq(y
, z
))
1030 check
= strjoina("/usr/share/zoneinfo/", z
);
1031 check
= prefix_root(dest
, check
);
1032 if (laccess(check
, F_OK
) < 0) {
1033 log_warning("Timezone %s does not exist in container, not updating container timezone.", z
);
1038 if (r
< 0 && errno
!= ENOENT
) {
1039 log_error_errno(errno
, "Failed to remove existing timezone info %s in container: %m", where
);
1043 what
= strjoina("../usr/share/zoneinfo/", z
);
1044 if (symlink(what
, where
) < 0) {
1045 log_error_errno(errno
, "Failed to correct timezone of container: %m");
1049 r
= userns_lchown(where
, 0, 0);
1051 return log_warning_errno(r
, "Failed to chown /etc/localtime: %m");
1056 static int setup_resolv_conf(const char *dest
) {
1057 const char *where
= NULL
;
1062 if (arg_private_network
)
1065 /* Fix resolv.conf, if possible */
1066 where
= prefix_roota(dest
, "/etc/resolv.conf");
1068 r
= copy_file("/etc/resolv.conf", where
, O_TRUNC
|O_NOFOLLOW
, 0644, 0);
1070 /* If the file already exists as symlink, let's
1071 * suppress the warning, under the assumption that
1072 * resolved or something similar runs inside and the
1073 * symlink points there.
1075 * If the disk image is read-only, there's also no
1076 * point in complaining.
1078 log_full_errno(IN_SET(r
, -ELOOP
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
1079 "Failed to copy /etc/resolv.conf to %s: %m", where
);
1083 r
= userns_lchown(where
, 0, 0);
1085 log_warning_errno(r
, "Failed to chown /etc/resolv.conf: %m");
1090 static char* id128_format_as_uuid(sd_id128_t id
, char s
[37]) {
1094 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1095 SD_ID128_FORMAT_VAL(id
));
1100 static int setup_boot_id(const char *dest
) {
1101 const char *from
, *to
;
1102 sd_id128_t rnd
= {};
1106 if (arg_share_system
)
1109 /* Generate a new randomized boot ID, so that each boot-up of
1110 * the container gets a new one */
1112 from
= prefix_roota(dest
, "/run/proc-sys-kernel-random-boot-id");
1113 to
= prefix_roota(dest
, "/proc/sys/kernel/random/boot_id");
1115 r
= sd_id128_randomize(&rnd
);
1117 return log_error_errno(r
, "Failed to generate random boot id: %m");
1119 id128_format_as_uuid(rnd
, as_uuid
);
1121 r
= write_string_file(from
, as_uuid
, WRITE_STRING_FILE_CREATE
);
1123 return log_error_errno(r
, "Failed to write boot id: %m");
1125 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1126 r
= log_error_errno(errno
, "Failed to bind mount boot id: %m");
1127 else if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
) < 0)
1128 log_warning_errno(errno
, "Failed to make boot id read-only: %m");
1134 static int copy_devnodes(const char *dest
) {
1136 static const char devnodes
[] =
1147 _cleanup_umask_ mode_t u
;
1153 /* Create /dev/net, so that we can create /dev/net/tun in it */
1154 if (userns_mkdir(dest
, "/dev/net", 0755, 0, 0) < 0)
1155 return log_error_errno(r
, "Failed to create /dev/net directory: %m");
1157 NULSTR_FOREACH(d
, devnodes
) {
1158 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
1161 from
= strappend("/dev/", d
);
1162 to
= prefix_root(dest
, from
);
1164 if (stat(from
, &st
) < 0) {
1166 if (errno
!= ENOENT
)
1167 return log_error_errno(errno
, "Failed to stat %s: %m", from
);
1169 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
1171 log_error("%s is not a char or block device, cannot copy.", from
);
1175 if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
1177 return log_error_errno(errno
, "mknod(%s) failed: %m", to
);
1179 /* Some systems abusively restrict mknod but
1180 * allow bind mounts. */
1183 return log_error_errno(r
, "touch (%s) failed: %m", to
);
1184 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1185 return log_error_errno(errno
, "Both mknod and bind mount (%s) failed: %m", to
);
1188 r
= userns_lchown(to
, 0, 0);
1190 return log_error_errno(r
, "chown() of device node %s failed: %m", to
);
1197 static int setup_pts(const char *dest
) {
1198 _cleanup_free_
char *options
= NULL
;
1203 if (arg_selinux_apifs_context
)
1204 (void) asprintf(&options
,
1205 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
",context=\"%s\"",
1206 arg_uid_shift
+ TTY_GID
,
1207 arg_selinux_apifs_context
);
1210 (void) asprintf(&options
,
1211 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
,
1212 arg_uid_shift
+ TTY_GID
);
1217 /* Mount /dev/pts itself */
1218 p
= prefix_roota(dest
, "/dev/pts");
1219 if (mkdir(p
, 0755) < 0)
1220 return log_error_errno(errno
, "Failed to create /dev/pts: %m");
1221 if (mount("devpts", p
, "devpts", MS_NOSUID
|MS_NOEXEC
, options
) < 0)
1222 return log_error_errno(errno
, "Failed to mount /dev/pts: %m");
1223 r
= userns_lchown(p
, 0, 0);
1225 return log_error_errno(r
, "Failed to chown /dev/pts: %m");
1227 /* Create /dev/ptmx symlink */
1228 p
= prefix_roota(dest
, "/dev/ptmx");
1229 if (symlink("pts/ptmx", p
) < 0)
1230 return log_error_errno(errno
, "Failed to create /dev/ptmx symlink: %m");
1231 r
= userns_lchown(p
, 0, 0);
1233 return log_error_errno(r
, "Failed to chown /dev/ptmx: %m");
1235 /* And fix /dev/pts/ptmx ownership */
1236 p
= prefix_roota(dest
, "/dev/pts/ptmx");
1237 r
= userns_lchown(p
, 0, 0);
1239 return log_error_errno(r
, "Failed to chown /dev/pts/ptmx: %m");
1244 static int setup_dev_console(const char *dest
, const char *console
) {
1245 _cleanup_umask_ mode_t u
;
1254 r
= chmod_and_chown(console
, 0600, arg_uid_shift
, arg_uid_shift
);
1256 return log_error_errno(r
, "Failed to correct access mode for TTY: %m");
1258 /* We need to bind mount the right tty to /dev/console since
1259 * ptys can only exist on pts file systems. To have something
1260 * to bind mount things on we create a empty regular file. */
1262 to
= prefix_roota(dest
, "/dev/console");
1265 return log_error_errno(r
, "touch() for /dev/console failed: %m");
1267 if (mount(console
, to
, NULL
, MS_BIND
, NULL
) < 0)
1268 return log_error_errno(errno
, "Bind mount for /dev/console failed: %m");
1273 static int setup_kmsg(const char *dest
, int kmsg_socket
) {
1274 const char *from
, *to
;
1275 _cleanup_umask_ mode_t u
;
1278 assert(kmsg_socket
>= 0);
1282 /* We create the kmsg FIFO as /run/kmsg, but immediately
1283 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1284 * on the reading side behave very similar to /proc/kmsg,
1285 * their writing side behaves differently from /dev/kmsg in
1286 * that writing blocks when nothing is reading. In order to
1287 * avoid any problems with containers deadlocking due to this
1288 * we simply make /dev/kmsg unavailable to the container. */
1289 from
= prefix_roota(dest
, "/run/kmsg");
1290 to
= prefix_roota(dest
, "/proc/kmsg");
1292 if (mkfifo(from
, 0600) < 0)
1293 return log_error_errno(errno
, "mkfifo() for /run/kmsg failed: %m");
1294 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1295 return log_error_errno(errno
, "Bind mount for /proc/kmsg failed: %m");
1297 fd
= open(from
, O_RDWR
|O_NDELAY
|O_CLOEXEC
);
1299 return log_error_errno(errno
, "Failed to open fifo: %m");
1301 /* Store away the fd in the socket, so that it stays open as
1302 * long as we run the child */
1303 r
= send_one_fd(kmsg_socket
, fd
, 0);
1307 return log_error_errno(r
, "Failed to send FIFO fd: %m");
1309 /* And now make the FIFO unavailable as /run/kmsg... */
1310 (void) unlink(from
);
1315 static int on_address_change(sd_netlink
*rtnl
, sd_netlink_message
*m
, void *userdata
) {
1316 union in_addr_union
*exposed
= userdata
;
1322 expose_port_execute(rtnl
, arg_expose_ports
, exposed
);
1326 static int setup_hostname(void) {
1328 if (arg_share_system
)
1331 if (sethostname_idempotent(arg_machine
) < 0)
1337 static int setup_journal(const char *directory
) {
1338 sd_id128_t machine_id
, this_id
;
1339 _cleanup_free_
char *b
= NULL
, *d
= NULL
;
1340 const char *etc_machine_id
, *p
, *q
;
1344 /* Don't link journals in ephemeral mode */
1348 etc_machine_id
= prefix_roota(directory
, "/etc/machine-id");
1350 r
= read_one_line_file(etc_machine_id
, &b
);
1351 if (r
== -ENOENT
&& arg_link_journal
== LINK_AUTO
)
1354 return log_error_errno(r
, "Failed to read machine ID from %s: %m", etc_machine_id
);
1357 if (isempty(id
) && arg_link_journal
== LINK_AUTO
)
1360 /* Verify validity */
1361 r
= sd_id128_from_string(id
, &machine_id
);
1363 return log_error_errno(r
, "Failed to parse machine ID from %s: %m", etc_machine_id
);
1365 r
= sd_id128_get_machine(&this_id
);
1367 return log_error_errno(r
, "Failed to retrieve machine ID: %m");
1369 if (sd_id128_equal(machine_id
, this_id
)) {
1370 log_full(arg_link_journal
== LINK_AUTO
? LOG_WARNING
: LOG_ERR
,
1371 "Host and machine ids are equal (%s): refusing to link journals", id
);
1372 if (arg_link_journal
== LINK_AUTO
)
1377 if (arg_link_journal
== LINK_NO
)
1380 r
= userns_mkdir(directory
, "/var", 0755, 0, 0);
1382 return log_error_errno(r
, "Failed to create /var: %m");
1384 r
= userns_mkdir(directory
, "/var/log", 0755, 0, 0);
1386 return log_error_errno(r
, "Failed to create /var/log: %m");
1388 r
= userns_mkdir(directory
, "/var/log/journal", 0755, 0, 0);
1390 return log_error_errno(r
, "Failed to create /var/log/journal: %m");
1392 p
= strjoina("/var/log/journal/", id
);
1393 q
= prefix_roota(directory
, p
);
1395 if (path_is_mount_point(p
, 0) > 0) {
1396 if (arg_link_journal
!= LINK_AUTO
) {
1397 log_error("%s: already a mount point, refusing to use for journal", p
);
1404 if (path_is_mount_point(q
, 0) > 0) {
1405 if (arg_link_journal
!= LINK_AUTO
) {
1406 log_error("%s: already a mount point, refusing to use for journal", q
);
1413 r
= readlink_and_make_absolute(p
, &d
);
1415 if ((arg_link_journal
== LINK_GUEST
||
1416 arg_link_journal
== LINK_AUTO
) &&
1419 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1421 log_warning_errno(r
, "Failed to create directory %s: %m", q
);
1426 return log_error_errno(errno
, "Failed to remove symlink %s: %m", p
);
1427 } else if (r
== -EINVAL
) {
1429 if (arg_link_journal
== LINK_GUEST
&&
1432 if (errno
== ENOTDIR
) {
1433 log_error("%s already exists and is neither a symlink nor a directory", p
);
1436 return log_error_errno(errno
, "Failed to remove %s: %m", p
);
1438 } else if (r
!= -ENOENT
)
1439 return log_error_errno(r
, "readlink(%s) failed: %m", p
);
1441 if (arg_link_journal
== LINK_GUEST
) {
1443 if (symlink(q
, p
) < 0) {
1444 if (arg_link_journal_try
) {
1445 log_debug_errno(errno
, "Failed to symlink %s to %s, skipping journal setup: %m", q
, p
);
1448 return log_error_errno(errno
, "Failed to symlink %s to %s: %m", q
, p
);
1451 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1453 log_warning_errno(r
, "Failed to create directory %s: %m", q
);
1457 if (arg_link_journal
== LINK_HOST
) {
1458 /* don't create parents here -- if the host doesn't have
1459 * permanent journal set up, don't force it here */
1462 if (arg_link_journal_try
) {
1463 log_debug_errno(errno
, "Failed to create %s, skipping journal setup: %m", p
);
1466 return log_error_errno(errno
, "Failed to create %s: %m", p
);
1469 } else if (access(p
, F_OK
) < 0)
1472 if (dir_is_empty(q
) == 0)
1473 log_warning("%s is not empty, proceeding anyway.", q
);
1475 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1477 return log_error_errno(r
, "Failed to create %s: %m", q
);
1479 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1480 return log_error_errno(errno
, "Failed to bind mount journal from host into guest: %m");
1485 static int drop_capabilities(void) {
1486 return capability_bounding_set_drop(~arg_retain
, false);
1489 static int reset_audit_loginuid(void) {
1490 _cleanup_free_
char *p
= NULL
;
1493 if (arg_share_system
)
1496 r
= read_one_line_file("/proc/self/loginuid", &p
);
1500 return log_error_errno(r
, "Failed to read /proc/self/loginuid: %m");
1502 /* Already reset? */
1503 if (streq(p
, "4294967295"))
1506 r
= write_string_file("/proc/self/loginuid", "4294967295", 0);
1509 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1510 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1511 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1512 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1513 "using systemd-nspawn. Sleeping for 5s... (%m)");
1521 static int setup_seccomp(void) {
1524 static const struct {
1525 uint64_t capability
;
1528 { CAP_SYS_RAWIO
, SCMP_SYS(iopl
) },
1529 { CAP_SYS_RAWIO
, SCMP_SYS(ioperm
) },
1530 { CAP_SYS_BOOT
, SCMP_SYS(kexec_load
) },
1531 { CAP_SYS_ADMIN
, SCMP_SYS(swapon
) },
1532 { CAP_SYS_ADMIN
, SCMP_SYS(swapoff
) },
1533 { CAP_SYS_ADMIN
, SCMP_SYS(open_by_handle_at
) },
1534 { CAP_SYS_MODULE
, SCMP_SYS(init_module
) },
1535 { CAP_SYS_MODULE
, SCMP_SYS(finit_module
) },
1536 { CAP_SYS_MODULE
, SCMP_SYS(delete_module
) },
1537 { CAP_SYSLOG
, SCMP_SYS(syslog
) },
1540 scmp_filter_ctx seccomp
;
1544 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
1548 r
= seccomp_add_secondary_archs(seccomp
);
1550 log_error_errno(r
, "Failed to add secondary archs to seccomp filter: %m");
1554 for (i
= 0; i
< ELEMENTSOF(blacklist
); i
++) {
1555 if (arg_retain
& (1ULL << blacklist
[i
].capability
))
1558 r
= seccomp_rule_add(seccomp
, SCMP_ACT_ERRNO(EPERM
), blacklist
[i
].syscall_num
, 0);
1560 continue; /* unknown syscall */
1562 log_error_errno(r
, "Failed to block syscall: %m");
1569 Audit is broken in containers, much of the userspace audit
1570 hookup will fail if running inside a container. We don't
1571 care and just turn off creation of audit sockets.
1573 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1574 with EAFNOSUPPORT which audit userspace uses as indication
1575 that audit is disabled in the kernel.
1578 r
= seccomp_rule_add(
1580 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1583 SCMP_A0(SCMP_CMP_EQ
, AF_NETLINK
),
1584 SCMP_A2(SCMP_CMP_EQ
, NETLINK_AUDIT
));
1586 log_error_errno(r
, "Failed to add audit seccomp rule: %m");
1590 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
1592 log_error_errno(r
, "Failed to unset NO_NEW_PRIVS: %m");
1596 r
= seccomp_load(seccomp
);
1598 log_debug_errno(r
, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1603 log_error_errno(r
, "Failed to install seccomp audit filter: %m");
1608 seccomp_release(seccomp
);
1616 static int setup_propagate(const char *root
) {
1620 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1621 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1622 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
1623 (void) mkdir_p(p
, 0600);
1625 r
= userns_mkdir(root
, "/run/systemd", 0755, 0, 0);
1627 return log_error_errno(r
, "Failed to create /run/systemd: %m");
1629 r
= userns_mkdir(root
, "/run/systemd/nspawn", 0755, 0, 0);
1631 return log_error_errno(r
, "Failed to create /run/systemd/nspawn: %m");
1633 r
= userns_mkdir(root
, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1635 return log_error_errno(r
, "Failed to create /run/systemd/nspawn/incoming: %m");
1637 q
= prefix_roota(root
, "/run/systemd/nspawn/incoming");
1638 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1639 return log_error_errno(errno
, "Failed to install propagation bind mount.");
1641 if (mount(NULL
, q
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
) < 0)
1642 return log_error_errno(errno
, "Failed to make propagation mount read-only");
1647 static int setup_image(char **device_path
, int *loop_nr
) {
1648 struct loop_info64 info
= {
1649 .lo_flags
= LO_FLAGS_AUTOCLEAR
|LO_FLAGS_PARTSCAN
1651 _cleanup_close_
int fd
= -1, control
= -1, loop
= -1;
1652 _cleanup_free_
char* loopdev
= NULL
;
1656 assert(device_path
);
1660 fd
= open(arg_image
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1662 return log_error_errno(errno
, "Failed to open %s: %m", arg_image
);
1664 if (fstat(fd
, &st
) < 0)
1665 return log_error_errno(errno
, "Failed to stat %s: %m", arg_image
);
1667 if (S_ISBLK(st
.st_mode
)) {
1670 p
= strdup(arg_image
);
1684 if (!S_ISREG(st
.st_mode
)) {
1685 log_error("%s is not a regular file or block device.", arg_image
);
1689 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
1691 return log_error_errno(errno
, "Failed to open /dev/loop-control: %m");
1693 nr
= ioctl(control
, LOOP_CTL_GET_FREE
);
1695 return log_error_errno(errno
, "Failed to allocate loop device: %m");
1697 if (asprintf(&loopdev
, "/dev/loop%i", nr
) < 0)
1700 loop
= open(loopdev
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1702 return log_error_errno(errno
, "Failed to open loop device %s: %m", loopdev
);
1704 if (ioctl(loop
, LOOP_SET_FD
, fd
) < 0)
1705 return log_error_errno(errno
, "Failed to set loopback file descriptor on %s: %m", loopdev
);
1708 info
.lo_flags
|= LO_FLAGS_READ_ONLY
;
1710 if (ioctl(loop
, LOOP_SET_STATUS64
, &info
) < 0)
1711 return log_error_errno(errno
, "Failed to set loopback settings on %s: %m", loopdev
);
1713 *device_path
= loopdev
;
1724 #define PARTITION_TABLE_BLURB \
1725 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1726 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1727 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1728 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1729 "to be bootable with systemd-nspawn."
1731 static int dissect_image(
1733 char **root_device
, bool *root_device_rw
,
1734 char **home_device
, bool *home_device_rw
,
1735 char **srv_device
, bool *srv_device_rw
,
1739 int home_nr
= -1, srv_nr
= -1;
1740 #ifdef GPT_ROOT_NATIVE
1743 #ifdef GPT_ROOT_SECONDARY
1744 int secondary_root_nr
= -1;
1746 _cleanup_free_
char *home
= NULL
, *root
= NULL
, *secondary_root
= NULL
, *srv
= NULL
, *generic
= NULL
;
1747 _cleanup_udev_enumerate_unref_
struct udev_enumerate
*e
= NULL
;
1748 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
1749 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
1750 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
1751 struct udev_list_entry
*first
, *item
;
1752 bool home_rw
= true, root_rw
= true, secondary_root_rw
= true, srv_rw
= true, generic_rw
= true;
1753 bool is_gpt
, is_mbr
, multiple_generic
= false;
1754 const char *pttype
= NULL
;
1761 assert(root_device
);
1762 assert(home_device
);
1767 b
= blkid_new_probe();
1772 r
= blkid_probe_set_device(b
, fd
, 0, 0);
1777 return log_error_errno(errno
, "Failed to set device on blkid probe: %m");
1780 blkid_probe_enable_partitions(b
, 1);
1781 blkid_probe_set_partitions_flags(b
, BLKID_PARTS_ENTRY_DETAILS
);
1784 r
= blkid_do_safeprobe(b
);
1785 if (r
== -2 || r
== 1) {
1786 log_error("Failed to identify any partition table on\n"
1788 PARTITION_TABLE_BLURB
, arg_image
);
1790 } else if (r
!= 0) {
1793 return log_error_errno(errno
, "Failed to probe: %m");
1796 (void) blkid_probe_lookup_value(b
, "PTTYPE", &pttype
, NULL
);
1798 is_gpt
= streq_ptr(pttype
, "gpt");
1799 is_mbr
= streq_ptr(pttype
, "dos");
1801 if (!is_gpt
&& !is_mbr
) {
1802 log_error("No GPT or MBR partition table discovered on\n"
1804 PARTITION_TABLE_BLURB
, arg_image
);
1809 pl
= blkid_probe_get_partitions(b
);
1814 log_error("Failed to list partitions of %s", arg_image
);
1822 if (fstat(fd
, &st
) < 0)
1823 return log_error_errno(errno
, "Failed to stat block device: %m");
1825 d
= udev_device_new_from_devnum(udev
, 'b', st
.st_rdev
);
1833 log_error("Kernel partitions never appeared.");
1837 e
= udev_enumerate_new(udev
);
1841 r
= udev_enumerate_add_match_parent(e
, d
);
1845 r
= udev_enumerate_scan_devices(e
);
1847 return log_error_errno(r
, "Failed to scan for partition devices of %s: %m", arg_image
);
1849 /* Count the partitions enumerated by the kernel */
1851 first
= udev_enumerate_get_list_entry(e
);
1852 udev_list_entry_foreach(item
, first
)
1855 /* Count the partitions enumerated by blkid */
1856 m
= blkid_partlist_numof_partitions(pl
);
1860 log_error("blkid and kernel partition list do not match.");
1866 /* The kernel has probed fewer partitions than
1867 * blkid? Maybe the kernel prober is still
1868 * running or it got EBUSY because udev
1869 * already opened the device. Let's reprobe
1870 * the device, which is a synchronous call
1871 * that waits until probing is complete. */
1873 for (j
= 0; j
< 20; j
++) {
1875 r
= ioctl(fd
, BLKRRPART
, 0);
1878 if (r
>= 0 || r
!= -EBUSY
)
1881 /* If something else has the device
1882 * open, such as an udev rule, the
1883 * ioctl will return EBUSY. Since
1884 * there's no way to wait until it
1885 * isn't busy anymore, let's just wait
1886 * a bit, and try again.
1888 * This is really something they
1889 * should fix in the kernel! */
1891 usleep(50 * USEC_PER_MSEC
);
1895 return log_error_errno(r
, "Failed to reread partition table: %m");
1898 e
= udev_enumerate_unref(e
);
1901 first
= udev_enumerate_get_list_entry(e
);
1902 udev_list_entry_foreach(item
, first
) {
1903 _cleanup_udev_device_unref_
struct udev_device
*q
;
1905 unsigned long long flags
;
1911 q
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(item
));
1916 return log_error_errno(errno
, "Failed to get partition device of %s: %m", arg_image
);
1919 qn
= udev_device_get_devnum(q
);
1923 if (st
.st_rdev
== qn
)
1926 node
= udev_device_get_devnode(q
);
1930 pp
= blkid_partlist_devno_to_partition(pl
, qn
);
1934 flags
= blkid_partition_get_flags(pp
);
1936 nr
= blkid_partition_get_partno(pp
);
1944 if (flags
& GPT_FLAG_NO_AUTO
)
1947 stype
= blkid_partition_get_type_string(pp
);
1951 if (sd_id128_from_string(stype
, &type_id
) < 0)
1954 if (sd_id128_equal(type_id
, GPT_HOME
)) {
1956 if (home
&& nr
>= home_nr
)
1960 home_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1962 r
= free_and_strdup(&home
, node
);
1966 } else if (sd_id128_equal(type_id
, GPT_SRV
)) {
1968 if (srv
&& nr
>= srv_nr
)
1972 srv_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1974 r
= free_and_strdup(&srv
, node
);
1978 #ifdef GPT_ROOT_NATIVE
1979 else if (sd_id128_equal(type_id
, GPT_ROOT_NATIVE
)) {
1981 if (root
&& nr
>= root_nr
)
1985 root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1987 r
= free_and_strdup(&root
, node
);
1992 #ifdef GPT_ROOT_SECONDARY
1993 else if (sd_id128_equal(type_id
, GPT_ROOT_SECONDARY
)) {
1995 if (secondary_root
&& nr
>= secondary_root_nr
)
1998 secondary_root_nr
= nr
;
1999 secondary_root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2001 r
= free_and_strdup(&secondary_root
, node
);
2006 else if (sd_id128_equal(type_id
, GPT_LINUX_GENERIC
)) {
2009 multiple_generic
= true;
2011 generic_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2013 r
= free_and_strdup(&generic
, node
);
2019 } else if (is_mbr
) {
2022 if (flags
!= 0x80) /* Bootable flag */
2025 type
= blkid_partition_get_type(pp
);
2026 if (type
!= 0x83) /* Linux partition */
2030 multiple_generic
= true;
2034 r
= free_and_strdup(&root
, node
);
2042 *root_device
= root
;
2045 *root_device_rw
= root_rw
;
2047 } else if (secondary_root
) {
2048 *root_device
= secondary_root
;
2049 secondary_root
= NULL
;
2051 *root_device_rw
= secondary_root_rw
;
2053 } else if (generic
) {
2055 /* There were no partitions with precise meanings
2056 * around, but we found generic partitions. In this
2057 * case, if there's only one, we can go ahead and boot
2058 * it, otherwise we bail out, because we really cannot
2059 * make any sense of it. */
2061 if (multiple_generic
) {
2062 log_error("Identified multiple bootable Linux partitions on\n"
2064 PARTITION_TABLE_BLURB
, arg_image
);
2068 *root_device
= generic
;
2071 *root_device_rw
= generic_rw
;
2074 log_error("Failed to identify root partition in disk image\n"
2076 PARTITION_TABLE_BLURB
, arg_image
);
2081 *home_device
= home
;
2084 *home_device_rw
= home_rw
;
2091 *srv_device_rw
= srv_rw
;
2096 log_error("--image= is not supported, compiled without blkid support.");
2101 static int mount_device(const char *what
, const char *where
, const char *directory
, bool rw
) {
2103 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
2104 const char *fstype
, *p
;
2114 p
= strjoina(where
, directory
);
2119 b
= blkid_new_probe_from_filename(what
);
2123 return log_error_errno(errno
, "Failed to allocate prober for %s: %m", what
);
2126 blkid_probe_enable_superblocks(b
, 1);
2127 blkid_probe_set_superblocks_flags(b
, BLKID_SUBLKS_TYPE
);
2130 r
= blkid_do_safeprobe(b
);
2131 if (r
== -1 || r
== 1) {
2132 log_error("Cannot determine file system type of %s", what
);
2134 } else if (r
!= 0) {
2137 return log_error_errno(errno
, "Failed to probe %s: %m", what
);
2141 if (blkid_probe_lookup_value(b
, "TYPE", &fstype
, NULL
) < 0) {
2144 log_error("Failed to determine file system type of %s", what
);
2148 if (streq(fstype
, "crypto_LUKS")) {
2149 log_error("nspawn currently does not support LUKS disk images.");
2153 if (mount(what
, p
, fstype
, MS_NODEV
|(rw
? 0 : MS_RDONLY
), NULL
) < 0)
2154 return log_error_errno(errno
, "Failed to mount %s: %m", what
);
2158 log_error("--image= is not supported, compiled without blkid support.");
2163 static int mount_devices(
2165 const char *root_device
, bool root_device_rw
,
2166 const char *home_device
, bool home_device_rw
,
2167 const char *srv_device
, bool srv_device_rw
) {
2173 r
= mount_device(root_device
, arg_directory
, NULL
, root_device_rw
);
2175 return log_error_errno(r
, "Failed to mount root directory: %m");
2179 r
= mount_device(home_device
, arg_directory
, "/home", home_device_rw
);
2181 return log_error_errno(r
, "Failed to mount home directory: %m");
2185 r
= mount_device(srv_device
, arg_directory
, "/srv", srv_device_rw
);
2187 return log_error_errno(r
, "Failed to mount server data directory: %m");
2193 static void loop_remove(int nr
, int *image_fd
) {
2194 _cleanup_close_
int control
= -1;
2200 if (image_fd
&& *image_fd
>= 0) {
2201 r
= ioctl(*image_fd
, LOOP_CLR_FD
);
2203 log_debug_errno(errno
, "Failed to close loop image: %m");
2204 *image_fd
= safe_close(*image_fd
);
2207 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
2209 log_warning_errno(errno
, "Failed to open /dev/loop-control: %m");
2213 r
= ioctl(control
, LOOP_CTL_REMOVE
, nr
);
2215 log_debug_errno(errno
, "Failed to remove loop %d: %m", nr
);
2220 * < 0 : wait_for_terminate() failed to get the state of the
2221 * container, the container was terminated by a signal, or
2222 * failed for an unknown reason. No change is made to the
2223 * container argument.
2224 * > 0 : The program executed in the container terminated with an
2225 * error. The exit code of the program executed in the
2226 * container is returned. The container argument has been set
2227 * to CONTAINER_TERMINATED.
2228 * 0 : The container is being rebooted, has been shut down or exited
2229 * successfully. The container argument has been set to either
2230 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2232 * That is, success is indicated by a return value of zero, and an
2233 * error is indicated by a non-zero value.
2235 static int wait_for_container(pid_t pid
, ContainerStatus
*container
) {
2239 r
= wait_for_terminate(pid
, &status
);
2241 return log_warning_errno(r
, "Failed to wait for container: %m");
2243 switch (status
.si_code
) {
2246 if (status
.si_status
== 0) {
2247 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s exited successfully.", arg_machine
);
2250 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s failed with error code %i.", arg_machine
, status
.si_status
);
2252 *container
= CONTAINER_TERMINATED
;
2253 return status
.si_status
;
2256 if (status
.si_status
== SIGINT
) {
2258 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s has been shut down.", arg_machine
);
2259 *container
= CONTAINER_TERMINATED
;
2262 } else if (status
.si_status
== SIGHUP
) {
2264 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s is being rebooted.", arg_machine
);
2265 *container
= CONTAINER_REBOOTED
;
2269 /* CLD_KILLED fallthrough */
2272 log_error("Container %s terminated by signal %s.", arg_machine
, signal_to_string(status
.si_status
));
2276 log_error("Container %s failed due to unknown reason.", arg_machine
);
2283 static int on_orderly_shutdown(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
2286 pid
= PTR_TO_UINT32(userdata
);
2288 if (kill(pid
, arg_kill_signal
) >= 0) {
2289 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2290 sd_event_source_set_userdata(s
, NULL
);
2295 sd_event_exit(sd_event_source_get_event(s
), 0);
2299 static int determine_names(void) {
2302 if (arg_template
&& !arg_directory
&& arg_machine
) {
2304 /* If --template= was specified then we should not
2305 * search for a machine, but instead create a new one
2306 * in /var/lib/machine. */
2308 arg_directory
= strjoin("/var/lib/machines/", arg_machine
, NULL
);
2313 if (!arg_image
&& !arg_directory
) {
2315 _cleanup_(image_unrefp
) Image
*i
= NULL
;
2317 r
= image_find(arg_machine
, &i
);
2319 return log_error_errno(r
, "Failed to find image for machine '%s': %m", arg_machine
);
2321 log_error("No image for machine '%s': %m", arg_machine
);
2325 if (i
->type
== IMAGE_RAW
)
2326 r
= free_and_strdup(&arg_image
, i
->path
);
2328 r
= free_and_strdup(&arg_directory
, i
->path
);
2330 return log_error_errno(r
, "Invalid image directory: %m");
2333 arg_read_only
= arg_read_only
|| i
->read_only
;
2335 arg_directory
= get_current_dir_name();
2337 if (!arg_directory
&& !arg_machine
) {
2338 log_error("Failed to determine path, please use -D or -i.");
2344 if (arg_directory
&& path_equal(arg_directory
, "/"))
2345 arg_machine
= gethostname_malloc();
2347 arg_machine
= strdup(basename(arg_image
?: arg_directory
));
2352 hostname_cleanup(arg_machine
);
2353 if (!machine_name_is_valid(arg_machine
)) {
2354 log_error("Failed to determine machine name automatically, please use -M.");
2358 if (arg_ephemeral
) {
2361 /* Add a random suffix when this is an
2362 * ephemeral machine, so that we can run many
2363 * instances at once without manually having
2364 * to specify -M each time. */
2366 if (asprintf(&b
, "%s-%016" PRIx64
, arg_machine
, random_u64()) < 0)
2377 static int determine_uid_shift(const char *directory
) {
2385 if (arg_uid_shift
== UID_INVALID
) {
2388 r
= stat(directory
, &st
);
2390 return log_error_errno(errno
, "Failed to determine UID base of %s: %m", directory
);
2392 arg_uid_shift
= st
.st_uid
& UINT32_C(0xffff0000);
2394 if (arg_uid_shift
!= (st
.st_gid
& UINT32_C(0xffff0000))) {
2395 log_error("UID and GID base of %s don't match.", directory
);
2399 arg_uid_range
= UINT32_C(0x10000);
2402 if (arg_uid_shift
> (uid_t
) -1 - arg_uid_range
) {
2403 log_error("UID base too high for UID range.");
2407 log_info("Using user namespaces with base " UID_FMT
" and range " UID_FMT
".", arg_uid_shift
, arg_uid_range
);
2411 static int inner_child(
2413 const char *directory
,
2419 _cleanup_free_
char *home
= NULL
;
2421 const char *envp
[] = {
2422 "PATH=" DEFAULT_PATH_SPLIT_USR
,
2423 NULL
, /* container */
2428 NULL
, /* container_uuid */
2429 NULL
, /* LISTEN_FDS */
2430 NULL
, /* LISTEN_PID */
2434 _cleanup_strv_free_
char **env_use
= NULL
;
2439 assert(kmsg_socket
>= 0);
2444 /* Tell the parent, that it now can write the UID map. */
2445 (void) barrier_place(barrier
); /* #1 */
2447 /* Wait until the parent wrote the UID map */
2448 if (!barrier_place_and_sync(barrier
)) { /* #2 */
2449 log_error("Parent died too early");
2454 r
= mount_all(NULL
, arg_userns
, true, arg_uid_shift
, arg_private_network
, arg_uid_range
, arg_selinux_apifs_context
);
2458 r
= mount_sysfs(NULL
);
2462 /* Wait until we are cgroup-ified, so that we
2463 * can mount the right cgroup path writable */
2464 if (!barrier_place_and_sync(barrier
)) { /* #3 */
2465 log_error("Parent died too early");
2469 r
= mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy
);
2473 r
= reset_uid_gid();
2475 return log_error_errno(r
, "Couldn't become new root: %m");
2477 r
= setup_boot_id(NULL
);
2481 r
= setup_kmsg(NULL
, kmsg_socket
);
2484 kmsg_socket
= safe_close(kmsg_socket
);
2489 return log_error_errno(errno
, "setsid() failed: %m");
2491 if (arg_private_network
)
2494 if (arg_expose_ports
) {
2495 r
= expose_port_send_rtnl(rtnl_socket
);
2498 rtnl_socket
= safe_close(rtnl_socket
);
2501 r
= drop_capabilities();
2503 return log_error_errno(r
, "drop_capabilities() failed: %m");
2507 if (arg_personality
!= PERSONALITY_INVALID
) {
2508 if (personality(arg_personality
) < 0)
2509 return log_error_errno(errno
, "personality() failed: %m");
2510 } else if (secondary
) {
2511 if (personality(PER_LINUX32
) < 0)
2512 return log_error_errno(errno
, "personality() failed: %m");
2516 if (arg_selinux_context
)
2517 if (setexeccon((security_context_t
) arg_selinux_context
) < 0)
2518 return log_error_errno(errno
, "setexeccon(\"%s\") failed: %m", arg_selinux_context
);
2521 r
= change_uid_gid(arg_user
, &home
);
2525 /* LXC sets container=lxc, so follow the scheme here */
2526 envp
[n_env
++] = strjoina("container=", arg_container_service_name
);
2528 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
2532 if ((asprintf((char**)(envp
+ n_env
++), "HOME=%s", home
? home
: "/root") < 0) ||
2533 (asprintf((char**)(envp
+ n_env
++), "USER=%s", arg_user
? arg_user
: "root") < 0) ||
2534 (asprintf((char**)(envp
+ n_env
++), "LOGNAME=%s", arg_user
? arg_user
: "root") < 0))
2537 if (!sd_id128_equal(arg_uuid
, SD_ID128_NULL
)) {
2540 if (asprintf((char**)(envp
+ n_env
++), "container_uuid=%s", id128_format_as_uuid(arg_uuid
, as_uuid
)) < 0)
2544 if (fdset_size(fds
) > 0) {
2545 r
= fdset_cloexec(fds
, false);
2547 return log_error_errno(r
, "Failed to unset O_CLOEXEC for file descriptors.");
2549 if ((asprintf((char **)(envp
+ n_env
++), "LISTEN_FDS=%u", fdset_size(fds
)) < 0) ||
2550 (asprintf((char **)(envp
+ n_env
++), "LISTEN_PID=1") < 0))
2554 env_use
= strv_env_merge(2, envp
, arg_setenv
);
2558 /* Let the parent know that we are ready and
2559 * wait until the parent is ready with the
2561 if (!barrier_place_and_sync(barrier
)) { /* #4 */
2562 log_error("Parent died too early");
2566 /* Now, explicitly close the log, so that we
2567 * then can close all remaining fds. Closing
2568 * the log explicitly first has the benefit
2569 * that the logging subsystem knows about it,
2570 * and is thus ready to be reopened should we
2571 * need it again. Note that the other fds
2572 * closed here are at least the locking and
2575 (void) fdset_close_others(fds
);
2581 /* Automatically search for the init system */
2583 m
= 1 + strv_length(arg_parameters
);
2584 a
= newa(char*, m
+ 1);
2585 if (strv_isempty(arg_parameters
))
2588 memcpy(a
+ 1, arg_parameters
, m
* sizeof(char*));
2590 a
[0] = (char*) "/usr/lib/systemd/systemd";
2591 execve(a
[0], a
, env_use
);
2593 a
[0] = (char*) "/lib/systemd/systemd";
2594 execve(a
[0], a
, env_use
);
2596 a
[0] = (char*) "/sbin/init";
2597 execve(a
[0], a
, env_use
);
2598 } else if (!strv_isempty(arg_parameters
))
2599 execvpe(arg_parameters
[0], arg_parameters
, env_use
);
2601 chdir(home
?: "/root");
2602 execle("/bin/bash", "-bash", NULL
, env_use
);
2603 execle("/bin/sh", "-sh", NULL
, env_use
);
2608 return log_error_errno(r
, "execv() failed: %m");
2611 static int outer_child(
2613 const char *directory
,
2614 const char *console
,
2615 const char *root_device
, bool root_device_rw
,
2616 const char *home_device
, bool home_device_rw
,
2617 const char *srv_device
, bool srv_device_rw
,
2623 int uid_shift_socket
,
2633 assert(pid_socket
>= 0);
2634 assert(kmsg_socket
>= 0);
2638 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0)
2639 return log_error_errno(errno
, "PR_SET_PDEATHSIG failed: %m");
2642 close_nointr(STDIN_FILENO
);
2643 close_nointr(STDOUT_FILENO
);
2644 close_nointr(STDERR_FILENO
);
2646 r
= open_terminal(console
, O_RDWR
);
2647 if (r
!= STDIN_FILENO
) {
2653 return log_error_errno(r
, "Failed to open console: %m");
2656 if (dup2(STDIN_FILENO
, STDOUT_FILENO
) != STDOUT_FILENO
||
2657 dup2(STDIN_FILENO
, STDERR_FILENO
) != STDERR_FILENO
)
2658 return log_error_errno(errno
, "Failed to duplicate console: %m");
2661 r
= reset_audit_loginuid();
2665 /* Mark everything as slave, so that we still
2666 * receive mounts from the real root, but don't
2667 * propagate mounts to the real root. */
2668 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
2669 return log_error_errno(errno
, "MS_SLAVE|MS_REC failed: %m");
2671 r
= mount_devices(directory
,
2672 root_device
, root_device_rw
,
2673 home_device
, home_device_rw
,
2674 srv_device
, srv_device_rw
);
2678 r
= determine_uid_shift(directory
);
2683 l
= send(uid_shift_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), MSG_NOSIGNAL
);
2685 return log_error_errno(errno
, "Failed to send UID shift: %m");
2686 if (l
!= sizeof(arg_uid_shift
)) {
2687 log_error("Short write while sending UID shift.");
2692 /* Turn directory into bind mount */
2693 if (mount(directory
, directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
2694 return log_error_errno(errno
, "Failed to make bind mount: %m");
2696 r
= setup_volatile(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2700 r
= setup_volatile_state(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2704 r
= base_filesystem_create(directory
, arg_uid_shift
, (gid_t
) arg_uid_shift
);
2708 if (arg_read_only
) {
2709 r
= bind_remount_recursive(directory
, true);
2711 return log_error_errno(r
, "Failed to make tree read-only: %m");
2714 r
= mount_all(directory
, arg_userns
, false, arg_private_network
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2718 r
= copy_devnodes(directory
);
2722 dev_setup(directory
, arg_uid_shift
, arg_uid_shift
);
2724 r
= setup_pts(directory
);
2728 r
= setup_propagate(directory
);
2732 r
= setup_dev_console(directory
, console
);
2736 r
= setup_seccomp();
2740 r
= setup_timezone(directory
);
2744 r
= setup_resolv_conf(directory
);
2748 r
= setup_journal(directory
);
2752 r
= mount_custom(directory
, arg_custom_mounts
, arg_n_custom_mounts
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2756 r
= mount_cgroups(directory
, arg_unified_cgroup_hierarchy
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2760 r
= mount_move_root(directory
);
2762 return log_error_errno(r
, "Failed to move root directory: %m");
2764 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
|
2765 (arg_share_system
? 0 : CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
) |
2766 (arg_private_network
? CLONE_NEWNET
: 0) |
2767 (arg_userns
? CLONE_NEWUSER
: 0),
2770 return log_error_errno(errno
, "Failed to fork inner child: %m");
2772 pid_socket
= safe_close(pid_socket
);
2773 uid_shift_socket
= safe_close(uid_shift_socket
);
2775 /* The inner child has all namespaces that are
2776 * requested, so that we all are owned by the user if
2777 * user namespaces are turned on. */
2779 r
= inner_child(barrier
, directory
, secondary
, kmsg_socket
, rtnl_socket
, fds
);
2781 _exit(EXIT_FAILURE
);
2783 _exit(EXIT_SUCCESS
);
2786 l
= send(pid_socket
, &pid
, sizeof(pid
), MSG_NOSIGNAL
);
2788 return log_error_errno(errno
, "Failed to send PID: %m");
2789 if (l
!= sizeof(pid
)) {
2790 log_error("Short write while sending PID.");
2794 pid_socket
= safe_close(pid_socket
);
2795 kmsg_socket
= safe_close(kmsg_socket
);
2796 rtnl_socket
= safe_close(rtnl_socket
);
2801 static int setup_uid_map(pid_t pid
) {
2802 char uid_map
[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t
) + 1], line
[DECIMAL_STR_MAX(uid_t
)*3+3+1];
2807 xsprintf(uid_map
, "/proc/" PID_FMT
"/uid_map", pid
);
2808 xsprintf(line
, UID_FMT
" " UID_FMT
" " UID_FMT
"\n", 0, arg_uid_shift
, arg_uid_range
);
2809 r
= write_string_file(uid_map
, line
, 0);
2811 return log_error_errno(r
, "Failed to write UID map: %m");
2813 /* We always assign the same UID and GID ranges */
2814 xsprintf(uid_map
, "/proc/" PID_FMT
"/gid_map", pid
);
2815 r
= write_string_file(uid_map
, line
, 0);
2817 return log_error_errno(r
, "Failed to write GID map: %m");
2822 static int load_settings(void) {
2823 _cleanup_(settings_freep
) Settings
*settings
= NULL
;
2824 _cleanup_fclose_
FILE *f
= NULL
;
2825 _cleanup_free_
char *p
= NULL
;
2829 /* If all settings are masked, there's no point in looking for
2830 * the settings file */
2831 if ((arg_settings_mask
& _SETTINGS_MASK_ALL
) == _SETTINGS_MASK_ALL
)
2834 fn
= strjoina(arg_machine
, ".nspawn");
2836 /* We first look in the admin's directories in /etc and /run */
2837 FOREACH_STRING(i
, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2838 _cleanup_free_
char *j
= NULL
;
2840 j
= strjoin(i
, "/", fn
, NULL
);
2849 /* By default, we trust configuration from /etc and /run */
2850 if (arg_settings_trusted
< 0)
2851 arg_settings_trusted
= true;
2856 if (errno
!= ENOENT
)
2857 return log_error_errno(errno
, "Failed to open %s: %m", j
);
2861 /* After that, let's look for a file next to the
2862 * actual image we shall boot. */
2865 p
= file_in_same_dir(arg_image
, fn
);
2868 } else if (arg_directory
) {
2869 p
= file_in_same_dir(arg_directory
, fn
);
2876 if (!f
&& errno
!= ENOENT
)
2877 return log_error_errno(errno
, "Failed to open %s: %m", p
);
2879 /* By default, we do not trust configuration from /var/lib/machines */
2880 if (arg_settings_trusted
< 0)
2881 arg_settings_trusted
= false;
2888 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted
));
2890 r
= settings_load(f
, p
, &settings
);
2894 /* Copy over bits from the settings, unless they have been
2895 * explicitly masked by command line switches. */
2897 if ((arg_settings_mask
& SETTING_BOOT
) == 0 &&
2898 settings
->boot
>= 0) {
2899 arg_boot
= settings
->boot
;
2901 strv_free(arg_parameters
);
2902 arg_parameters
= settings
->parameters
;
2903 settings
->parameters
= NULL
;
2906 if ((arg_settings_mask
& SETTING_ENVIRONMENT
) == 0 &&
2907 settings
->environment
) {
2908 strv_free(arg_setenv
);
2909 arg_setenv
= settings
->environment
;
2910 settings
->environment
= NULL
;
2913 if ((arg_settings_mask
& SETTING_USER
) == 0 &&
2916 arg_user
= settings
->user
;
2917 settings
->user
= NULL
;
2920 if ((arg_settings_mask
& SETTING_CAPABILITY
) == 0) {
2923 plus
= settings
->capability
;
2924 if (settings_private_network(settings
))
2925 plus
|= (1ULL << CAP_NET_ADMIN
);
2927 if (!arg_settings_trusted
&& plus
!= 0) {
2928 if (settings
->capability
!= 0)
2929 log_warning("Ignoring Capability= setting, file %s is not trusted.", p
);
2933 arg_retain
&= ~settings
->drop_capability
;
2936 if ((arg_settings_mask
& SETTING_KILL_SIGNAL
) == 0 &&
2937 settings
->kill_signal
> 0)
2938 arg_kill_signal
= settings
->kill_signal
;
2940 if ((arg_settings_mask
& SETTING_PERSONALITY
) == 0 &&
2941 settings
->personality
!= PERSONALITY_INVALID
)
2942 arg_personality
= settings
->personality
;
2944 if ((arg_settings_mask
& SETTING_MACHINE_ID
) == 0 &&
2945 !sd_id128_is_null(settings
->machine_id
)) {
2947 if (!arg_settings_trusted
)
2948 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p
);
2950 arg_uuid
= settings
->machine_id
;
2953 if ((arg_settings_mask
& SETTING_READ_ONLY
) == 0 &&
2954 settings
->read_only
>= 0)
2955 arg_read_only
= settings
->read_only
;
2957 if ((arg_settings_mask
& SETTING_VOLATILE_MODE
) == 0 &&
2958 settings
->volatile_mode
!= _VOLATILE_MODE_INVALID
)
2959 arg_volatile_mode
= settings
->volatile_mode
;
2961 if ((arg_settings_mask
& SETTING_CUSTOM_MOUNTS
) == 0 &&
2962 settings
->n_custom_mounts
> 0) {
2964 if (!arg_settings_trusted
)
2965 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p
);
2967 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
2968 arg_custom_mounts
= settings
->custom_mounts
;
2969 arg_n_custom_mounts
= settings
->n_custom_mounts
;
2971 settings
->custom_mounts
= NULL
;
2972 settings
->n_custom_mounts
= 0;
2976 if ((arg_settings_mask
& SETTING_NETWORK
) == 0 &&
2977 (settings
->private_network
>= 0 ||
2978 settings
->network_veth
>= 0 ||
2979 settings
->network_bridge
||
2980 settings
->network_interfaces
||
2981 settings
->network_macvlan
||
2982 settings
->network_ipvlan
||
2983 settings
->network_veth_extra
)) {
2985 if (!arg_settings_trusted
)
2986 log_warning("Ignoring network settings, file %s is not trusted.", p
);
2988 arg_network_veth
= settings_network_veth(settings
);
2989 arg_private_network
= settings_private_network(settings
);
2991 strv_free(arg_network_interfaces
);
2992 arg_network_interfaces
= settings
->network_interfaces
;
2993 settings
->network_interfaces
= NULL
;
2995 strv_free(arg_network_macvlan
);
2996 arg_network_macvlan
= settings
->network_macvlan
;
2997 settings
->network_macvlan
= NULL
;
2999 strv_free(arg_network_ipvlan
);
3000 arg_network_ipvlan
= settings
->network_ipvlan
;
3001 settings
->network_ipvlan
= NULL
;
3003 strv_free(arg_network_veth_extra
);
3004 arg_network_veth_extra
= settings
->network_veth_extra
;
3005 settings
->network_veth_extra
= NULL
;
3007 free(arg_network_bridge
);
3008 arg_network_bridge
= settings
->network_bridge
;
3009 settings
->network_bridge
= NULL
;
3013 if ((arg_settings_mask
& SETTING_EXPOSE_PORTS
) == 0 &&
3014 settings
->expose_ports
) {
3016 if (!arg_settings_trusted
)
3017 log_warning("Ignoring Port= setting, file %s is not trusted.", p
);
3019 expose_port_free_all(arg_expose_ports
);
3020 arg_expose_ports
= settings
->expose_ports
;
3021 settings
->expose_ports
= NULL
;
3028 int main(int argc
, char *argv
[]) {
3030 _cleanup_free_
char *device_path
= NULL
, *root_device
= NULL
, *home_device
= NULL
, *srv_device
= NULL
, *console
= NULL
;
3031 bool root_device_rw
= true, home_device_rw
= true, srv_device_rw
= true;
3032 _cleanup_close_
int master
= -1, image_fd
= -1;
3033 _cleanup_fdset_free_ FDSet
*fds
= NULL
;
3034 int r
, n_fd_passed
, loop_nr
= -1;
3035 char veth_name
[IFNAMSIZ
];
3036 bool secondary
= false, remove_subvol
= false;
3039 int ret
= EXIT_SUCCESS
;
3040 union in_addr_union exposed
= {};
3041 _cleanup_release_lock_file_ LockFile tree_global_lock
= LOCK_FILE_INIT
, tree_local_lock
= LOCK_FILE_INIT
;
3044 log_parse_environment();
3047 r
= parse_argv(argc
, argv
);
3051 if (geteuid() != 0) {
3052 log_error("Need to be root.");
3056 r
= determine_names();
3060 r
= load_settings();
3064 r
= verify_arguments();
3068 n_fd_passed
= sd_listen_fds(false);
3069 if (n_fd_passed
> 0) {
3070 r
= fdset_new_listen_fds(&fds
, false);
3072 log_error_errno(r
, "Failed to collect file descriptors: %m");
3077 if (arg_directory
) {
3080 if (path_equal(arg_directory
, "/") && !arg_ephemeral
) {
3081 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3086 if (arg_ephemeral
) {
3087 _cleanup_free_
char *np
= NULL
;
3089 /* If the specified path is a mount point we
3090 * generate the new snapshot immediately
3091 * inside it under a random name. However if
3092 * the specified is not a mount point we
3093 * create the new snapshot in the parent
3094 * directory, just next to it. */
3095 r
= path_is_mount_point(arg_directory
, 0);
3097 log_error_errno(r
, "Failed to determine whether directory %s is mount point: %m", arg_directory
);
3101 r
= tempfn_random_child(arg_directory
, "machine.", &np
);
3103 r
= tempfn_random(arg_directory
, "machine.", &np
);
3105 log_error_errno(r
, "Failed to generate name for snapshot: %m");
3109 r
= image_path_lock(np
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3111 log_error_errno(r
, "Failed to lock %s: %m", np
);
3115 r
= btrfs_subvol_snapshot(arg_directory
, np
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
3117 log_error_errno(r
, "Failed to create snapshot %s from %s: %m", np
, arg_directory
);
3121 free(arg_directory
);
3125 remove_subvol
= true;
3128 r
= image_path_lock(arg_directory
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3130 log_error_errno(r
, "Directory tree %s is currently busy.", arg_directory
);
3134 log_error_errno(r
, "Failed to lock %s: %m", arg_directory
);
3139 r
= btrfs_subvol_snapshot(arg_template
, arg_directory
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
3142 log_info("Directory %s already exists, not populating from template %s.", arg_directory
, arg_template
);
3144 log_error_errno(r
, "Couldn't create snapshot %s from %s: %m", arg_directory
, arg_template
);
3148 log_info("Populated %s from template %s.", arg_directory
, arg_template
);
3154 if (path_is_os_tree(arg_directory
) <= 0) {
3155 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory
);
3162 p
= strjoina(arg_directory
, "/usr/");
3163 if (laccess(p
, F_OK
) < 0) {
3164 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory
);
3171 char template[] = "/tmp/nspawn-root-XXXXXX";
3174 assert(!arg_template
);
3176 r
= image_path_lock(arg_image
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3178 r
= log_error_errno(r
, "Disk image %s is currently busy.", arg_image
);
3182 r
= log_error_errno(r
, "Failed to create image lock: %m");
3186 if (!mkdtemp(template)) {
3187 log_error_errno(errno
, "Failed to create temporary directory: %m");
3192 arg_directory
= strdup(template);
3193 if (!arg_directory
) {
3198 image_fd
= setup_image(&device_path
, &loop_nr
);
3204 r
= dissect_image(image_fd
,
3205 &root_device
, &root_device_rw
,
3206 &home_device
, &home_device_rw
,
3207 &srv_device
, &srv_device_rw
,
3213 r
= custom_mounts_prepare();
3218 isatty(STDIN_FILENO
) > 0 &&
3219 isatty(STDOUT_FILENO
) > 0;
3221 master
= posix_openpt(O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NDELAY
);
3223 r
= log_error_errno(errno
, "Failed to acquire pseudo tty: %m");
3227 r
= ptsname_malloc(master
, &console
);
3229 r
= log_error_errno(r
, "Failed to determine tty name: %m");
3233 if (unlockpt(master
) < 0) {
3234 r
= log_error_errno(errno
, "Failed to unlock tty: %m");
3239 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3240 arg_machine
, arg_image
?: arg_directory
);
3242 assert_se(sigprocmask_many(SIG_BLOCK
, NULL
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, -1) >= 0);
3244 assert_se(sigemptyset(&mask_chld
) == 0);
3245 assert_se(sigaddset(&mask_chld
, SIGCHLD
) == 0);
3247 if (prctl(PR_SET_CHILD_SUBREAPER
, 1) < 0) {
3248 r
= log_error_errno(errno
, "Failed to become subreaper: %m");
3253 _cleanup_close_pair_
int kmsg_socket_pair
[2] = { -1, -1 }, rtnl_socket_pair
[2] = { -1, -1 }, pid_socket_pair
[2] = { -1, -1 }, uid_shift_socket_pair
[2] = { -1, -1 };
3254 ContainerStatus container_status
;
3255 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
3256 static const struct sigaction sa
= {
3257 .sa_handler
= nop_signal_handler
,
3258 .sa_flags
= SA_NOCLDSTOP
,
3262 _cleanup_event_unref_ sd_event
*event
= NULL
;
3263 _cleanup_(pty_forward_freep
) PTYForward
*forward
= NULL
;
3264 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
3267 r
= barrier_create(&barrier
);
3269 log_error_errno(r
, "Cannot initialize IPC barrier: %m");
3273 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, kmsg_socket_pair
) < 0) {
3274 r
= log_error_errno(errno
, "Failed to create kmsg socket pair: %m");
3278 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, rtnl_socket_pair
) < 0) {
3279 r
= log_error_errno(errno
, "Failed to create rtnl socket pair: %m");
3283 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, pid_socket_pair
) < 0) {
3284 r
= log_error_errno(errno
, "Failed to create pid socket pair: %m");
3289 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, uid_shift_socket_pair
) < 0) {
3290 r
= log_error_errno(errno
, "Failed to create uid shift socket pair: %m");
3294 /* Child can be killed before execv(), so handle SIGCHLD
3295 * in order to interrupt parent's blocking calls and
3296 * give it a chance to call wait() and terminate. */
3297 r
= sigprocmask(SIG_UNBLOCK
, &mask_chld
, NULL
);
3299 r
= log_error_errno(errno
, "Failed to change the signal mask: %m");
3303 r
= sigaction(SIGCHLD
, &sa
, NULL
);
3305 r
= log_error_errno(errno
, "Failed to install SIGCHLD handler: %m");
3309 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
, NULL
);
3311 if (errno
== EINVAL
)
3312 r
= log_error_errno(errno
, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3314 r
= log_error_errno(errno
, "clone() failed: %m");
3320 /* The outer child only has a file system namespace. */
3321 barrier_set_role(&barrier
, BARRIER_CHILD
);
3323 master
= safe_close(master
);
3325 kmsg_socket_pair
[0] = safe_close(kmsg_socket_pair
[0]);
3326 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3327 pid_socket_pair
[0] = safe_close(pid_socket_pair
[0]);
3328 uid_shift_socket_pair
[0] = safe_close(uid_shift_socket_pair
[0]);
3330 (void) reset_all_signal_handlers();
3331 (void) reset_signal_mask();
3333 r
= outer_child(&barrier
,
3336 root_device
, root_device_rw
,
3337 home_device
, home_device_rw
,
3338 srv_device
, srv_device_rw
,
3342 kmsg_socket_pair
[1],
3343 rtnl_socket_pair
[1],
3344 uid_shift_socket_pair
[1],
3347 _exit(EXIT_FAILURE
);
3349 _exit(EXIT_SUCCESS
);
3352 barrier_set_role(&barrier
, BARRIER_PARENT
);
3354 fds
= fdset_free(fds
);
3356 kmsg_socket_pair
[1] = safe_close(kmsg_socket_pair
[1]);
3357 rtnl_socket_pair
[1] = safe_close(rtnl_socket_pair
[1]);
3358 pid_socket_pair
[1] = safe_close(pid_socket_pair
[1]);
3359 uid_shift_socket_pair
[1] = safe_close(uid_shift_socket_pair
[1]);
3361 /* Wait for the outer child. */
3362 r
= wait_for_terminate_and_warn("namespace helper", pid
, NULL
);
3371 /* And now retrieve the PID of the inner child. */
3372 l
= recv(pid_socket_pair
[0], &pid
, sizeof(pid
), 0);
3374 r
= log_error_errno(errno
, "Failed to read inner child PID: %m");
3377 if (l
!= sizeof(pid
)) {
3378 log_error("Short read while reading inner child PID.");
3383 log_debug("Init process invoked as PID " PID_FMT
, pid
);
3386 if (!barrier_place_and_sync(&barrier
)) { /* #1 */
3387 log_error("Child died too early.");
3392 l
= recv(uid_shift_socket_pair
[0], &arg_uid_shift
, sizeof(arg_uid_shift
), 0);
3394 r
= log_error_errno(errno
, "Failed to read UID shift: %m");
3397 if (l
!= sizeof(arg_uid_shift
)) {
3398 log_error("Short read while reading UID shift.");
3403 r
= setup_uid_map(pid
);
3407 (void) barrier_place(&barrier
); /* #2 */
3410 if (arg_private_network
) {
3412 r
= move_network_interfaces(pid
, arg_network_interfaces
);
3416 if (arg_network_veth
) {
3417 r
= setup_veth(arg_machine
, pid
, veth_name
, !!arg_network_bridge
);
3423 if (arg_network_bridge
) {
3424 r
= setup_bridge(veth_name
, arg_network_bridge
);
3432 r
= setup_veth_extra(arg_machine
, pid
, arg_network_veth_extra
);
3436 r
= setup_macvlan(arg_machine
, pid
, arg_network_macvlan
);
3440 r
= setup_ipvlan(arg_machine
, pid
, arg_network_ipvlan
);
3446 r
= register_machine(
3453 arg_custom_mounts
, arg_n_custom_mounts
,
3457 arg_container_service_name
);
3462 r
= sync_cgroup(pid
, arg_unified_cgroup_hierarchy
);
3466 if (arg_keep_unit
) {
3467 r
= create_subcgroup(pid
, arg_unified_cgroup_hierarchy
);
3472 r
= chown_cgroup(pid
, arg_uid_shift
);
3476 /* Notify the child that the parent is ready with all
3477 * its setup (including cgroup-ification), and that
3478 * the child can now hand over control to the code to
3479 * run inside the container. */
3480 (void) barrier_place(&barrier
); /* #3 */
3482 /* Block SIGCHLD here, before notifying child.
3483 * process_pty() will handle it with the other signals. */
3484 assert_se(sigprocmask(SIG_BLOCK
, &mask_chld
, NULL
) >= 0);
3486 /* Reset signal to default */
3487 r
= default_signals(SIGCHLD
, -1);
3489 log_error_errno(r
, "Failed to reset SIGCHLD: %m");
3493 /* Let the child know that we are ready and wait that the child is completely ready now. */
3494 if (!barrier_place_and_sync(&barrier
)) { /* #4 */
3495 log_error("Child died too early.");
3502 "STATUS=Container running.\n"
3503 "X_NSPAWN_LEADER_PID=" PID_FMT
, pid
);
3505 r
= sd_event_new(&event
);
3507 log_error_errno(r
, "Failed to get default event source: %m");
3511 if (arg_kill_signal
> 0) {
3512 /* Try to kill the init system on SIGINT or SIGTERM */
3513 sd_event_add_signal(event
, NULL
, SIGINT
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
3514 sd_event_add_signal(event
, NULL
, SIGTERM
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
3516 /* Immediately exit */
3517 sd_event_add_signal(event
, NULL
, SIGINT
, NULL
, NULL
);
3518 sd_event_add_signal(event
, NULL
, SIGTERM
, NULL
, NULL
);
3521 /* simply exit on sigchld */
3522 sd_event_add_signal(event
, NULL
, SIGCHLD
, NULL
, NULL
);
3524 if (arg_expose_ports
) {
3525 r
= expose_port_watch_rtnl(event
, rtnl_socket_pair
[0], on_address_change
, &exposed
, &rtnl
);
3529 (void) expose_port_execute(rtnl
, arg_expose_ports
, &exposed
);
3532 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3534 r
= pty_forward_new(event
, master
, PTY_FORWARD_IGNORE_VHANGUP
| (interactive
? 0 : PTY_FORWARD_READ_ONLY
), &forward
);
3536 log_error_errno(r
, "Failed to create PTY forwarder: %m");
3540 r
= sd_event_loop(event
);
3542 log_error_errno(r
, "Failed to run event loop: %m");
3546 pty_forward_get_last_char(forward
, &last_char
);
3548 forward
= pty_forward_free(forward
);
3550 if (!arg_quiet
&& last_char
!= '\n')
3553 /* Kill if it is not dead yet anyway */
3554 if (arg_register
&& !arg_keep_unit
)
3555 terminate_machine(pid
);
3557 /* Normally redundant, but better safe than sorry */
3560 r
= wait_for_container(pid
, &container_status
);
3564 /* We failed to wait for the container, or the
3565 * container exited abnormally */
3567 else if (r
> 0 || container_status
== CONTAINER_TERMINATED
){
3568 /* The container exited with a non-zero
3569 * status, or with zero status and no reboot
3575 /* CONTAINER_REBOOTED, loop again */
3577 if (arg_keep_unit
) {
3578 /* Special handling if we are running as a
3579 * service: instead of simply restarting the
3580 * machine we want to restart the entire
3581 * service, so let's inform systemd about this
3582 * with the special exit code 133. The service
3583 * file uses RestartForceExitStatus=133 so
3584 * that this results in a full nspawn
3585 * restart. This is necessary since we might
3586 * have cgroup parameters set we want to have
3593 expose_port_flush(arg_expose_ports
, &exposed
);
3599 "STATUS=Terminating...");
3604 /* Try to flush whatever is still queued in the pty */
3606 (void) copy_bytes(master
, STDOUT_FILENO
, (uint64_t) -1, false);
3608 loop_remove(loop_nr
, &image_fd
);
3610 if (remove_subvol
&& arg_directory
) {
3613 k
= btrfs_subvol_remove(arg_directory
, BTRFS_REMOVE_RECURSIVE
|BTRFS_REMOVE_QUOTA
);
3615 log_warning_errno(k
, "Cannot remove subvolume '%s', ignoring: %m", arg_directory
);
3621 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
3622 (void) rm_rf(p
, REMOVE_ROOT
);
3625 expose_port_flush(arg_expose_ports
, &exposed
);
3627 free(arg_directory
);
3632 strv_free(arg_setenv
);
3633 free(arg_network_bridge
);
3634 strv_free(arg_network_interfaces
);
3635 strv_free(arg_network_macvlan
);
3636 strv_free(arg_network_ipvlan
);
3637 strv_free(arg_network_veth_extra
);
3638 strv_free(arg_parameters
);
3639 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
3640 expose_port_free_all(arg_expose_ports
);
3642 return r
< 0 ? EXIT_FAILURE
: ret
;