1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
23 #include <blkid/blkid.h>
27 #include <linux/loop.h>
33 #include <selinux/selinux.h>
40 #include <sys/mount.h>
41 #include <sys/personality.h>
42 #include <sys/prctl.h>
43 #include <sys/types.h>
46 #include "sd-daemon.h"
50 #include "base-filesystem.h"
51 #include "blkid-util.h"
52 #include "btrfs-util.h"
54 #include "capability.h"
55 #include "cgroup-util.h"
57 #include "dev-setup.h"
59 #include "event-util.h"
63 #include "formats-util.h"
65 #include "hostname-util.h"
67 #include "loopback-setup.h"
68 #include "machine-image.h"
72 #include "netlink-util.h"
73 #include "nspawn-cgroup.h"
74 #include "nspawn-expose-ports.h"
75 #include "nspawn-mount.h"
76 #include "nspawn-network.h"
77 #include "nspawn-register.h"
78 #include "nspawn-settings.h"
79 #include "nspawn-setuid.h"
80 #include "path-util.h"
81 #include "process-util.h"
83 #include "random-util.h"
86 #include "seccomp-util.h"
88 #include "signal-util.h"
89 #include "string-util.h"
91 #include "terminal-util.h"
92 #include "udev-util.h"
95 typedef enum ContainerStatus
{
100 typedef enum LinkJournal
{
107 static char *arg_directory
= NULL
;
108 static char *arg_template
= NULL
;
109 static char *arg_user
= NULL
;
110 static sd_id128_t arg_uuid
= {};
111 static char *arg_machine
= NULL
;
112 static const char *arg_selinux_context
= NULL
;
113 static const char *arg_selinux_apifs_context
= NULL
;
114 static const char *arg_slice
= NULL
;
115 static bool arg_private_network
= false;
116 static bool arg_read_only
= false;
117 static bool arg_boot
= false;
118 static bool arg_ephemeral
= false;
119 static LinkJournal arg_link_journal
= LINK_AUTO
;
120 static bool arg_link_journal_try
= false;
121 static uint64_t arg_retain
=
122 (1ULL << CAP_CHOWN
) |
123 (1ULL << CAP_DAC_OVERRIDE
) |
124 (1ULL << CAP_DAC_READ_SEARCH
) |
125 (1ULL << CAP_FOWNER
) |
126 (1ULL << CAP_FSETID
) |
127 (1ULL << CAP_IPC_OWNER
) |
129 (1ULL << CAP_LEASE
) |
130 (1ULL << CAP_LINUX_IMMUTABLE
) |
131 (1ULL << CAP_NET_BIND_SERVICE
) |
132 (1ULL << CAP_NET_BROADCAST
) |
133 (1ULL << CAP_NET_RAW
) |
134 (1ULL << CAP_SETGID
) |
135 (1ULL << CAP_SETFCAP
) |
136 (1ULL << CAP_SETPCAP
) |
137 (1ULL << CAP_SETUID
) |
138 (1ULL << CAP_SYS_ADMIN
) |
139 (1ULL << CAP_SYS_CHROOT
) |
140 (1ULL << CAP_SYS_NICE
) |
141 (1ULL << CAP_SYS_PTRACE
) |
142 (1ULL << CAP_SYS_TTY_CONFIG
) |
143 (1ULL << CAP_SYS_RESOURCE
) |
144 (1ULL << CAP_SYS_BOOT
) |
145 (1ULL << CAP_AUDIT_WRITE
) |
146 (1ULL << CAP_AUDIT_CONTROL
) |
148 static CustomMount
*arg_custom_mounts
= NULL
;
149 static unsigned arg_n_custom_mounts
= 0;
150 static char **arg_setenv
= NULL
;
151 static bool arg_quiet
= false;
152 static bool arg_share_system
= false;
153 static bool arg_register
= true;
154 static bool arg_keep_unit
= false;
155 static char **arg_network_interfaces
= NULL
;
156 static char **arg_network_macvlan
= NULL
;
157 static char **arg_network_ipvlan
= NULL
;
158 static bool arg_network_veth
= false;
159 static char *arg_network_bridge
= NULL
;
160 static unsigned long arg_personality
= PERSONALITY_INVALID
;
161 static char *arg_image
= NULL
;
162 static VolatileMode arg_volatile_mode
= VOLATILE_NO
;
163 static ExposePort
*arg_expose_ports
= NULL
;
164 static char **arg_property
= NULL
;
165 static uid_t arg_uid_shift
= UID_INVALID
, arg_uid_range
= 0x10000U
;
166 static bool arg_userns
= false;
167 static int arg_kill_signal
= 0;
168 static bool arg_unified_cgroup_hierarchy
= false;
169 static SettingsMask arg_settings_mask
= 0;
170 static int arg_settings_trusted
= -1;
171 static char **arg_parameters
= NULL
;
173 static void help(void) {
174 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
175 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
176 " -h --help Show this help\n"
177 " --version Print version string\n"
178 " -q --quiet Do not show status information\n"
179 " -D --directory=PATH Root directory for the container\n"
180 " --template=PATH Initialize root directory from template directory,\n"
182 " -x --ephemeral Run container with snapshot of root directory, and\n"
183 " remove it after exit\n"
184 " -i --image=PATH File system device or disk image for the container\n"
185 " -b --boot Boot up full system (i.e. invoke init)\n"
186 " -u --user=USER Run the command under specified user or uid\n"
187 " -M --machine=NAME Set the machine name for the container\n"
188 " --uuid=UUID Set a specific machine UUID for the container\n"
189 " -S --slice=SLICE Place the container in the specified slice\n"
190 " --property=NAME=VALUE Set scope unit property\n"
191 " --private-users[=UIDBASE[:NUIDS]]\n"
192 " Run within user namespace\n"
193 " --private-network Disable network in container\n"
194 " --network-interface=INTERFACE\n"
195 " Assign an existing network interface to the\n"
197 " --network-macvlan=INTERFACE\n"
198 " Create a macvlan network interface based on an\n"
199 " existing network interface to the container\n"
200 " --network-ipvlan=INTERFACE\n"
201 " Create a ipvlan network interface based on an\n"
202 " existing network interface to the container\n"
203 " -n --network-veth Add a virtual ethernet connection between host\n"
205 " --network-bridge=INTERFACE\n"
206 " Add a virtual ethernet connection between host\n"
207 " and container and add it to an existing bridge on\n"
209 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
210 " Expose a container IP port on the host\n"
211 " -Z --selinux-context=SECLABEL\n"
212 " Set the SELinux security context to be used by\n"
213 " processes in the container\n"
214 " -L --selinux-apifs-context=SECLABEL\n"
215 " Set the SELinux security context to be used by\n"
216 " API/tmpfs file systems in the container\n"
217 " --capability=CAP In addition to the default, retain specified\n"
219 " --drop-capability=CAP Drop the specified capability from the default set\n"
220 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
221 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
222 " try-guest, try-host\n"
223 " -j Equivalent to --link-journal=try-guest\n"
224 " --read-only Mount the root directory read-only\n"
225 " --bind=PATH[:PATH[:OPTIONS]]\n"
226 " Bind mount a file or directory from the host into\n"
228 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
229 " Similar, but creates a read-only bind mount\n"
230 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
231 " --overlay=PATH[:PATH...]:PATH\n"
232 " Create an overlay mount from the host to \n"
234 " --overlay-ro=PATH[:PATH...]:PATH\n"
235 " Similar, but creates a read-only overlay mount\n"
236 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
237 " --share-system Share system namespaces with host\n"
238 " --register=BOOLEAN Register container as machine\n"
239 " --keep-unit Do not register a scope for the machine, reuse\n"
240 " the service unit nspawn is running in\n"
241 " --volatile[=MODE] Run the system in volatile mode\n"
242 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
243 , program_invocation_short_name
);
247 static int custom_mounts_prepare(void) {
251 /* Ensure the mounts are applied prefix first. */
252 qsort_safe(arg_custom_mounts
, arg_n_custom_mounts
, sizeof(CustomMount
), custom_mount_compare
);
254 /* Allocate working directories for the overlay file systems that need it */
255 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
256 CustomMount
*m
= &arg_custom_mounts
[i
];
258 if (arg_userns
&& arg_uid_shift
== UID_INVALID
&& path_equal(m
->destination
, "/")) {
259 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
263 if (m
->type
!= CUSTOM_MOUNT_OVERLAY
)
272 r
= tempfn_random(m
->source
, NULL
, &m
->work_dir
);
274 return log_error_errno(r
, "Failed to generate work directory from %s: %m", m
->source
);
280 static int detect_unified_cgroup_hierarchy(void) {
284 /* Allow the user to control whether the unified hierarchy is used */
285 e
= getenv("UNIFIED_CGROUP_HIERARCHY");
287 r
= parse_boolean(e
);
289 return log_error_errno(r
, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
291 arg_unified_cgroup_hierarchy
= r
;
295 /* Otherwise inherit the default from the host system */
298 return log_error_errno(r
, "Failed to determine whether the unified cgroups hierarchy is used: %m");
300 arg_unified_cgroup_hierarchy
= r
;
304 static int parse_argv(int argc
, char *argv
[]) {
323 ARG_NETWORK_INTERFACE
,
336 static const struct option options
[] = {
337 { "help", no_argument
, NULL
, 'h' },
338 { "version", no_argument
, NULL
, ARG_VERSION
},
339 { "directory", required_argument
, NULL
, 'D' },
340 { "template", required_argument
, NULL
, ARG_TEMPLATE
},
341 { "ephemeral", no_argument
, NULL
, 'x' },
342 { "user", required_argument
, NULL
, 'u' },
343 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
344 { "boot", no_argument
, NULL
, 'b' },
345 { "uuid", required_argument
, NULL
, ARG_UUID
},
346 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
347 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
348 { "drop-capability", required_argument
, NULL
, ARG_DROP_CAPABILITY
},
349 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
350 { "bind", required_argument
, NULL
, ARG_BIND
},
351 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
352 { "tmpfs", required_argument
, NULL
, ARG_TMPFS
},
353 { "overlay", required_argument
, NULL
, ARG_OVERLAY
},
354 { "overlay-ro", required_argument
, NULL
, ARG_OVERLAY_RO
},
355 { "machine", required_argument
, NULL
, 'M' },
356 { "slice", required_argument
, NULL
, 'S' },
357 { "setenv", required_argument
, NULL
, ARG_SETENV
},
358 { "selinux-context", required_argument
, NULL
, 'Z' },
359 { "selinux-apifs-context", required_argument
, NULL
, 'L' },
360 { "quiet", no_argument
, NULL
, 'q' },
361 { "share-system", no_argument
, NULL
, ARG_SHARE_SYSTEM
},
362 { "register", required_argument
, NULL
, ARG_REGISTER
},
363 { "keep-unit", no_argument
, NULL
, ARG_KEEP_UNIT
},
364 { "network-interface", required_argument
, NULL
, ARG_NETWORK_INTERFACE
},
365 { "network-macvlan", required_argument
, NULL
, ARG_NETWORK_MACVLAN
},
366 { "network-ipvlan", required_argument
, NULL
, ARG_NETWORK_IPVLAN
},
367 { "network-veth", no_argument
, NULL
, 'n' },
368 { "network-bridge", required_argument
, NULL
, ARG_NETWORK_BRIDGE
},
369 { "personality", required_argument
, NULL
, ARG_PERSONALITY
},
370 { "image", required_argument
, NULL
, 'i' },
371 { "volatile", optional_argument
, NULL
, ARG_VOLATILE
},
372 { "port", required_argument
, NULL
, 'p' },
373 { "property", required_argument
, NULL
, ARG_PROPERTY
},
374 { "private-users", optional_argument
, NULL
, ARG_PRIVATE_USERS
},
375 { "kill-signal", required_argument
, NULL
, ARG_KILL_SIGNAL
},
376 { "settings", required_argument
, NULL
, ARG_SETTINGS
},
381 uint64_t plus
= 0, minus
= 0;
382 bool mask_all_settings
= false, mask_no_settings
= false;
387 while ((c
= getopt_long(argc
, argv
, "+hD:u:bL:M:jS:Z:qi:xp:n", options
, NULL
)) >= 0)
399 r
= parse_path_argument_and_warn(optarg
, false, &arg_directory
);
405 r
= parse_path_argument_and_warn(optarg
, false, &arg_template
);
411 r
= parse_path_argument_and_warn(optarg
, false, &arg_image
);
417 arg_ephemeral
= true;
421 r
= free_and_strdup(&arg_user
, optarg
);
425 arg_settings_mask
|= SETTING_USER
;
428 case ARG_NETWORK_BRIDGE
:
429 r
= free_and_strdup(&arg_network_bridge
, optarg
);
436 arg_network_veth
= true;
437 arg_private_network
= true;
438 arg_settings_mask
|= SETTING_NETWORK
;
441 case ARG_NETWORK_INTERFACE
:
442 if (strv_extend(&arg_network_interfaces
, optarg
) < 0)
445 arg_private_network
= true;
446 arg_settings_mask
|= SETTING_NETWORK
;
449 case ARG_NETWORK_MACVLAN
:
450 if (strv_extend(&arg_network_macvlan
, optarg
) < 0)
453 arg_private_network
= true;
454 arg_settings_mask
|= SETTING_NETWORK
;
457 case ARG_NETWORK_IPVLAN
:
458 if (strv_extend(&arg_network_ipvlan
, optarg
) < 0)
463 case ARG_PRIVATE_NETWORK
:
464 arg_private_network
= true;
465 arg_settings_mask
|= SETTING_NETWORK
;
470 arg_settings_mask
|= SETTING_BOOT
;
474 r
= sd_id128_from_string(optarg
, &arg_uuid
);
476 log_error("Invalid UUID: %s", optarg
);
480 arg_settings_mask
|= SETTING_MACHINE_ID
;
489 arg_machine
= mfree(arg_machine
);
491 if (!machine_name_is_valid(optarg
)) {
492 log_error("Invalid machine name: %s", optarg
);
496 r
= free_and_strdup(&arg_machine
, optarg
);
504 arg_selinux_context
= optarg
;
508 arg_selinux_apifs_context
= optarg
;
512 arg_read_only
= true;
513 arg_settings_mask
|= SETTING_READ_ONLY
;
517 case ARG_DROP_CAPABILITY
: {
518 const char *state
, *word
;
521 FOREACH_WORD_SEPARATOR(word
, length
, optarg
, ",", state
) {
522 _cleanup_free_
char *t
;
524 t
= strndup(word
, length
);
528 if (streq(t
, "all")) {
529 if (c
== ARG_CAPABILITY
)
530 plus
= (uint64_t) -1;
532 minus
= (uint64_t) -1;
536 cap
= capability_from_name(t
);
538 log_error("Failed to parse capability %s.", t
);
542 if (c
== ARG_CAPABILITY
)
543 plus
|= 1ULL << (uint64_t) cap
;
545 minus
|= 1ULL << (uint64_t) cap
;
549 arg_settings_mask
|= SETTING_CAPABILITY
;
554 arg_link_journal
= LINK_GUEST
;
555 arg_link_journal_try
= true;
558 case ARG_LINK_JOURNAL
:
559 if (streq(optarg
, "auto")) {
560 arg_link_journal
= LINK_AUTO
;
561 arg_link_journal_try
= false;
562 } else if (streq(optarg
, "no")) {
563 arg_link_journal
= LINK_NO
;
564 arg_link_journal_try
= false;
565 } else if (streq(optarg
, "guest")) {
566 arg_link_journal
= LINK_GUEST
;
567 arg_link_journal_try
= false;
568 } else if (streq(optarg
, "host")) {
569 arg_link_journal
= LINK_HOST
;
570 arg_link_journal_try
= false;
571 } else if (streq(optarg
, "try-guest")) {
572 arg_link_journal
= LINK_GUEST
;
573 arg_link_journal_try
= true;
574 } else if (streq(optarg
, "try-host")) {
575 arg_link_journal
= LINK_HOST
;
576 arg_link_journal_try
= true;
578 log_error("Failed to parse link journal mode %s", optarg
);
586 r
= bind_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
, c
== ARG_BIND_RO
);
588 return log_error_errno(r
, "Failed to parse --bind(-ro)= argument %s: %m", optarg
);
590 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
594 r
= tmpfs_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
);
596 return log_error_errno(r
, "Failed to parse --tmpfs= argument %s: %m", optarg
);
598 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
602 case ARG_OVERLAY_RO
: {
603 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
604 _cleanup_strv_free_
char **lower
= NULL
;
609 r
= strv_split_extract(&lower
, optarg
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
613 log_error("Invalid overlay specification: %s", optarg
);
617 STRV_FOREACH(i
, lower
) {
618 if (!path_is_absolute(*i
)) {
619 log_error("Overlay path %s is not absolute.", *i
);
627 log_error("--overlay= needs at least two colon-separated directories specified.");
632 /* If two parameters are specified,
633 * the first one is the lower, the
634 * second one the upper directory. And
635 * we'll also define the destination
636 * mount point the same as the upper. */
640 destination
= strdup(upper
);
645 upper
= lower
[n
- 2];
646 destination
= lower
[n
- 1];
650 m
= custom_mount_add(&arg_custom_mounts
, &arg_n_custom_mounts
, CUSTOM_MOUNT_OVERLAY
);
654 m
->destination
= destination
;
657 m
->read_only
= c
== ARG_OVERLAY_RO
;
659 upper
= destination
= NULL
;
662 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
669 if (!env_assignment_is_valid(optarg
)) {
670 log_error("Environment variable assignment '%s' is not valid.", optarg
);
674 n
= strv_env_set(arg_setenv
, optarg
);
678 strv_free(arg_setenv
);
681 arg_settings_mask
|= SETTING_ENVIRONMENT
;
689 case ARG_SHARE_SYSTEM
:
690 arg_share_system
= true;
694 r
= parse_boolean(optarg
);
696 log_error("Failed to parse --register= argument: %s", optarg
);
704 arg_keep_unit
= true;
707 case ARG_PERSONALITY
:
709 arg_personality
= personality_from_string(optarg
);
710 if (arg_personality
== PERSONALITY_INVALID
) {
711 log_error("Unknown or unsupported personality '%s'.", optarg
);
715 arg_settings_mask
|= SETTING_PERSONALITY
;
721 arg_volatile_mode
= VOLATILE_YES
;
725 m
= volatile_mode_from_string(optarg
);
727 log_error("Failed to parse --volatile= argument: %s", optarg
);
730 arg_volatile_mode
= m
;
733 arg_settings_mask
|= SETTING_VOLATILE_MODE
;
737 r
= expose_port_parse(&arg_expose_ports
, optarg
);
739 return log_error_errno(r
, "Duplicate port specification: %s", optarg
);
741 return log_error_errno(r
, "Failed to parse host port %s: %m", optarg
);
743 arg_settings_mask
|= SETTING_EXPOSE_PORTS
;
747 if (strv_extend(&arg_property
, optarg
) < 0)
752 case ARG_PRIVATE_USERS
:
754 _cleanup_free_
char *buffer
= NULL
;
755 const char *range
, *shift
;
757 range
= strchr(optarg
, ':');
759 buffer
= strndup(optarg
, range
- optarg
);
765 if (safe_atou32(range
, &arg_uid_range
) < 0 || arg_uid_range
<= 0) {
766 log_error("Failed to parse UID range: %s", range
);
772 if (parse_uid(shift
, &arg_uid_shift
) < 0) {
773 log_error("Failed to parse UID: %s", optarg
);
781 case ARG_KILL_SIGNAL
:
782 arg_kill_signal
= signal_from_string_try_harder(optarg
);
783 if (arg_kill_signal
< 0) {
784 log_error("Cannot parse signal: %s", optarg
);
788 arg_settings_mask
|= SETTING_KILL_SIGNAL
;
793 /* no → do not read files
794 * yes → read files, do not override cmdline, trust only subset
795 * override → read files, override cmdline, trust only subset
796 * trusted → read files, do not override cmdline, trust all
799 r
= parse_boolean(optarg
);
801 if (streq(optarg
, "trusted")) {
802 mask_all_settings
= false;
803 mask_no_settings
= false;
804 arg_settings_trusted
= true;
806 } else if (streq(optarg
, "override")) {
807 mask_all_settings
= false;
808 mask_no_settings
= true;
809 arg_settings_trusted
= -1;
811 return log_error_errno(r
, "Failed to parse --settings= argument: %s", optarg
);
814 mask_all_settings
= false;
815 mask_no_settings
= false;
816 arg_settings_trusted
= -1;
819 mask_all_settings
= true;
820 mask_no_settings
= false;
821 arg_settings_trusted
= false;
830 assert_not_reached("Unhandled option");
833 if (arg_share_system
)
834 arg_register
= false;
836 if (arg_boot
&& arg_share_system
) {
837 log_error("--boot and --share-system may not be combined.");
841 if (arg_keep_unit
&& cg_pid_get_owner_uid(0, NULL
) >= 0) {
842 log_error("--keep-unit may not be used when invoked from a user session.");
846 if (arg_directory
&& arg_image
) {
847 log_error("--directory= and --image= may not be combined.");
851 if (arg_template
&& arg_image
) {
852 log_error("--template= and --image= may not be combined.");
856 if (arg_template
&& !(arg_directory
|| arg_machine
)) {
857 log_error("--template= needs --directory= or --machine=.");
861 if (arg_ephemeral
&& arg_template
) {
862 log_error("--ephemeral and --template= may not be combined.");
866 if (arg_ephemeral
&& arg_image
) {
867 log_error("--ephemeral and --image= may not be combined.");
871 if (arg_ephemeral
&& !IN_SET(arg_link_journal
, LINK_NO
, LINK_AUTO
)) {
872 log_error("--ephemeral and --link-journal= may not be combined.");
876 if (arg_userns
&& access("/proc/self/uid_map", F_OK
) < 0)
877 return log_error_errno(EOPNOTSUPP
, "--private-users= is not supported, kernel compiled without user namespace support.");
880 arg_parameters
= strv_copy(argv
+ optind
);
884 arg_settings_mask
|= SETTING_BOOT
;
887 /* Load all settings from .nspawn files */
888 if (mask_no_settings
)
889 arg_settings_mask
= 0;
891 /* Don't load any settings from .nspawn files */
892 if (mask_all_settings
)
893 arg_settings_mask
= _SETTINGS_MASK_ALL
;
895 arg_retain
= (arg_retain
| plus
| (arg_private_network
? 1ULL << CAP_NET_ADMIN
: 0)) & ~minus
;
897 r
= detect_unified_cgroup_hierarchy();
904 static int verify_arguments(void) {
906 if (arg_volatile_mode
!= VOLATILE_NO
&& arg_read_only
) {
907 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
911 if (arg_expose_ports
&& !arg_private_network
) {
912 log_error("Cannot use --port= without private networking.");
916 if (arg_boot
&& arg_kill_signal
<= 0)
917 arg_kill_signal
= SIGRTMIN
+3;
922 static int userns_lchown(const char *p
, uid_t uid
, gid_t gid
) {
928 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
931 if (uid
!= UID_INVALID
) {
932 uid
+= arg_uid_shift
;
934 if (uid
< arg_uid_shift
|| uid
>= arg_uid_shift
+ arg_uid_range
)
938 if (gid
!= GID_INVALID
) {
939 gid
+= (gid_t
) arg_uid_shift
;
941 if (gid
< (gid_t
) arg_uid_shift
|| gid
>= (gid_t
) (arg_uid_shift
+ arg_uid_range
))
945 if (lchown(p
, uid
, gid
) < 0)
951 static int userns_mkdir(const char *root
, const char *path
, mode_t mode
, uid_t uid
, gid_t gid
) {
954 q
= prefix_roota(root
, path
);
955 if (mkdir(q
, mode
) < 0) {
961 return userns_lchown(q
, uid
, gid
);
964 static int setup_timezone(const char *dest
) {
965 _cleanup_free_
char *p
= NULL
, *q
= NULL
;
966 const char *where
, *check
, *what
;
972 /* Fix the timezone, if possible */
973 r
= readlink_malloc("/etc/localtime", &p
);
975 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
979 z
= path_startswith(p
, "../usr/share/zoneinfo/");
981 z
= path_startswith(p
, "/usr/share/zoneinfo/");
983 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
987 where
= prefix_roota(dest
, "/etc/localtime");
988 r
= readlink_malloc(where
, &q
);
990 y
= path_startswith(q
, "../usr/share/zoneinfo/");
992 y
= path_startswith(q
, "/usr/share/zoneinfo/");
994 /* Already pointing to the right place? Then do nothing .. */
995 if (y
&& streq(y
, z
))
999 check
= strjoina("/usr/share/zoneinfo/", z
);
1000 check
= prefix_root(dest
, check
);
1001 if (laccess(check
, F_OK
) < 0) {
1002 log_warning("Timezone %s does not exist in container, not updating container timezone.", z
);
1007 if (r
< 0 && errno
!= ENOENT
) {
1008 log_error_errno(errno
, "Failed to remove existing timezone info %s in container: %m", where
);
1012 what
= strjoina("../usr/share/zoneinfo/", z
);
1013 if (symlink(what
, where
) < 0) {
1014 log_error_errno(errno
, "Failed to correct timezone of container: %m");
1018 r
= userns_lchown(where
, 0, 0);
1020 return log_warning_errno(r
, "Failed to chown /etc/localtime: %m");
1025 static int setup_resolv_conf(const char *dest
) {
1026 const char *where
= NULL
;
1031 if (arg_private_network
)
1034 /* Fix resolv.conf, if possible */
1035 where
= prefix_roota(dest
, "/etc/resolv.conf");
1037 r
= copy_file("/etc/resolv.conf", where
, O_TRUNC
|O_NOFOLLOW
, 0644, 0);
1039 /* If the file already exists as symlink, let's
1040 * suppress the warning, under the assumption that
1041 * resolved or something similar runs inside and the
1042 * symlink points there.
1044 * If the disk image is read-only, there's also no
1045 * point in complaining.
1047 log_full_errno(IN_SET(r
, -ELOOP
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
1048 "Failed to copy /etc/resolv.conf to %s: %m", where
);
1052 r
= userns_lchown(where
, 0, 0);
1054 log_warning_errno(r
, "Failed to chown /etc/resolv.conf: %m");
1059 static char* id128_format_as_uuid(sd_id128_t id
, char s
[37]) {
1063 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1064 SD_ID128_FORMAT_VAL(id
));
1069 static int setup_boot_id(const char *dest
) {
1070 const char *from
, *to
;
1071 sd_id128_t rnd
= {};
1075 if (arg_share_system
)
1078 /* Generate a new randomized boot ID, so that each boot-up of
1079 * the container gets a new one */
1081 from
= prefix_roota(dest
, "/run/proc-sys-kernel-random-boot-id");
1082 to
= prefix_roota(dest
, "/proc/sys/kernel/random/boot_id");
1084 r
= sd_id128_randomize(&rnd
);
1086 return log_error_errno(r
, "Failed to generate random boot id: %m");
1088 id128_format_as_uuid(rnd
, as_uuid
);
1090 r
= write_string_file(from
, as_uuid
, WRITE_STRING_FILE_CREATE
);
1092 return log_error_errno(r
, "Failed to write boot id: %m");
1094 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1095 r
= log_error_errno(errno
, "Failed to bind mount boot id: %m");
1096 else if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
) < 0)
1097 log_warning_errno(errno
, "Failed to make boot id read-only: %m");
1103 static int copy_devnodes(const char *dest
) {
1105 static const char devnodes
[] =
1116 _cleanup_umask_ mode_t u
;
1122 /* Create /dev/net, so that we can create /dev/net/tun in it */
1123 if (userns_mkdir(dest
, "/dev/net", 0755, 0, 0) < 0)
1124 return log_error_errno(r
, "Failed to create /dev/net directory: %m");
1126 NULSTR_FOREACH(d
, devnodes
) {
1127 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
1130 from
= strappend("/dev/", d
);
1131 to
= prefix_root(dest
, from
);
1133 if (stat(from
, &st
) < 0) {
1135 if (errno
!= ENOENT
)
1136 return log_error_errno(errno
, "Failed to stat %s: %m", from
);
1138 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
1140 log_error("%s is not a char or block device, cannot copy.", from
);
1144 if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
1146 return log_error_errno(errno
, "mknod(%s) failed: %m", to
);
1148 /* Some systems abusively restrict mknod but
1149 * allow bind mounts. */
1152 return log_error_errno(r
, "touch (%s) failed: %m", to
);
1153 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1154 return log_error_errno(errno
, "Both mknod and bind mount (%s) failed: %m", to
);
1157 r
= userns_lchown(to
, 0, 0);
1159 return log_error_errno(r
, "chown() of device node %s failed: %m", to
);
1166 static int setup_pts(const char *dest
) {
1167 _cleanup_free_
char *options
= NULL
;
1171 if (arg_selinux_apifs_context
)
1172 (void) asprintf(&options
,
1173 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
",context=\"%s\"",
1174 arg_uid_shift
+ TTY_GID
,
1175 arg_selinux_apifs_context
);
1178 (void) asprintf(&options
,
1179 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
,
1180 arg_uid_shift
+ TTY_GID
);
1185 /* Mount /dev/pts itself */
1186 p
= prefix_roota(dest
, "/dev/pts");
1187 if (mkdir(p
, 0755) < 0)
1188 return log_error_errno(errno
, "Failed to create /dev/pts: %m");
1189 if (mount("devpts", p
, "devpts", MS_NOSUID
|MS_NOEXEC
, options
) < 0)
1190 return log_error_errno(errno
, "Failed to mount /dev/pts: %m");
1191 if (userns_lchown(p
, 0, 0) < 0)
1192 return log_error_errno(errno
, "Failed to chown /dev/pts: %m");
1194 /* Create /dev/ptmx symlink */
1195 p
= prefix_roota(dest
, "/dev/ptmx");
1196 if (symlink("pts/ptmx", p
) < 0)
1197 return log_error_errno(errno
, "Failed to create /dev/ptmx symlink: %m");
1198 if (userns_lchown(p
, 0, 0) < 0)
1199 return log_error_errno(errno
, "Failed to chown /dev/ptmx: %m");
1201 /* And fix /dev/pts/ptmx ownership */
1202 p
= prefix_roota(dest
, "/dev/pts/ptmx");
1203 if (userns_lchown(p
, 0, 0) < 0)
1204 return log_error_errno(errno
, "Failed to chown /dev/pts/ptmx: %m");
1209 static int setup_dev_console(const char *dest
, const char *console
) {
1210 _cleanup_umask_ mode_t u
;
1219 r
= chmod_and_chown(console
, 0600, arg_uid_shift
, arg_uid_shift
);
1221 return log_error_errno(r
, "Failed to correct access mode for TTY: %m");
1223 /* We need to bind mount the right tty to /dev/console since
1224 * ptys can only exist on pts file systems. To have something
1225 * to bind mount things on we create a empty regular file. */
1227 to
= prefix_roota(dest
, "/dev/console");
1230 return log_error_errno(r
, "touch() for /dev/console failed: %m");
1232 if (mount(console
, to
, NULL
, MS_BIND
, NULL
) < 0)
1233 return log_error_errno(errno
, "Bind mount for /dev/console failed: %m");
1238 static int setup_kmsg(const char *dest
, int kmsg_socket
) {
1239 const char *from
, *to
;
1240 _cleanup_umask_ mode_t u
;
1243 assert(kmsg_socket
>= 0);
1247 /* We create the kmsg FIFO as /run/kmsg, but immediately
1248 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1249 * on the reading side behave very similar to /proc/kmsg,
1250 * their writing side behaves differently from /dev/kmsg in
1251 * that writing blocks when nothing is reading. In order to
1252 * avoid any problems with containers deadlocking due to this
1253 * we simply make /dev/kmsg unavailable to the container. */
1254 from
= prefix_roota(dest
, "/run/kmsg");
1255 to
= prefix_roota(dest
, "/proc/kmsg");
1257 if (mkfifo(from
, 0600) < 0)
1258 return log_error_errno(errno
, "mkfifo() for /run/kmsg failed: %m");
1259 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1260 return log_error_errno(errno
, "Bind mount for /proc/kmsg failed: %m");
1262 fd
= open(from
, O_RDWR
|O_NDELAY
|O_CLOEXEC
);
1264 return log_error_errno(errno
, "Failed to open fifo: %m");
1266 /* Store away the fd in the socket, so that it stays open as
1267 * long as we run the child */
1268 r
= send_one_fd(kmsg_socket
, fd
, 0);
1272 return log_error_errno(r
, "Failed to send FIFO fd: %m");
1274 /* And now make the FIFO unavailable as /run/kmsg... */
1275 (void) unlink(from
);
1280 static int on_address_change(sd_netlink
*rtnl
, sd_netlink_message
*m
, void *userdata
) {
1281 union in_addr_union
*exposed
= userdata
;
1287 expose_port_execute(rtnl
, arg_expose_ports
, exposed
);
1291 static int setup_hostname(void) {
1293 if (arg_share_system
)
1296 if (sethostname_idempotent(arg_machine
) < 0)
1302 static int setup_journal(const char *directory
) {
1303 sd_id128_t machine_id
, this_id
;
1304 _cleanup_free_
char *b
= NULL
, *d
= NULL
;
1305 const char *etc_machine_id
, *p
, *q
;
1309 /* Don't link journals in ephemeral mode */
1313 etc_machine_id
= prefix_roota(directory
, "/etc/machine-id");
1315 r
= read_one_line_file(etc_machine_id
, &b
);
1316 if (r
== -ENOENT
&& arg_link_journal
== LINK_AUTO
)
1319 return log_error_errno(r
, "Failed to read machine ID from %s: %m", etc_machine_id
);
1322 if (isempty(id
) && arg_link_journal
== LINK_AUTO
)
1325 /* Verify validity */
1326 r
= sd_id128_from_string(id
, &machine_id
);
1328 return log_error_errno(r
, "Failed to parse machine ID from %s: %m", etc_machine_id
);
1330 r
= sd_id128_get_machine(&this_id
);
1332 return log_error_errno(r
, "Failed to retrieve machine ID: %m");
1334 if (sd_id128_equal(machine_id
, this_id
)) {
1335 log_full(arg_link_journal
== LINK_AUTO
? LOG_WARNING
: LOG_ERR
,
1336 "Host and machine ids are equal (%s): refusing to link journals", id
);
1337 if (arg_link_journal
== LINK_AUTO
)
1342 if (arg_link_journal
== LINK_NO
)
1345 r
= userns_mkdir(directory
, "/var", 0755, 0, 0);
1347 return log_error_errno(r
, "Failed to create /var: %m");
1349 r
= userns_mkdir(directory
, "/var/log", 0755, 0, 0);
1351 return log_error_errno(r
, "Failed to create /var/log: %m");
1353 r
= userns_mkdir(directory
, "/var/log/journal", 0755, 0, 0);
1355 return log_error_errno(r
, "Failed to create /var/log/journal: %m");
1357 p
= strjoina("/var/log/journal/", id
);
1358 q
= prefix_roota(directory
, p
);
1360 if (path_is_mount_point(p
, 0) > 0) {
1361 if (arg_link_journal
!= LINK_AUTO
) {
1362 log_error("%s: already a mount point, refusing to use for journal", p
);
1369 if (path_is_mount_point(q
, 0) > 0) {
1370 if (arg_link_journal
!= LINK_AUTO
) {
1371 log_error("%s: already a mount point, refusing to use for journal", q
);
1378 r
= readlink_and_make_absolute(p
, &d
);
1380 if ((arg_link_journal
== LINK_GUEST
||
1381 arg_link_journal
== LINK_AUTO
) &&
1384 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1386 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
1391 return log_error_errno(errno
, "Failed to remove symlink %s: %m", p
);
1392 } else if (r
== -EINVAL
) {
1394 if (arg_link_journal
== LINK_GUEST
&&
1397 if (errno
== ENOTDIR
) {
1398 log_error("%s already exists and is neither a symlink nor a directory", p
);
1401 log_error_errno(errno
, "Failed to remove %s: %m", p
);
1405 } else if (r
!= -ENOENT
) {
1406 log_error_errno(errno
, "readlink(%s) failed: %m", p
);
1410 if (arg_link_journal
== LINK_GUEST
) {
1412 if (symlink(q
, p
) < 0) {
1413 if (arg_link_journal_try
) {
1414 log_debug_errno(errno
, "Failed to symlink %s to %s, skipping journal setup: %m", q
, p
);
1417 log_error_errno(errno
, "Failed to symlink %s to %s: %m", q
, p
);
1422 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1424 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
1428 if (arg_link_journal
== LINK_HOST
) {
1429 /* don't create parents here -- if the host doesn't have
1430 * permanent journal set up, don't force it here */
1433 if (arg_link_journal_try
) {
1434 log_debug_errno(errno
, "Failed to create %s, skipping journal setup: %m", p
);
1437 log_error_errno(errno
, "Failed to create %s: %m", p
);
1442 } else if (access(p
, F_OK
) < 0)
1445 if (dir_is_empty(q
) == 0)
1446 log_warning("%s is not empty, proceeding anyway.", q
);
1448 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1450 log_error_errno(errno
, "Failed to create %s: %m", q
);
1454 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1455 return log_error_errno(errno
, "Failed to bind mount journal from host into guest: %m");
1460 static int drop_capabilities(void) {
1461 return capability_bounding_set_drop(~arg_retain
, false);
1464 static int reset_audit_loginuid(void) {
1465 _cleanup_free_
char *p
= NULL
;
1468 if (arg_share_system
)
1471 r
= read_one_line_file("/proc/self/loginuid", &p
);
1475 return log_error_errno(r
, "Failed to read /proc/self/loginuid: %m");
1477 /* Already reset? */
1478 if (streq(p
, "4294967295"))
1481 r
= write_string_file("/proc/self/loginuid", "4294967295", 0);
1484 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1485 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1486 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1487 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1488 "using systemd-nspawn. Sleeping for 5s... (%m)");
1496 static int setup_seccomp(void) {
1499 static const struct {
1500 uint64_t capability
;
1503 { CAP_SYS_RAWIO
, SCMP_SYS(iopl
) },
1504 { CAP_SYS_RAWIO
, SCMP_SYS(ioperm
) },
1505 { CAP_SYS_BOOT
, SCMP_SYS(kexec_load
) },
1506 { CAP_SYS_ADMIN
, SCMP_SYS(swapon
) },
1507 { CAP_SYS_ADMIN
, SCMP_SYS(swapoff
) },
1508 { CAP_SYS_ADMIN
, SCMP_SYS(open_by_handle_at
) },
1509 { CAP_SYS_MODULE
, SCMP_SYS(init_module
) },
1510 { CAP_SYS_MODULE
, SCMP_SYS(finit_module
) },
1511 { CAP_SYS_MODULE
, SCMP_SYS(delete_module
) },
1512 { CAP_SYSLOG
, SCMP_SYS(syslog
) },
1515 scmp_filter_ctx seccomp
;
1519 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
1523 r
= seccomp_add_secondary_archs(seccomp
);
1525 log_error_errno(r
, "Failed to add secondary archs to seccomp filter: %m");
1529 for (i
= 0; i
< ELEMENTSOF(blacklist
); i
++) {
1530 if (arg_retain
& (1ULL << blacklist
[i
].capability
))
1533 r
= seccomp_rule_add(seccomp
, SCMP_ACT_ERRNO(EPERM
), blacklist
[i
].syscall_num
, 0);
1535 continue; /* unknown syscall */
1537 log_error_errno(r
, "Failed to block syscall: %m");
1544 Audit is broken in containers, much of the userspace audit
1545 hookup will fail if running inside a container. We don't
1546 care and just turn off creation of audit sockets.
1548 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1549 with EAFNOSUPPORT which audit userspace uses as indication
1550 that audit is disabled in the kernel.
1553 r
= seccomp_rule_add(
1555 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1558 SCMP_A0(SCMP_CMP_EQ
, AF_NETLINK
),
1559 SCMP_A2(SCMP_CMP_EQ
, NETLINK_AUDIT
));
1561 log_error_errno(r
, "Failed to add audit seccomp rule: %m");
1565 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
1567 log_error_errno(r
, "Failed to unset NO_NEW_PRIVS: %m");
1571 r
= seccomp_load(seccomp
);
1573 log_debug_errno(r
, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1578 log_error_errno(r
, "Failed to install seccomp audit filter: %m");
1583 seccomp_release(seccomp
);
1591 static int setup_propagate(const char *root
) {
1594 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1595 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1596 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
1597 (void) mkdir_p(p
, 0600);
1599 if (userns_mkdir(root
, "/run/systemd", 0755, 0, 0) < 0)
1600 return log_error_errno(errno
, "Failed to create /run/systemd: %m");
1602 if (userns_mkdir(root
, "/run/systemd/nspawn", 0755, 0, 0) < 0)
1603 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn: %m");
1605 if (userns_mkdir(root
, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
1606 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn/incoming: %m");
1608 q
= prefix_roota(root
, "/run/systemd/nspawn/incoming");
1609 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1610 return log_error_errno(errno
, "Failed to install propagation bind mount.");
1612 if (mount(NULL
, q
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
) < 0)
1613 return log_error_errno(errno
, "Failed to make propagation mount read-only");
1618 static int setup_image(char **device_path
, int *loop_nr
) {
1619 struct loop_info64 info
= {
1620 .lo_flags
= LO_FLAGS_AUTOCLEAR
|LO_FLAGS_PARTSCAN
1622 _cleanup_close_
int fd
= -1, control
= -1, loop
= -1;
1623 _cleanup_free_
char* loopdev
= NULL
;
1627 assert(device_path
);
1631 fd
= open(arg_image
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1633 return log_error_errno(errno
, "Failed to open %s: %m", arg_image
);
1635 if (fstat(fd
, &st
) < 0)
1636 return log_error_errno(errno
, "Failed to stat %s: %m", arg_image
);
1638 if (S_ISBLK(st
.st_mode
)) {
1641 p
= strdup(arg_image
);
1655 if (!S_ISREG(st
.st_mode
)) {
1656 log_error_errno(errno
, "%s is not a regular file or block device: %m", arg_image
);
1660 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
1662 return log_error_errno(errno
, "Failed to open /dev/loop-control: %m");
1664 nr
= ioctl(control
, LOOP_CTL_GET_FREE
);
1666 return log_error_errno(errno
, "Failed to allocate loop device: %m");
1668 if (asprintf(&loopdev
, "/dev/loop%i", nr
) < 0)
1671 loop
= open(loopdev
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1673 return log_error_errno(errno
, "Failed to open loop device %s: %m", loopdev
);
1675 if (ioctl(loop
, LOOP_SET_FD
, fd
) < 0)
1676 return log_error_errno(errno
, "Failed to set loopback file descriptor on %s: %m", loopdev
);
1679 info
.lo_flags
|= LO_FLAGS_READ_ONLY
;
1681 if (ioctl(loop
, LOOP_SET_STATUS64
, &info
) < 0)
1682 return log_error_errno(errno
, "Failed to set loopback settings on %s: %m", loopdev
);
1684 *device_path
= loopdev
;
1695 #define PARTITION_TABLE_BLURB \
1696 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1697 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1698 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1699 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1700 "to be bootable with systemd-nspawn."
1702 static int dissect_image(
1704 char **root_device
, bool *root_device_rw
,
1705 char **home_device
, bool *home_device_rw
,
1706 char **srv_device
, bool *srv_device_rw
,
1710 int home_nr
= -1, srv_nr
= -1;
1711 #ifdef GPT_ROOT_NATIVE
1714 #ifdef GPT_ROOT_SECONDARY
1715 int secondary_root_nr
= -1;
1717 _cleanup_free_
char *home
= NULL
, *root
= NULL
, *secondary_root
= NULL
, *srv
= NULL
, *generic
= NULL
;
1718 _cleanup_udev_enumerate_unref_
struct udev_enumerate
*e
= NULL
;
1719 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
1720 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
1721 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
1722 struct udev_list_entry
*first
, *item
;
1723 bool home_rw
= true, root_rw
= true, secondary_root_rw
= true, srv_rw
= true, generic_rw
= true;
1724 bool is_gpt
, is_mbr
, multiple_generic
= false;
1725 const char *pttype
= NULL
;
1732 assert(root_device
);
1733 assert(home_device
);
1738 b
= blkid_new_probe();
1743 r
= blkid_probe_set_device(b
, fd
, 0, 0);
1748 log_error_errno(errno
, "Failed to set device on blkid probe: %m");
1752 blkid_probe_enable_partitions(b
, 1);
1753 blkid_probe_set_partitions_flags(b
, BLKID_PARTS_ENTRY_DETAILS
);
1756 r
= blkid_do_safeprobe(b
);
1757 if (r
== -2 || r
== 1) {
1758 log_error("Failed to identify any partition table on\n"
1760 PARTITION_TABLE_BLURB
, arg_image
);
1762 } else if (r
!= 0) {
1765 log_error_errno(errno
, "Failed to probe: %m");
1769 (void) blkid_probe_lookup_value(b
, "PTTYPE", &pttype
, NULL
);
1771 is_gpt
= streq_ptr(pttype
, "gpt");
1772 is_mbr
= streq_ptr(pttype
, "dos");
1774 if (!is_gpt
&& !is_mbr
) {
1775 log_error("No GPT or MBR partition table discovered on\n"
1777 PARTITION_TABLE_BLURB
, arg_image
);
1782 pl
= blkid_probe_get_partitions(b
);
1787 log_error("Failed to list partitions of %s", arg_image
);
1795 if (fstat(fd
, &st
) < 0)
1796 return log_error_errno(errno
, "Failed to stat block device: %m");
1798 d
= udev_device_new_from_devnum(udev
, 'b', st
.st_rdev
);
1806 log_error("Kernel partitions never appeared.");
1810 e
= udev_enumerate_new(udev
);
1814 r
= udev_enumerate_add_match_parent(e
, d
);
1818 r
= udev_enumerate_scan_devices(e
);
1820 return log_error_errno(r
, "Failed to scan for partition devices of %s: %m", arg_image
);
1822 /* Count the partitions enumerated by the kernel */
1824 first
= udev_enumerate_get_list_entry(e
);
1825 udev_list_entry_foreach(item
, first
)
1828 /* Count the partitions enumerated by blkid */
1829 m
= blkid_partlist_numof_partitions(pl
);
1833 log_error("blkid and kernel partition list do not match.");
1839 /* The kernel has probed fewer partitions than
1840 * blkid? Maybe the kernel prober is still
1841 * running or it got EBUSY because udev
1842 * already opened the device. Let's reprobe
1843 * the device, which is a synchronous call
1844 * that waits until probing is complete. */
1846 for (j
= 0; j
< 20; j
++) {
1848 r
= ioctl(fd
, BLKRRPART
, 0);
1851 if (r
>= 0 || r
!= -EBUSY
)
1854 /* If something else has the device
1855 * open, such as an udev rule, the
1856 * ioctl will return EBUSY. Since
1857 * there's no way to wait until it
1858 * isn't busy anymore, let's just wait
1859 * a bit, and try again.
1861 * This is really something they
1862 * should fix in the kernel! */
1864 usleep(50 * USEC_PER_MSEC
);
1868 return log_error_errno(r
, "Failed to reread partition table: %m");
1871 e
= udev_enumerate_unref(e
);
1874 first
= udev_enumerate_get_list_entry(e
);
1875 udev_list_entry_foreach(item
, first
) {
1876 _cleanup_udev_device_unref_
struct udev_device
*q
;
1878 unsigned long long flags
;
1884 q
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(item
));
1889 log_error_errno(errno
, "Failed to get partition device of %s: %m", arg_image
);
1893 qn
= udev_device_get_devnum(q
);
1897 if (st
.st_rdev
== qn
)
1900 node
= udev_device_get_devnode(q
);
1904 pp
= blkid_partlist_devno_to_partition(pl
, qn
);
1908 flags
= blkid_partition_get_flags(pp
);
1910 nr
= blkid_partition_get_partno(pp
);
1918 if (flags
& GPT_FLAG_NO_AUTO
)
1921 stype
= blkid_partition_get_type_string(pp
);
1925 if (sd_id128_from_string(stype
, &type_id
) < 0)
1928 if (sd_id128_equal(type_id
, GPT_HOME
)) {
1930 if (home
&& nr
>= home_nr
)
1934 home_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1936 r
= free_and_strdup(&home
, node
);
1940 } else if (sd_id128_equal(type_id
, GPT_SRV
)) {
1942 if (srv
&& nr
>= srv_nr
)
1946 srv_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1948 r
= free_and_strdup(&srv
, node
);
1952 #ifdef GPT_ROOT_NATIVE
1953 else if (sd_id128_equal(type_id
, GPT_ROOT_NATIVE
)) {
1955 if (root
&& nr
>= root_nr
)
1959 root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1961 r
= free_and_strdup(&root
, node
);
1966 #ifdef GPT_ROOT_SECONDARY
1967 else if (sd_id128_equal(type_id
, GPT_ROOT_SECONDARY
)) {
1969 if (secondary_root
&& nr
>= secondary_root_nr
)
1972 secondary_root_nr
= nr
;
1973 secondary_root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1975 r
= free_and_strdup(&secondary_root
, node
);
1980 else if (sd_id128_equal(type_id
, GPT_LINUX_GENERIC
)) {
1983 multiple_generic
= true;
1985 generic_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1987 r
= free_and_strdup(&generic
, node
);
1993 } else if (is_mbr
) {
1996 if (flags
!= 0x80) /* Bootable flag */
1999 type
= blkid_partition_get_type(pp
);
2000 if (type
!= 0x83) /* Linux partition */
2004 multiple_generic
= true;
2008 r
= free_and_strdup(&root
, node
);
2016 *root_device
= root
;
2019 *root_device_rw
= root_rw
;
2021 } else if (secondary_root
) {
2022 *root_device
= secondary_root
;
2023 secondary_root
= NULL
;
2025 *root_device_rw
= secondary_root_rw
;
2027 } else if (generic
) {
2029 /* There were no partitions with precise meanings
2030 * around, but we found generic partitions. In this
2031 * case, if there's only one, we can go ahead and boot
2032 * it, otherwise we bail out, because we really cannot
2033 * make any sense of it. */
2035 if (multiple_generic
) {
2036 log_error("Identified multiple bootable Linux partitions on\n"
2038 PARTITION_TABLE_BLURB
, arg_image
);
2042 *root_device
= generic
;
2045 *root_device_rw
= generic_rw
;
2048 log_error("Failed to identify root partition in disk image\n"
2050 PARTITION_TABLE_BLURB
, arg_image
);
2055 *home_device
= home
;
2058 *home_device_rw
= home_rw
;
2065 *srv_device_rw
= srv_rw
;
2070 log_error("--image= is not supported, compiled without blkid support.");
2075 static int mount_device(const char *what
, const char *where
, const char *directory
, bool rw
) {
2077 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
2078 const char *fstype
, *p
;
2088 p
= strjoina(where
, directory
);
2093 b
= blkid_new_probe_from_filename(what
);
2097 log_error_errno(errno
, "Failed to allocate prober for %s: %m", what
);
2101 blkid_probe_enable_superblocks(b
, 1);
2102 blkid_probe_set_superblocks_flags(b
, BLKID_SUBLKS_TYPE
);
2105 r
= blkid_do_safeprobe(b
);
2106 if (r
== -1 || r
== 1) {
2107 log_error("Cannot determine file system type of %s", what
);
2109 } else if (r
!= 0) {
2112 log_error_errno(errno
, "Failed to probe %s: %m", what
);
2117 if (blkid_probe_lookup_value(b
, "TYPE", &fstype
, NULL
) < 0) {
2120 log_error("Failed to determine file system type of %s", what
);
2124 if (streq(fstype
, "crypto_LUKS")) {
2125 log_error("nspawn currently does not support LUKS disk images.");
2129 if (mount(what
, p
, fstype
, MS_NODEV
|(rw
? 0 : MS_RDONLY
), NULL
) < 0)
2130 return log_error_errno(errno
, "Failed to mount %s: %m", what
);
2134 log_error("--image= is not supported, compiled without blkid support.");
2139 static int mount_devices(
2141 const char *root_device
, bool root_device_rw
,
2142 const char *home_device
, bool home_device_rw
,
2143 const char *srv_device
, bool srv_device_rw
) {
2149 r
= mount_device(root_device
, arg_directory
, NULL
, root_device_rw
);
2151 return log_error_errno(r
, "Failed to mount root directory: %m");
2155 r
= mount_device(home_device
, arg_directory
, "/home", home_device_rw
);
2157 return log_error_errno(r
, "Failed to mount home directory: %m");
2161 r
= mount_device(srv_device
, arg_directory
, "/srv", srv_device_rw
);
2163 return log_error_errno(r
, "Failed to mount server data directory: %m");
2169 static void loop_remove(int nr
, int *image_fd
) {
2170 _cleanup_close_
int control
= -1;
2176 if (image_fd
&& *image_fd
>= 0) {
2177 r
= ioctl(*image_fd
, LOOP_CLR_FD
);
2179 log_debug_errno(errno
, "Failed to close loop image: %m");
2180 *image_fd
= safe_close(*image_fd
);
2183 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
2185 log_warning_errno(errno
, "Failed to open /dev/loop-control: %m");
2189 r
= ioctl(control
, LOOP_CTL_REMOVE
, nr
);
2191 log_debug_errno(errno
, "Failed to remove loop %d: %m", nr
);
2196 * < 0 : wait_for_terminate() failed to get the state of the
2197 * container, the container was terminated by a signal, or
2198 * failed for an unknown reason. No change is made to the
2199 * container argument.
2200 * > 0 : The program executed in the container terminated with an
2201 * error. The exit code of the program executed in the
2202 * container is returned. The container argument has been set
2203 * to CONTAINER_TERMINATED.
2204 * 0 : The container is being rebooted, has been shut down or exited
2205 * successfully. The container argument has been set to either
2206 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2208 * That is, success is indicated by a return value of zero, and an
2209 * error is indicated by a non-zero value.
2211 static int wait_for_container(pid_t pid
, ContainerStatus
*container
) {
2215 r
= wait_for_terminate(pid
, &status
);
2217 return log_warning_errno(r
, "Failed to wait for container: %m");
2219 switch (status
.si_code
) {
2222 if (status
.si_status
== 0) {
2223 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s exited successfully.", arg_machine
);
2226 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s failed with error code %i.", arg_machine
, status
.si_status
);
2228 *container
= CONTAINER_TERMINATED
;
2229 return status
.si_status
;
2232 if (status
.si_status
== SIGINT
) {
2234 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s has been shut down.", arg_machine
);
2235 *container
= CONTAINER_TERMINATED
;
2238 } else if (status
.si_status
== SIGHUP
) {
2240 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s is being rebooted.", arg_machine
);
2241 *container
= CONTAINER_REBOOTED
;
2245 /* CLD_KILLED fallthrough */
2248 log_error("Container %s terminated by signal %s.", arg_machine
, signal_to_string(status
.si_status
));
2252 log_error("Container %s failed due to unknown reason.", arg_machine
);
2259 static int on_orderly_shutdown(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
2262 pid
= PTR_TO_UINT32(userdata
);
2264 if (kill(pid
, arg_kill_signal
) >= 0) {
2265 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2266 sd_event_source_set_userdata(s
, NULL
);
2271 sd_event_exit(sd_event_source_get_event(s
), 0);
2275 static int determine_names(void) {
2278 if (arg_template
&& !arg_directory
&& arg_machine
) {
2280 /* If --template= was specified then we should not
2281 * search for a machine, but instead create a new one
2282 * in /var/lib/machine. */
2284 arg_directory
= strjoin("/var/lib/machines/", arg_machine
, NULL
);
2289 if (!arg_image
&& !arg_directory
) {
2291 _cleanup_(image_unrefp
) Image
*i
= NULL
;
2293 r
= image_find(arg_machine
, &i
);
2295 return log_error_errno(r
, "Failed to find image for machine '%s': %m", arg_machine
);
2297 log_error("No image for machine '%s': %m", arg_machine
);
2301 if (i
->type
== IMAGE_RAW
)
2302 r
= free_and_strdup(&arg_image
, i
->path
);
2304 r
= free_and_strdup(&arg_directory
, i
->path
);
2306 return log_error_errno(r
, "Invalid image directory: %m");
2309 arg_read_only
= arg_read_only
|| i
->read_only
;
2311 arg_directory
= get_current_dir_name();
2313 if (!arg_directory
&& !arg_machine
) {
2314 log_error("Failed to determine path, please use -D or -i.");
2320 if (arg_directory
&& path_equal(arg_directory
, "/"))
2321 arg_machine
= gethostname_malloc();
2323 arg_machine
= strdup(basename(arg_image
?: arg_directory
));
2328 hostname_cleanup(arg_machine
);
2329 if (!machine_name_is_valid(arg_machine
)) {
2330 log_error("Failed to determine machine name automatically, please use -M.");
2334 if (arg_ephemeral
) {
2337 /* Add a random suffix when this is an
2338 * ephemeral machine, so that we can run many
2339 * instances at once without manually having
2340 * to specify -M each time. */
2342 if (asprintf(&b
, "%s-%016" PRIx64
, arg_machine
, random_u64()) < 0)
2353 static int determine_uid_shift(const char *directory
) {
2361 if (arg_uid_shift
== UID_INVALID
) {
2364 r
= stat(directory
, &st
);
2366 return log_error_errno(errno
, "Failed to determine UID base of %s: %m", directory
);
2368 arg_uid_shift
= st
.st_uid
& UINT32_C(0xffff0000);
2370 if (arg_uid_shift
!= (st
.st_gid
& UINT32_C(0xffff0000))) {
2371 log_error("UID and GID base of %s don't match.", directory
);
2375 arg_uid_range
= UINT32_C(0x10000);
2378 if (arg_uid_shift
> (uid_t
) -1 - arg_uid_range
) {
2379 log_error("UID base too high for UID range.");
2383 log_info("Using user namespaces with base " UID_FMT
" and range " UID_FMT
".", arg_uid_shift
, arg_uid_range
);
2387 static int inner_child(
2389 const char *directory
,
2395 _cleanup_free_
char *home
= NULL
;
2397 const char *envp
[] = {
2398 "PATH=" DEFAULT_PATH_SPLIT_USR
,
2399 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2404 NULL
, /* container_uuid */
2405 NULL
, /* LISTEN_FDS */
2406 NULL
, /* LISTEN_PID */
2410 _cleanup_strv_free_
char **env_use
= NULL
;
2415 assert(kmsg_socket
>= 0);
2420 /* Tell the parent, that it now can write the UID map. */
2421 (void) barrier_place(barrier
); /* #1 */
2423 /* Wait until the parent wrote the UID map */
2424 if (!barrier_place_and_sync(barrier
)) { /* #2 */
2425 log_error("Parent died too early");
2430 r
= mount_all(NULL
, arg_userns
, true, arg_uid_shift
, arg_private_network
, arg_uid_range
, arg_selinux_apifs_context
);
2434 r
= mount_sysfs(NULL
);
2438 /* Wait until we are cgroup-ified, so that we
2439 * can mount the right cgroup path writable */
2440 if (!barrier_place_and_sync(barrier
)) { /* #3 */
2441 log_error("Parent died too early");
2445 r
= mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy
);
2449 r
= reset_uid_gid();
2451 return log_error_errno(r
, "Couldn't become new root: %m");
2453 r
= setup_boot_id(NULL
);
2457 r
= setup_kmsg(NULL
, kmsg_socket
);
2460 kmsg_socket
= safe_close(kmsg_socket
);
2465 return log_error_errno(errno
, "setsid() failed: %m");
2467 if (arg_private_network
)
2470 if (arg_expose_ports
) {
2471 r
= expose_port_send_rtnl(rtnl_socket
);
2474 rtnl_socket
= safe_close(rtnl_socket
);
2477 if (drop_capabilities() < 0)
2478 return log_error_errno(errno
, "drop_capabilities() failed: %m");
2482 if (arg_personality
!= PERSONALITY_INVALID
) {
2483 if (personality(arg_personality
) < 0)
2484 return log_error_errno(errno
, "personality() failed: %m");
2485 } else if (secondary
) {
2486 if (personality(PER_LINUX32
) < 0)
2487 return log_error_errno(errno
, "personality() failed: %m");
2491 if (arg_selinux_context
)
2492 if (setexeccon((security_context_t
) arg_selinux_context
) < 0)
2493 return log_error_errno(errno
, "setexeccon(\"%s\") failed: %m", arg_selinux_context
);
2496 r
= change_uid_gid(arg_user
, &home
);
2500 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
2504 if ((asprintf((char**)(envp
+ n_env
++), "HOME=%s", home
? home
: "/root") < 0) ||
2505 (asprintf((char**)(envp
+ n_env
++), "USER=%s", arg_user
? arg_user
: "root") < 0) ||
2506 (asprintf((char**)(envp
+ n_env
++), "LOGNAME=%s", arg_user
? arg_user
: "root") < 0))
2509 if (!sd_id128_equal(arg_uuid
, SD_ID128_NULL
)) {
2512 if (asprintf((char**)(envp
+ n_env
++), "container_uuid=%s", id128_format_as_uuid(arg_uuid
, as_uuid
)) < 0)
2516 if (fdset_size(fds
) > 0) {
2517 r
= fdset_cloexec(fds
, false);
2519 return log_error_errno(r
, "Failed to unset O_CLOEXEC for file descriptors.");
2521 if ((asprintf((char **)(envp
+ n_env
++), "LISTEN_FDS=%u", fdset_size(fds
)) < 0) ||
2522 (asprintf((char **)(envp
+ n_env
++), "LISTEN_PID=1") < 0))
2526 env_use
= strv_env_merge(2, envp
, arg_setenv
);
2530 /* Let the parent know that we are ready and
2531 * wait until the parent is ready with the
2533 if (!barrier_place_and_sync(barrier
)) { /* #4 */
2534 log_error("Parent died too early");
2538 /* Now, explicitly close the log, so that we
2539 * then can close all remaining fds. Closing
2540 * the log explicitly first has the benefit
2541 * that the logging subsystem knows about it,
2542 * and is thus ready to be reopened should we
2543 * need it again. Note that the other fds
2544 * closed here are at least the locking and
2547 (void) fdset_close_others(fds
);
2553 /* Automatically search for the init system */
2555 m
= 1 + strv_length(arg_parameters
);
2556 a
= newa(char*, m
+ 1);
2557 if (strv_isempty(arg_parameters
))
2560 memcpy(a
+ 1, arg_parameters
, m
* sizeof(char*));
2562 a
[0] = (char*) "/usr/lib/systemd/systemd";
2563 execve(a
[0], a
, env_use
);
2565 a
[0] = (char*) "/lib/systemd/systemd";
2566 execve(a
[0], a
, env_use
);
2568 a
[0] = (char*) "/sbin/init";
2569 execve(a
[0], a
, env_use
);
2570 } else if (!strv_isempty(arg_parameters
))
2571 execvpe(arg_parameters
[0], arg_parameters
, env_use
);
2573 chdir(home
?: "/root");
2574 execle("/bin/bash", "-bash", NULL
, env_use
);
2575 execle("/bin/sh", "-sh", NULL
, env_use
);
2579 return log_error_errno(errno
, "execv() failed: %m");
2582 static int outer_child(
2584 const char *directory
,
2585 const char *console
,
2586 const char *root_device
, bool root_device_rw
,
2587 const char *home_device
, bool home_device_rw
,
2588 const char *srv_device
, bool srv_device_rw
,
2594 int uid_shift_socket
,
2604 assert(pid_socket
>= 0);
2605 assert(kmsg_socket
>= 0);
2609 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0)
2610 return log_error_errno(errno
, "PR_SET_PDEATHSIG failed: %m");
2613 close_nointr(STDIN_FILENO
);
2614 close_nointr(STDOUT_FILENO
);
2615 close_nointr(STDERR_FILENO
);
2617 r
= open_terminal(console
, O_RDWR
);
2618 if (r
!= STDIN_FILENO
) {
2624 return log_error_errno(r
, "Failed to open console: %m");
2627 if (dup2(STDIN_FILENO
, STDOUT_FILENO
) != STDOUT_FILENO
||
2628 dup2(STDIN_FILENO
, STDERR_FILENO
) != STDERR_FILENO
)
2629 return log_error_errno(errno
, "Failed to duplicate console: %m");
2632 r
= reset_audit_loginuid();
2636 /* Mark everything as slave, so that we still
2637 * receive mounts from the real root, but don't
2638 * propagate mounts to the real root. */
2639 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
2640 return log_error_errno(errno
, "MS_SLAVE|MS_REC failed: %m");
2642 r
= mount_devices(directory
,
2643 root_device
, root_device_rw
,
2644 home_device
, home_device_rw
,
2645 srv_device
, srv_device_rw
);
2649 r
= determine_uid_shift(directory
);
2654 l
= send(uid_shift_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), MSG_NOSIGNAL
);
2656 return log_error_errno(errno
, "Failed to send UID shift: %m");
2657 if (l
!= sizeof(arg_uid_shift
)) {
2658 log_error("Short write while sending UID shift.");
2663 /* Turn directory into bind mount */
2664 if (mount(directory
, directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
2665 return log_error_errno(errno
, "Failed to make bind mount: %m");
2667 r
= setup_volatile(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2671 r
= setup_volatile_state(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2675 r
= base_filesystem_create(directory
, arg_uid_shift
, (gid_t
) arg_uid_shift
);
2679 if (arg_read_only
) {
2680 r
= bind_remount_recursive(directory
, true);
2682 return log_error_errno(r
, "Failed to make tree read-only: %m");
2685 r
= mount_all(directory
, arg_userns
, false, arg_private_network
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2689 r
= copy_devnodes(directory
);
2693 dev_setup(directory
, arg_uid_shift
, arg_uid_shift
);
2695 r
= setup_pts(directory
);
2699 r
= setup_propagate(directory
);
2703 r
= setup_dev_console(directory
, console
);
2707 r
= setup_seccomp();
2711 r
= setup_timezone(directory
);
2715 r
= setup_resolv_conf(directory
);
2719 r
= setup_journal(directory
);
2723 r
= mount_custom(directory
, arg_custom_mounts
, arg_n_custom_mounts
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2727 r
= mount_cgroups(directory
, arg_unified_cgroup_hierarchy
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2731 r
= mount_move_root(directory
);
2733 return log_error_errno(r
, "Failed to move root directory: %m");
2735 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
|
2736 (arg_share_system
? 0 : CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
) |
2737 (arg_private_network
? CLONE_NEWNET
: 0) |
2738 (arg_userns
? CLONE_NEWUSER
: 0),
2741 return log_error_errno(errno
, "Failed to fork inner child: %m");
2743 pid_socket
= safe_close(pid_socket
);
2744 uid_shift_socket
= safe_close(uid_shift_socket
);
2746 /* The inner child has all namespaces that are
2747 * requested, so that we all are owned by the user if
2748 * user namespaces are turned on. */
2750 r
= inner_child(barrier
, directory
, secondary
, kmsg_socket
, rtnl_socket
, fds
);
2752 _exit(EXIT_FAILURE
);
2754 _exit(EXIT_SUCCESS
);
2757 l
= send(pid_socket
, &pid
, sizeof(pid
), MSG_NOSIGNAL
);
2759 return log_error_errno(errno
, "Failed to send PID: %m");
2760 if (l
!= sizeof(pid
)) {
2761 log_error("Short write while sending PID.");
2765 pid_socket
= safe_close(pid_socket
);
2766 kmsg_socket
= safe_close(kmsg_socket
);
2767 rtnl_socket
= safe_close(rtnl_socket
);
2772 static int setup_uid_map(pid_t pid
) {
2773 char uid_map
[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t
) + 1], line
[DECIMAL_STR_MAX(uid_t
)*3+3+1];
2778 xsprintf(uid_map
, "/proc/" PID_FMT
"/uid_map", pid
);
2779 xsprintf(line
, UID_FMT
" " UID_FMT
" " UID_FMT
"\n", 0, arg_uid_shift
, arg_uid_range
);
2780 r
= write_string_file(uid_map
, line
, 0);
2782 return log_error_errno(r
, "Failed to write UID map: %m");
2784 /* We always assign the same UID and GID ranges */
2785 xsprintf(uid_map
, "/proc/" PID_FMT
"/gid_map", pid
);
2786 r
= write_string_file(uid_map
, line
, 0);
2788 return log_error_errno(r
, "Failed to write GID map: %m");
2793 static int load_settings(void) {
2794 _cleanup_(settings_freep
) Settings
*settings
= NULL
;
2795 _cleanup_fclose_
FILE *f
= NULL
;
2796 _cleanup_free_
char *p
= NULL
;
2800 /* If all settings are masked, there's no point in looking for
2801 * the settings file */
2802 if ((arg_settings_mask
& _SETTINGS_MASK_ALL
) == _SETTINGS_MASK_ALL
)
2805 fn
= strjoina(arg_machine
, ".nspawn");
2807 /* We first look in the admin's directories in /etc and /run */
2808 FOREACH_STRING(i
, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2809 _cleanup_free_
char *j
= NULL
;
2811 j
= strjoin(i
, "/", fn
, NULL
);
2820 /* By default we trust configuration from /etc and /run */
2821 if (arg_settings_trusted
< 0)
2822 arg_settings_trusted
= true;
2827 if (errno
!= ENOENT
)
2828 return log_error_errno(errno
, "Failed to open %s: %m", j
);
2832 /* After that, let's look for a file next to the
2833 * actual image we shall boot. */
2836 p
= file_in_same_dir(arg_image
, fn
);
2839 } else if (arg_directory
) {
2840 p
= file_in_same_dir(arg_directory
, fn
);
2847 if (!f
&& errno
!= ENOENT
)
2848 return log_error_errno(errno
, "Failed to open %s: %m", p
);
2850 /* By default we do not trust configuration from /var/lib/machines */
2851 if (arg_settings_trusted
< 0)
2852 arg_settings_trusted
= false;
2859 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted
));
2861 r
= settings_load(f
, p
, &settings
);
2865 /* Copy over bits from the settings, unless they have been
2866 * explicitly masked by command line switches. */
2868 if ((arg_settings_mask
& SETTING_BOOT
) == 0 &&
2869 settings
->boot
>= 0) {
2870 arg_boot
= settings
->boot
;
2872 strv_free(arg_parameters
);
2873 arg_parameters
= settings
->parameters
;
2874 settings
->parameters
= NULL
;
2877 if ((arg_settings_mask
& SETTING_ENVIRONMENT
) == 0 &&
2878 settings
->environment
) {
2879 strv_free(arg_setenv
);
2880 arg_setenv
= settings
->environment
;
2881 settings
->environment
= NULL
;
2884 if ((arg_settings_mask
& SETTING_USER
) == 0 &&
2887 arg_user
= settings
->user
;
2888 settings
->user
= NULL
;
2891 if ((arg_settings_mask
& SETTING_CAPABILITY
) == 0) {
2894 plus
= settings
->capability
;
2895 if (settings_private_network(settings
))
2896 plus
|= (1ULL << CAP_NET_ADMIN
);
2898 if (!arg_settings_trusted
&& plus
!= 0) {
2899 if (settings
->capability
!= 0)
2900 log_warning("Ignoring Capability= setting, file %s is not trusted.", p
);
2904 arg_retain
&= ~settings
->drop_capability
;
2907 if ((arg_settings_mask
& SETTING_KILL_SIGNAL
) == 0 &&
2908 settings
->kill_signal
> 0)
2909 arg_kill_signal
= settings
->kill_signal
;
2911 if ((arg_settings_mask
& SETTING_PERSONALITY
) == 0 &&
2912 settings
->personality
!= PERSONALITY_INVALID
)
2913 arg_personality
= settings
->personality
;
2915 if ((arg_settings_mask
& SETTING_MACHINE_ID
) == 0 &&
2916 !sd_id128_is_null(settings
->machine_id
)) {
2918 if (!arg_settings_trusted
)
2919 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p
);
2921 arg_uuid
= settings
->machine_id
;
2924 if ((arg_settings_mask
& SETTING_READ_ONLY
) == 0 &&
2925 settings
->read_only
>= 0)
2926 arg_read_only
= settings
->read_only
;
2928 if ((arg_settings_mask
& SETTING_VOLATILE_MODE
) == 0 &&
2929 settings
->volatile_mode
!= _VOLATILE_MODE_INVALID
)
2930 arg_volatile_mode
= settings
->volatile_mode
;
2932 if ((arg_settings_mask
& SETTING_CUSTOM_MOUNTS
) == 0 &&
2933 settings
->n_custom_mounts
> 0) {
2935 if (!arg_settings_trusted
)
2936 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p
);
2938 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
2939 arg_custom_mounts
= settings
->custom_mounts
;
2940 arg_n_custom_mounts
= settings
->n_custom_mounts
;
2942 settings
->custom_mounts
= NULL
;
2943 settings
->n_custom_mounts
= 0;
2947 if ((arg_settings_mask
& SETTING_NETWORK
) == 0 &&
2948 (settings
->private_network
>= 0 ||
2949 settings
->network_veth
>= 0 ||
2950 settings
->network_bridge
||
2951 settings
->network_interfaces
||
2952 settings
->network_macvlan
||
2953 settings
->network_ipvlan
)) {
2955 if (!arg_settings_trusted
)
2956 log_warning("Ignoring network settings, file %s is not trusted.", p
);
2958 arg_network_veth
= settings_private_network(settings
);
2959 arg_private_network
= settings_private_network(settings
);
2961 strv_free(arg_network_interfaces
);
2962 arg_network_interfaces
= settings
->network_interfaces
;
2963 settings
->network_interfaces
= NULL
;
2965 strv_free(arg_network_macvlan
);
2966 arg_network_macvlan
= settings
->network_macvlan
;
2967 settings
->network_macvlan
= NULL
;
2969 strv_free(arg_network_ipvlan
);
2970 arg_network_ipvlan
= settings
->network_ipvlan
;
2971 settings
->network_ipvlan
= NULL
;
2973 free(arg_network_bridge
);
2974 arg_network_bridge
= settings
->network_bridge
;
2975 settings
->network_bridge
= NULL
;
2979 if ((arg_settings_mask
& SETTING_EXPOSE_PORTS
) == 0 &&
2980 settings
->expose_ports
) {
2982 if (!arg_settings_trusted
)
2983 log_warning("Ignoring Port= setting, file %s is not trusted.", p
);
2985 expose_port_free_all(arg_expose_ports
);
2986 arg_expose_ports
= settings
->expose_ports
;
2987 settings
->expose_ports
= NULL
;
2994 int main(int argc
, char *argv
[]) {
2996 _cleanup_free_
char *device_path
= NULL
, *root_device
= NULL
, *home_device
= NULL
, *srv_device
= NULL
, *console
= NULL
;
2997 bool root_device_rw
= true, home_device_rw
= true, srv_device_rw
= true;
2998 _cleanup_close_
int master
= -1, image_fd
= -1;
2999 _cleanup_fdset_free_ FDSet
*fds
= NULL
;
3000 int r
, n_fd_passed
, loop_nr
= -1;
3001 char veth_name
[IFNAMSIZ
];
3002 bool secondary
= false, remove_subvol
= false;
3005 int ret
= EXIT_SUCCESS
;
3006 union in_addr_union exposed
= {};
3007 _cleanup_release_lock_file_ LockFile tree_global_lock
= LOCK_FILE_INIT
, tree_local_lock
= LOCK_FILE_INIT
;
3010 log_parse_environment();
3013 r
= parse_argv(argc
, argv
);
3017 if (geteuid() != 0) {
3018 log_error("Need to be root.");
3022 r
= determine_names();
3026 r
= load_settings();
3030 r
= verify_arguments();
3034 n_fd_passed
= sd_listen_fds(false);
3035 if (n_fd_passed
> 0) {
3036 r
= fdset_new_listen_fds(&fds
, false);
3038 log_error_errno(r
, "Failed to collect file descriptors: %m");
3043 if (arg_directory
) {
3046 if (path_equal(arg_directory
, "/") && !arg_ephemeral
) {
3047 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3052 if (arg_ephemeral
) {
3053 _cleanup_free_
char *np
= NULL
;
3055 /* If the specified path is a mount point we
3056 * generate the new snapshot immediately
3057 * inside it under a random name. However if
3058 * the specified is not a mount point we
3059 * create the new snapshot in the parent
3060 * directory, just next to it. */
3061 r
= path_is_mount_point(arg_directory
, 0);
3063 log_error_errno(r
, "Failed to determine whether directory %s is mount point: %m", arg_directory
);
3067 r
= tempfn_random_child(arg_directory
, "machine.", &np
);
3069 r
= tempfn_random(arg_directory
, "machine.", &np
);
3071 log_error_errno(r
, "Failed to generate name for snapshot: %m");
3075 r
= image_path_lock(np
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3077 log_error_errno(r
, "Failed to lock %s: %m", np
);
3081 r
= btrfs_subvol_snapshot(arg_directory
, np
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
3083 log_error_errno(r
, "Failed to create snapshot %s from %s: %m", np
, arg_directory
);
3087 free(arg_directory
);
3091 remove_subvol
= true;
3094 r
= image_path_lock(arg_directory
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3096 log_error_errno(r
, "Directory tree %s is currently busy.", arg_directory
);
3100 log_error_errno(r
, "Failed to lock %s: %m", arg_directory
);
3105 r
= btrfs_subvol_snapshot(arg_template
, arg_directory
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
3108 log_info("Directory %s already exists, not populating from template %s.", arg_directory
, arg_template
);
3110 log_error_errno(r
, "Couldn't create snapshot %s from %s: %m", arg_directory
, arg_template
);
3114 log_info("Populated %s from template %s.", arg_directory
, arg_template
);
3120 if (path_is_os_tree(arg_directory
) <= 0) {
3121 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory
);
3128 p
= strjoina(arg_directory
, "/usr/");
3129 if (laccess(p
, F_OK
) < 0) {
3130 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory
);
3137 char template[] = "/tmp/nspawn-root-XXXXXX";
3140 assert(!arg_template
);
3142 r
= image_path_lock(arg_image
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3144 r
= log_error_errno(r
, "Disk image %s is currently busy.", arg_image
);
3148 r
= log_error_errno(r
, "Failed to create image lock: %m");
3152 if (!mkdtemp(template)) {
3153 log_error_errno(errno
, "Failed to create temporary directory: %m");
3158 arg_directory
= strdup(template);
3159 if (!arg_directory
) {
3164 image_fd
= setup_image(&device_path
, &loop_nr
);
3170 r
= dissect_image(image_fd
,
3171 &root_device
, &root_device_rw
,
3172 &home_device
, &home_device_rw
,
3173 &srv_device
, &srv_device_rw
,
3179 r
= custom_mounts_prepare();
3184 isatty(STDIN_FILENO
) > 0 &&
3185 isatty(STDOUT_FILENO
) > 0;
3187 master
= posix_openpt(O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NDELAY
);
3189 r
= log_error_errno(errno
, "Failed to acquire pseudo tty: %m");
3193 r
= ptsname_malloc(master
, &console
);
3195 r
= log_error_errno(r
, "Failed to determine tty name: %m");
3199 if (unlockpt(master
) < 0) {
3200 r
= log_error_errno(errno
, "Failed to unlock tty: %m");
3205 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3206 arg_machine
, arg_image
?: arg_directory
);
3208 assert_se(sigprocmask_many(SIG_BLOCK
, NULL
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, -1) >= 0);
3210 assert_se(sigemptyset(&mask_chld
) == 0);
3211 assert_se(sigaddset(&mask_chld
, SIGCHLD
) == 0);
3213 if (prctl(PR_SET_CHILD_SUBREAPER
, 1) < 0) {
3214 r
= log_error_errno(errno
, "Failed to become subreaper: %m");
3219 _cleanup_close_pair_
int kmsg_socket_pair
[2] = { -1, -1 }, rtnl_socket_pair
[2] = { -1, -1 }, pid_socket_pair
[2] = { -1, -1 },
3220 uid_shift_socket_pair
[2] = { -1, -1 };
3221 ContainerStatus container_status
;
3222 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
3223 static const struct sigaction sa
= {
3224 .sa_handler
= nop_signal_handler
,
3225 .sa_flags
= SA_NOCLDSTOP
,
3229 _cleanup_event_unref_ sd_event
*event
= NULL
;
3230 _cleanup_(pty_forward_freep
) PTYForward
*forward
= NULL
;
3231 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
3234 r
= barrier_create(&barrier
);
3236 log_error_errno(r
, "Cannot initialize IPC barrier: %m");
3240 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, kmsg_socket_pair
) < 0) {
3241 r
= log_error_errno(errno
, "Failed to create kmsg socket pair: %m");
3245 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, rtnl_socket_pair
) < 0) {
3246 r
= log_error_errno(errno
, "Failed to create rtnl socket pair: %m");
3250 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, pid_socket_pair
) < 0) {
3251 r
= log_error_errno(errno
, "Failed to create pid socket pair: %m");
3256 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, uid_shift_socket_pair
) < 0) {
3257 r
= log_error_errno(errno
, "Failed to create uid shift socket pair: %m");
3261 /* Child can be killed before execv(), so handle SIGCHLD
3262 * in order to interrupt parent's blocking calls and
3263 * give it a chance to call wait() and terminate. */
3264 r
= sigprocmask(SIG_UNBLOCK
, &mask_chld
, NULL
);
3266 r
= log_error_errno(errno
, "Failed to change the signal mask: %m");
3270 r
= sigaction(SIGCHLD
, &sa
, NULL
);
3272 r
= log_error_errno(errno
, "Failed to install SIGCHLD handler: %m");
3276 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
, NULL
);
3278 if (errno
== EINVAL
)
3279 r
= log_error_errno(errno
, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3281 r
= log_error_errno(errno
, "clone() failed: %m");
3287 /* The outer child only has a file system namespace. */
3288 barrier_set_role(&barrier
, BARRIER_CHILD
);
3290 master
= safe_close(master
);
3292 kmsg_socket_pair
[0] = safe_close(kmsg_socket_pair
[0]);
3293 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3294 pid_socket_pair
[0] = safe_close(pid_socket_pair
[0]);
3295 uid_shift_socket_pair
[0] = safe_close(uid_shift_socket_pair
[0]);
3297 (void) reset_all_signal_handlers();
3298 (void) reset_signal_mask();
3300 r
= outer_child(&barrier
,
3303 root_device
, root_device_rw
,
3304 home_device
, home_device_rw
,
3305 srv_device
, srv_device_rw
,
3309 kmsg_socket_pair
[1],
3310 rtnl_socket_pair
[1],
3311 uid_shift_socket_pair
[1],
3314 _exit(EXIT_FAILURE
);
3316 _exit(EXIT_SUCCESS
);
3319 barrier_set_role(&barrier
, BARRIER_PARENT
);
3321 fds
= fdset_free(fds
);
3323 kmsg_socket_pair
[1] = safe_close(kmsg_socket_pair
[1]);
3324 rtnl_socket_pair
[1] = safe_close(rtnl_socket_pair
[1]);
3325 pid_socket_pair
[1] = safe_close(pid_socket_pair
[1]);
3326 uid_shift_socket_pair
[1] = safe_close(uid_shift_socket_pair
[1]);
3328 /* Wait for the outer child. */
3329 r
= wait_for_terminate_and_warn("namespace helper", pid
, NULL
);
3338 /* And now retrieve the PID of the inner child. */
3339 l
= recv(pid_socket_pair
[0], &pid
, sizeof(pid
), 0);
3341 r
= log_error_errno(errno
, "Failed to read inner child PID: %m");
3344 if (l
!= sizeof(pid
)) {
3345 log_error("Short read while reading inner child PID.");
3350 log_debug("Init process invoked as PID " PID_FMT
, pid
);
3353 if (!barrier_place_and_sync(&barrier
)) { /* #1 */
3354 log_error("Child died too early.");
3359 l
= recv(uid_shift_socket_pair
[0], &arg_uid_shift
, sizeof(arg_uid_shift
), 0);
3361 r
= log_error_errno(errno
, "Failed to read UID shift: %m");
3364 if (l
!= sizeof(arg_uid_shift
)) {
3365 log_error("Short read while reading UID shift.");
3370 r
= setup_uid_map(pid
);
3374 (void) barrier_place(&barrier
); /* #2 */
3377 if (arg_private_network
) {
3379 r
= move_network_interfaces(pid
, arg_network_interfaces
);
3383 if (arg_network_veth
) {
3384 r
= setup_veth(arg_machine
, pid
, veth_name
, !!arg_network_bridge
);
3390 if (arg_network_bridge
) {
3391 r
= setup_bridge(veth_name
, arg_network_bridge
);
3399 r
= setup_macvlan(arg_machine
, pid
, arg_network_macvlan
);
3403 r
= setup_ipvlan(arg_machine
, pid
, arg_network_ipvlan
);
3409 r
= register_machine(
3416 arg_custom_mounts
, arg_n_custom_mounts
,
3424 r
= sync_cgroup(pid
, arg_unified_cgroup_hierarchy
);
3428 if (arg_keep_unit
) {
3429 r
= create_subcgroup(pid
, arg_unified_cgroup_hierarchy
);
3434 r
= chown_cgroup(pid
, arg_uid_shift
);
3438 /* Notify the child that the parent is ready with all
3439 * its setup (including cgroup-ification), and that
3440 * the child can now hand over control to the code to
3441 * run inside the container. */
3442 (void) barrier_place(&barrier
); /* #3 */
3444 /* Block SIGCHLD here, before notifying child.
3445 * process_pty() will handle it with the other signals. */
3446 assert_se(sigprocmask(SIG_BLOCK
, &mask_chld
, NULL
) >= 0);
3448 /* Reset signal to default */
3449 r
= default_signals(SIGCHLD
, -1);
3451 log_error_errno(r
, "Failed to reset SIGCHLD: %m");
3455 /* Let the child know that we are ready and wait that the child is completely ready now. */
3456 if (!barrier_place_and_sync(&barrier
)) { /* #4 */
3457 log_error("Child died too early.");
3464 "STATUS=Container running.\n"
3465 "X_NSPAWN_LEADER_PID=" PID_FMT
, pid
);
3467 r
= sd_event_new(&event
);
3469 log_error_errno(r
, "Failed to get default event source: %m");
3473 if (arg_kill_signal
> 0) {
3474 /* Try to kill the init system on SIGINT or SIGTERM */
3475 sd_event_add_signal(event
, NULL
, SIGINT
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
3476 sd_event_add_signal(event
, NULL
, SIGTERM
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
3478 /* Immediately exit */
3479 sd_event_add_signal(event
, NULL
, SIGINT
, NULL
, NULL
);
3480 sd_event_add_signal(event
, NULL
, SIGTERM
, NULL
, NULL
);
3483 /* simply exit on sigchld */
3484 sd_event_add_signal(event
, NULL
, SIGCHLD
, NULL
, NULL
);
3486 if (arg_expose_ports
) {
3487 r
= expose_port_watch_rtnl(event
, rtnl_socket_pair
[0], on_address_change
, &exposed
, &rtnl
);
3491 (void) expose_port_execute(rtnl
, arg_expose_ports
, &exposed
);
3494 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3496 r
= pty_forward_new(event
, master
, PTY_FORWARD_IGNORE_VHANGUP
| (interactive
? 0 : PTY_FORWARD_READ_ONLY
), &forward
);
3498 log_error_errno(r
, "Failed to create PTY forwarder: %m");
3502 r
= sd_event_loop(event
);
3504 log_error_errno(r
, "Failed to run event loop: %m");
3508 pty_forward_get_last_char(forward
, &last_char
);
3510 forward
= pty_forward_free(forward
);
3512 if (!arg_quiet
&& last_char
!= '\n')
3515 /* Kill if it is not dead yet anyway */
3516 if (arg_register
&& !arg_keep_unit
)
3517 terminate_machine(pid
);
3519 /* Normally redundant, but better safe than sorry */
3522 r
= wait_for_container(pid
, &container_status
);
3526 /* We failed to wait for the container, or the
3527 * container exited abnormally */
3529 else if (r
> 0 || container_status
== CONTAINER_TERMINATED
){
3530 /* The container exited with a non-zero
3531 * status, or with zero status and no reboot
3537 /* CONTAINER_REBOOTED, loop again */
3539 if (arg_keep_unit
) {
3540 /* Special handling if we are running as a
3541 * service: instead of simply restarting the
3542 * machine we want to restart the entire
3543 * service, so let's inform systemd about this
3544 * with the special exit code 133. The service
3545 * file uses RestartForceExitStatus=133 so
3546 * that this results in a full nspawn
3547 * restart. This is necessary since we might
3548 * have cgroup parameters set we want to have
3555 expose_port_flush(arg_expose_ports
, &exposed
);
3561 "STATUS=Terminating...");
3566 /* Try to flush whatever is still queued in the pty */
3568 (void) copy_bytes(master
, STDOUT_FILENO
, (uint64_t) -1, false);
3570 loop_remove(loop_nr
, &image_fd
);
3572 if (remove_subvol
&& arg_directory
) {
3575 k
= btrfs_subvol_remove(arg_directory
, BTRFS_REMOVE_RECURSIVE
|BTRFS_REMOVE_QUOTA
);
3577 log_warning_errno(k
, "Cannot remove subvolume '%s', ignoring: %m", arg_directory
);
3583 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
3584 (void) rm_rf(p
, REMOVE_ROOT
);
3587 expose_port_flush(arg_expose_ports
, &exposed
);
3589 free(arg_directory
);
3594 strv_free(arg_setenv
);
3595 free(arg_network_bridge
);
3596 strv_free(arg_network_interfaces
);
3597 strv_free(arg_network_macvlan
);
3598 strv_free(arg_network_ipvlan
);
3599 strv_free(arg_parameters
);
3600 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
3601 expose_port_free_all(arg_expose_ports
);
3603 return r
< 0 ? EXIT_FAILURE
: ret
;