2 This file is part of systemd.
4 Copyright 2010 Lennart Poettering
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <linux/loop.h>
30 #include <selinux/selinux.h>
37 #include <sys/mount.h>
38 #include <sys/personality.h>
39 #include <sys/prctl.h>
40 #include <sys/types.h>
45 #include "sd-daemon.h"
48 #include "alloc-util.h"
50 #include "base-filesystem.h"
51 #include "blkid-util.h"
52 #include "btrfs-util.h"
55 #include "capability-util.h"
56 #include "cgroup-util.h"
58 #include "dev-setup.h"
59 #include "dissect-image.h"
64 #include "format-util.h"
67 #include "hexdecoct.h"
68 #include "hostname-util.h"
69 #include "id128-util.h"
71 #include "loop-util.h"
72 #include "loopback-setup.h"
73 #include "machine-image.h"
77 #include "mount-util.h"
78 #include "netlink-util.h"
79 #include "nspawn-cgroup.h"
80 #include "nspawn-expose-ports.h"
81 #include "nspawn-mount.h"
82 #include "nspawn-network.h"
83 #include "nspawn-patch-uid.h"
84 #include "nspawn-register.h"
85 #include "nspawn-seccomp.h"
86 #include "nspawn-settings.h"
87 #include "nspawn-setuid.h"
88 #include "nspawn-stub-pid1.h"
89 #include "parse-util.h"
90 #include "path-util.h"
91 #include "process-util.h"
93 #include "random-util.h"
94 #include "raw-clone.h"
96 #include "selinux-util.h"
97 #include "signal-util.h"
98 #include "socket-util.h"
99 #include "stat-util.h"
100 #include "stdio-util.h"
101 #include "string-util.h"
103 #include "terminal-util.h"
104 #include "udev-util.h"
105 #include "umask-util.h"
106 #include "user-util.h"
109 /* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
110 * UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems
111 * may have their own allocation ranges too. */
112 #define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
113 #define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
115 /* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
116 * nspawn_notify_socket_path is relative to the container
117 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
118 #define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
120 #define EXIT_FORCE_RESTART 133
122 typedef enum ContainerStatus
{
123 CONTAINER_TERMINATED
,
127 typedef enum LinkJournal
{
134 static char *arg_directory
= NULL
;
135 static char *arg_template
= NULL
;
136 static char *arg_chdir
= NULL
;
137 static char *arg_pivot_root_new
= NULL
;
138 static char *arg_pivot_root_old
= NULL
;
139 static char *arg_user
= NULL
;
140 static sd_id128_t arg_uuid
= {};
141 static char *arg_machine
= NULL
;
142 static const char *arg_selinux_context
= NULL
;
143 static const char *arg_selinux_apifs_context
= NULL
;
144 static const char *arg_slice
= NULL
;
145 static bool arg_private_network
= false;
146 static bool arg_read_only
= false;
147 static StartMode arg_start_mode
= START_PID1
;
148 static bool arg_ephemeral
= false;
149 static LinkJournal arg_link_journal
= LINK_AUTO
;
150 static bool arg_link_journal_try
= false;
151 static uint64_t arg_caps_retain
=
152 (1ULL << CAP_AUDIT_CONTROL
) |
153 (1ULL << CAP_AUDIT_WRITE
) |
154 (1ULL << CAP_CHOWN
) |
155 (1ULL << CAP_DAC_OVERRIDE
) |
156 (1ULL << CAP_DAC_READ_SEARCH
) |
157 (1ULL << CAP_FOWNER
) |
158 (1ULL << CAP_FSETID
) |
159 (1ULL << CAP_IPC_OWNER
) |
161 (1ULL << CAP_LEASE
) |
162 (1ULL << CAP_LINUX_IMMUTABLE
) |
163 (1ULL << CAP_MKNOD
) |
164 (1ULL << CAP_NET_BIND_SERVICE
) |
165 (1ULL << CAP_NET_BROADCAST
) |
166 (1ULL << CAP_NET_RAW
) |
167 (1ULL << CAP_SETFCAP
) |
168 (1ULL << CAP_SETGID
) |
169 (1ULL << CAP_SETPCAP
) |
170 (1ULL << CAP_SETUID
) |
171 (1ULL << CAP_SYS_ADMIN
) |
172 (1ULL << CAP_SYS_BOOT
) |
173 (1ULL << CAP_SYS_CHROOT
) |
174 (1ULL << CAP_SYS_NICE
) |
175 (1ULL << CAP_SYS_PTRACE
) |
176 (1ULL << CAP_SYS_RESOURCE
) |
177 (1ULL << CAP_SYS_TTY_CONFIG
);
178 static CustomMount
*arg_custom_mounts
= NULL
;
179 static unsigned arg_n_custom_mounts
= 0;
180 static char **arg_setenv
= NULL
;
181 static bool arg_quiet
= false;
182 static bool arg_register
= true;
183 static bool arg_keep_unit
= false;
184 static char **arg_network_interfaces
= NULL
;
185 static char **arg_network_macvlan
= NULL
;
186 static char **arg_network_ipvlan
= NULL
;
187 static bool arg_network_veth
= false;
188 static char **arg_network_veth_extra
= NULL
;
189 static char *arg_network_bridge
= NULL
;
190 static char *arg_network_zone
= NULL
;
191 static unsigned long arg_personality
= PERSONALITY_INVALID
;
192 static char *arg_image
= NULL
;
193 static VolatileMode arg_volatile_mode
= VOLATILE_NO
;
194 static ExposePort
*arg_expose_ports
= NULL
;
195 static char **arg_property
= NULL
;
196 static UserNamespaceMode arg_userns_mode
= USER_NAMESPACE_NO
;
197 static uid_t arg_uid_shift
= UID_INVALID
, arg_uid_range
= 0x10000U
;
198 static bool arg_userns_chown
= false;
199 static int arg_kill_signal
= 0;
200 static CGroupUnified arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_UNKNOWN
;
201 static SettingsMask arg_settings_mask
= 0;
202 static int arg_settings_trusted
= -1;
203 static char **arg_parameters
= NULL
;
204 static const char *arg_container_service_name
= "systemd-nspawn";
205 static bool arg_notify_ready
= false;
206 static bool arg_use_cgns
= true;
207 static unsigned long arg_clone_ns_flags
= CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
;
208 static MountSettingsMask arg_mount_settings
= MOUNT_APPLY_APIVFS_RO
;
209 static void *arg_root_hash
= NULL
;
210 static size_t arg_root_hash_size
= 0;
212 static void help(void) {
213 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
214 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
215 " -h --help Show this help\n"
216 " --version Print version string\n"
217 " -q --quiet Do not show status information\n"
218 " -D --directory=PATH Root directory for the container\n"
219 " --template=PATH Initialize root directory from template directory,\n"
221 " -x --ephemeral Run container with snapshot of root directory, and\n"
222 " remove it after exit\n"
223 " -i --image=PATH File system device or disk image for the container\n"
224 " --root-hash=HASH Specify verity root hash\n"
225 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
226 " -b --boot Boot up full system (i.e. invoke init)\n"
227 " --chdir=PATH Set working directory in the container\n"
228 " --pivot-root=PATH[:PATH]\n"
229 " Pivot root to given directory in the container\n"
230 " -u --user=USER Run the command under specified user or uid\n"
231 " -M --machine=NAME Set the machine name for the container\n"
232 " --uuid=UUID Set a specific machine UUID for the container\n"
233 " -S --slice=SLICE Place the container in the specified slice\n"
234 " --property=NAME=VALUE Set scope unit property\n"
235 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
236 " --private-users[=UIDBASE[:NUIDS]]\n"
237 " Similar, but with user configured UID/GID range\n"
238 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n"
239 " --private-network Disable network in container\n"
240 " --network-interface=INTERFACE\n"
241 " Assign an existing network interface to the\n"
243 " --network-macvlan=INTERFACE\n"
244 " Create a macvlan network interface based on an\n"
245 " existing network interface to the container\n"
246 " --network-ipvlan=INTERFACE\n"
247 " Create a ipvlan network interface based on an\n"
248 " existing network interface to the container\n"
249 " -n --network-veth Add a virtual Ethernet connection between host\n"
251 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
252 " Add an additional virtual Ethernet link between\n"
253 " host and container\n"
254 " --network-bridge=INTERFACE\n"
255 " Add a virtual Ethernet connection to the container\n"
256 " and attach it to an existing bridge on the host\n"
257 " --network-zone=NAME Similar, but attach the new interface to an\n"
258 " an automatically managed bridge interface\n"
259 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
260 " Expose a container IP port on the host\n"
261 " -Z --selinux-context=SECLABEL\n"
262 " Set the SELinux security context to be used by\n"
263 " processes in the container\n"
264 " -L --selinux-apifs-context=SECLABEL\n"
265 " Set the SELinux security context to be used by\n"
266 " API/tmpfs file systems in the container\n"
267 " --capability=CAP In addition to the default, retain specified\n"
269 " --drop-capability=CAP Drop the specified capability from the default set\n"
270 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
271 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
272 " host, try-guest, try-host\n"
273 " -j Equivalent to --link-journal=try-guest\n"
274 " --read-only Mount the root directory read-only\n"
275 " --bind=PATH[:PATH[:OPTIONS]]\n"
276 " Bind mount a file or directory from the host into\n"
278 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
279 " Similar, but creates a read-only bind mount\n"
280 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
281 " --overlay=PATH[:PATH...]:PATH\n"
282 " Create an overlay mount from the host to \n"
284 " --overlay-ro=PATH[:PATH...]:PATH\n"
285 " Similar, but creates a read-only overlay mount\n"
286 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
287 " --register=BOOLEAN Register container as machine\n"
288 " --keep-unit Do not register a scope for the machine, reuse\n"
289 " the service unit nspawn is running in\n"
290 " --volatile[=MODE] Run the system in volatile mode\n"
291 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
292 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
293 , program_invocation_short_name
);
296 static int custom_mount_check_all(void) {
299 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
300 CustomMount
*m
= &arg_custom_mounts
[i
];
302 if (path_equal(m
->destination
, "/") && arg_userns_mode
!= USER_NAMESPACE_NO
) {
304 if (arg_userns_chown
) {
305 log_error("--private-users-chown may not be combined with custom root mounts.");
307 } else if (arg_uid_shift
== UID_INVALID
) {
308 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
317 static int detect_unified_cgroup_hierarchy(const char *directory
) {
321 /* Allow the user to control whether the unified hierarchy is used */
322 e
= getenv("UNIFIED_CGROUP_HIERARCHY");
324 r
= parse_boolean(e
);
326 return log_error_errno(r
, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
328 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_ALL
;
330 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_NONE
;
335 /* Otherwise inherit the default from the host system */
336 r
= cg_all_unified();
338 return log_error_errno(r
, "Failed to determine whether we are in all unified mode.");
340 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
341 * routine only detects 231, so we'll have a false negative here for 230. */
342 r
= systemd_installation_has_version(directory
, 230);
344 return log_error_errno(r
, "Failed to determine systemd version in container: %m");
346 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_ALL
;
348 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_NONE
;
349 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER
) > 0) {
350 /* Mixed cgroup hierarchy support was added in 233 */
351 r
= systemd_installation_has_version(directory
, 233);
353 return log_error_errno(r
, "Failed to determine systemd version in container: %m");
355 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_SYSTEMD
;
357 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_NONE
;
359 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_NONE
;
364 static void parse_share_ns_env(const char *name
, unsigned long ns_flag
) {
367 r
= getenv_bool(name
);
371 log_warning_errno(r
, "Failed to parse %s from environment, defaulting to false.", name
);
372 arg_clone_ns_flags
= (arg_clone_ns_flags
& ~ns_flag
) | (r
> 0 ? 0 : ns_flag
);
375 static void parse_mount_settings_env(void) {
379 e
= getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
383 if (streq(e
, "network")) {
384 arg_mount_settings
|= MOUNT_APPLY_APIVFS_RO
|MOUNT_APPLY_APIVFS_NETNS
;
388 r
= parse_boolean(e
);
390 log_warning_errno(r
, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring.");
394 SET_FLAG(arg_mount_settings
, MOUNT_APPLY_APIVFS_RO
, r
== 0);
395 SET_FLAG(arg_mount_settings
, MOUNT_APPLY_APIVFS_NETNS
, false);
398 static int parse_argv(int argc
, char *argv
[]) {
416 ARG_NETWORK_INTERFACE
,
421 ARG_NETWORK_VETH_EXTRA
,
431 ARG_PRIVATE_USERS_CHOWN
,
436 static const struct option options
[] = {
437 { "help", no_argument
, NULL
, 'h' },
438 { "version", no_argument
, NULL
, ARG_VERSION
},
439 { "directory", required_argument
, NULL
, 'D' },
440 { "template", required_argument
, NULL
, ARG_TEMPLATE
},
441 { "ephemeral", no_argument
, NULL
, 'x' },
442 { "user", required_argument
, NULL
, 'u' },
443 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
444 { "as-pid2", no_argument
, NULL
, 'a' },
445 { "boot", no_argument
, NULL
, 'b' },
446 { "uuid", required_argument
, NULL
, ARG_UUID
},
447 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
448 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
449 { "drop-capability", required_argument
, NULL
, ARG_DROP_CAPABILITY
},
450 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
451 { "bind", required_argument
, NULL
, ARG_BIND
},
452 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
453 { "tmpfs", required_argument
, NULL
, ARG_TMPFS
},
454 { "overlay", required_argument
, NULL
, ARG_OVERLAY
},
455 { "overlay-ro", required_argument
, NULL
, ARG_OVERLAY_RO
},
456 { "machine", required_argument
, NULL
, 'M' },
457 { "slice", required_argument
, NULL
, 'S' },
458 { "setenv", required_argument
, NULL
, 'E' },
459 { "selinux-context", required_argument
, NULL
, 'Z' },
460 { "selinux-apifs-context", required_argument
, NULL
, 'L' },
461 { "quiet", no_argument
, NULL
, 'q' },
462 { "share-system", no_argument
, NULL
, ARG_SHARE_SYSTEM
}, /* not documented */
463 { "register", required_argument
, NULL
, ARG_REGISTER
},
464 { "keep-unit", no_argument
, NULL
, ARG_KEEP_UNIT
},
465 { "network-interface", required_argument
, NULL
, ARG_NETWORK_INTERFACE
},
466 { "network-macvlan", required_argument
, NULL
, ARG_NETWORK_MACVLAN
},
467 { "network-ipvlan", required_argument
, NULL
, ARG_NETWORK_IPVLAN
},
468 { "network-veth", no_argument
, NULL
, 'n' },
469 { "network-veth-extra", required_argument
, NULL
, ARG_NETWORK_VETH_EXTRA
},
470 { "network-bridge", required_argument
, NULL
, ARG_NETWORK_BRIDGE
},
471 { "network-zone", required_argument
, NULL
, ARG_NETWORK_ZONE
},
472 { "personality", required_argument
, NULL
, ARG_PERSONALITY
},
473 { "image", required_argument
, NULL
, 'i' },
474 { "volatile", optional_argument
, NULL
, ARG_VOLATILE
},
475 { "port", required_argument
, NULL
, 'p' },
476 { "property", required_argument
, NULL
, ARG_PROPERTY
},
477 { "private-users", optional_argument
, NULL
, ARG_PRIVATE_USERS
},
478 { "private-users-chown", optional_argument
, NULL
, ARG_PRIVATE_USERS_CHOWN
},
479 { "kill-signal", required_argument
, NULL
, ARG_KILL_SIGNAL
},
480 { "settings", required_argument
, NULL
, ARG_SETTINGS
},
481 { "chdir", required_argument
, NULL
, ARG_CHDIR
},
482 { "pivot-root", required_argument
, NULL
, ARG_PIVOT_ROOT
},
483 { "notify-ready", required_argument
, NULL
, ARG_NOTIFY_READY
},
484 { "root-hash", required_argument
, NULL
, ARG_ROOT_HASH
},
490 uint64_t plus
= 0, minus
= 0;
491 bool mask_all_settings
= false, mask_no_settings
= false;
496 while ((c
= getopt_long(argc
, argv
, "+hD:u:abL:M:jS:Z:qi:xp:nUE:", options
, NULL
)) >= 0)
508 r
= parse_path_argument_and_warn(optarg
, false, &arg_directory
);
514 r
= parse_path_argument_and_warn(optarg
, false, &arg_template
);
520 r
= parse_path_argument_and_warn(optarg
, false, &arg_image
);
526 arg_ephemeral
= true;
530 r
= free_and_strdup(&arg_user
, optarg
);
534 arg_settings_mask
|= SETTING_USER
;
537 case ARG_NETWORK_ZONE
: {
540 j
= strappend("vz-", optarg
);
544 if (!ifname_valid(j
)) {
545 log_error("Network zone name not valid: %s", j
);
550 free(arg_network_zone
);
551 arg_network_zone
= j
;
553 arg_network_veth
= true;
554 arg_private_network
= true;
555 arg_settings_mask
|= SETTING_NETWORK
;
559 case ARG_NETWORK_BRIDGE
:
561 if (!ifname_valid(optarg
)) {
562 log_error("Bridge interface name not valid: %s", optarg
);
566 r
= free_and_strdup(&arg_network_bridge
, optarg
);
573 arg_network_veth
= true;
574 arg_private_network
= true;
575 arg_settings_mask
|= SETTING_NETWORK
;
578 case ARG_NETWORK_VETH_EXTRA
:
579 r
= veth_extra_parse(&arg_network_veth_extra
, optarg
);
581 return log_error_errno(r
, "Failed to parse --network-veth-extra= parameter: %s", optarg
);
583 arg_private_network
= true;
584 arg_settings_mask
|= SETTING_NETWORK
;
587 case ARG_NETWORK_INTERFACE
:
589 if (!ifname_valid(optarg
)) {
590 log_error("Network interface name not valid: %s", optarg
);
594 if (strv_extend(&arg_network_interfaces
, optarg
) < 0)
597 arg_private_network
= true;
598 arg_settings_mask
|= SETTING_NETWORK
;
601 case ARG_NETWORK_MACVLAN
:
603 if (!ifname_valid(optarg
)) {
604 log_error("MACVLAN network interface name not valid: %s", optarg
);
608 if (strv_extend(&arg_network_macvlan
, optarg
) < 0)
611 arg_private_network
= true;
612 arg_settings_mask
|= SETTING_NETWORK
;
615 case ARG_NETWORK_IPVLAN
:
617 if (!ifname_valid(optarg
)) {
618 log_error("IPVLAN network interface name not valid: %s", optarg
);
622 if (strv_extend(&arg_network_ipvlan
, optarg
) < 0)
627 case ARG_PRIVATE_NETWORK
:
628 arg_private_network
= true;
629 arg_settings_mask
|= SETTING_NETWORK
;
633 if (arg_start_mode
== START_PID2
) {
634 log_error("--boot and --as-pid2 may not be combined.");
638 arg_start_mode
= START_BOOT
;
639 arg_settings_mask
|= SETTING_START_MODE
;
643 if (arg_start_mode
== START_BOOT
) {
644 log_error("--boot and --as-pid2 may not be combined.");
648 arg_start_mode
= START_PID2
;
649 arg_settings_mask
|= SETTING_START_MODE
;
653 r
= sd_id128_from_string(optarg
, &arg_uuid
);
655 return log_error_errno(r
, "Invalid UUID: %s", optarg
);
657 if (sd_id128_is_null(arg_uuid
)) {
658 log_error("Machine UUID may not be all zeroes.");
662 arg_settings_mask
|= SETTING_MACHINE_ID
;
671 arg_machine
= mfree(arg_machine
);
673 if (!machine_name_is_valid(optarg
)) {
674 log_error("Invalid machine name: %s", optarg
);
678 r
= free_and_strdup(&arg_machine
, optarg
);
685 arg_selinux_context
= optarg
;
689 arg_selinux_apifs_context
= optarg
;
693 arg_read_only
= true;
694 arg_settings_mask
|= SETTING_READ_ONLY
;
698 case ARG_DROP_CAPABILITY
: {
701 _cleanup_free_
char *t
= NULL
;
703 r
= extract_first_word(&p
, &t
, ",", 0);
705 return log_error_errno(r
, "Failed to parse capability %s.", t
);
710 if (streq(t
, "all")) {
711 if (c
== ARG_CAPABILITY
)
712 plus
= (uint64_t) -1;
714 minus
= (uint64_t) -1;
718 cap
= capability_from_name(t
);
720 log_error("Failed to parse capability %s.", t
);
724 if (c
== ARG_CAPABILITY
)
725 plus
|= 1ULL << (uint64_t) cap
;
727 minus
|= 1ULL << (uint64_t) cap
;
731 arg_settings_mask
|= SETTING_CAPABILITY
;
736 arg_link_journal
= LINK_GUEST
;
737 arg_link_journal_try
= true;
740 case ARG_LINK_JOURNAL
:
741 if (streq(optarg
, "auto")) {
742 arg_link_journal
= LINK_AUTO
;
743 arg_link_journal_try
= false;
744 } else if (streq(optarg
, "no")) {
745 arg_link_journal
= LINK_NO
;
746 arg_link_journal_try
= false;
747 } else if (streq(optarg
, "guest")) {
748 arg_link_journal
= LINK_GUEST
;
749 arg_link_journal_try
= false;
750 } else if (streq(optarg
, "host")) {
751 arg_link_journal
= LINK_HOST
;
752 arg_link_journal_try
= false;
753 } else if (streq(optarg
, "try-guest")) {
754 arg_link_journal
= LINK_GUEST
;
755 arg_link_journal_try
= true;
756 } else if (streq(optarg
, "try-host")) {
757 arg_link_journal
= LINK_HOST
;
758 arg_link_journal_try
= true;
760 log_error("Failed to parse link journal mode %s", optarg
);
768 r
= bind_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
, c
== ARG_BIND_RO
);
770 return log_error_errno(r
, "Failed to parse --bind(-ro)= argument %s: %m", optarg
);
772 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
776 r
= tmpfs_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
);
778 return log_error_errno(r
, "Failed to parse --tmpfs= argument %s: %m", optarg
);
780 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
785 r
= overlay_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
, c
== ARG_OVERLAY_RO
);
786 if (r
== -EADDRNOTAVAIL
)
787 return log_error_errno(r
, "--overlay(-ro)= needs at least two colon-separated directories specified.");
789 return log_error_errno(r
, "Failed to parse --overlay(-ro)= argument %s: %m", optarg
);
791 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
797 if (!env_assignment_is_valid(optarg
)) {
798 log_error("Environment variable assignment '%s' is not valid.", optarg
);
802 n
= strv_env_set(arg_setenv
, optarg
);
806 strv_free(arg_setenv
);
809 arg_settings_mask
|= SETTING_ENVIRONMENT
;
817 case ARG_SHARE_SYSTEM
:
818 /* We don't officially support this anymore, except for compat reasons. People should use the
819 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
820 arg_clone_ns_flags
= 0;
824 r
= parse_boolean(optarg
);
826 log_error("Failed to parse --register= argument: %s", optarg
);
834 arg_keep_unit
= true;
837 case ARG_PERSONALITY
:
839 arg_personality
= personality_from_string(optarg
);
840 if (arg_personality
== PERSONALITY_INVALID
) {
841 log_error("Unknown or unsupported personality '%s'.", optarg
);
845 arg_settings_mask
|= SETTING_PERSONALITY
;
851 arg_volatile_mode
= VOLATILE_YES
;
855 m
= volatile_mode_from_string(optarg
);
857 log_error("Failed to parse --volatile= argument: %s", optarg
);
860 arg_volatile_mode
= m
;
863 arg_settings_mask
|= SETTING_VOLATILE_MODE
;
867 r
= expose_port_parse(&arg_expose_ports
, optarg
);
869 return log_error_errno(r
, "Duplicate port specification: %s", optarg
);
871 return log_error_errno(r
, "Failed to parse host port %s: %m", optarg
);
873 arg_settings_mask
|= SETTING_EXPOSE_PORTS
;
877 if (strv_extend(&arg_property
, optarg
) < 0)
882 case ARG_PRIVATE_USERS
: {
887 else if (!in_charset(optarg
, DIGITS
))
888 /* do *not* parse numbers as booleans */
889 boolean
= parse_boolean(optarg
);
891 if (boolean
== false) {
892 /* no: User namespacing off */
893 arg_userns_mode
= USER_NAMESPACE_NO
;
894 arg_uid_shift
= UID_INVALID
;
895 arg_uid_range
= UINT32_C(0x10000);
896 } else if (boolean
== true) {
897 /* yes: User namespacing on, UID range is read from root dir */
898 arg_userns_mode
= USER_NAMESPACE_FIXED
;
899 arg_uid_shift
= UID_INVALID
;
900 arg_uid_range
= UINT32_C(0x10000);
901 } else if (streq(optarg
, "pick")) {
902 /* pick: User namespacing on, UID range is picked randomly */
903 arg_userns_mode
= USER_NAMESPACE_PICK
;
904 arg_uid_shift
= UID_INVALID
;
905 arg_uid_range
= UINT32_C(0x10000);
907 _cleanup_free_
char *buffer
= NULL
;
908 const char *range
, *shift
;
910 /* anything else: User namespacing on, UID range is explicitly configured */
912 range
= strchr(optarg
, ':');
914 buffer
= strndup(optarg
, range
- optarg
);
920 r
= safe_atou32(range
, &arg_uid_range
);
922 return log_error_errno(r
, "Failed to parse UID range \"%s\": %m", range
);
926 r
= parse_uid(shift
, &arg_uid_shift
);
928 return log_error_errno(r
, "Failed to parse UID \"%s\": %m", optarg
);
930 arg_userns_mode
= USER_NAMESPACE_FIXED
;
933 if (arg_uid_range
<= 0) {
934 log_error("UID range cannot be 0.");
938 arg_settings_mask
|= SETTING_USERNS
;
943 if (userns_supported()) {
944 arg_userns_mode
= USER_NAMESPACE_PICK
;
945 arg_uid_shift
= UID_INVALID
;
946 arg_uid_range
= UINT32_C(0x10000);
948 arg_settings_mask
|= SETTING_USERNS
;
953 case ARG_PRIVATE_USERS_CHOWN
:
954 arg_userns_chown
= true;
956 arg_settings_mask
|= SETTING_USERNS
;
959 case ARG_KILL_SIGNAL
:
960 arg_kill_signal
= signal_from_string_try_harder(optarg
);
961 if (arg_kill_signal
< 0) {
962 log_error("Cannot parse signal: %s", optarg
);
966 arg_settings_mask
|= SETTING_KILL_SIGNAL
;
971 /* no → do not read files
972 * yes → read files, do not override cmdline, trust only subset
973 * override → read files, override cmdline, trust only subset
974 * trusted → read files, do not override cmdline, trust all
977 r
= parse_boolean(optarg
);
979 if (streq(optarg
, "trusted")) {
980 mask_all_settings
= false;
981 mask_no_settings
= false;
982 arg_settings_trusted
= true;
984 } else if (streq(optarg
, "override")) {
985 mask_all_settings
= false;
986 mask_no_settings
= true;
987 arg_settings_trusted
= -1;
989 return log_error_errno(r
, "Failed to parse --settings= argument: %s", optarg
);
992 mask_all_settings
= false;
993 mask_no_settings
= false;
994 arg_settings_trusted
= -1;
997 mask_all_settings
= true;
998 mask_no_settings
= false;
999 arg_settings_trusted
= false;
1005 if (!path_is_absolute(optarg
)) {
1006 log_error("Working directory %s is not an absolute path.", optarg
);
1010 r
= free_and_strdup(&arg_chdir
, optarg
);
1014 arg_settings_mask
|= SETTING_WORKING_DIRECTORY
;
1017 case ARG_PIVOT_ROOT
:
1018 r
= pivot_root_parse(&arg_pivot_root_new
, &arg_pivot_root_old
, optarg
);
1020 return log_error_errno(r
, "Failed to parse --pivot-root= argument %s: %m", optarg
);
1022 arg_settings_mask
|= SETTING_PIVOT_ROOT
;
1025 case ARG_NOTIFY_READY
:
1026 r
= parse_boolean(optarg
);
1028 log_error("%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg
);
1031 arg_notify_ready
= r
;
1032 arg_settings_mask
|= SETTING_NOTIFY_READY
;
1035 case ARG_ROOT_HASH
: {
1039 r
= unhexmem(optarg
, strlen(optarg
), &k
, &l
);
1041 return log_error_errno(r
, "Failed to parse root hash: %s", optarg
);
1042 if (l
< sizeof(sd_id128_t
)) {
1043 log_error("Root hash must be at least 128bit long: %s", optarg
);
1048 free(arg_root_hash
);
1050 arg_root_hash_size
= l
;
1058 assert_not_reached("Unhandled option");
1061 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC
);
1062 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID
);
1063 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS
);
1064 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
);
1066 if (arg_userns_mode
!= USER_NAMESPACE_NO
)
1067 arg_mount_settings
|= MOUNT_USE_USERNS
;
1069 if (arg_private_network
)
1070 arg_mount_settings
|= MOUNT_APPLY_APIVFS_NETNS
;
1072 parse_mount_settings_env();
1074 if (!(arg_clone_ns_flags
& CLONE_NEWPID
) ||
1075 !(arg_clone_ns_flags
& CLONE_NEWUTS
)) {
1076 arg_register
= false;
1077 if (arg_start_mode
!= START_PID1
) {
1078 log_error("--boot cannot be used without namespacing.");
1083 if (arg_userns_mode
== USER_NAMESPACE_PICK
)
1084 arg_userns_chown
= true;
1086 if (arg_keep_unit
&& cg_pid_get_owner_uid(0, NULL
) >= 0) {
1087 log_error("--keep-unit may not be used when invoked from a user session.");
1091 if (arg_directory
&& arg_image
) {
1092 log_error("--directory= and --image= may not be combined.");
1096 if (arg_template
&& arg_image
) {
1097 log_error("--template= and --image= may not be combined.");
1101 if (arg_ephemeral
&& arg_template
&& !arg_directory
) {
1102 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1103 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1104 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1107 arg_directory
= arg_template
;
1108 arg_template
= NULL
;
1111 if (arg_template
&& !(arg_directory
|| arg_machine
)) {
1112 log_error("--template= needs --directory= or --machine=.");
1116 if (arg_ephemeral
&& arg_template
) {
1117 log_error("--ephemeral and --template= may not be combined.");
1121 if (arg_ephemeral
&& !IN_SET(arg_link_journal
, LINK_NO
, LINK_AUTO
)) {
1122 log_error("--ephemeral and --link-journal= may not be combined.");
1126 if (arg_userns_mode
!= USER_NAMESPACE_NO
&& !userns_supported()) {
1127 log_error("--private-users= is not supported, kernel compiled without user namespace support.");
1131 if (arg_userns_chown
&& arg_read_only
) {
1132 log_error("--read-only and --private-users-chown may not be combined.");
1136 if (arg_network_bridge
&& arg_network_zone
) {
1137 log_error("--network-bridge= and --network-zone= may not be combined.");
1141 if (argc
> optind
) {
1142 arg_parameters
= strv_copy(argv
+ optind
);
1143 if (!arg_parameters
)
1146 arg_settings_mask
|= SETTING_START_MODE
;
1149 /* Load all settings from .nspawn files */
1150 if (mask_no_settings
)
1151 arg_settings_mask
= 0;
1153 /* Don't load any settings from .nspawn files */
1154 if (mask_all_settings
)
1155 arg_settings_mask
= _SETTINGS_MASK_ALL
;
1157 arg_caps_retain
= (arg_caps_retain
| plus
| (arg_private_network
? 1ULL << CAP_NET_ADMIN
: 0)) & ~minus
;
1159 r
= cg_unified_flush();
1161 return log_error_errno(r
, "Failed to determine whether the unified cgroups hierarchy is used: %m");
1163 e
= getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
1165 arg_container_service_name
= e
;
1167 r
= getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
1169 arg_use_cgns
= cg_ns_supported();
1173 r
= custom_mount_check_all();
1180 static int verify_arguments(void) {
1181 if (arg_userns_mode
!= USER_NAMESPACE_NO
&& (arg_mount_settings
& MOUNT_APPLY_APIVFS_NETNS
) && !arg_private_network
) {
1182 log_error("Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1186 if (arg_userns_mode
!= USER_NAMESPACE_NO
&& !(arg_mount_settings
& MOUNT_APPLY_APIVFS_RO
)) {
1187 log_error("Cannot combine --private-users with read-write mounts.");
1191 if (arg_volatile_mode
!= VOLATILE_NO
&& arg_read_only
) {
1192 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1196 if (arg_expose_ports
&& !arg_private_network
) {
1197 log_error("Cannot use --port= without private networking.");
1201 #ifndef HAVE_LIBIPTC
1202 if (arg_expose_ports
) {
1203 log_error("--port= is not supported, compiled without libiptc support.");
1208 if (arg_start_mode
== START_BOOT
&& arg_kill_signal
<= 0)
1209 arg_kill_signal
= SIGRTMIN
+3;
1214 static int userns_lchown(const char *p
, uid_t uid
, gid_t gid
) {
1217 if (arg_userns_mode
== USER_NAMESPACE_NO
)
1220 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
1223 if (uid
!= UID_INVALID
) {
1224 uid
+= arg_uid_shift
;
1226 if (uid
< arg_uid_shift
|| uid
>= arg_uid_shift
+ arg_uid_range
)
1230 if (gid
!= GID_INVALID
) {
1231 gid
+= (gid_t
) arg_uid_shift
;
1233 if (gid
< (gid_t
) arg_uid_shift
|| gid
>= (gid_t
) (arg_uid_shift
+ arg_uid_range
))
1237 if (lchown(p
, uid
, gid
) < 0)
1243 static int userns_mkdir(const char *root
, const char *path
, mode_t mode
, uid_t uid
, gid_t gid
) {
1246 q
= prefix_roota(root
, path
);
1247 if (mkdir(q
, mode
) < 0) {
1248 if (errno
== EEXIST
)
1253 return userns_lchown(q
, uid
, gid
);
1256 static int setup_timezone(const char *dest
) {
1257 _cleanup_free_
char *p
= NULL
, *q
= NULL
;
1258 const char *where
, *check
, *what
;
1264 /* Fix the timezone, if possible */
1265 r
= readlink_malloc("/etc/localtime", &p
);
1267 log_warning("host's /etc/localtime is not a symlink, not updating container timezone.");
1268 /* to handle warning, delete /etc/localtime and replace it
1269 * with a symbolic link to a time zone data file.
1272 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1277 z
= path_startswith(p
, "../usr/share/zoneinfo/");
1279 z
= path_startswith(p
, "/usr/share/zoneinfo/");
1281 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1285 where
= prefix_roota(dest
, "/etc/localtime");
1286 r
= readlink_malloc(where
, &q
);
1288 y
= path_startswith(q
, "../usr/share/zoneinfo/");
1290 y
= path_startswith(q
, "/usr/share/zoneinfo/");
1292 /* Already pointing to the right place? Then do nothing .. */
1293 if (y
&& streq(y
, z
))
1297 check
= strjoina("/usr/share/zoneinfo/", z
);
1298 check
= prefix_roota(dest
, check
);
1299 if (laccess(check
, F_OK
) < 0) {
1300 log_warning("Timezone %s does not exist in container, not updating container timezone.", z
);
1304 if (unlink(where
) < 0 && errno
!= ENOENT
) {
1305 log_full_errno(IN_SET(errno
, EROFS
, EACCES
, EPERM
) ? LOG_DEBUG
: LOG_WARNING
, /* Don't complain on read-only images */
1307 "Failed to remove existing timezone info %s in container, ignoring: %m", where
);
1311 what
= strjoina("../usr/share/zoneinfo/", z
);
1312 if (symlink(what
, where
) < 0) {
1313 log_full_errno(IN_SET(errno
, EROFS
, EACCES
, EPERM
) ? LOG_DEBUG
: LOG_WARNING
,
1315 "Failed to correct timezone of container, ignoring: %m");
1319 r
= userns_lchown(where
, 0, 0);
1321 return log_warning_errno(r
, "Failed to chown /etc/localtime: %m");
1326 static int resolved_listening(void) {
1327 _cleanup_(sd_bus_flush_close_unrefp
) sd_bus
*bus
= NULL
;
1328 _cleanup_free_
char *dns_stub_listener_mode
= NULL
;
1331 /* Check if resolved is listening */
1333 r
= sd_bus_open_system(&bus
);
1337 r
= bus_name_has_owner(bus
, "org.freedesktop.resolve1", NULL
);
1341 r
= sd_bus_get_property_string(bus
,
1342 "org.freedesktop.resolve1",
1343 "/org/freedesktop/resolve1",
1344 "org.freedesktop.resolve1.Manager",
1347 &dns_stub_listener_mode
);
1351 return STR_IN_SET(dns_stub_listener_mode
, "udp", "yes");
1354 static int setup_resolv_conf(const char *dest
) {
1355 _cleanup_free_
char *resolved
= NULL
, *etc
= NULL
;
1361 if (arg_private_network
)
1364 r
= chase_symlinks("/etc", dest
, CHASE_PREFIX_ROOT
, &etc
);
1366 log_warning_errno(r
, "Failed to resolve /etc path in container, ignoring: %m");
1370 where
= strjoina(etc
, "/resolv.conf");
1371 found
= chase_symlinks(where
, dest
, CHASE_NONEXISTENT
, &resolved
);
1373 log_warning_errno(found
, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
1377 if (access("/usr/lib/systemd/resolv.conf", F_OK
) >= 0 &&
1378 resolved_listening() > 0) {
1380 /* resolved is enabled on the host. In this, case bind mount its static resolv.conf file into the
1381 * container, so that the container can use the host's resolver. Given that network namespacing is
1382 * disabled it's only natural of the container also uses the host's resolver. It also has the big
1383 * advantage that the container will be able to follow the host's DNS server configuration changes
1386 if (found
== 0) /* missing? */
1387 (void) touch(resolved
);
1389 r
= mount_verbose(LOG_DEBUG
, "/usr/lib/systemd/resolv.conf", resolved
, NULL
, MS_BIND
, NULL
);
1391 return mount_verbose(LOG_ERR
, NULL
, resolved
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
);
1394 /* If that didn't work, let's copy the file */
1395 r
= copy_file("/etc/resolv.conf", where
, O_TRUNC
|O_NOFOLLOW
, 0644, 0, COPY_REFLINK
);
1397 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1398 * resolved or something similar runs inside and the symlink points there.
1400 * If the disk image is read-only, there's also no point in complaining.
1402 log_full_errno(IN_SET(r
, -ELOOP
, -EROFS
, -EACCES
, -EPERM
) ? LOG_DEBUG
: LOG_WARNING
, r
,
1403 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where
);
1407 r
= userns_lchown(where
, 0, 0);
1409 log_warning_errno(r
, "Failed to chown /etc/resolv.conf, ignoring: %m");
1414 static int setup_boot_id(const char *dest
) {
1415 sd_id128_t rnd
= SD_ID128_NULL
;
1416 const char *from
, *to
;
1419 /* Generate a new randomized boot ID, so that each boot-up of
1420 * the container gets a new one */
1422 from
= prefix_roota(dest
, "/run/proc-sys-kernel-random-boot-id");
1423 to
= prefix_roota(dest
, "/proc/sys/kernel/random/boot_id");
1425 r
= sd_id128_randomize(&rnd
);
1427 return log_error_errno(r
, "Failed to generate random boot id: %m");
1429 r
= id128_write(from
, ID128_UUID
, rnd
, false);
1431 return log_error_errno(r
, "Failed to write boot id: %m");
1433 r
= mount_verbose(LOG_ERR
, from
, to
, NULL
, MS_BIND
, NULL
);
1435 r
= mount_verbose(LOG_ERR
, NULL
, to
, NULL
,
1436 MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
);
1438 (void) unlink(from
);
1442 static int copy_devnodes(const char *dest
) {
1444 static const char devnodes
[] =
1455 _cleanup_umask_ mode_t u
;
1461 /* Create /dev/net, so that we can create /dev/net/tun in it */
1462 if (userns_mkdir(dest
, "/dev/net", 0755, 0, 0) < 0)
1463 return log_error_errno(r
, "Failed to create /dev/net directory: %m");
1465 NULSTR_FOREACH(d
, devnodes
) {
1466 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
1469 from
= strappend("/dev/", d
);
1470 to
= prefix_root(dest
, from
);
1472 if (stat(from
, &st
) < 0) {
1474 if (errno
!= ENOENT
)
1475 return log_error_errno(errno
, "Failed to stat %s: %m", from
);
1477 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
1479 log_error("%s is not a char or block device, cannot copy.", from
);
1483 if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
1484 /* Explicitly warn the user when /dev is already populated. */
1485 if (errno
== EEXIST
)
1486 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest
);
1488 return log_error_errno(errno
, "mknod(%s) failed: %m", to
);
1490 /* Some systems abusively restrict mknod but
1491 * allow bind mounts. */
1494 return log_error_errno(r
, "touch (%s) failed: %m", to
);
1495 r
= mount_verbose(LOG_DEBUG
, from
, to
, NULL
, MS_BIND
, NULL
);
1497 return log_error_errno(r
, "Both mknod and bind mount (%s) failed: %m", to
);
1500 r
= userns_lchown(to
, 0, 0);
1502 return log_error_errno(r
, "chown() of device node %s failed: %m", to
);
1509 static int setup_pts(const char *dest
) {
1510 _cleanup_free_
char *options
= NULL
;
1515 if (arg_selinux_apifs_context
)
1516 (void) asprintf(&options
,
1517 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
",context=\"%s\"",
1518 arg_uid_shift
+ TTY_GID
,
1519 arg_selinux_apifs_context
);
1522 (void) asprintf(&options
,
1523 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
,
1524 arg_uid_shift
+ TTY_GID
);
1529 /* Mount /dev/pts itself */
1530 p
= prefix_roota(dest
, "/dev/pts");
1531 if (mkdir(p
, 0755) < 0)
1532 return log_error_errno(errno
, "Failed to create /dev/pts: %m");
1533 r
= mount_verbose(LOG_ERR
, "devpts", p
, "devpts", MS_NOSUID
|MS_NOEXEC
, options
);
1536 r
= userns_lchown(p
, 0, 0);
1538 return log_error_errno(r
, "Failed to chown /dev/pts: %m");
1540 /* Create /dev/ptmx symlink */
1541 p
= prefix_roota(dest
, "/dev/ptmx");
1542 if (symlink("pts/ptmx", p
) < 0)
1543 return log_error_errno(errno
, "Failed to create /dev/ptmx symlink: %m");
1544 r
= userns_lchown(p
, 0, 0);
1546 return log_error_errno(r
, "Failed to chown /dev/ptmx: %m");
1548 /* And fix /dev/pts/ptmx ownership */
1549 p
= prefix_roota(dest
, "/dev/pts/ptmx");
1550 r
= userns_lchown(p
, 0, 0);
1552 return log_error_errno(r
, "Failed to chown /dev/pts/ptmx: %m");
1557 static int setup_dev_console(const char *dest
, const char *console
) {
1558 _cleanup_umask_ mode_t u
;
1567 r
= chmod_and_chown(console
, 0600, arg_uid_shift
, arg_uid_shift
);
1569 return log_error_errno(r
, "Failed to correct access mode for TTY: %m");
1571 /* We need to bind mount the right tty to /dev/console since
1572 * ptys can only exist on pts file systems. To have something
1573 * to bind mount things on we create a empty regular file. */
1575 to
= prefix_roota(dest
, "/dev/console");
1578 return log_error_errno(r
, "touch() for /dev/console failed: %m");
1580 return mount_verbose(LOG_ERR
, console
, to
, NULL
, MS_BIND
, NULL
);
1583 static int setup_kmsg(const char *dest
, int kmsg_socket
) {
1584 const char *from
, *to
;
1585 _cleanup_umask_ mode_t u
;
1588 assert(kmsg_socket
>= 0);
1592 /* We create the kmsg FIFO as /run/kmsg, but immediately
1593 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1594 * on the reading side behave very similar to /proc/kmsg,
1595 * their writing side behaves differently from /dev/kmsg in
1596 * that writing blocks when nothing is reading. In order to
1597 * avoid any problems with containers deadlocking due to this
1598 * we simply make /dev/kmsg unavailable to the container. */
1599 from
= prefix_roota(dest
, "/run/kmsg");
1600 to
= prefix_roota(dest
, "/proc/kmsg");
1602 if (mkfifo(from
, 0600) < 0)
1603 return log_error_errno(errno
, "mkfifo() for /run/kmsg failed: %m");
1604 r
= mount_verbose(LOG_ERR
, from
, to
, NULL
, MS_BIND
, NULL
);
1608 fd
= open(from
, O_RDWR
|O_NDELAY
|O_CLOEXEC
);
1610 return log_error_errno(errno
, "Failed to open fifo: %m");
1612 /* Store away the fd in the socket, so that it stays open as
1613 * long as we run the child */
1614 r
= send_one_fd(kmsg_socket
, fd
, 0);
1618 return log_error_errno(r
, "Failed to send FIFO fd: %m");
1620 /* And now make the FIFO unavailable as /run/kmsg... */
1621 (void) unlink(from
);
1626 static int on_address_change(sd_netlink
*rtnl
, sd_netlink_message
*m
, void *userdata
) {
1627 union in_addr_union
*exposed
= userdata
;
1633 expose_port_execute(rtnl
, arg_expose_ports
, exposed
);
1637 static int setup_hostname(void) {
1639 if ((arg_clone_ns_flags
& CLONE_NEWUTS
) == 0)
1642 if (sethostname_idempotent(arg_machine
) < 0)
1648 static int setup_journal(const char *directory
) {
1650 _cleanup_free_
char *d
= NULL
;
1656 /* Don't link journals in ephemeral mode */
1660 if (arg_link_journal
== LINK_NO
)
1663 try = arg_link_journal_try
|| arg_link_journal
== LINK_AUTO
;
1665 r
= sd_id128_get_machine(&this_id
);
1667 return log_error_errno(r
, "Failed to retrieve machine ID: %m");
1669 if (sd_id128_equal(arg_uuid
, this_id
)) {
1670 log_full(try ? LOG_WARNING
: LOG_ERR
,
1671 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid
, id
));
1677 r
= userns_mkdir(directory
, "/var", 0755, 0, 0);
1679 return log_error_errno(r
, "Failed to create /var: %m");
1681 r
= userns_mkdir(directory
, "/var/log", 0755, 0, 0);
1683 return log_error_errno(r
, "Failed to create /var/log: %m");
1685 r
= userns_mkdir(directory
, "/var/log/journal", 0755, 0, 0);
1687 return log_error_errno(r
, "Failed to create /var/log/journal: %m");
1689 (void) sd_id128_to_string(arg_uuid
, id
);
1691 p
= strjoina("/var/log/journal/", id
);
1692 q
= prefix_roota(directory
, p
);
1694 if (path_is_mount_point(p
, NULL
, 0) > 0) {
1698 log_error("%s: already a mount point, refusing to use for journal", p
);
1702 if (path_is_mount_point(q
, NULL
, 0) > 0) {
1706 log_error("%s: already a mount point, refusing to use for journal", q
);
1710 r
= readlink_and_make_absolute(p
, &d
);
1712 if ((arg_link_journal
== LINK_GUEST
||
1713 arg_link_journal
== LINK_AUTO
) &&
1716 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1718 log_warning_errno(r
, "Failed to create directory %s: %m", q
);
1723 return log_error_errno(errno
, "Failed to remove symlink %s: %m", p
);
1724 } else if (r
== -EINVAL
) {
1726 if (arg_link_journal
== LINK_GUEST
&&
1729 if (errno
== ENOTDIR
) {
1730 log_error("%s already exists and is neither a symlink nor a directory", p
);
1733 return log_error_errno(errno
, "Failed to remove %s: %m", p
);
1735 } else if (r
!= -ENOENT
)
1736 return log_error_errno(r
, "readlink(%s) failed: %m", p
);
1738 if (arg_link_journal
== LINK_GUEST
) {
1740 if (symlink(q
, p
) < 0) {
1742 log_debug_errno(errno
, "Failed to symlink %s to %s, skipping journal setup: %m", q
, p
);
1745 return log_error_errno(errno
, "Failed to symlink %s to %s: %m", q
, p
);
1748 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1750 log_warning_errno(r
, "Failed to create directory %s: %m", q
);
1754 if (arg_link_journal
== LINK_HOST
) {
1755 /* don't create parents here — if the host doesn't have
1756 * permanent journal set up, don't force it here */
1758 if (mkdir(p
, 0755) < 0 && errno
!= EEXIST
) {
1760 log_debug_errno(errno
, "Failed to create %s, skipping journal setup: %m", p
);
1763 return log_error_errno(errno
, "Failed to create %s: %m", p
);
1766 } else if (access(p
, F_OK
) < 0)
1769 if (dir_is_empty(q
) == 0)
1770 log_warning("%s is not empty, proceeding anyway.", q
);
1772 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1774 return log_error_errno(r
, "Failed to create %s: %m", q
);
1776 r
= mount_verbose(LOG_DEBUG
, p
, q
, NULL
, MS_BIND
, NULL
);
1778 return log_error_errno(errno
, "Failed to bind mount journal from host into guest: %m");
1783 static int drop_capabilities(void) {
1784 return capability_bounding_set_drop(arg_caps_retain
, false);
1787 static int reset_audit_loginuid(void) {
1788 _cleanup_free_
char *p
= NULL
;
1791 if ((arg_clone_ns_flags
& CLONE_NEWPID
) == 0)
1794 r
= read_one_line_file("/proc/self/loginuid", &p
);
1798 return log_error_errno(r
, "Failed to read /proc/self/loginuid: %m");
1800 /* Already reset? */
1801 if (streq(p
, "4294967295"))
1804 r
= write_string_file("/proc/self/loginuid", "4294967295", 0);
1807 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1808 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1809 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1810 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1811 "using systemd-nspawn. Sleeping for 5s... (%m)");
1820 static int setup_propagate(const char *root
) {
1824 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1825 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1826 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
1827 (void) mkdir_p(p
, 0600);
1829 r
= userns_mkdir(root
, "/run/systemd", 0755, 0, 0);
1831 return log_error_errno(r
, "Failed to create /run/systemd: %m");
1833 r
= userns_mkdir(root
, "/run/systemd/nspawn", 0755, 0, 0);
1835 return log_error_errno(r
, "Failed to create /run/systemd/nspawn: %m");
1837 r
= userns_mkdir(root
, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1839 return log_error_errno(r
, "Failed to create /run/systemd/nspawn/incoming: %m");
1841 q
= prefix_roota(root
, "/run/systemd/nspawn/incoming");
1842 r
= mount_verbose(LOG_ERR
, p
, q
, NULL
, MS_BIND
, NULL
);
1846 r
= mount_verbose(LOG_ERR
, NULL
, q
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
);
1850 /* machined will MS_MOVE into that directory, and that's only
1851 * supported for non-shared mounts. */
1852 return mount_verbose(LOG_ERR
, NULL
, q
, NULL
, MS_SLAVE
, NULL
);
1855 static int setup_machine_id(const char *directory
) {
1856 const char *etc_machine_id
;
1860 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
1861 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
1862 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
1863 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
1864 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
1865 * container behaves nicely). */
1867 etc_machine_id
= prefix_roota(directory
, "/etc/machine-id");
1869 r
= id128_read(etc_machine_id
, ID128_PLAIN
, &id
);
1871 if (!IN_SET(r
, -ENOENT
, -ENOMEDIUM
)) /* If the file is missing or empty, we don't mind */
1872 return log_error_errno(r
, "Failed to read machine ID from container image: %m");
1874 if (sd_id128_is_null(arg_uuid
)) {
1875 r
= sd_id128_randomize(&arg_uuid
);
1877 return log_error_errno(r
, "Failed to acquire randomized machine UUID: %m");
1880 if (sd_id128_is_null(id
)) {
1881 log_error("Machine ID in container image is zero, refusing.");
1891 static int recursive_chown(const char *directory
, uid_t shift
, uid_t range
) {
1896 if (arg_userns_mode
== USER_NAMESPACE_NO
|| !arg_userns_chown
)
1899 r
= path_patch_uid(directory
, arg_uid_shift
, arg_uid_range
);
1900 if (r
== -EOPNOTSUPP
)
1901 return log_error_errno(r
, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
1903 return log_error_errno(r
, "Upper 16 bits of root directory UID and GID do not match.");
1905 return log_error_errno(r
, "Failed to adjust UID/GID shift of OS tree: %m");
1907 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
1909 log_debug("Patched directory tree to match UID/GID range.");
1916 * < 0 : wait_for_terminate() failed to get the state of the
1917 * container, the container was terminated by a signal, or
1918 * failed for an unknown reason. No change is made to the
1919 * container argument.
1920 * > 0 : The program executed in the container terminated with an
1921 * error. The exit code of the program executed in the
1922 * container is returned. The container argument has been set
1923 * to CONTAINER_TERMINATED.
1924 * 0 : The container is being rebooted, has been shut down or exited
1925 * successfully. The container argument has been set to either
1926 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
1928 * That is, success is indicated by a return value of zero, and an
1929 * error is indicated by a non-zero value.
1931 static int wait_for_container(pid_t pid
, ContainerStatus
*container
) {
1935 r
= wait_for_terminate(pid
, &status
);
1937 return log_warning_errno(r
, "Failed to wait for container: %m");
1939 switch (status
.si_code
) {
1942 if (status
.si_status
== 0)
1943 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s exited successfully.", arg_machine
);
1945 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s failed with error code %i.", arg_machine
, status
.si_status
);
1947 *container
= CONTAINER_TERMINATED
;
1948 return status
.si_status
;
1951 if (status
.si_status
== SIGINT
) {
1952 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s has been shut down.", arg_machine
);
1953 *container
= CONTAINER_TERMINATED
;
1956 } else if (status
.si_status
== SIGHUP
) {
1957 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s is being rebooted.", arg_machine
);
1958 *container
= CONTAINER_REBOOTED
;
1965 log_error("Container %s terminated by signal %s.", arg_machine
, signal_to_string(status
.si_status
));
1969 log_error("Container %s failed due to unknown reason.", arg_machine
);
1974 static int on_orderly_shutdown(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
1977 pid
= PTR_TO_PID(userdata
);
1979 if (kill(pid
, arg_kill_signal
) >= 0) {
1980 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1981 sd_event_source_set_userdata(s
, NULL
);
1986 sd_event_exit(sd_event_source_get_event(s
), 0);
1990 static int on_sigchld(sd_event_source
*s
, const struct signalfd_siginfo
*ssi
, void *userdata
) {
1993 if (waitid(P_ALL
, 0, &si
, WNOHANG
|WNOWAIT
|WEXITED
) < 0)
1994 return log_error_errno(errno
, "Failed to waitid(): %m");
1995 if (si
.si_pid
== 0) /* No pending children. */
1997 if (si
.si_pid
== PTR_TO_PID(userdata
)) {
1998 /* The main process we care for has exited. Return from
1999 * signal handler but leave the zombie. */
2000 sd_event_exit(sd_event_source_get_event(s
), 0);
2003 /* Reap all other children. */
2004 (void) waitid(P_PID
, si
.si_pid
, &si
, WNOHANG
|WEXITED
);
2010 static int determine_names(void) {
2013 if (arg_template
&& !arg_directory
&& arg_machine
) {
2015 /* If --template= was specified then we should not
2016 * search for a machine, but instead create a new one
2017 * in /var/lib/machine. */
2019 arg_directory
= strjoin("/var/lib/machines/", arg_machine
);
2024 if (!arg_image
&& !arg_directory
) {
2026 _cleanup_(image_unrefp
) Image
*i
= NULL
;
2028 r
= image_find(arg_machine
, &i
);
2030 return log_error_errno(r
, "Failed to find image for machine '%s': %m", arg_machine
);
2032 log_error("No image for machine '%s': %m", arg_machine
);
2036 if (i
->type
== IMAGE_RAW
)
2037 r
= free_and_strdup(&arg_image
, i
->path
);
2039 r
= free_and_strdup(&arg_directory
, i
->path
);
2044 arg_read_only
= arg_read_only
|| i
->read_only
;
2046 arg_directory
= get_current_dir_name();
2048 if (!arg_directory
&& !arg_image
) {
2049 log_error("Failed to determine path, please use -D or -i.");
2056 if (arg_directory
&& path_equal(arg_directory
, "/"))
2057 arg_machine
= gethostname_malloc();
2062 arg_machine
= strdup(basename(arg_image
));
2064 /* Truncate suffix if there is one */
2065 e
= endswith(arg_machine
, ".raw");
2069 arg_machine
= strdup(basename(arg_directory
));
2074 hostname_cleanup(arg_machine
);
2075 if (!machine_name_is_valid(arg_machine
)) {
2076 log_error("Failed to determine machine name automatically, please use -M.");
2080 if (arg_ephemeral
) {
2083 /* Add a random suffix when this is an
2084 * ephemeral machine, so that we can run many
2085 * instances at once without manually having
2086 * to specify -M each time. */
2088 if (asprintf(&b
, "%s-%016" PRIx64
, arg_machine
, random_u64()) < 0)
2099 static int chase_symlinks_and_update(char **p
, unsigned flags
) {
2108 r
= chase_symlinks(*p
, NULL
, flags
, &chased
);
2110 return log_error_errno(r
, "Failed to resolve path %s: %m", *p
);
2118 static int determine_uid_shift(const char *directory
) {
2121 if (arg_userns_mode
== USER_NAMESPACE_NO
) {
2126 if (arg_uid_shift
== UID_INVALID
) {
2129 r
= stat(directory
, &st
);
2131 return log_error_errno(errno
, "Failed to determine UID base of %s: %m", directory
);
2133 arg_uid_shift
= st
.st_uid
& UINT32_C(0xffff0000);
2135 if (arg_uid_shift
!= (st
.st_gid
& UINT32_C(0xffff0000))) {
2136 log_error("UID and GID base of %s don't match.", directory
);
2140 arg_uid_range
= UINT32_C(0x10000);
2143 if (arg_uid_shift
> (uid_t
) -1 - arg_uid_range
) {
2144 log_error("UID base too high for UID range.");
2151 static int inner_child(
2153 const char *directory
,
2159 _cleanup_free_
char *home
= NULL
;
2162 const char *envp
[] = {
2163 "PATH=" DEFAULT_PATH_SPLIT_USR
,
2164 NULL
, /* container */
2169 NULL
, /* container_uuid */
2170 NULL
, /* LISTEN_FDS */
2171 NULL
, /* LISTEN_PID */
2172 NULL
, /* NOTIFY_SOCKET */
2175 const char *exec_target
;
2177 _cleanup_strv_free_
char **env_use
= NULL
;
2182 assert(kmsg_socket
>= 0);
2184 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
2185 /* Tell the parent, that it now can write the UID map. */
2186 (void) barrier_place(barrier
); /* #1 */
2188 /* Wait until the parent wrote the UID map */
2189 if (!barrier_place_and_sync(barrier
)) { /* #2 */
2190 log_error("Parent died too early");
2195 r
= reset_uid_gid();
2197 return log_error_errno(r
, "Couldn't become new root: %m");
2200 arg_mount_settings
| MOUNT_IN_USERNS
,
2203 arg_selinux_apifs_context
);
2208 r
= mount_sysfs(NULL
, arg_mount_settings
);
2212 /* Wait until we are cgroup-ified, so that we
2213 * can mount the right cgroup path writable */
2214 if (!barrier_place_and_sync(barrier
)) { /* #3 */
2215 log_error("Parent died too early");
2219 if (arg_use_cgns
&& cg_ns_supported()) {
2220 r
= unshare(CLONE_NEWCGROUP
);
2222 return log_error_errno(errno
, "Failed to unshare cgroup namespace");
2225 arg_unified_cgroup_hierarchy
,
2226 arg_userns_mode
!= USER_NAMESPACE_NO
,
2229 arg_selinux_apifs_context
,
2234 r
= mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy
);
2239 r
= setup_boot_id(NULL
);
2243 r
= setup_kmsg(NULL
, kmsg_socket
);
2246 kmsg_socket
= safe_close(kmsg_socket
);
2251 return log_error_errno(errno
, "setsid() failed: %m");
2253 if (arg_private_network
)
2256 if (arg_expose_ports
) {
2257 r
= expose_port_send_rtnl(rtnl_socket
);
2260 rtnl_socket
= safe_close(rtnl_socket
);
2263 r
= drop_capabilities();
2265 return log_error_errno(r
, "drop_capabilities() failed: %m");
2269 if (arg_personality
!= PERSONALITY_INVALID
) {
2270 if (personality(arg_personality
) < 0)
2271 return log_error_errno(errno
, "personality() failed: %m");
2272 } else if (secondary
) {
2273 if (personality(PER_LINUX32
) < 0)
2274 return log_error_errno(errno
, "personality() failed: %m");
2278 if (arg_selinux_context
)
2279 if (setexeccon(arg_selinux_context
) < 0)
2280 return log_error_errno(errno
, "setexeccon(\"%s\") failed: %m", arg_selinux_context
);
2283 r
= change_uid_gid(arg_user
, &home
);
2287 /* LXC sets container=lxc, so follow the scheme here */
2288 envp
[n_env
++] = strjoina("container=", arg_container_service_name
);
2290 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
2294 if ((asprintf((char**)(envp
+ n_env
++), "HOME=%s", home
? home
: "/root") < 0) ||
2295 (asprintf((char**)(envp
+ n_env
++), "USER=%s", arg_user
? arg_user
: "root") < 0) ||
2296 (asprintf((char**)(envp
+ n_env
++), "LOGNAME=%s", arg_user
? arg_user
: "root") < 0))
2299 assert(!sd_id128_is_null(arg_uuid
));
2301 if (asprintf((char**)(envp
+ n_env
++), "container_uuid=%s", id128_to_uuid_string(arg_uuid
, as_uuid
)) < 0)
2304 if (fdset_size(fds
) > 0) {
2305 r
= fdset_cloexec(fds
, false);
2307 return log_error_errno(r
, "Failed to unset O_CLOEXEC for file descriptors.");
2309 if ((asprintf((char **)(envp
+ n_env
++), "LISTEN_FDS=%u", fdset_size(fds
)) < 0) ||
2310 (asprintf((char **)(envp
+ n_env
++), "LISTEN_PID=1") < 0))
2313 if (asprintf((char **)(envp
+ n_env
++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH
) < 0)
2316 env_use
= strv_env_merge(2, envp
, arg_setenv
);
2320 /* Let the parent know that we are ready and
2321 * wait until the parent is ready with the
2323 if (!barrier_place_and_sync(barrier
)) { /* #4 */
2324 log_error("Parent died too early");
2329 if (chdir(arg_chdir
) < 0)
2330 return log_error_errno(errno
, "Failed to change to specified working directory %s: %m", arg_chdir
);
2332 if (arg_start_mode
== START_PID2
) {
2333 r
= stub_pid1(arg_uuid
);
2338 /* Now, explicitly close the log, so that we
2339 * then can close all remaining fds. Closing
2340 * the log explicitly first has the benefit
2341 * that the logging subsystem knows about it,
2342 * and is thus ready to be reopened should we
2343 * need it again. Note that the other fds
2344 * closed here are at least the locking and
2347 (void) fdset_close_others(fds
);
2349 if (arg_start_mode
== START_BOOT
) {
2353 /* Automatically search for the init system */
2355 m
= strv_length(arg_parameters
);
2356 a
= newa(char*, m
+ 2);
2357 memcpy_safe(a
+ 1, arg_parameters
, m
* sizeof(char*));
2360 a
[0] = (char*) "/usr/lib/systemd/systemd";
2361 execve(a
[0], a
, env_use
);
2363 a
[0] = (char*) "/lib/systemd/systemd";
2364 execve(a
[0], a
, env_use
);
2366 a
[0] = (char*) "/sbin/init";
2367 execve(a
[0], a
, env_use
);
2369 exec_target
= "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
2370 } else if (!strv_isempty(arg_parameters
)) {
2371 exec_target
= arg_parameters
[0];
2372 execvpe(arg_parameters
[0], arg_parameters
, env_use
);
2375 /* If we cannot change the directory, we'll end up in /, that is expected. */
2376 (void) chdir(home
?: "/root");
2378 execle("/bin/bash", "-bash", NULL
, env_use
);
2379 execle("/bin/sh", "-sh", NULL
, env_use
);
2381 exec_target
= "/bin/bash, /bin/sh";
2386 return log_error_errno(r
, "execv(%s) failed: %m", exec_target
);
2389 static int setup_sd_notify_child(void) {
2390 static const int one
= 1;
2392 union sockaddr_union sa
= {
2393 .sa
.sa_family
= AF_UNIX
,
2397 fd
= socket(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
|SOCK_NONBLOCK
, 0);
2399 return log_error_errno(errno
, "Failed to allocate notification socket: %m");
2401 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH
, 0755);
2402 (void) unlink(NSPAWN_NOTIFY_SOCKET_PATH
);
2404 strncpy(sa
.un
.sun_path
, NSPAWN_NOTIFY_SOCKET_PATH
, sizeof(sa
.un
.sun_path
)-1);
2405 r
= bind(fd
, &sa
.sa
, SOCKADDR_UN_LEN(sa
.un
));
2408 return log_error_errno(errno
, "bind(%s) failed: %m", sa
.un
.sun_path
);
2411 r
= userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH
, 0, 0);
2414 return log_error_errno(r
, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH
": %m");
2417 r
= setsockopt(fd
, SOL_SOCKET
, SO_PASSCRED
, &one
, sizeof(one
));
2420 return log_error_errno(errno
, "SO_PASSCRED failed: %m");
2426 static int outer_child(
2428 const char *directory
,
2429 const char *console
,
2430 DissectedImage
*dissected_image
,
2438 int uid_shift_socket
,
2444 _cleanup_close_
int fd
= -1;
2449 assert(pid_socket
>= 0);
2450 assert(uuid_socket
>= 0);
2451 assert(notify_socket
>= 0);
2452 assert(kmsg_socket
>= 0);
2454 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0)
2455 return log_error_errno(errno
, "PR_SET_PDEATHSIG failed: %m");
2458 close_nointr(STDIN_FILENO
);
2459 close_nointr(STDOUT_FILENO
);
2460 close_nointr(STDERR_FILENO
);
2462 r
= open_terminal(console
, O_RDWR
);
2463 if (r
!= STDIN_FILENO
) {
2469 return log_error_errno(r
, "Failed to open console: %m");
2472 if (dup2(STDIN_FILENO
, STDOUT_FILENO
) != STDOUT_FILENO
||
2473 dup2(STDIN_FILENO
, STDERR_FILENO
) != STDERR_FILENO
)
2474 return log_error_errno(errno
, "Failed to duplicate console: %m");
2477 r
= reset_audit_loginuid();
2481 /* Mark everything as slave, so that we still
2482 * receive mounts from the real root, but don't
2483 * propagate mounts to the real root. */
2484 r
= mount_verbose(LOG_ERR
, NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
);
2488 if (dissected_image
) {
2489 r
= dissected_image_mount(dissected_image
, directory
, DISSECT_IMAGE_DISCARD_ON_LOOP
|(arg_read_only
? DISSECT_IMAGE_READ_ONLY
: 0));
2494 r
= determine_uid_shift(directory
);
2498 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
2499 /* Let the parent know which UID shift we read from the image */
2500 l
= send(uid_shift_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), MSG_NOSIGNAL
);
2502 return log_error_errno(errno
, "Failed to send UID shift: %m");
2503 if (l
!= sizeof(arg_uid_shift
)) {
2504 log_error("Short write while sending UID shift.");
2508 if (arg_userns_mode
== USER_NAMESPACE_PICK
) {
2509 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
2510 * we just read from the image is available. If yes, it will send the UID shift back to us, if
2511 * not it will pick a different one, and send it back to us. */
2513 l
= recv(uid_shift_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), 0);
2515 return log_error_errno(errno
, "Failed to recv UID shift: %m");
2516 if (l
!= sizeof(arg_uid_shift
)) {
2517 log_error("Short read while receiving UID shift.");
2522 log_info("Selected user namespace base " UID_FMT
" and range " UID_FMT
".", arg_uid_shift
, arg_uid_range
);
2525 /* Turn directory into bind mount */
2526 r
= mount_verbose(LOG_ERR
, directory
, directory
, NULL
, MS_BIND
|MS_REC
, NULL
);
2530 r
= setup_pivot_root(
2533 arg_pivot_root_old
);
2540 arg_userns_mode
!= USER_NAMESPACE_NO
,
2543 arg_selinux_context
);
2547 r
= setup_volatile_state(
2550 arg_userns_mode
!= USER_NAMESPACE_NO
,
2553 arg_selinux_context
);
2557 /* Mark everything as shared so our mounts get propagated down. This is
2558 * required to make new bind mounts available in systemd services
2559 * inside the containter that create a new mount namespace.
2560 * See https://github.com/systemd/systemd/issues/3860
2561 * Further submounts (such as /dev) done after this will inherit the
2562 * shared propagation mode. */
2563 r
= mount_verbose(LOG_ERR
, NULL
, directory
, NULL
, MS_SHARED
|MS_REC
, NULL
);
2567 r
= recursive_chown(directory
, arg_uid_shift
, arg_uid_range
);
2571 r
= base_filesystem_create(directory
, arg_uid_shift
, (gid_t
) arg_uid_shift
);
2575 if (arg_read_only
) {
2576 r
= bind_remount_recursive(directory
, true, NULL
);
2578 return log_error_errno(r
, "Failed to make tree read-only: %m");
2581 r
= mount_all(directory
,
2585 arg_selinux_apifs_context
);
2589 r
= copy_devnodes(directory
);
2593 dev_setup(directory
, arg_uid_shift
, arg_uid_shift
);
2595 r
= setup_pts(directory
);
2599 r
= setup_propagate(directory
);
2603 r
= setup_dev_console(directory
, console
);
2607 r
= setup_seccomp(arg_caps_retain
);
2611 r
= setup_timezone(directory
);
2615 r
= setup_resolv_conf(directory
);
2619 r
= setup_machine_id(directory
);
2623 r
= setup_journal(directory
);
2630 arg_n_custom_mounts
,
2631 arg_userns_mode
!= USER_NAMESPACE_NO
,
2634 arg_selinux_apifs_context
);
2638 if (!arg_use_cgns
|| !cg_ns_supported()) {
2641 arg_unified_cgroup_hierarchy
,
2642 arg_userns_mode
!= USER_NAMESPACE_NO
,
2645 arg_selinux_apifs_context
,
2651 r
= mount_move_root(directory
);
2653 return log_error_errno(r
, "Failed to move root directory: %m");
2655 fd
= setup_sd_notify_child();
2659 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
|
2660 arg_clone_ns_flags
|
2661 (arg_private_network
? CLONE_NEWNET
: 0) |
2662 (arg_userns_mode
!= USER_NAMESPACE_NO
? CLONE_NEWUSER
: 0));
2664 return log_error_errno(errno
, "Failed to fork inner child: %m");
2666 pid_socket
= safe_close(pid_socket
);
2667 uuid_socket
= safe_close(uuid_socket
);
2668 notify_socket
= safe_close(notify_socket
);
2669 uid_shift_socket
= safe_close(uid_shift_socket
);
2671 /* The inner child has all namespaces that are
2672 * requested, so that we all are owned by the user if
2673 * user namespaces are turned on. */
2675 r
= inner_child(barrier
, directory
, secondary
, kmsg_socket
, rtnl_socket
, fds
);
2677 _exit(EXIT_FAILURE
);
2679 _exit(EXIT_SUCCESS
);
2682 l
= send(pid_socket
, &pid
, sizeof(pid
), MSG_NOSIGNAL
);
2684 return log_error_errno(errno
, "Failed to send PID: %m");
2685 if (l
!= sizeof(pid
)) {
2686 log_error("Short write while sending PID.");
2690 l
= send(uuid_socket
, &arg_uuid
, sizeof(arg_uuid
), MSG_NOSIGNAL
);
2692 return log_error_errno(errno
, "Failed to send machine ID: %m");
2693 if (l
!= sizeof(arg_uuid
)) {
2694 log_error("Short write while sending machine ID.");
2698 l
= send_one_fd(notify_socket
, fd
, 0);
2700 return log_error_errno(errno
, "Failed to send notify fd: %m");
2702 pid_socket
= safe_close(pid_socket
);
2703 uuid_socket
= safe_close(uuid_socket
);
2704 notify_socket
= safe_close(notify_socket
);
2705 kmsg_socket
= safe_close(kmsg_socket
);
2706 rtnl_socket
= safe_close(rtnl_socket
);
2711 static int uid_shift_pick(uid_t
*shift
, LockFile
*ret_lock_file
) {
2712 unsigned n_tries
= 100;
2717 assert(ret_lock_file
);
2718 assert(arg_userns_mode
== USER_NAMESPACE_PICK
);
2719 assert(arg_uid_range
== 0x10000U
);
2723 (void) mkdir("/run/systemd/nspawn-uid", 0755);
2726 char lock_path
[strlen("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t
) + 1];
2727 _cleanup_release_lock_file_ LockFile lf
= LOCK_FILE_INIT
;
2732 if (candidate
< UID_SHIFT_PICK_MIN
|| candidate
> UID_SHIFT_PICK_MAX
)
2734 if ((candidate
& UINT32_C(0xFFFF)) != 0)
2737 xsprintf(lock_path
, "/run/systemd/nspawn-uid/" UID_FMT
, candidate
);
2738 r
= make_lock_file(lock_path
, LOCK_EX
|LOCK_NB
, &lf
);
2739 if (r
== -EBUSY
) /* Range already taken by another nspawn instance */
2744 /* Make some superficial checks whether the range is currently known in the user database */
2745 if (getpwuid(candidate
))
2747 if (getpwuid(candidate
+ UINT32_C(0xFFFE)))
2749 if (getgrgid(candidate
))
2751 if (getgrgid(candidate
+ UINT32_C(0xFFFE)))
2754 *ret_lock_file
= lf
;
2755 lf
= (struct LockFile
) LOCK_FILE_INIT
;
2760 random_bytes(&candidate
, sizeof(candidate
));
2761 candidate
= (candidate
% (UID_SHIFT_PICK_MAX
- UID_SHIFT_PICK_MIN
)) + UID_SHIFT_PICK_MIN
;
2762 candidate
&= (uid_t
) UINT32_C(0xFFFF0000);
2766 static int setup_uid_map(pid_t pid
) {
2767 char uid_map
[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t
) + 1], line
[DECIMAL_STR_MAX(uid_t
)*3+3+1];
2772 xsprintf(uid_map
, "/proc/" PID_FMT
"/uid_map", pid
);
2773 xsprintf(line
, UID_FMT
" " UID_FMT
" " UID_FMT
"\n", 0, arg_uid_shift
, arg_uid_range
);
2774 r
= write_string_file(uid_map
, line
, 0);
2776 return log_error_errno(r
, "Failed to write UID map: %m");
2778 /* We always assign the same UID and GID ranges */
2779 xsprintf(uid_map
, "/proc/" PID_FMT
"/gid_map", pid
);
2780 r
= write_string_file(uid_map
, line
, 0);
2782 return log_error_errno(r
, "Failed to write GID map: %m");
2787 static int nspawn_dispatch_notify_fd(sd_event_source
*source
, int fd
, uint32_t revents
, void *userdata
) {
2788 char buf
[NOTIFY_BUFFER_MAX
+1];
2790 struct iovec iovec
= {
2792 .iov_len
= sizeof(buf
)-1,
2795 struct cmsghdr cmsghdr
;
2796 uint8_t buf
[CMSG_SPACE(sizeof(struct ucred
)) +
2797 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX
)];
2799 struct msghdr msghdr
= {
2802 .msg_control
= &control
,
2803 .msg_controllen
= sizeof(control
),
2805 struct cmsghdr
*cmsg
;
2806 struct ucred
*ucred
= NULL
;
2808 pid_t inner_child_pid
;
2809 _cleanup_strv_free_
char **tags
= NULL
;
2813 inner_child_pid
= PTR_TO_PID(userdata
);
2815 if (revents
!= EPOLLIN
) {
2816 log_warning("Got unexpected poll event for notify fd.");
2820 n
= recvmsg(fd
, &msghdr
, MSG_DONTWAIT
|MSG_CMSG_CLOEXEC
);
2822 if (errno
== EAGAIN
|| errno
== EINTR
)
2825 return log_warning_errno(errno
, "Couldn't read notification socket: %m");
2827 cmsg_close_all(&msghdr
);
2829 CMSG_FOREACH(cmsg
, &msghdr
) {
2830 if (cmsg
->cmsg_level
== SOL_SOCKET
&&
2831 cmsg
->cmsg_type
== SCM_CREDENTIALS
&&
2832 cmsg
->cmsg_len
== CMSG_LEN(sizeof(struct ucred
))) {
2834 ucred
= (struct ucred
*) CMSG_DATA(cmsg
);
2838 if (!ucred
|| ucred
->pid
!= inner_child_pid
) {
2839 log_warning("Received notify message without valid credentials. Ignoring.");
2843 if ((size_t) n
>= sizeof(buf
)) {
2844 log_warning("Received notify message exceeded maximum size. Ignoring.");
2849 tags
= strv_split(buf
, "\n\r");
2853 if (strv_find(tags
, "READY=1"))
2854 sd_notifyf(false, "READY=1\n");
2856 p
= strv_find_startswith(tags
, "STATUS=");
2858 sd_notifyf(false, "STATUS=Container running: %s", p
);
2863 static int setup_sd_notify_parent(sd_event
*event
, int fd
, pid_t
*inner_child_pid
, sd_event_source
**notify_event_source
) {
2866 r
= sd_event_add_io(event
, notify_event_source
, fd
, EPOLLIN
, nspawn_dispatch_notify_fd
, inner_child_pid
);
2868 return log_error_errno(r
, "Failed to allocate notify event source: %m");
2870 (void) sd_event_source_set_description(*notify_event_source
, "nspawn-notify");
2875 static int load_settings(void) {
2876 _cleanup_(settings_freep
) Settings
*settings
= NULL
;
2877 _cleanup_fclose_
FILE *f
= NULL
;
2878 _cleanup_free_
char *p
= NULL
;
2882 /* If all settings are masked, there's no point in looking for
2883 * the settings file */
2884 if ((arg_settings_mask
& _SETTINGS_MASK_ALL
) == _SETTINGS_MASK_ALL
)
2887 fn
= strjoina(arg_machine
, ".nspawn");
2889 /* We first look in the admin's directories in /etc and /run */
2890 FOREACH_STRING(i
, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2891 _cleanup_free_
char *j
= NULL
;
2893 j
= strjoin(i
, "/", fn
);
2902 /* By default, we trust configuration from /etc and /run */
2903 if (arg_settings_trusted
< 0)
2904 arg_settings_trusted
= true;
2909 if (errno
!= ENOENT
)
2910 return log_error_errno(errno
, "Failed to open %s: %m", j
);
2914 /* After that, let's look for a file next to the
2915 * actual image we shall boot. */
2918 p
= file_in_same_dir(arg_image
, fn
);
2921 } else if (arg_directory
) {
2922 p
= file_in_same_dir(arg_directory
, fn
);
2929 if (!f
&& errno
!= ENOENT
)
2930 return log_error_errno(errno
, "Failed to open %s: %m", p
);
2932 /* By default, we do not trust configuration from /var/lib/machines */
2933 if (arg_settings_trusted
< 0)
2934 arg_settings_trusted
= false;
2941 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted
));
2943 r
= settings_load(f
, p
, &settings
);
2947 /* Copy over bits from the settings, unless they have been
2948 * explicitly masked by command line switches. */
2950 if ((arg_settings_mask
& SETTING_START_MODE
) == 0 &&
2951 settings
->start_mode
>= 0) {
2952 arg_start_mode
= settings
->start_mode
;
2954 strv_free(arg_parameters
);
2955 arg_parameters
= settings
->parameters
;
2956 settings
->parameters
= NULL
;
2959 if ((arg_settings_mask
& SETTING_PIVOT_ROOT
) == 0 &&
2960 settings
->pivot_root_new
) {
2961 free_and_replace(arg_pivot_root_new
, settings
->pivot_root_new
);
2962 free_and_replace(arg_pivot_root_old
, settings
->pivot_root_old
);
2965 if ((arg_settings_mask
& SETTING_WORKING_DIRECTORY
) == 0 &&
2966 settings
->working_directory
) {
2968 arg_chdir
= settings
->working_directory
;
2969 settings
->working_directory
= NULL
;
2972 if ((arg_settings_mask
& SETTING_ENVIRONMENT
) == 0 &&
2973 settings
->environment
) {
2974 strv_free(arg_setenv
);
2975 arg_setenv
= settings
->environment
;
2976 settings
->environment
= NULL
;
2979 if ((arg_settings_mask
& SETTING_USER
) == 0 &&
2982 arg_user
= settings
->user
;
2983 settings
->user
= NULL
;
2986 if ((arg_settings_mask
& SETTING_CAPABILITY
) == 0) {
2989 plus
= settings
->capability
;
2990 if (settings_private_network(settings
))
2991 plus
|= (1ULL << CAP_NET_ADMIN
);
2993 if (!arg_settings_trusted
&& plus
!= 0) {
2994 if (settings
->capability
!= 0)
2995 log_warning("Ignoring Capability= setting, file %s is not trusted.", p
);
2997 arg_caps_retain
|= plus
;
2999 arg_caps_retain
&= ~settings
->drop_capability
;
3002 if ((arg_settings_mask
& SETTING_KILL_SIGNAL
) == 0 &&
3003 settings
->kill_signal
> 0)
3004 arg_kill_signal
= settings
->kill_signal
;
3006 if ((arg_settings_mask
& SETTING_PERSONALITY
) == 0 &&
3007 settings
->personality
!= PERSONALITY_INVALID
)
3008 arg_personality
= settings
->personality
;
3010 if ((arg_settings_mask
& SETTING_MACHINE_ID
) == 0 &&
3011 !sd_id128_is_null(settings
->machine_id
)) {
3013 if (!arg_settings_trusted
)
3014 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p
);
3016 arg_uuid
= settings
->machine_id
;
3019 if ((arg_settings_mask
& SETTING_READ_ONLY
) == 0 &&
3020 settings
->read_only
>= 0)
3021 arg_read_only
= settings
->read_only
;
3023 if ((arg_settings_mask
& SETTING_VOLATILE_MODE
) == 0 &&
3024 settings
->volatile_mode
!= _VOLATILE_MODE_INVALID
)
3025 arg_volatile_mode
= settings
->volatile_mode
;
3027 if ((arg_settings_mask
& SETTING_CUSTOM_MOUNTS
) == 0 &&
3028 settings
->n_custom_mounts
> 0) {
3030 if (!arg_settings_trusted
)
3031 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p
);
3033 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
3034 arg_custom_mounts
= settings
->custom_mounts
;
3035 arg_n_custom_mounts
= settings
->n_custom_mounts
;
3037 settings
->custom_mounts
= NULL
;
3038 settings
->n_custom_mounts
= 0;
3042 if ((arg_settings_mask
& SETTING_NETWORK
) == 0 &&
3043 (settings
->private_network
>= 0 ||
3044 settings
->network_veth
>= 0 ||
3045 settings
->network_bridge
||
3046 settings
->network_zone
||
3047 settings
->network_interfaces
||
3048 settings
->network_macvlan
||
3049 settings
->network_ipvlan
||
3050 settings
->network_veth_extra
)) {
3052 if (!arg_settings_trusted
)
3053 log_warning("Ignoring network settings, file %s is not trusted.", p
);
3055 arg_network_veth
= settings_network_veth(settings
);
3056 arg_private_network
= settings_private_network(settings
);
3058 strv_free(arg_network_interfaces
);
3059 arg_network_interfaces
= settings
->network_interfaces
;
3060 settings
->network_interfaces
= NULL
;
3062 strv_free(arg_network_macvlan
);
3063 arg_network_macvlan
= settings
->network_macvlan
;
3064 settings
->network_macvlan
= NULL
;
3066 strv_free(arg_network_ipvlan
);
3067 arg_network_ipvlan
= settings
->network_ipvlan
;
3068 settings
->network_ipvlan
= NULL
;
3070 strv_free(arg_network_veth_extra
);
3071 arg_network_veth_extra
= settings
->network_veth_extra
;
3072 settings
->network_veth_extra
= NULL
;
3074 free(arg_network_bridge
);
3075 arg_network_bridge
= settings
->network_bridge
;
3076 settings
->network_bridge
= NULL
;
3078 free(arg_network_zone
);
3079 arg_network_zone
= settings
->network_zone
;
3080 settings
->network_zone
= NULL
;
3084 if ((arg_settings_mask
& SETTING_EXPOSE_PORTS
) == 0 &&
3085 settings
->expose_ports
) {
3087 if (!arg_settings_trusted
)
3088 log_warning("Ignoring Port= setting, file %s is not trusted.", p
);
3090 expose_port_free_all(arg_expose_ports
);
3091 arg_expose_ports
= settings
->expose_ports
;
3092 settings
->expose_ports
= NULL
;
3096 if ((arg_settings_mask
& SETTING_USERNS
) == 0 &&
3097 settings
->userns_mode
!= _USER_NAMESPACE_MODE_INVALID
) {
3099 if (!arg_settings_trusted
)
3100 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p
);
3102 arg_userns_mode
= settings
->userns_mode
;
3103 arg_uid_shift
= settings
->uid_shift
;
3104 arg_uid_range
= settings
->uid_range
;
3105 arg_userns_chown
= settings
->userns_chown
;
3109 if ((arg_settings_mask
& SETTING_NOTIFY_READY
) == 0)
3110 arg_notify_ready
= settings
->notify_ready
;
3115 static int run(int master
,
3116 const char* console
,
3117 DissectedImage
*dissected_image
,
3121 char veth_name
[IFNAMSIZ
], bool *veth_created
,
3122 union in_addr_union
*exposed
,
3123 pid_t
*pid
, int *ret
) {
3125 static const struct sigaction sa
= {
3126 .sa_handler
= nop_signal_handler
,
3127 .sa_flags
= SA_NOCLDSTOP
|SA_RESTART
,
3130 _cleanup_release_lock_file_ LockFile uid_shift_lock
= LOCK_FILE_INIT
;
3131 _cleanup_close_
int etc_passwd_lock
= -1;
3132 _cleanup_close_pair_
int
3133 kmsg_socket_pair
[2] = { -1, -1 },
3134 rtnl_socket_pair
[2] = { -1, -1 },
3135 pid_socket_pair
[2] = { -1, -1 },
3136 uuid_socket_pair
[2] = { -1, -1 },
3137 notify_socket_pair
[2] = { -1, -1 },
3138 uid_shift_socket_pair
[2] = { -1, -1 };
3139 _cleanup_close_
int notify_socket
= -1;
3140 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
3141 _cleanup_(sd_event_source_unrefp
) sd_event_source
*notify_event_source
= NULL
;
3142 _cleanup_(sd_event_unrefp
) sd_event
*event
= NULL
;
3143 _cleanup_(pty_forward_freep
) PTYForward
*forward
= NULL
;
3144 _cleanup_(sd_netlink_unrefp
) sd_netlink
*rtnl
= NULL
;
3145 ContainerStatus container_status
= 0;
3151 assert_se(sigemptyset(&mask_chld
) == 0);
3152 assert_se(sigaddset(&mask_chld
, SIGCHLD
) == 0);
3154 if (arg_userns_mode
== USER_NAMESPACE_PICK
) {
3155 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3156 * check with getpwuid() if the specific user already exists. Note that /etc might be
3157 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3158 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3159 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3162 etc_passwd_lock
= take_etc_passwd_lock(NULL
);
3163 if (etc_passwd_lock
< 0 && etc_passwd_lock
!= -EROFS
)
3164 return log_error_errno(etc_passwd_lock
, "Failed to take /etc/passwd lock: %m");
3167 r
= barrier_create(&barrier
);
3169 return log_error_errno(r
, "Cannot initialize IPC barrier: %m");
3171 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, kmsg_socket_pair
) < 0)
3172 return log_error_errno(errno
, "Failed to create kmsg socket pair: %m");
3174 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, rtnl_socket_pair
) < 0)
3175 return log_error_errno(errno
, "Failed to create rtnl socket pair: %m");
3177 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, pid_socket_pair
) < 0)
3178 return log_error_errno(errno
, "Failed to create pid socket pair: %m");
3180 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, uuid_socket_pair
) < 0)
3181 return log_error_errno(errno
, "Failed to create id socket pair: %m");
3183 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, notify_socket_pair
) < 0)
3184 return log_error_errno(errno
, "Failed to create notify socket pair: %m");
3186 if (arg_userns_mode
!= USER_NAMESPACE_NO
)
3187 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, uid_shift_socket_pair
) < 0)
3188 return log_error_errno(errno
, "Failed to create uid shift socket pair: %m");
3190 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
3191 * parent's blocking calls and give it a chance to call wait() and terminate. */
3192 r
= sigprocmask(SIG_UNBLOCK
, &mask_chld
, NULL
);
3194 return log_error_errno(errno
, "Failed to change the signal mask: %m");
3196 r
= sigaction(SIGCHLD
, &sa
, NULL
);
3198 return log_error_errno(errno
, "Failed to install SIGCHLD handler: %m");
3200 *pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
);
3202 return log_error_errno(errno
, "clone() failed%s: %m",
3204 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
3207 /* The outer child only has a file system namespace. */
3208 barrier_set_role(&barrier
, BARRIER_CHILD
);
3210 master
= safe_close(master
);
3212 kmsg_socket_pair
[0] = safe_close(kmsg_socket_pair
[0]);
3213 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3214 pid_socket_pair
[0] = safe_close(pid_socket_pair
[0]);
3215 uuid_socket_pair
[0] = safe_close(uuid_socket_pair
[0]);
3216 notify_socket_pair
[0] = safe_close(notify_socket_pair
[0]);
3217 uid_shift_socket_pair
[0] = safe_close(uid_shift_socket_pair
[0]);
3219 (void) reset_all_signal_handlers();
3220 (void) reset_signal_mask();
3222 r
= outer_child(&barrier
,
3229 uuid_socket_pair
[1],
3230 notify_socket_pair
[1],
3231 kmsg_socket_pair
[1],
3232 rtnl_socket_pair
[1],
3233 uid_shift_socket_pair
[1],
3236 _exit(EXIT_FAILURE
);
3238 _exit(EXIT_SUCCESS
);
3241 barrier_set_role(&barrier
, BARRIER_PARENT
);
3243 fds
= fdset_free(fds
);
3245 kmsg_socket_pair
[1] = safe_close(kmsg_socket_pair
[1]);
3246 rtnl_socket_pair
[1] = safe_close(rtnl_socket_pair
[1]);
3247 pid_socket_pair
[1] = safe_close(pid_socket_pair
[1]);
3248 uuid_socket_pair
[1] = safe_close(uuid_socket_pair
[1]);
3249 notify_socket_pair
[1] = safe_close(notify_socket_pair
[1]);
3250 uid_shift_socket_pair
[1] = safe_close(uid_shift_socket_pair
[1]);
3252 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
3253 /* The child just let us know the UID shift it might have read from the image. */
3254 l
= recv(uid_shift_socket_pair
[0], &arg_uid_shift
, sizeof arg_uid_shift
, 0);
3256 return log_error_errno(errno
, "Failed to read UID shift: %m");
3257 if (l
!= sizeof arg_uid_shift
) {
3258 log_error("Short read while reading UID shift.");
3262 if (arg_userns_mode
== USER_NAMESPACE_PICK
) {
3263 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3264 * image, but if that's already in use, pick a new one, and report back to the child,
3265 * which one we now picked. */
3267 r
= uid_shift_pick(&arg_uid_shift
, &uid_shift_lock
);
3269 return log_error_errno(r
, "Failed to pick suitable UID/GID range: %m");
3271 l
= send(uid_shift_socket_pair
[0], &arg_uid_shift
, sizeof arg_uid_shift
, MSG_NOSIGNAL
);
3273 return log_error_errno(errno
, "Failed to send UID shift: %m");
3274 if (l
!= sizeof arg_uid_shift
) {
3275 log_error("Short write while writing UID shift.");
3281 /* Wait for the outer child. */
3282 r
= wait_for_terminate_and_warn("namespace helper", *pid
, NULL
);
3284 return r
< 0 ? r
: -EIO
;
3286 /* And now retrieve the PID of the inner child. */
3287 l
= recv(pid_socket_pair
[0], pid
, sizeof *pid
, 0);
3289 return log_error_errno(errno
, "Failed to read inner child PID: %m");
3290 if (l
!= sizeof *pid
) {
3291 log_error("Short read while reading inner child PID.");
3295 /* We also retrieve container UUID in case it was generated by outer child */
3296 l
= recv(uuid_socket_pair
[0], &arg_uuid
, sizeof arg_uuid
, 0);
3298 return log_error_errno(errno
, "Failed to read container machine ID: %m");
3299 if (l
!= sizeof(arg_uuid
)) {
3300 log_error("Short read while reading container machined ID.");
3304 /* We also retrieve the socket used for notifications generated by outer child */
3305 notify_socket
= receive_one_fd(notify_socket_pair
[0], 0);
3306 if (notify_socket
< 0)
3307 return log_error_errno(notify_socket
,
3308 "Failed to receive notification socket from the outer child: %m");
3310 log_debug("Init process invoked as PID "PID_FMT
, *pid
);
3312 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
3313 if (!barrier_place_and_sync(&barrier
)) { /* #1 */
3314 log_error("Child died too early.");
3318 r
= setup_uid_map(*pid
);
3322 (void) barrier_place(&barrier
); /* #2 */
3325 if (arg_private_network
) {
3327 r
= move_network_interfaces(*pid
, arg_network_interfaces
);
3331 if (arg_network_veth
) {
3332 r
= setup_veth(arg_machine
, *pid
, veth_name
,
3333 arg_network_bridge
|| arg_network_zone
);
3339 if (arg_network_bridge
) {
3340 /* Add the interface to a bridge */
3341 r
= setup_bridge(veth_name
, arg_network_bridge
, false);
3346 } else if (arg_network_zone
) {
3347 /* Add the interface to a bridge, possibly creating it */
3348 r
= setup_bridge(veth_name
, arg_network_zone
, true);
3356 r
= setup_veth_extra(arg_machine
, *pid
, arg_network_veth_extra
);
3360 /* We created the primary and extra veth links now; let's remember this, so that we know to
3361 remove them later on. Note that we don't bother with removing veth links that were created
3362 here when their setup failed half-way, because in that case the kernel should be able to
3363 remove them on its own, since they cannot be referenced by anything yet. */
3364 *veth_created
= true;
3366 r
= setup_macvlan(arg_machine
, *pid
, arg_network_macvlan
);
3370 r
= setup_ipvlan(arg_machine
, *pid
, arg_network_ipvlan
);
3376 r
= register_machine(
3383 arg_custom_mounts
, arg_n_custom_mounts
,
3387 arg_container_service_name
);
3392 r
= sync_cgroup(*pid
, arg_unified_cgroup_hierarchy
, arg_uid_shift
);
3396 if (arg_keep_unit
) {
3397 r
= create_subcgroup(*pid
, arg_unified_cgroup_hierarchy
);
3402 r
= chown_cgroup(*pid
, arg_uid_shift
);
3406 /* Notify the child that the parent is ready with all
3407 * its setup (including cgroup-ification), and that
3408 * the child can now hand over control to the code to
3409 * run inside the container. */
3410 (void) barrier_place(&barrier
); /* #3 */
3412 /* Block SIGCHLD here, before notifying child.
3413 * process_pty() will handle it with the other signals. */
3414 assert_se(sigprocmask(SIG_BLOCK
, &mask_chld
, NULL
) >= 0);
3416 /* Reset signal to default */
3417 r
= default_signals(SIGCHLD
, -1);
3419 return log_error_errno(r
, "Failed to reset SIGCHLD: %m");
3421 r
= sd_event_new(&event
);
3423 return log_error_errno(r
, "Failed to get default event source: %m");
3425 r
= setup_sd_notify_parent(event
, notify_socket
, PID_TO_PTR(*pid
), ¬ify_event_source
);
3429 /* Let the child know that we are ready and wait that the child is completely ready now. */
3430 if (!barrier_place_and_sync(&barrier
)) { /* #4 */
3431 log_error("Child died too early.");
3435 /* At this point we have made use of the UID we picked, and thus nss-mymachines
3436 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
3437 etc_passwd_lock
= safe_close(etc_passwd_lock
);
3440 "STATUS=Container running.\n"
3441 "X_NSPAWN_LEADER_PID=" PID_FMT
, *pid
);
3442 if (!arg_notify_ready
)
3443 sd_notify(false, "READY=1\n");
3445 if (arg_kill_signal
> 0) {
3446 /* Try to kill the init system on SIGINT or SIGTERM */
3447 sd_event_add_signal(event
, NULL
, SIGINT
, on_orderly_shutdown
, PID_TO_PTR(*pid
));
3448 sd_event_add_signal(event
, NULL
, SIGTERM
, on_orderly_shutdown
, PID_TO_PTR(*pid
));
3450 /* Immediately exit */
3451 sd_event_add_signal(event
, NULL
, SIGINT
, NULL
, NULL
);
3452 sd_event_add_signal(event
, NULL
, SIGTERM
, NULL
, NULL
);
3455 /* Exit when the child exits */
3456 sd_event_add_signal(event
, NULL
, SIGCHLD
, on_sigchld
, PID_TO_PTR(*pid
));
3458 if (arg_expose_ports
) {
3459 r
= expose_port_watch_rtnl(event
, rtnl_socket_pair
[0], on_address_change
, exposed
, &rtnl
);
3463 (void) expose_port_execute(rtnl
, arg_expose_ports
, exposed
);
3466 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3468 r
= pty_forward_new(event
, master
,
3469 PTY_FORWARD_IGNORE_VHANGUP
| (interactive
? 0 : PTY_FORWARD_READ_ONLY
),
3472 return log_error_errno(r
, "Failed to create PTY forwarder: %m");
3474 r
= sd_event_loop(event
);
3476 return log_error_errno(r
, "Failed to run event loop: %m");
3478 pty_forward_get_last_char(forward
, &last_char
);
3480 forward
= pty_forward_free(forward
);
3482 if (!arg_quiet
&& last_char
!= '\n')
3485 /* Kill if it is not dead yet anyway */
3486 if (arg_register
&& !arg_keep_unit
)
3487 terminate_machine(*pid
);
3489 /* Normally redundant, but better safe than sorry */
3490 (void) kill(*pid
, SIGKILL
);
3492 r
= wait_for_container(*pid
, &container_status
);
3496 /* We failed to wait for the container, or the container exited abnormally. */
3498 if (r
> 0 || container_status
== CONTAINER_TERMINATED
) {
3499 /* r > 0 → The container exited with a non-zero status.
3500 * As a special case, we need to replace 133 with a different value,
3501 * because 133 is special-cased in the service file to reboot the container.
3502 * otherwise → The container exited with zero status and a reboot was not requested.
3504 if (r
== EXIT_FORCE_RESTART
)
3505 r
= EXIT_FAILURE
; /* replace 133 with the general failure code */
3507 return 0; /* finito */
3510 /* CONTAINER_REBOOTED, loop again */
3512 if (arg_keep_unit
) {
3513 /* Special handling if we are running as a service: instead of simply
3514 * restarting the machine we want to restart the entire service, so let's
3515 * inform systemd about this with the special exit code 133. The service
3516 * file uses RestartForceExitStatus=133 so that this results in a full
3517 * nspawn restart. This is necessary since we might have cgroup parameters
3518 * set we want to have flushed out. */
3519 *ret
= EXIT_FORCE_RESTART
;
3520 return 0; /* finito */
3523 expose_port_flush(arg_expose_ports
, exposed
);
3525 (void) remove_veth_links(veth_name
, arg_network_veth_extra
);
3526 *veth_created
= false;
3527 return 1; /* loop again */
3530 int main(int argc
, char *argv
[]) {
3532 _cleanup_free_
char *console
= NULL
;
3533 _cleanup_close_
int master
= -1;
3534 _cleanup_fdset_free_ FDSet
*fds
= NULL
;
3535 int r
, n_fd_passed
, ret
= EXIT_SUCCESS
;
3536 char veth_name
[IFNAMSIZ
] = "";
3537 bool secondary
= false, remove_directory
= false, remove_image
= false;
3539 union in_addr_union exposed
= {};
3540 _cleanup_release_lock_file_ LockFile tree_global_lock
= LOCK_FILE_INIT
, tree_local_lock
= LOCK_FILE_INIT
;
3541 bool interactive
, veth_created
= false, remove_tmprootdir
= false;
3542 char tmprootdir
[] = "/tmp/nspawn-root-XXXXXX";
3543 _cleanup_(loop_device_unrefp
) LoopDevice
*loop
= NULL
;
3544 _cleanup_(decrypted_image_unrefp
) DecryptedImage
*decrypted_image
= NULL
;
3545 _cleanup_(dissected_image_unrefp
) DissectedImage
*dissected_image
= NULL
;
3547 log_parse_environment();
3550 /* Make sure rename_process() in the stub init process can work */
3554 r
= parse_argv(argc
, argv
);
3558 if (geteuid() != 0) {
3559 log_error("Need to be root.");
3563 r
= determine_names();
3567 r
= load_settings();
3571 r
= verify_arguments();
3575 n_fd_passed
= sd_listen_fds(false);
3576 if (n_fd_passed
> 0) {
3577 r
= fdset_new_listen_fds(&fds
, false);
3579 log_error_errno(r
, "Failed to collect file descriptors: %m");
3584 if (arg_directory
) {
3587 if (path_equal(arg_directory
, "/") && !arg_ephemeral
) {
3588 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3593 if (arg_ephemeral
) {
3594 _cleanup_free_
char *np
= NULL
;
3596 r
= chase_symlinks_and_update(&arg_directory
, 0);
3600 /* If the specified path is a mount point we
3601 * generate the new snapshot immediately
3602 * inside it under a random name. However if
3603 * the specified is not a mount point we
3604 * create the new snapshot in the parent
3605 * directory, just next to it. */
3606 r
= path_is_mount_point(arg_directory
, NULL
, 0);
3608 log_error_errno(r
, "Failed to determine whether directory %s is mount point: %m", arg_directory
);
3612 r
= tempfn_random_child(arg_directory
, "machine.", &np
);
3614 r
= tempfn_random(arg_directory
, "machine.", &np
);
3616 log_error_errno(r
, "Failed to generate name for directory snapshot: %m");
3620 r
= image_path_lock(np
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3622 log_error_errno(r
, "Failed to lock %s: %m", np
);
3626 r
= btrfs_subvol_snapshot(arg_directory
, np
,
3627 (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) |
3628 BTRFS_SNAPSHOT_FALLBACK_COPY
|
3629 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY
|
3630 BTRFS_SNAPSHOT_RECURSIVE
|
3631 BTRFS_SNAPSHOT_QUOTA
);
3633 log_error_errno(r
, "Failed to create snapshot %s from %s: %m", np
, arg_directory
);
3637 free(arg_directory
);
3641 remove_directory
= true;
3644 r
= chase_symlinks_and_update(&arg_directory
, arg_template
? CHASE_NONEXISTENT
: 0);
3648 r
= image_path_lock(arg_directory
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3650 log_error_errno(r
, "Directory tree %s is currently busy.", arg_directory
);
3654 log_error_errno(r
, "Failed to lock %s: %m", arg_directory
);
3659 r
= chase_symlinks_and_update(&arg_template
, 0);
3663 r
= btrfs_subvol_snapshot(arg_template
, arg_directory
,
3664 (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) |
3665 BTRFS_SNAPSHOT_FALLBACK_COPY
|
3666 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY
|
3667 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE
|
3668 BTRFS_SNAPSHOT_RECURSIVE
|
3669 BTRFS_SNAPSHOT_QUOTA
);
3672 log_info("Directory %s already exists, not populating from template %s.", arg_directory
, arg_template
);
3674 log_error_errno(r
, "Couldn't create snapshot %s from %s: %m", arg_directory
, arg_template
);
3678 log_info("Populated %s from template %s.", arg_directory
, arg_template
);
3683 if (arg_start_mode
== START_BOOT
) {
3684 if (path_is_os_tree(arg_directory
) <= 0) {
3685 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory
);
3692 p
= strjoina(arg_directory
, "/usr/");
3693 if (laccess(p
, F_OK
) < 0) {
3694 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory
);
3702 assert(!arg_template
);
3704 r
= chase_symlinks_and_update(&arg_image
, 0);
3708 if (arg_ephemeral
) {
3709 _cleanup_free_
char *np
= NULL
;
3711 r
= tempfn_random(arg_image
, "machine.", &np
);
3713 log_error_errno(r
, "Failed to generate name for image snapshot: %m");
3717 r
= image_path_lock(np
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3719 r
= log_error_errno(r
, "Failed to create image lock: %m");
3723 r
= copy_file(arg_image
, np
, O_EXCL
, arg_read_only
? 0400 : 0600, FS_NOCOW_FL
, COPY_REFLINK
);
3725 r
= log_error_errno(r
, "Failed to copy image file: %m");
3733 remove_image
= true;
3735 r
= image_path_lock(arg_image
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3737 r
= log_error_errno(r
, "Disk image %s is currently busy.", arg_image
);
3741 r
= log_error_errno(r
, "Failed to create image lock: %m");
3745 if (!arg_root_hash
) {
3746 r
= root_hash_load(arg_image
, &arg_root_hash
, &arg_root_hash_size
);
3748 log_error_errno(r
, "Failed to load root hash file for %s: %m", arg_image
);
3754 if (!mkdtemp(tmprootdir
)) {
3755 r
= log_error_errno(errno
, "Failed to create temporary directory: %m");
3759 remove_tmprootdir
= true;
3761 arg_directory
= strdup(tmprootdir
);
3762 if (!arg_directory
) {
3767 r
= loop_device_make_by_path(arg_image
, arg_read_only
? O_RDONLY
: O_RDWR
, &loop
);
3769 log_error_errno(r
, "Failed to set up loopback block device: %m");
3775 arg_root_hash
, arg_root_hash_size
,
3776 DISSECT_IMAGE_REQUIRE_ROOT
,
3779 log_error_errno(r
, "Could not find a suitable file system or partition table in image: %s", arg_image
);
3781 log_notice("Note that the disk image needs to\n"
3782 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
3783 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
3784 " c) or follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n"
3785 " d) or contain a file system without a partition table\n"
3786 "in order to be bootable with systemd-nspawn.");
3789 if (r
== -EADDRNOTAVAIL
) {
3790 log_error_errno(r
, "No root partition for specified root hash found.");
3793 if (r
== -EOPNOTSUPP
) {
3794 log_error_errno(r
, "--image= is not supported, compiled without blkid support.");
3798 log_error_errno(r
, "Failed to dissect image: %m");
3802 if (!arg_root_hash
&& dissected_image
->can_verity
)
3803 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image
);
3805 r
= dissected_image_decrypt_interactively(dissected_image
, NULL
, arg_root_hash
, arg_root_hash_size
, 0, &decrypted_image
);
3809 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
3810 if (remove_image
&& unlink(arg_image
) >= 0)
3811 remove_image
= false;
3814 r
= custom_mount_prepare_all(arg_directory
, arg_custom_mounts
, arg_n_custom_mounts
);
3818 r
= detect_unified_cgroup_hierarchy(arg_directory
);
3823 isatty(STDIN_FILENO
) > 0 &&
3824 isatty(STDOUT_FILENO
) > 0;
3826 master
= posix_openpt(O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NDELAY
);
3828 r
= log_error_errno(errno
, "Failed to acquire pseudo tty: %m");
3832 r
= ptsname_malloc(master
, &console
);
3834 r
= log_error_errno(r
, "Failed to determine tty name: %m");
3838 if (arg_selinux_apifs_context
) {
3839 r
= mac_selinux_apply(console
, arg_selinux_apifs_context
);
3844 if (unlockpt(master
) < 0) {
3845 r
= log_error_errno(errno
, "Failed to unlock tty: %m");
3850 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3851 arg_machine
, arg_image
?: arg_directory
);
3853 assert_se(sigprocmask_many(SIG_BLOCK
, NULL
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, -1) >= 0);
3855 if (prctl(PR_SET_CHILD_SUBREAPER
, 1) < 0) {
3856 r
= log_error_errno(errno
, "Failed to become subreaper: %m");
3864 interactive
, secondary
,
3866 veth_name
, &veth_created
,
3875 r
== 0 && ret
== EXIT_FORCE_RESTART
? "STOPPING=1\nSTATUS=Restarting..." :
3876 "STOPPING=1\nSTATUS=Terminating...");
3879 (void) kill(pid
, SIGKILL
);
3881 /* Try to flush whatever is still queued in the pty */
3883 (void) copy_bytes(master
, STDOUT_FILENO
, (uint64_t) -1, 0);
3884 master
= safe_close(master
);
3888 (void) wait_for_terminate(pid
, NULL
);
3890 if (remove_directory
&& arg_directory
) {
3893 k
= rm_rf(arg_directory
, REMOVE_ROOT
|REMOVE_PHYSICAL
|REMOVE_SUBVOLUME
);
3895 log_warning_errno(k
, "Cannot remove '%s', ignoring: %m", arg_directory
);
3898 if (remove_image
&& arg_image
) {
3899 if (unlink(arg_image
) < 0)
3900 log_warning_errno(errno
, "Can't remove image file '%s', ignoring: %m", arg_image
);
3903 if (remove_tmprootdir
) {
3904 if (rmdir(tmprootdir
) < 0)
3905 log_debug_errno(errno
, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir
);
3911 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
3912 (void) rm_rf(p
, REMOVE_ROOT
);
3915 expose_port_flush(arg_expose_ports
, &exposed
);
3918 (void) remove_veth_links(veth_name
, arg_network_veth_extra
);
3919 (void) remove_bridge(arg_network_zone
);
3921 free(arg_directory
);
3926 free(arg_pivot_root_new
);
3927 free(arg_pivot_root_old
);
3929 strv_free(arg_setenv
);
3930 free(arg_network_bridge
);
3931 strv_free(arg_network_interfaces
);
3932 strv_free(arg_network_macvlan
);
3933 strv_free(arg_network_ipvlan
);
3934 strv_free(arg_network_veth_extra
);
3935 strv_free(arg_parameters
);
3936 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
3937 expose_port_free_all(arg_expose_ports
);
3938 free(arg_root_hash
);
3940 return r
< 0 ? EXIT_FAILURE
: ret
;