1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/mount.h>
31 #include <sys/prctl.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
37 #include <sys/personality.h>
38 #include <linux/loop.h>
42 #include <selinux/selinux.h>
50 #include <blkid/blkid.h>
53 #include "sd-daemon.h"
56 #include "random-util.h"
63 #include "cgroup-util.h"
65 #include "path-util.h"
66 #include "loopback-setup.h"
67 #include "dev-setup.h"
72 #include "bus-error.h"
75 #include "netlink-util.h"
76 #include "udev-util.h"
77 #include "blkid-util.h"
80 #include "base-filesystem.h"
82 #include "event-util.h"
83 #include "capability.h"
85 #include "btrfs-util.h"
86 #include "machine-image.h"
88 #include "in-addr-util.h"
89 #include "formats-util.h"
90 #include "process-util.h"
91 #include "terminal-util.h"
92 #include "hostname-util.h"
93 #include "signal-util.h"
96 #include "seccomp-util.h"
100 #include "nspawn-settings.h"
101 #include "nspawn-mount.h"
102 #include "nspawn-network.h"
103 #include "nspawn-expose-ports.h"
104 #include "nspawn-cgroup.h"
105 #include "nspawn-register.h"
106 #include "nspawn-setuid.h"
108 typedef enum ContainerStatus
{
109 CONTAINER_TERMINATED
,
113 typedef enum LinkJournal
{
120 static char *arg_directory
= NULL
;
121 static char *arg_template
= NULL
;
122 static char *arg_user
= NULL
;
123 static sd_id128_t arg_uuid
= {};
124 static char *arg_machine
= NULL
;
125 static const char *arg_selinux_context
= NULL
;
126 static const char *arg_selinux_apifs_context
= NULL
;
127 static const char *arg_slice
= NULL
;
128 static bool arg_private_network
= false;
129 static bool arg_read_only
= false;
130 static bool arg_boot
= false;
131 static bool arg_ephemeral
= false;
132 static LinkJournal arg_link_journal
= LINK_AUTO
;
133 static bool arg_link_journal_try
= false;
134 static uint64_t arg_retain
=
135 (1ULL << CAP_CHOWN
) |
136 (1ULL << CAP_DAC_OVERRIDE
) |
137 (1ULL << CAP_DAC_READ_SEARCH
) |
138 (1ULL << CAP_FOWNER
) |
139 (1ULL << CAP_FSETID
) |
140 (1ULL << CAP_IPC_OWNER
) |
142 (1ULL << CAP_LEASE
) |
143 (1ULL << CAP_LINUX_IMMUTABLE
) |
144 (1ULL << CAP_NET_BIND_SERVICE
) |
145 (1ULL << CAP_NET_BROADCAST
) |
146 (1ULL << CAP_NET_RAW
) |
147 (1ULL << CAP_SETGID
) |
148 (1ULL << CAP_SETFCAP
) |
149 (1ULL << CAP_SETPCAP
) |
150 (1ULL << CAP_SETUID
) |
151 (1ULL << CAP_SYS_ADMIN
) |
152 (1ULL << CAP_SYS_CHROOT
) |
153 (1ULL << CAP_SYS_NICE
) |
154 (1ULL << CAP_SYS_PTRACE
) |
155 (1ULL << CAP_SYS_TTY_CONFIG
) |
156 (1ULL << CAP_SYS_RESOURCE
) |
157 (1ULL << CAP_SYS_BOOT
) |
158 (1ULL << CAP_AUDIT_WRITE
) |
159 (1ULL << CAP_AUDIT_CONTROL
) |
161 static CustomMount
*arg_custom_mounts
= NULL
;
162 static unsigned arg_n_custom_mounts
= 0;
163 static char **arg_setenv
= NULL
;
164 static bool arg_quiet
= false;
165 static bool arg_share_system
= false;
166 static bool arg_register
= true;
167 static bool arg_keep_unit
= false;
168 static char **arg_network_interfaces
= NULL
;
169 static char **arg_network_macvlan
= NULL
;
170 static char **arg_network_ipvlan
= NULL
;
171 static bool arg_network_veth
= false;
172 static char *arg_network_bridge
= NULL
;
173 static unsigned long arg_personality
= PERSONALITY_INVALID
;
174 static char *arg_image
= NULL
;
175 static VolatileMode arg_volatile_mode
= VOLATILE_NO
;
176 static ExposePort
*arg_expose_ports
= NULL
;
177 static char **arg_property
= NULL
;
178 static uid_t arg_uid_shift
= UID_INVALID
, arg_uid_range
= 0x10000U
;
179 static bool arg_userns
= false;
180 static int arg_kill_signal
= 0;
181 static bool arg_unified_cgroup_hierarchy
= false;
182 static SettingsMask arg_settings_mask
= 0;
183 static int arg_settings_trusted
= -1;
184 static char **arg_parameters
= NULL
;
186 static void help(void) {
187 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
188 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
189 " -h --help Show this help\n"
190 " --version Print version string\n"
191 " -q --quiet Do not show status information\n"
192 " -D --directory=PATH Root directory for the container\n"
193 " --template=PATH Initialize root directory from template directory,\n"
195 " -x --ephemeral Run container with snapshot of root directory, and\n"
196 " remove it after exit\n"
197 " -i --image=PATH File system device or disk image for the container\n"
198 " -b --boot Boot up full system (i.e. invoke init)\n"
199 " -u --user=USER Run the command under specified user or uid\n"
200 " -M --machine=NAME Set the machine name for the container\n"
201 " --uuid=UUID Set a specific machine UUID for the container\n"
202 " -S --slice=SLICE Place the container in the specified slice\n"
203 " --property=NAME=VALUE Set scope unit property\n"
204 " --private-users[=UIDBASE[:NUIDS]]\n"
205 " Run within user namespace\n"
206 " --private-network Disable network in container\n"
207 " --network-interface=INTERFACE\n"
208 " Assign an existing network interface to the\n"
210 " --network-macvlan=INTERFACE\n"
211 " Create a macvlan network interface based on an\n"
212 " existing network interface to the container\n"
213 " --network-ipvlan=INTERFACE\n"
214 " Create a ipvlan network interface based on an\n"
215 " existing network interface to the container\n"
216 " -n --network-veth Add a virtual ethernet connection between host\n"
218 " --network-bridge=INTERFACE\n"
219 " Add a virtual ethernet connection between host\n"
220 " and container and add it to an existing bridge on\n"
222 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
223 " Expose a container IP port on the host\n"
224 " -Z --selinux-context=SECLABEL\n"
225 " Set the SELinux security context to be used by\n"
226 " processes in the container\n"
227 " -L --selinux-apifs-context=SECLABEL\n"
228 " Set the SELinux security context to be used by\n"
229 " API/tmpfs file systems in the container\n"
230 " --capability=CAP In addition to the default, retain specified\n"
232 " --drop-capability=CAP Drop the specified capability from the default set\n"
233 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
234 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
235 " try-guest, try-host\n"
236 " -j Equivalent to --link-journal=try-guest\n"
237 " --read-only Mount the root directory read-only\n"
238 " --bind=PATH[:PATH[:OPTIONS]]\n"
239 " Bind mount a file or directory from the host into\n"
241 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
242 " Similar, but creates a read-only bind mount\n"
243 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
244 " --overlay=PATH[:PATH...]:PATH\n"
245 " Create an overlay mount from the host to \n"
247 " --overlay-ro=PATH[:PATH...]:PATH\n"
248 " Similar, but creates a read-only overlay mount\n"
249 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
250 " --share-system Share system namespaces with host\n"
251 " --register=BOOLEAN Register container as machine\n"
252 " --keep-unit Do not register a scope for the machine, reuse\n"
253 " the service unit nspawn is running in\n"
254 " --volatile[=MODE] Run the system in volatile mode\n"
255 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
256 , program_invocation_short_name
);
260 static int custom_mounts_prepare(void) {
264 /* Ensure the mounts are applied prefix first. */
265 qsort_safe(arg_custom_mounts
, arg_n_custom_mounts
, sizeof(CustomMount
), custom_mount_compare
);
267 /* Allocate working directories for the overlay file systems that need it */
268 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
269 CustomMount
*m
= &arg_custom_mounts
[i
];
271 if (arg_userns
&& arg_uid_shift
== UID_INVALID
&& path_equal(m
->destination
, "/")) {
272 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
276 if (m
->type
!= CUSTOM_MOUNT_OVERLAY
)
285 r
= tempfn_random(m
->source
, NULL
, &m
->work_dir
);
287 return log_error_errno(r
, "Failed to generate work directory from %s: %m", m
->source
);
293 static int set_sanitized_path(char **b
, const char *path
) {
299 p
= canonicalize_file_name(path
);
304 p
= path_make_absolute_cwd(path
);
310 *b
= path_kill_slashes(p
);
314 static int detect_unified_cgroup_hierarchy(void) {
318 /* Allow the user to control whether the unified hierarchy is used */
319 e
= getenv("UNIFIED_CGROUP_HIERARCHY");
321 r
= parse_boolean(e
);
323 return log_error_errno(r
, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
325 arg_unified_cgroup_hierarchy
= r
;
329 /* Otherwise inherit the default from the host system */
332 return log_error_errno(r
, "Failed to determine whether the unified cgroups hierarchy is used: %m");
334 arg_unified_cgroup_hierarchy
= r
;
338 static int parse_argv(int argc
, char *argv
[]) {
357 ARG_NETWORK_INTERFACE
,
370 static const struct option options
[] = {
371 { "help", no_argument
, NULL
, 'h' },
372 { "version", no_argument
, NULL
, ARG_VERSION
},
373 { "directory", required_argument
, NULL
, 'D' },
374 { "template", required_argument
, NULL
, ARG_TEMPLATE
},
375 { "ephemeral", no_argument
, NULL
, 'x' },
376 { "user", required_argument
, NULL
, 'u' },
377 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
378 { "boot", no_argument
, NULL
, 'b' },
379 { "uuid", required_argument
, NULL
, ARG_UUID
},
380 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
381 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
382 { "drop-capability", required_argument
, NULL
, ARG_DROP_CAPABILITY
},
383 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
384 { "bind", required_argument
, NULL
, ARG_BIND
},
385 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
386 { "tmpfs", required_argument
, NULL
, ARG_TMPFS
},
387 { "overlay", required_argument
, NULL
, ARG_OVERLAY
},
388 { "overlay-ro", required_argument
, NULL
, ARG_OVERLAY_RO
},
389 { "machine", required_argument
, NULL
, 'M' },
390 { "slice", required_argument
, NULL
, 'S' },
391 { "setenv", required_argument
, NULL
, ARG_SETENV
},
392 { "selinux-context", required_argument
, NULL
, 'Z' },
393 { "selinux-apifs-context", required_argument
, NULL
, 'L' },
394 { "quiet", no_argument
, NULL
, 'q' },
395 { "share-system", no_argument
, NULL
, ARG_SHARE_SYSTEM
},
396 { "register", required_argument
, NULL
, ARG_REGISTER
},
397 { "keep-unit", no_argument
, NULL
, ARG_KEEP_UNIT
},
398 { "network-interface", required_argument
, NULL
, ARG_NETWORK_INTERFACE
},
399 { "network-macvlan", required_argument
, NULL
, ARG_NETWORK_MACVLAN
},
400 { "network-ipvlan", required_argument
, NULL
, ARG_NETWORK_IPVLAN
},
401 { "network-veth", no_argument
, NULL
, 'n' },
402 { "network-bridge", required_argument
, NULL
, ARG_NETWORK_BRIDGE
},
403 { "personality", required_argument
, NULL
, ARG_PERSONALITY
},
404 { "image", required_argument
, NULL
, 'i' },
405 { "volatile", optional_argument
, NULL
, ARG_VOLATILE
},
406 { "port", required_argument
, NULL
, 'p' },
407 { "property", required_argument
, NULL
, ARG_PROPERTY
},
408 { "private-users", optional_argument
, NULL
, ARG_PRIVATE_USERS
},
409 { "kill-signal", required_argument
, NULL
, ARG_KILL_SIGNAL
},
410 { "settings", required_argument
, NULL
, ARG_SETTINGS
},
415 uint64_t plus
= 0, minus
= 0;
416 bool mask_all_settings
= false, mask_no_settings
= false;
421 while ((c
= getopt_long(argc
, argv
, "+hD:u:bL:M:jS:Z:qi:xp:n", options
, NULL
)) >= 0)
430 puts(PACKAGE_STRING
);
431 puts(SYSTEMD_FEATURES
);
435 r
= set_sanitized_path(&arg_directory
, optarg
);
437 return log_error_errno(r
, "Invalid root directory: %m");
442 r
= set_sanitized_path(&arg_template
, optarg
);
444 return log_error_errno(r
, "Invalid template directory: %m");
449 r
= set_sanitized_path(&arg_image
, optarg
);
451 return log_error_errno(r
, "Invalid image path: %m");
456 arg_ephemeral
= true;
460 r
= free_and_strdup(&arg_user
, optarg
);
464 arg_settings_mask
|= SETTING_USER
;
467 case ARG_NETWORK_BRIDGE
:
468 r
= free_and_strdup(&arg_network_bridge
, optarg
);
475 arg_network_veth
= true;
476 arg_private_network
= true;
477 arg_settings_mask
|= SETTING_NETWORK
;
480 case ARG_NETWORK_INTERFACE
:
481 if (strv_extend(&arg_network_interfaces
, optarg
) < 0)
484 arg_private_network
= true;
485 arg_settings_mask
|= SETTING_NETWORK
;
488 case ARG_NETWORK_MACVLAN
:
489 if (strv_extend(&arg_network_macvlan
, optarg
) < 0)
492 arg_private_network
= true;
493 arg_settings_mask
|= SETTING_NETWORK
;
496 case ARG_NETWORK_IPVLAN
:
497 if (strv_extend(&arg_network_ipvlan
, optarg
) < 0)
502 case ARG_PRIVATE_NETWORK
:
503 arg_private_network
= true;
504 arg_settings_mask
|= SETTING_NETWORK
;
509 arg_settings_mask
|= SETTING_BOOT
;
513 r
= sd_id128_from_string(optarg
, &arg_uuid
);
515 log_error("Invalid UUID: %s", optarg
);
519 arg_settings_mask
|= SETTING_MACHINE_ID
;
528 arg_machine
= mfree(arg_machine
);
530 if (!machine_name_is_valid(optarg
)) {
531 log_error("Invalid machine name: %s", optarg
);
535 r
= free_and_strdup(&arg_machine
, optarg
);
543 arg_selinux_context
= optarg
;
547 arg_selinux_apifs_context
= optarg
;
551 arg_read_only
= true;
552 arg_settings_mask
|= SETTING_READ_ONLY
;
556 case ARG_DROP_CAPABILITY
: {
557 const char *state
, *word
;
560 FOREACH_WORD_SEPARATOR(word
, length
, optarg
, ",", state
) {
561 _cleanup_free_
char *t
;
563 t
= strndup(word
, length
);
567 if (streq(t
, "all")) {
568 if (c
== ARG_CAPABILITY
)
569 plus
= (uint64_t) -1;
571 minus
= (uint64_t) -1;
575 cap
= capability_from_name(t
);
577 log_error("Failed to parse capability %s.", t
);
581 if (c
== ARG_CAPABILITY
)
582 plus
|= 1ULL << (uint64_t) cap
;
584 minus
|= 1ULL << (uint64_t) cap
;
588 arg_settings_mask
|= SETTING_CAPABILITY
;
593 arg_link_journal
= LINK_GUEST
;
594 arg_link_journal_try
= true;
597 case ARG_LINK_JOURNAL
:
598 if (streq(optarg
, "auto")) {
599 arg_link_journal
= LINK_AUTO
;
600 arg_link_journal_try
= false;
601 } else if (streq(optarg
, "no")) {
602 arg_link_journal
= LINK_NO
;
603 arg_link_journal_try
= false;
604 } else if (streq(optarg
, "guest")) {
605 arg_link_journal
= LINK_GUEST
;
606 arg_link_journal_try
= false;
607 } else if (streq(optarg
, "host")) {
608 arg_link_journal
= LINK_HOST
;
609 arg_link_journal_try
= false;
610 } else if (streq(optarg
, "try-guest")) {
611 arg_link_journal
= LINK_GUEST
;
612 arg_link_journal_try
= true;
613 } else if (streq(optarg
, "try-host")) {
614 arg_link_journal
= LINK_HOST
;
615 arg_link_journal_try
= true;
617 log_error("Failed to parse link journal mode %s", optarg
);
625 r
= bind_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
, c
== ARG_BIND_RO
);
627 return log_error_errno(r
, "Failed to parse --bind(-ro)= argument %s: %m", optarg
);
629 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
633 r
= tmpfs_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
);
635 return log_error_errno(r
, "Failed to parse --tmpfs= argument %s: %m", optarg
);
637 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
641 case ARG_OVERLAY_RO
: {
642 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
643 _cleanup_strv_free_
char **lower
= NULL
;
648 r
= strv_split_extract(&lower
, optarg
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
652 log_error("Invalid overlay specification: %s", optarg
);
656 STRV_FOREACH(i
, lower
) {
657 if (!path_is_absolute(*i
)) {
658 log_error("Overlay path %s is not absolute.", *i
);
666 log_error("--overlay= needs at least two colon-separated directories specified.");
671 /* If two parameters are specified,
672 * the first one is the lower, the
673 * second one the upper directory. And
674 * we'll also define the destination
675 * mount point the same as the upper. */
679 destination
= strdup(upper
);
684 upper
= lower
[n
- 2];
685 destination
= lower
[n
- 1];
689 m
= custom_mount_add(&arg_custom_mounts
, &arg_n_custom_mounts
, CUSTOM_MOUNT_OVERLAY
);
693 m
->destination
= destination
;
696 m
->read_only
= c
== ARG_OVERLAY_RO
;
698 upper
= destination
= NULL
;
701 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
708 if (!env_assignment_is_valid(optarg
)) {
709 log_error("Environment variable assignment '%s' is not valid.", optarg
);
713 n
= strv_env_set(arg_setenv
, optarg
);
717 strv_free(arg_setenv
);
720 arg_settings_mask
|= SETTING_ENVIRONMENT
;
728 case ARG_SHARE_SYSTEM
:
729 arg_share_system
= true;
733 r
= parse_boolean(optarg
);
735 log_error("Failed to parse --register= argument: %s", optarg
);
743 arg_keep_unit
= true;
746 case ARG_PERSONALITY
:
748 arg_personality
= personality_from_string(optarg
);
749 if (arg_personality
== PERSONALITY_INVALID
) {
750 log_error("Unknown or unsupported personality '%s'.", optarg
);
754 arg_settings_mask
|= SETTING_PERSONALITY
;
760 arg_volatile_mode
= VOLATILE_YES
;
764 m
= volatile_mode_from_string(optarg
);
766 log_error("Failed to parse --volatile= argument: %s", optarg
);
769 arg_volatile_mode
= m
;
772 arg_settings_mask
|= SETTING_VOLATILE_MODE
;
776 r
= expose_port_parse(&arg_expose_ports
, optarg
);
778 return log_error_errno(r
, "Duplicate port specification: %s", optarg
);
780 return log_error_errno(r
, "Failed to parse host port %s: %m", optarg
);
782 arg_settings_mask
|= SETTING_EXPOSE_PORTS
;
786 if (strv_extend(&arg_property
, optarg
) < 0)
791 case ARG_PRIVATE_USERS
:
793 _cleanup_free_
char *buffer
= NULL
;
794 const char *range
, *shift
;
796 range
= strchr(optarg
, ':');
798 buffer
= strndup(optarg
, range
- optarg
);
804 if (safe_atou32(range
, &arg_uid_range
) < 0 || arg_uid_range
<= 0) {
805 log_error("Failed to parse UID range: %s", range
);
811 if (parse_uid(shift
, &arg_uid_shift
) < 0) {
812 log_error("Failed to parse UID: %s", optarg
);
820 case ARG_KILL_SIGNAL
:
821 arg_kill_signal
= signal_from_string_try_harder(optarg
);
822 if (arg_kill_signal
< 0) {
823 log_error("Cannot parse signal: %s", optarg
);
827 arg_settings_mask
|= SETTING_KILL_SIGNAL
;
832 /* no → do not read files
833 * yes → read files, do not override cmdline, trust only subset
834 * override → read files, override cmdline, trust only subset
835 * trusted → read files, do not override cmdline, trust all
838 r
= parse_boolean(optarg
);
840 if (streq(optarg
, "trusted")) {
841 mask_all_settings
= false;
842 mask_no_settings
= false;
843 arg_settings_trusted
= true;
845 } else if (streq(optarg
, "override")) {
846 mask_all_settings
= false;
847 mask_no_settings
= true;
848 arg_settings_trusted
= -1;
850 return log_error_errno(r
, "Failed to parse --settings= argument: %s", optarg
);
853 mask_all_settings
= false;
854 mask_no_settings
= false;
855 arg_settings_trusted
= -1;
858 mask_all_settings
= true;
859 mask_no_settings
= false;
860 arg_settings_trusted
= false;
869 assert_not_reached("Unhandled option");
872 if (arg_share_system
)
873 arg_register
= false;
875 if (arg_boot
&& arg_share_system
) {
876 log_error("--boot and --share-system may not be combined.");
880 if (arg_keep_unit
&& cg_pid_get_owner_uid(0, NULL
) >= 0) {
881 log_error("--keep-unit may not be used when invoked from a user session.");
885 if (arg_directory
&& arg_image
) {
886 log_error("--directory= and --image= may not be combined.");
890 if (arg_template
&& arg_image
) {
891 log_error("--template= and --image= may not be combined.");
895 if (arg_template
&& !(arg_directory
|| arg_machine
)) {
896 log_error("--template= needs --directory= or --machine=.");
900 if (arg_ephemeral
&& arg_template
) {
901 log_error("--ephemeral and --template= may not be combined.");
905 if (arg_ephemeral
&& arg_image
) {
906 log_error("--ephemeral and --image= may not be combined.");
910 if (arg_ephemeral
&& !IN_SET(arg_link_journal
, LINK_NO
, LINK_AUTO
)) {
911 log_error("--ephemeral and --link-journal= may not be combined.");
915 if (arg_userns
&& access("/proc/self/uid_map", F_OK
) < 0)
916 return log_error_errno(EOPNOTSUPP
, "--private-users= is not supported, kernel compiled without user namespace support.");
919 arg_parameters
= strv_copy(argv
+ optind
);
923 arg_settings_mask
|= SETTING_BOOT
;
926 /* Load all settings from .nspawn files */
927 if (mask_no_settings
)
928 arg_settings_mask
= 0;
930 /* Don't load any settings from .nspawn files */
931 if (mask_all_settings
)
932 arg_settings_mask
= _SETTINGS_MASK_ALL
;
934 arg_retain
= (arg_retain
| plus
| (arg_private_network
? 1ULL << CAP_NET_ADMIN
: 0)) & ~minus
;
936 r
= detect_unified_cgroup_hierarchy();
943 static int verify_arguments(void) {
945 if (arg_volatile_mode
!= VOLATILE_NO
&& arg_read_only
) {
946 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
950 if (arg_expose_ports
&& !arg_private_network
) {
951 log_error("Cannot use --port= without private networking.");
955 if (arg_boot
&& arg_kill_signal
<= 0)
956 arg_kill_signal
= SIGRTMIN
+3;
961 static int userns_lchown(const char *p
, uid_t uid
, gid_t gid
) {
967 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
970 if (uid
!= UID_INVALID
) {
971 uid
+= arg_uid_shift
;
973 if (uid
< arg_uid_shift
|| uid
>= arg_uid_shift
+ arg_uid_range
)
977 if (gid
!= GID_INVALID
) {
978 gid
+= (gid_t
) arg_uid_shift
;
980 if (gid
< (gid_t
) arg_uid_shift
|| gid
>= (gid_t
) (arg_uid_shift
+ arg_uid_range
))
984 if (lchown(p
, uid
, gid
) < 0)
990 static int userns_mkdir(const char *root
, const char *path
, mode_t mode
, uid_t uid
, gid_t gid
) {
993 q
= prefix_roota(root
, path
);
994 if (mkdir(q
, mode
) < 0) {
1000 return userns_lchown(q
, uid
, gid
);
1003 static int setup_timezone(const char *dest
) {
1004 _cleanup_free_
char *p
= NULL
, *q
= NULL
;
1005 const char *where
, *check
, *what
;
1011 /* Fix the timezone, if possible */
1012 r
= readlink_malloc("/etc/localtime", &p
);
1014 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1018 z
= path_startswith(p
, "../usr/share/zoneinfo/");
1020 z
= path_startswith(p
, "/usr/share/zoneinfo/");
1022 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1026 where
= prefix_roota(dest
, "/etc/localtime");
1027 r
= readlink_malloc(where
, &q
);
1029 y
= path_startswith(q
, "../usr/share/zoneinfo/");
1031 y
= path_startswith(q
, "/usr/share/zoneinfo/");
1033 /* Already pointing to the right place? Then do nothing .. */
1034 if (y
&& streq(y
, z
))
1038 check
= strjoina("/usr/share/zoneinfo/", z
);
1039 check
= prefix_root(dest
, check
);
1040 if (laccess(check
, F_OK
) < 0) {
1041 log_warning("Timezone %s does not exist in container, not updating container timezone.", z
);
1046 if (r
< 0 && errno
!= ENOENT
) {
1047 log_error_errno(errno
, "Failed to remove existing timezone info %s in container: %m", where
);
1051 what
= strjoina("../usr/share/zoneinfo/", z
);
1052 if (symlink(what
, where
) < 0) {
1053 log_error_errno(errno
, "Failed to correct timezone of container: %m");
1057 r
= userns_lchown(where
, 0, 0);
1059 return log_warning_errno(r
, "Failed to chown /etc/localtime: %m");
1064 static int setup_resolv_conf(const char *dest
) {
1065 const char *where
= NULL
;
1070 if (arg_private_network
)
1073 /* Fix resolv.conf, if possible */
1074 where
= prefix_roota(dest
, "/etc/resolv.conf");
1076 r
= copy_file("/etc/resolv.conf", where
, O_TRUNC
|O_NOFOLLOW
, 0644, 0);
1078 /* If the file already exists as symlink, let's
1079 * suppress the warning, under the assumption that
1080 * resolved or something similar runs inside and the
1081 * symlink points there.
1083 * If the disk image is read-only, there's also no
1084 * point in complaining.
1086 log_full_errno(IN_SET(r
, -ELOOP
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
1087 "Failed to copy /etc/resolv.conf to %s: %m", where
);
1091 r
= userns_lchown(where
, 0, 0);
1093 log_warning_errno(r
, "Failed to chown /etc/resolv.conf: %m");
1098 static char* id128_format_as_uuid(sd_id128_t id
, char s
[37]) {
1102 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1103 SD_ID128_FORMAT_VAL(id
));
1108 static int setup_boot_id(const char *dest
) {
1109 const char *from
, *to
;
1110 sd_id128_t rnd
= {};
1114 if (arg_share_system
)
1117 /* Generate a new randomized boot ID, so that each boot-up of
1118 * the container gets a new one */
1120 from
= prefix_roota(dest
, "/run/proc-sys-kernel-random-boot-id");
1121 to
= prefix_roota(dest
, "/proc/sys/kernel/random/boot_id");
1123 r
= sd_id128_randomize(&rnd
);
1125 return log_error_errno(r
, "Failed to generate random boot id: %m");
1127 id128_format_as_uuid(rnd
, as_uuid
);
1129 r
= write_string_file(from
, as_uuid
, WRITE_STRING_FILE_CREATE
);
1131 return log_error_errno(r
, "Failed to write boot id: %m");
1133 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1134 r
= log_error_errno(errno
, "Failed to bind mount boot id: %m");
1135 else if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
) < 0)
1136 log_warning_errno(errno
, "Failed to make boot id read-only: %m");
1142 static int copy_devnodes(const char *dest
) {
1144 static const char devnodes
[] =
1155 _cleanup_umask_ mode_t u
;
1161 /* Create /dev/net, so that we can create /dev/net/tun in it */
1162 if (userns_mkdir(dest
, "/dev/net", 0755, 0, 0) < 0)
1163 return log_error_errno(r
, "Failed to create /dev/net directory: %m");
1165 NULSTR_FOREACH(d
, devnodes
) {
1166 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
1169 from
= strappend("/dev/", d
);
1170 to
= prefix_root(dest
, from
);
1172 if (stat(from
, &st
) < 0) {
1174 if (errno
!= ENOENT
)
1175 return log_error_errno(errno
, "Failed to stat %s: %m", from
);
1177 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
1179 log_error("%s is not a char or block device, cannot copy.", from
);
1183 if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
1185 return log_error_errno(errno
, "mknod(%s) failed: %m", to
);
1187 /* Some systems abusively restrict mknod but
1188 * allow bind mounts. */
1191 return log_error_errno(r
, "touch (%s) failed: %m", to
);
1192 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1193 return log_error_errno(errno
, "Both mknod and bind mount (%s) failed: %m", to
);
1196 r
= userns_lchown(to
, 0, 0);
1198 return log_error_errno(r
, "chown() of device node %s failed: %m", to
);
1205 static int setup_pts(const char *dest
) {
1206 _cleanup_free_
char *options
= NULL
;
1210 if (arg_selinux_apifs_context
)
1211 (void) asprintf(&options
,
1212 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
",context=\"%s\"",
1213 arg_uid_shift
+ TTY_GID
,
1214 arg_selinux_apifs_context
);
1217 (void) asprintf(&options
,
1218 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
,
1219 arg_uid_shift
+ TTY_GID
);
1224 /* Mount /dev/pts itself */
1225 p
= prefix_roota(dest
, "/dev/pts");
1226 if (mkdir(p
, 0755) < 0)
1227 return log_error_errno(errno
, "Failed to create /dev/pts: %m");
1228 if (mount("devpts", p
, "devpts", MS_NOSUID
|MS_NOEXEC
, options
) < 0)
1229 return log_error_errno(errno
, "Failed to mount /dev/pts: %m");
1230 if (userns_lchown(p
, 0, 0) < 0)
1231 return log_error_errno(errno
, "Failed to chown /dev/pts: %m");
1233 /* Create /dev/ptmx symlink */
1234 p
= prefix_roota(dest
, "/dev/ptmx");
1235 if (symlink("pts/ptmx", p
) < 0)
1236 return log_error_errno(errno
, "Failed to create /dev/ptmx symlink: %m");
1237 if (userns_lchown(p
, 0, 0) < 0)
1238 return log_error_errno(errno
, "Failed to chown /dev/ptmx: %m");
1240 /* And fix /dev/pts/ptmx ownership */
1241 p
= prefix_roota(dest
, "/dev/pts/ptmx");
1242 if (userns_lchown(p
, 0, 0) < 0)
1243 return log_error_errno(errno
, "Failed to chown /dev/pts/ptmx: %m");
1248 static int setup_dev_console(const char *dest
, const char *console
) {
1249 _cleanup_umask_ mode_t u
;
1258 r
= chmod_and_chown(console
, 0600, arg_uid_shift
, arg_uid_shift
);
1260 return log_error_errno(r
, "Failed to correct access mode for TTY: %m");
1262 /* We need to bind mount the right tty to /dev/console since
1263 * ptys can only exist on pts file systems. To have something
1264 * to bind mount things on we create a empty regular file. */
1266 to
= prefix_roota(dest
, "/dev/console");
1269 return log_error_errno(r
, "touch() for /dev/console failed: %m");
1271 if (mount(console
, to
, NULL
, MS_BIND
, NULL
) < 0)
1272 return log_error_errno(errno
, "Bind mount for /dev/console failed: %m");
1277 static int setup_kmsg(const char *dest
, int kmsg_socket
) {
1278 const char *from
, *to
;
1279 _cleanup_umask_ mode_t u
;
1282 struct cmsghdr cmsghdr
;
1283 uint8_t buf
[CMSG_SPACE(sizeof(int))];
1285 struct msghdr mh
= {
1286 .msg_control
= &control
,
1287 .msg_controllen
= sizeof(control
),
1289 struct cmsghdr
*cmsg
;
1291 assert(kmsg_socket
>= 0);
1295 /* We create the kmsg FIFO as /run/kmsg, but immediately
1296 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1297 * on the reading side behave very similar to /proc/kmsg,
1298 * their writing side behaves differently from /dev/kmsg in
1299 * that writing blocks when nothing is reading. In order to
1300 * avoid any problems with containers deadlocking due to this
1301 * we simply make /dev/kmsg unavailable to the container. */
1302 from
= prefix_roota(dest
, "/run/kmsg");
1303 to
= prefix_roota(dest
, "/proc/kmsg");
1305 if (mkfifo(from
, 0600) < 0)
1306 return log_error_errno(errno
, "mkfifo() for /run/kmsg failed: %m");
1307 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1308 return log_error_errno(errno
, "Bind mount for /proc/kmsg failed: %m");
1310 fd
= open(from
, O_RDWR
|O_NDELAY
|O_CLOEXEC
);
1312 return log_error_errno(errno
, "Failed to open fifo: %m");
1314 cmsg
= CMSG_FIRSTHDR(&mh
);
1315 cmsg
->cmsg_level
= SOL_SOCKET
;
1316 cmsg
->cmsg_type
= SCM_RIGHTS
;
1317 cmsg
->cmsg_len
= CMSG_LEN(sizeof(int));
1318 memcpy(CMSG_DATA(cmsg
), &fd
, sizeof(int));
1320 mh
.msg_controllen
= cmsg
->cmsg_len
;
1322 /* Store away the fd in the socket, so that it stays open as
1323 * long as we run the child */
1324 k
= sendmsg(kmsg_socket
, &mh
, MSG_NOSIGNAL
);
1328 return log_error_errno(errno
, "Failed to send FIFO fd: %m");
1330 /* And now make the FIFO unavailable as /run/kmsg... */
1331 (void) unlink(from
);
1336 static int on_address_change(sd_netlink
*rtnl
, sd_netlink_message
*m
, void *userdata
) {
1337 union in_addr_union
*exposed
= userdata
;
1343 expose_port_execute(rtnl
, arg_expose_ports
, exposed
);
1347 static int setup_hostname(void) {
1349 if (arg_share_system
)
1352 if (sethostname_idempotent(arg_machine
) < 0)
1358 static int setup_journal(const char *directory
) {
1359 sd_id128_t machine_id
, this_id
;
1360 _cleanup_free_
char *b
= NULL
, *d
= NULL
;
1361 const char *etc_machine_id
, *p
, *q
;
1365 /* Don't link journals in ephemeral mode */
1369 etc_machine_id
= prefix_roota(directory
, "/etc/machine-id");
1371 r
= read_one_line_file(etc_machine_id
, &b
);
1372 if (r
== -ENOENT
&& arg_link_journal
== LINK_AUTO
)
1375 return log_error_errno(r
, "Failed to read machine ID from %s: %m", etc_machine_id
);
1378 if (isempty(id
) && arg_link_journal
== LINK_AUTO
)
1381 /* Verify validity */
1382 r
= sd_id128_from_string(id
, &machine_id
);
1384 return log_error_errno(r
, "Failed to parse machine ID from %s: %m", etc_machine_id
);
1386 r
= sd_id128_get_machine(&this_id
);
1388 return log_error_errno(r
, "Failed to retrieve machine ID: %m");
1390 if (sd_id128_equal(machine_id
, this_id
)) {
1391 log_full(arg_link_journal
== LINK_AUTO
? LOG_WARNING
: LOG_ERR
,
1392 "Host and machine ids are equal (%s): refusing to link journals", id
);
1393 if (arg_link_journal
== LINK_AUTO
)
1398 if (arg_link_journal
== LINK_NO
)
1401 r
= userns_mkdir(directory
, "/var", 0755, 0, 0);
1403 return log_error_errno(r
, "Failed to create /var: %m");
1405 r
= userns_mkdir(directory
, "/var/log", 0755, 0, 0);
1407 return log_error_errno(r
, "Failed to create /var/log: %m");
1409 r
= userns_mkdir(directory
, "/var/log/journal", 0755, 0, 0);
1411 return log_error_errno(r
, "Failed to create /var/log/journal: %m");
1413 p
= strjoina("/var/log/journal/", id
);
1414 q
= prefix_roota(directory
, p
);
1416 if (path_is_mount_point(p
, 0) > 0) {
1417 if (arg_link_journal
!= LINK_AUTO
) {
1418 log_error("%s: already a mount point, refusing to use for journal", p
);
1425 if (path_is_mount_point(q
, 0) > 0) {
1426 if (arg_link_journal
!= LINK_AUTO
) {
1427 log_error("%s: already a mount point, refusing to use for journal", q
);
1434 r
= readlink_and_make_absolute(p
, &d
);
1436 if ((arg_link_journal
== LINK_GUEST
||
1437 arg_link_journal
== LINK_AUTO
) &&
1440 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1442 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
1447 return log_error_errno(errno
, "Failed to remove symlink %s: %m", p
);
1448 } else if (r
== -EINVAL
) {
1450 if (arg_link_journal
== LINK_GUEST
&&
1453 if (errno
== ENOTDIR
) {
1454 log_error("%s already exists and is neither a symlink nor a directory", p
);
1457 log_error_errno(errno
, "Failed to remove %s: %m", p
);
1461 } else if (r
!= -ENOENT
) {
1462 log_error_errno(errno
, "readlink(%s) failed: %m", p
);
1466 if (arg_link_journal
== LINK_GUEST
) {
1468 if (symlink(q
, p
) < 0) {
1469 if (arg_link_journal_try
) {
1470 log_debug_errno(errno
, "Failed to symlink %s to %s, skipping journal setup: %m", q
, p
);
1473 log_error_errno(errno
, "Failed to symlink %s to %s: %m", q
, p
);
1478 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1480 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
1484 if (arg_link_journal
== LINK_HOST
) {
1485 /* don't create parents here -- if the host doesn't have
1486 * permanent journal set up, don't force it here */
1489 if (arg_link_journal_try
) {
1490 log_debug_errno(errno
, "Failed to create %s, skipping journal setup: %m", p
);
1493 log_error_errno(errno
, "Failed to create %s: %m", p
);
1498 } else if (access(p
, F_OK
) < 0)
1501 if (dir_is_empty(q
) == 0)
1502 log_warning("%s is not empty, proceeding anyway.", q
);
1504 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1506 log_error_errno(errno
, "Failed to create %s: %m", q
);
1510 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1511 return log_error_errno(errno
, "Failed to bind mount journal from host into guest: %m");
1516 static int drop_capabilities(void) {
1517 return capability_bounding_set_drop(~arg_retain
, false);
1520 static int reset_audit_loginuid(void) {
1521 _cleanup_free_
char *p
= NULL
;
1524 if (arg_share_system
)
1527 r
= read_one_line_file("/proc/self/loginuid", &p
);
1531 return log_error_errno(r
, "Failed to read /proc/self/loginuid: %m");
1533 /* Already reset? */
1534 if (streq(p
, "4294967295"))
1537 r
= write_string_file("/proc/self/loginuid", "4294967295", 0);
1540 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1541 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1542 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1543 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1544 "using systemd-nspawn. Sleeping for 5s... (%m)");
1552 static int setup_seccomp(void) {
1555 static const struct {
1556 uint64_t capability
;
1559 { CAP_SYS_RAWIO
, SCMP_SYS(iopl
) },
1560 { CAP_SYS_RAWIO
, SCMP_SYS(ioperm
) },
1561 { CAP_SYS_BOOT
, SCMP_SYS(kexec_load
) },
1562 { CAP_SYS_ADMIN
, SCMP_SYS(swapon
) },
1563 { CAP_SYS_ADMIN
, SCMP_SYS(swapoff
) },
1564 { CAP_SYS_ADMIN
, SCMP_SYS(open_by_handle_at
) },
1565 { CAP_SYS_MODULE
, SCMP_SYS(init_module
) },
1566 { CAP_SYS_MODULE
, SCMP_SYS(finit_module
) },
1567 { CAP_SYS_MODULE
, SCMP_SYS(delete_module
) },
1568 { CAP_SYSLOG
, SCMP_SYS(syslog
) },
1571 scmp_filter_ctx seccomp
;
1575 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
1579 r
= seccomp_add_secondary_archs(seccomp
);
1581 log_error_errno(r
, "Failed to add secondary archs to seccomp filter: %m");
1585 for (i
= 0; i
< ELEMENTSOF(blacklist
); i
++) {
1586 if (arg_retain
& (1ULL << blacklist
[i
].capability
))
1589 r
= seccomp_rule_add(seccomp
, SCMP_ACT_ERRNO(EPERM
), blacklist
[i
].syscall_num
, 0);
1591 continue; /* unknown syscall */
1593 log_error_errno(r
, "Failed to block syscall: %m");
1600 Audit is broken in containers, much of the userspace audit
1601 hookup will fail if running inside a container. We don't
1602 care and just turn off creation of audit sockets.
1604 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1605 with EAFNOSUPPORT which audit userspace uses as indication
1606 that audit is disabled in the kernel.
1609 r
= seccomp_rule_add(
1611 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1614 SCMP_A0(SCMP_CMP_EQ
, AF_NETLINK
),
1615 SCMP_A2(SCMP_CMP_EQ
, NETLINK_AUDIT
));
1617 log_error_errno(r
, "Failed to add audit seccomp rule: %m");
1621 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
1623 log_error_errno(r
, "Failed to unset NO_NEW_PRIVS: %m");
1627 r
= seccomp_load(seccomp
);
1629 log_debug_errno(r
, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1634 log_error_errno(r
, "Failed to install seccomp audit filter: %m");
1639 seccomp_release(seccomp
);
1647 static int setup_propagate(const char *root
) {
1650 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1651 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1652 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
1653 (void) mkdir_p(p
, 0600);
1655 if (userns_mkdir(root
, "/run/systemd", 0755, 0, 0) < 0)
1656 return log_error_errno(errno
, "Failed to create /run/systemd: %m");
1658 if (userns_mkdir(root
, "/run/systemd/nspawn", 0755, 0, 0) < 0)
1659 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn: %m");
1661 if (userns_mkdir(root
, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
1662 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn/incoming: %m");
1664 q
= prefix_roota(root
, "/run/systemd/nspawn/incoming");
1665 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1666 return log_error_errno(errno
, "Failed to install propagation bind mount.");
1668 if (mount(NULL
, q
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
) < 0)
1669 return log_error_errno(errno
, "Failed to make propagation mount read-only");
1674 static int setup_image(char **device_path
, int *loop_nr
) {
1675 struct loop_info64 info
= {
1676 .lo_flags
= LO_FLAGS_AUTOCLEAR
|LO_FLAGS_PARTSCAN
1678 _cleanup_close_
int fd
= -1, control
= -1, loop
= -1;
1679 _cleanup_free_
char* loopdev
= NULL
;
1683 assert(device_path
);
1687 fd
= open(arg_image
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1689 return log_error_errno(errno
, "Failed to open %s: %m", arg_image
);
1691 if (fstat(fd
, &st
) < 0)
1692 return log_error_errno(errno
, "Failed to stat %s: %m", arg_image
);
1694 if (S_ISBLK(st
.st_mode
)) {
1697 p
= strdup(arg_image
);
1711 if (!S_ISREG(st
.st_mode
)) {
1712 log_error_errno(errno
, "%s is not a regular file or block device: %m", arg_image
);
1716 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
1718 return log_error_errno(errno
, "Failed to open /dev/loop-control: %m");
1720 nr
= ioctl(control
, LOOP_CTL_GET_FREE
);
1722 return log_error_errno(errno
, "Failed to allocate loop device: %m");
1724 if (asprintf(&loopdev
, "/dev/loop%i", nr
) < 0)
1727 loop
= open(loopdev
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1729 return log_error_errno(errno
, "Failed to open loop device %s: %m", loopdev
);
1731 if (ioctl(loop
, LOOP_SET_FD
, fd
) < 0)
1732 return log_error_errno(errno
, "Failed to set loopback file descriptor on %s: %m", loopdev
);
1735 info
.lo_flags
|= LO_FLAGS_READ_ONLY
;
1737 if (ioctl(loop
, LOOP_SET_STATUS64
, &info
) < 0)
1738 return log_error_errno(errno
, "Failed to set loopback settings on %s: %m", loopdev
);
1740 *device_path
= loopdev
;
1751 #define PARTITION_TABLE_BLURB \
1752 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1753 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1754 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1755 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1756 "to be bootable with systemd-nspawn."
1758 static int dissect_image(
1760 char **root_device
, bool *root_device_rw
,
1761 char **home_device
, bool *home_device_rw
,
1762 char **srv_device
, bool *srv_device_rw
,
1766 int home_nr
= -1, srv_nr
= -1;
1767 #ifdef GPT_ROOT_NATIVE
1770 #ifdef GPT_ROOT_SECONDARY
1771 int secondary_root_nr
= -1;
1773 _cleanup_free_
char *home
= NULL
, *root
= NULL
, *secondary_root
= NULL
, *srv
= NULL
, *generic
= NULL
;
1774 _cleanup_udev_enumerate_unref_
struct udev_enumerate
*e
= NULL
;
1775 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
1776 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
1777 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
1778 struct udev_list_entry
*first
, *item
;
1779 bool home_rw
= true, root_rw
= true, secondary_root_rw
= true, srv_rw
= true, generic_rw
= true;
1780 bool is_gpt
, is_mbr
, multiple_generic
= false;
1781 const char *pttype
= NULL
;
1788 assert(root_device
);
1789 assert(home_device
);
1794 b
= blkid_new_probe();
1799 r
= blkid_probe_set_device(b
, fd
, 0, 0);
1804 log_error_errno(errno
, "Failed to set device on blkid probe: %m");
1808 blkid_probe_enable_partitions(b
, 1);
1809 blkid_probe_set_partitions_flags(b
, BLKID_PARTS_ENTRY_DETAILS
);
1812 r
= blkid_do_safeprobe(b
);
1813 if (r
== -2 || r
== 1) {
1814 log_error("Failed to identify any partition table on\n"
1816 PARTITION_TABLE_BLURB
, arg_image
);
1818 } else if (r
!= 0) {
1821 log_error_errno(errno
, "Failed to probe: %m");
1825 (void) blkid_probe_lookup_value(b
, "PTTYPE", &pttype
, NULL
);
1827 is_gpt
= streq_ptr(pttype
, "gpt");
1828 is_mbr
= streq_ptr(pttype
, "dos");
1830 if (!is_gpt
&& !is_mbr
) {
1831 log_error("No GPT or MBR partition table discovered on\n"
1833 PARTITION_TABLE_BLURB
, arg_image
);
1838 pl
= blkid_probe_get_partitions(b
);
1843 log_error("Failed to list partitions of %s", arg_image
);
1851 if (fstat(fd
, &st
) < 0)
1852 return log_error_errno(errno
, "Failed to stat block device: %m");
1854 d
= udev_device_new_from_devnum(udev
, 'b', st
.st_rdev
);
1862 log_error("Kernel partitions never appeared.");
1866 e
= udev_enumerate_new(udev
);
1870 r
= udev_enumerate_add_match_parent(e
, d
);
1874 r
= udev_enumerate_scan_devices(e
);
1876 return log_error_errno(r
, "Failed to scan for partition devices of %s: %m", arg_image
);
1878 /* Count the partitions enumerated by the kernel */
1880 first
= udev_enumerate_get_list_entry(e
);
1881 udev_list_entry_foreach(item
, first
)
1884 /* Count the partitions enumerated by blkid */
1885 m
= blkid_partlist_numof_partitions(pl
);
1889 log_error("blkid and kernel partition list do not match.");
1895 /* The kernel has probed fewer partitions than
1896 * blkid? Maybe the kernel prober is still
1897 * running or it got EBUSY because udev
1898 * already opened the device. Let's reprobe
1899 * the device, which is a synchronous call
1900 * that waits until probing is complete. */
1902 for (j
= 0; j
< 20; j
++) {
1904 r
= ioctl(fd
, BLKRRPART
, 0);
1907 if (r
>= 0 || r
!= -EBUSY
)
1910 /* If something else has the device
1911 * open, such as an udev rule, the
1912 * ioctl will return EBUSY. Since
1913 * there's no way to wait until it
1914 * isn't busy anymore, let's just wait
1915 * a bit, and try again.
1917 * This is really something they
1918 * should fix in the kernel! */
1920 usleep(50 * USEC_PER_MSEC
);
1924 return log_error_errno(r
, "Failed to reread partition table: %m");
1927 e
= udev_enumerate_unref(e
);
1930 first
= udev_enumerate_get_list_entry(e
);
1931 udev_list_entry_foreach(item
, first
) {
1932 _cleanup_udev_device_unref_
struct udev_device
*q
;
1934 unsigned long long flags
;
1940 q
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(item
));
1945 log_error_errno(errno
, "Failed to get partition device of %s: %m", arg_image
);
1949 qn
= udev_device_get_devnum(q
);
1953 if (st
.st_rdev
== qn
)
1956 node
= udev_device_get_devnode(q
);
1960 pp
= blkid_partlist_devno_to_partition(pl
, qn
);
1964 flags
= blkid_partition_get_flags(pp
);
1966 nr
= blkid_partition_get_partno(pp
);
1974 if (flags
& GPT_FLAG_NO_AUTO
)
1977 stype
= blkid_partition_get_type_string(pp
);
1981 if (sd_id128_from_string(stype
, &type_id
) < 0)
1984 if (sd_id128_equal(type_id
, GPT_HOME
)) {
1986 if (home
&& nr
>= home_nr
)
1990 home_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1992 r
= free_and_strdup(&home
, node
);
1996 } else if (sd_id128_equal(type_id
, GPT_SRV
)) {
1998 if (srv
&& nr
>= srv_nr
)
2002 srv_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2004 r
= free_and_strdup(&srv
, node
);
2008 #ifdef GPT_ROOT_NATIVE
2009 else if (sd_id128_equal(type_id
, GPT_ROOT_NATIVE
)) {
2011 if (root
&& nr
>= root_nr
)
2015 root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2017 r
= free_and_strdup(&root
, node
);
2022 #ifdef GPT_ROOT_SECONDARY
2023 else if (sd_id128_equal(type_id
, GPT_ROOT_SECONDARY
)) {
2025 if (secondary_root
&& nr
>= secondary_root_nr
)
2028 secondary_root_nr
= nr
;
2029 secondary_root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2031 r
= free_and_strdup(&secondary_root
, node
);
2036 else if (sd_id128_equal(type_id
, GPT_LINUX_GENERIC
)) {
2039 multiple_generic
= true;
2041 generic_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2043 r
= free_and_strdup(&generic
, node
);
2049 } else if (is_mbr
) {
2052 if (flags
!= 0x80) /* Bootable flag */
2055 type
= blkid_partition_get_type(pp
);
2056 if (type
!= 0x83) /* Linux partition */
2060 multiple_generic
= true;
2064 r
= free_and_strdup(&root
, node
);
2072 *root_device
= root
;
2075 *root_device_rw
= root_rw
;
2077 } else if (secondary_root
) {
2078 *root_device
= secondary_root
;
2079 secondary_root
= NULL
;
2081 *root_device_rw
= secondary_root_rw
;
2083 } else if (generic
) {
2085 /* There were no partitions with precise meanings
2086 * around, but we found generic partitions. In this
2087 * case, if there's only one, we can go ahead and boot
2088 * it, otherwise we bail out, because we really cannot
2089 * make any sense of it. */
2091 if (multiple_generic
) {
2092 log_error("Identified multiple bootable Linux partitions on\n"
2094 PARTITION_TABLE_BLURB
, arg_image
);
2098 *root_device
= generic
;
2101 *root_device_rw
= generic_rw
;
2104 log_error("Failed to identify root partition in disk image\n"
2106 PARTITION_TABLE_BLURB
, arg_image
);
2111 *home_device
= home
;
2114 *home_device_rw
= home_rw
;
2121 *srv_device_rw
= srv_rw
;
2126 log_error("--image= is not supported, compiled without blkid support.");
2131 static int mount_device(const char *what
, const char *where
, const char *directory
, bool rw
) {
2133 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
2134 const char *fstype
, *p
;
2144 p
= strjoina(where
, directory
);
2149 b
= blkid_new_probe_from_filename(what
);
2153 log_error_errno(errno
, "Failed to allocate prober for %s: %m", what
);
2157 blkid_probe_enable_superblocks(b
, 1);
2158 blkid_probe_set_superblocks_flags(b
, BLKID_SUBLKS_TYPE
);
2161 r
= blkid_do_safeprobe(b
);
2162 if (r
== -1 || r
== 1) {
2163 log_error("Cannot determine file system type of %s", what
);
2165 } else if (r
!= 0) {
2168 log_error_errno(errno
, "Failed to probe %s: %m", what
);
2173 if (blkid_probe_lookup_value(b
, "TYPE", &fstype
, NULL
) < 0) {
2176 log_error("Failed to determine file system type of %s", what
);
2180 if (streq(fstype
, "crypto_LUKS")) {
2181 log_error("nspawn currently does not support LUKS disk images.");
2185 if (mount(what
, p
, fstype
, MS_NODEV
|(rw
? 0 : MS_RDONLY
), NULL
) < 0)
2186 return log_error_errno(errno
, "Failed to mount %s: %m", what
);
2190 log_error("--image= is not supported, compiled without blkid support.");
2195 static int mount_devices(
2197 const char *root_device
, bool root_device_rw
,
2198 const char *home_device
, bool home_device_rw
,
2199 const char *srv_device
, bool srv_device_rw
) {
2205 r
= mount_device(root_device
, arg_directory
, NULL
, root_device_rw
);
2207 return log_error_errno(r
, "Failed to mount root directory: %m");
2211 r
= mount_device(home_device
, arg_directory
, "/home", home_device_rw
);
2213 return log_error_errno(r
, "Failed to mount home directory: %m");
2217 r
= mount_device(srv_device
, arg_directory
, "/srv", srv_device_rw
);
2219 return log_error_errno(r
, "Failed to mount server data directory: %m");
2225 static void loop_remove(int nr
, int *image_fd
) {
2226 _cleanup_close_
int control
= -1;
2232 if (image_fd
&& *image_fd
>= 0) {
2233 r
= ioctl(*image_fd
, LOOP_CLR_FD
);
2235 log_debug_errno(errno
, "Failed to close loop image: %m");
2236 *image_fd
= safe_close(*image_fd
);
2239 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
2241 log_warning_errno(errno
, "Failed to open /dev/loop-control: %m");
2245 r
= ioctl(control
, LOOP_CTL_REMOVE
, nr
);
2247 log_debug_errno(errno
, "Failed to remove loop %d: %m", nr
);
2252 * < 0 : wait_for_terminate() failed to get the state of the
2253 * container, the container was terminated by a signal, or
2254 * failed for an unknown reason. No change is made to the
2255 * container argument.
2256 * > 0 : The program executed in the container terminated with an
2257 * error. The exit code of the program executed in the
2258 * container is returned. The container argument has been set
2259 * to CONTAINER_TERMINATED.
2260 * 0 : The container is being rebooted, has been shut down or exited
2261 * successfully. The container argument has been set to either
2262 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2264 * That is, success is indicated by a return value of zero, and an
2265 * error is indicated by a non-zero value.
2267 static int wait_for_container(pid_t pid
, ContainerStatus
*container
) {
2271 r
= wait_for_terminate(pid
, &status
);
2273 return log_warning_errno(r
, "Failed to wait for container: %m");
2275 switch (status
.si_code
) {
2278 if (status
.si_status
== 0) {
2279 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s exited successfully.", arg_machine
);
2282 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s failed with error code %i.", arg_machine
, status
.si_status
);
2284 *container
= CONTAINER_TERMINATED
;
2285 return status
.si_status
;
2288 if (status
.si_status
== SIGINT
) {
2290 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s has been shut down.", arg_machine
);
2291 *container
= CONTAINER_TERMINATED
;
2294 } else if (status
.si_status
== SIGHUP
) {
2296 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s is being rebooted.", arg_machine
);
2297 *container
= CONTAINER_REBOOTED
;
2301 /* CLD_KILLED fallthrough */
2304 log_error("Container %s terminated by signal %s.", arg_machine
, signal_to_string(status
.si_status
));
2308 log_error("Container %s failed due to unknown reason.", arg_machine
);
2315 static void nop_handler(int sig
) {}
2317 static int on_orderly_shutdown(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
2320 pid
= PTR_TO_UINT32(userdata
);
2322 if (kill(pid
, arg_kill_signal
) >= 0) {
2323 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2324 sd_event_source_set_userdata(s
, NULL
);
2329 sd_event_exit(sd_event_source_get_event(s
), 0);
2333 static int determine_names(void) {
2336 if (arg_template
&& !arg_directory
&& arg_machine
) {
2338 /* If --template= was specified then we should not
2339 * search for a machine, but instead create a new one
2340 * in /var/lib/machine. */
2342 arg_directory
= strjoin("/var/lib/machines/", arg_machine
, NULL
);
2347 if (!arg_image
&& !arg_directory
) {
2349 _cleanup_(image_unrefp
) Image
*i
= NULL
;
2351 r
= image_find(arg_machine
, &i
);
2353 return log_error_errno(r
, "Failed to find image for machine '%s': %m", arg_machine
);
2355 log_error("No image for machine '%s': %m", arg_machine
);
2359 if (i
->type
== IMAGE_RAW
)
2360 r
= set_sanitized_path(&arg_image
, i
->path
);
2362 r
= set_sanitized_path(&arg_directory
, i
->path
);
2364 return log_error_errno(r
, "Invalid image directory: %m");
2367 arg_read_only
= arg_read_only
|| i
->read_only
;
2369 arg_directory
= get_current_dir_name();
2371 if (!arg_directory
&& !arg_machine
) {
2372 log_error("Failed to determine path, please use -D or -i.");
2378 if (arg_directory
&& path_equal(arg_directory
, "/"))
2379 arg_machine
= gethostname_malloc();
2381 arg_machine
= strdup(basename(arg_image
?: arg_directory
));
2386 hostname_cleanup(arg_machine
);
2387 if (!machine_name_is_valid(arg_machine
)) {
2388 log_error("Failed to determine machine name automatically, please use -M.");
2392 if (arg_ephemeral
) {
2395 /* Add a random suffix when this is an
2396 * ephemeral machine, so that we can run many
2397 * instances at once without manually having
2398 * to specify -M each time. */
2400 if (asprintf(&b
, "%s-%016" PRIx64
, arg_machine
, random_u64()) < 0)
2411 static int determine_uid_shift(const char *directory
) {
2419 if (arg_uid_shift
== UID_INVALID
) {
2422 r
= stat(directory
, &st
);
2424 return log_error_errno(errno
, "Failed to determine UID base of %s: %m", directory
);
2426 arg_uid_shift
= st
.st_uid
& UINT32_C(0xffff0000);
2428 if (arg_uid_shift
!= (st
.st_gid
& UINT32_C(0xffff0000))) {
2429 log_error("UID and GID base of %s don't match.", directory
);
2433 arg_uid_range
= UINT32_C(0x10000);
2436 if (arg_uid_shift
> (uid_t
) -1 - arg_uid_range
) {
2437 log_error("UID base too high for UID range.");
2441 log_info("Using user namespaces with base " UID_FMT
" and range " UID_FMT
".", arg_uid_shift
, arg_uid_range
);
2445 static int inner_child(
2447 const char *directory
,
2453 _cleanup_free_
char *home
= NULL
;
2455 const char *envp
[] = {
2456 "PATH=" DEFAULT_PATH_SPLIT_USR
,
2457 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2462 NULL
, /* container_uuid */
2463 NULL
, /* LISTEN_FDS */
2464 NULL
, /* LISTEN_PID */
2468 _cleanup_strv_free_
char **env_use
= NULL
;
2473 assert(kmsg_socket
>= 0);
2478 /* Tell the parent, that it now can write the UID map. */
2479 (void) barrier_place(barrier
); /* #1 */
2481 /* Wait until the parent wrote the UID map */
2482 if (!barrier_place_and_sync(barrier
)) { /* #2 */
2483 log_error("Parent died too early");
2488 r
= mount_all(NULL
, true, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2492 /* Wait until we are cgroup-ified, so that we
2493 * can mount the right cgroup path writable */
2494 if (!barrier_place_and_sync(barrier
)) { /* #3 */
2495 log_error("Parent died too early");
2499 r
= mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy
);
2503 r
= reset_uid_gid();
2505 return log_error_errno(r
, "Couldn't become new root: %m");
2507 r
= setup_boot_id(NULL
);
2511 r
= setup_kmsg(NULL
, kmsg_socket
);
2514 kmsg_socket
= safe_close(kmsg_socket
);
2519 return log_error_errno(errno
, "setsid() failed: %m");
2521 if (arg_private_network
)
2524 if (arg_expose_ports
) {
2525 r
= expose_port_send_rtnl(rtnl_socket
);
2528 rtnl_socket
= safe_close(rtnl_socket
);
2531 if (drop_capabilities() < 0)
2532 return log_error_errno(errno
, "drop_capabilities() failed: %m");
2536 if (arg_personality
!= PERSONALITY_INVALID
) {
2537 if (personality(arg_personality
) < 0)
2538 return log_error_errno(errno
, "personality() failed: %m");
2539 } else if (secondary
) {
2540 if (personality(PER_LINUX32
) < 0)
2541 return log_error_errno(errno
, "personality() failed: %m");
2545 if (arg_selinux_context
)
2546 if (setexeccon((security_context_t
) arg_selinux_context
) < 0)
2547 return log_error_errno(errno
, "setexeccon(\"%s\") failed: %m", arg_selinux_context
);
2550 r
= change_uid_gid(arg_user
, &home
);
2554 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
2558 if ((asprintf((char**)(envp
+ n_env
++), "HOME=%s", home
? home
: "/root") < 0) ||
2559 (asprintf((char**)(envp
+ n_env
++), "USER=%s", arg_user
? arg_user
: "root") < 0) ||
2560 (asprintf((char**)(envp
+ n_env
++), "LOGNAME=%s", arg_user
? arg_user
: "root") < 0))
2563 if (!sd_id128_equal(arg_uuid
, SD_ID128_NULL
)) {
2566 if (asprintf((char**)(envp
+ n_env
++), "container_uuid=%s", id128_format_as_uuid(arg_uuid
, as_uuid
)) < 0)
2570 if (fdset_size(fds
) > 0) {
2571 r
= fdset_cloexec(fds
, false);
2573 return log_error_errno(r
, "Failed to unset O_CLOEXEC for file descriptors.");
2575 if ((asprintf((char **)(envp
+ n_env
++), "LISTEN_FDS=%u", fdset_size(fds
)) < 0) ||
2576 (asprintf((char **)(envp
+ n_env
++), "LISTEN_PID=1") < 0))
2580 env_use
= strv_env_merge(2, envp
, arg_setenv
);
2584 /* Let the parent know that we are ready and
2585 * wait until the parent is ready with the
2587 if (!barrier_place_and_sync(barrier
)) { /* #4 */
2588 log_error("Parent died too early");
2592 /* Now, explicitly close the log, so that we
2593 * then can close all remaining fds. Closing
2594 * the log explicitly first has the benefit
2595 * that the logging subsystem knows about it,
2596 * and is thus ready to be reopened should we
2597 * need it again. Note that the other fds
2598 * closed here are at least the locking and
2601 (void) fdset_close_others(fds
);
2607 /* Automatically search for the init system */
2609 m
= 1 + strv_length(arg_parameters
);
2610 a
= newa(char*, m
+ 1);
2611 if (strv_isempty(arg_parameters
))
2614 memcpy(a
+ 1, arg_parameters
, m
* sizeof(char*));
2616 a
[0] = (char*) "/usr/lib/systemd/systemd";
2617 execve(a
[0], a
, env_use
);
2619 a
[0] = (char*) "/lib/systemd/systemd";
2620 execve(a
[0], a
, env_use
);
2622 a
[0] = (char*) "/sbin/init";
2623 execve(a
[0], a
, env_use
);
2624 } else if (!strv_isempty(arg_parameters
))
2625 execvpe(arg_parameters
[0], arg_parameters
, env_use
);
2627 chdir(home
?: "/root");
2628 execle("/bin/bash", "-bash", NULL
, env_use
);
2629 execle("/bin/sh", "-sh", NULL
, env_use
);
2633 return log_error_errno(errno
, "execv() failed: %m");
2636 static int outer_child(
2638 const char *directory
,
2639 const char *console
,
2640 const char *root_device
, bool root_device_rw
,
2641 const char *home_device
, bool home_device_rw
,
2642 const char *srv_device
, bool srv_device_rw
,
2648 int uid_shift_socket
,
2658 assert(pid_socket
>= 0);
2659 assert(kmsg_socket
>= 0);
2663 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0)
2664 return log_error_errno(errno
, "PR_SET_PDEATHSIG failed: %m");
2667 close_nointr(STDIN_FILENO
);
2668 close_nointr(STDOUT_FILENO
);
2669 close_nointr(STDERR_FILENO
);
2671 r
= open_terminal(console
, O_RDWR
);
2672 if (r
!= STDIN_FILENO
) {
2678 return log_error_errno(r
, "Failed to open console: %m");
2681 if (dup2(STDIN_FILENO
, STDOUT_FILENO
) != STDOUT_FILENO
||
2682 dup2(STDIN_FILENO
, STDERR_FILENO
) != STDERR_FILENO
)
2683 return log_error_errno(errno
, "Failed to duplicate console: %m");
2686 r
= reset_audit_loginuid();
2690 /* Mark everything as slave, so that we still
2691 * receive mounts from the real root, but don't
2692 * propagate mounts to the real root. */
2693 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
2694 return log_error_errno(errno
, "MS_SLAVE|MS_REC failed: %m");
2696 r
= mount_devices(directory
,
2697 root_device
, root_device_rw
,
2698 home_device
, home_device_rw
,
2699 srv_device
, srv_device_rw
);
2703 r
= determine_uid_shift(directory
);
2708 l
= send(uid_shift_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), MSG_NOSIGNAL
);
2710 return log_error_errno(errno
, "Failed to send UID shift: %m");
2711 if (l
!= sizeof(arg_uid_shift
)) {
2712 log_error("Short write while sending UID shift.");
2717 /* Turn directory into bind mount */
2718 if (mount(directory
, directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
2719 return log_error_errno(errno
, "Failed to make bind mount: %m");
2721 r
= setup_volatile(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2725 r
= setup_volatile_state(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2729 r
= base_filesystem_create(directory
, arg_uid_shift
, (gid_t
) arg_uid_shift
);
2733 if (arg_read_only
) {
2734 r
= bind_remount_recursive(directory
, true);
2736 return log_error_errno(r
, "Failed to make tree read-only: %m");
2739 r
= mount_all(directory
, false, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2743 if (copy_devnodes(directory
) < 0)
2746 dev_setup(directory
, arg_uid_shift
, arg_uid_shift
);
2748 if (setup_pts(directory
) < 0)
2751 r
= setup_propagate(directory
);
2755 r
= setup_dev_console(directory
, console
);
2759 r
= setup_seccomp();
2763 r
= setup_timezone(directory
);
2767 r
= setup_resolv_conf(directory
);
2771 r
= setup_journal(directory
);
2775 r
= mount_custom(directory
, arg_custom_mounts
, arg_n_custom_mounts
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2779 r
= mount_cgroups(directory
, arg_unified_cgroup_hierarchy
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2783 r
= mount_move_root(directory
);
2785 return log_error_errno(r
, "Failed to move root directory: %m");
2787 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
|
2788 (arg_share_system
? 0 : CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
) |
2789 (arg_private_network
? CLONE_NEWNET
: 0) |
2790 (arg_userns
? CLONE_NEWUSER
: 0),
2793 return log_error_errno(errno
, "Failed to fork inner child: %m");
2795 pid_socket
= safe_close(pid_socket
);
2796 uid_shift_socket
= safe_close(uid_shift_socket
);
2798 /* The inner child has all namespaces that are
2799 * requested, so that we all are owned by the user if
2800 * user namespaces are turned on. */
2802 r
= inner_child(barrier
, directory
, secondary
, kmsg_socket
, rtnl_socket
, fds
);
2804 _exit(EXIT_FAILURE
);
2806 _exit(EXIT_SUCCESS
);
2809 l
= send(pid_socket
, &pid
, sizeof(pid
), MSG_NOSIGNAL
);
2811 return log_error_errno(errno
, "Failed to send PID: %m");
2812 if (l
!= sizeof(pid
)) {
2813 log_error("Short write while sending PID.");
2817 pid_socket
= safe_close(pid_socket
);
2822 static int setup_uid_map(pid_t pid
) {
2823 char uid_map
[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t
) + 1], line
[DECIMAL_STR_MAX(uid_t
)*3+3+1];
2828 xsprintf(uid_map
, "/proc/" PID_FMT
"/uid_map", pid
);
2829 xsprintf(line
, UID_FMT
" " UID_FMT
" " UID_FMT
"\n", 0, arg_uid_shift
, arg_uid_range
);
2830 r
= write_string_file(uid_map
, line
, 0);
2832 return log_error_errno(r
, "Failed to write UID map: %m");
2834 /* We always assign the same UID and GID ranges */
2835 xsprintf(uid_map
, "/proc/" PID_FMT
"/gid_map", pid
);
2836 r
= write_string_file(uid_map
, line
, 0);
2838 return log_error_errno(r
, "Failed to write GID map: %m");
2843 static int load_settings(void) {
2844 _cleanup_(settings_freep
) Settings
*settings
= NULL
;
2845 _cleanup_fclose_
FILE *f
= NULL
;
2846 _cleanup_free_
char *p
= NULL
;
2850 /* If all settings are masked, there's no point in looking for
2851 * the settings file */
2852 if ((arg_settings_mask
& _SETTINGS_MASK_ALL
) == _SETTINGS_MASK_ALL
)
2855 fn
= strjoina(arg_machine
, ".nspawn");
2857 /* We first look in the admin's directories in /etc and /run */
2858 FOREACH_STRING(i
, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2859 _cleanup_free_
char *j
= NULL
;
2861 j
= strjoin(i
, "/", fn
, NULL
);
2870 /* By default we trust configuration from /etc and /run */
2871 if (arg_settings_trusted
< 0)
2872 arg_settings_trusted
= true;
2877 if (errno
!= ENOENT
)
2878 return log_error_errno(errno
, "Failed to open %s: %m", j
);
2882 /* After that, let's look for a file next to the
2883 * actual image we shall boot. */
2886 p
= file_in_same_dir(arg_image
, fn
);
2889 } else if (arg_directory
) {
2890 p
= file_in_same_dir(arg_directory
, fn
);
2897 if (!f
&& errno
!= ENOENT
)
2898 return log_error_errno(errno
, "Failed to open %s: %m", p
);
2900 /* By default we do not trust configuration from /var/lib/machines */
2901 if (arg_settings_trusted
< 0)
2902 arg_settings_trusted
= false;
2909 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted
));
2911 r
= settings_load(f
, p
, &settings
);
2915 /* Copy over bits from the settings, unless they have been
2916 * explicitly masked by command line switches. */
2918 if ((arg_settings_mask
& SETTING_BOOT
) == 0 &&
2919 settings
->boot
>= 0) {
2920 arg_boot
= settings
->boot
;
2922 strv_free(arg_parameters
);
2923 arg_parameters
= settings
->parameters
;
2924 settings
->parameters
= NULL
;
2927 if ((arg_settings_mask
& SETTING_ENVIRONMENT
) == 0 &&
2928 settings
->environment
) {
2929 strv_free(arg_setenv
);
2930 arg_setenv
= settings
->environment
;
2931 settings
->environment
= NULL
;
2934 if ((arg_settings_mask
& SETTING_USER
) == 0 &&
2937 arg_user
= settings
->user
;
2938 settings
->user
= NULL
;
2941 if ((arg_settings_mask
& SETTING_CAPABILITY
) == 0) {
2943 if (!arg_settings_trusted
&& settings
->capability
!= 0)
2944 log_warning("Ignoring Capability= setting, file %s is not trusted.", p
);
2946 arg_retain
|= settings
->capability
;
2948 arg_retain
&= ~settings
->drop_capability
;
2951 if ((arg_settings_mask
& SETTING_KILL_SIGNAL
) == 0 &&
2952 settings
->kill_signal
> 0)
2953 arg_kill_signal
= settings
->kill_signal
;
2955 if ((arg_settings_mask
& SETTING_PERSONALITY
) == 0 &&
2956 settings
->personality
!= PERSONALITY_INVALID
)
2957 arg_personality
= settings
->personality
;
2959 if ((arg_settings_mask
& SETTING_MACHINE_ID
) == 0 &&
2960 !sd_id128_is_null(settings
->machine_id
)) {
2962 if (!arg_settings_trusted
)
2963 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p
);
2965 arg_uuid
= settings
->machine_id
;
2968 if ((arg_settings_mask
& SETTING_READ_ONLY
) == 0 &&
2969 settings
->read_only
>= 0)
2970 arg_read_only
= settings
->read_only
;
2972 if ((arg_settings_mask
& SETTING_VOLATILE_MODE
) == 0 &&
2973 settings
->volatile_mode
!= _VOLATILE_MODE_INVALID
)
2974 arg_volatile_mode
= settings
->volatile_mode
;
2976 if ((arg_settings_mask
& SETTING_CUSTOM_MOUNTS
) == 0 &&
2977 settings
->n_custom_mounts
> 0) {
2979 if (!arg_settings_trusted
)
2980 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p
);
2982 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
2983 arg_custom_mounts
= settings
->custom_mounts
;
2984 arg_n_custom_mounts
= settings
->n_custom_mounts
;
2986 settings
->custom_mounts
= NULL
;
2987 settings
->n_custom_mounts
= 0;
2991 if ((arg_settings_mask
& SETTING_NETWORK
) == 0 &&
2992 (settings
->private_network
>= 0 ||
2993 settings
->network_veth
>= 0 ||
2994 settings
->network_bridge
||
2995 settings
->network_interfaces
||
2996 settings
->network_macvlan
||
2997 settings
->network_ipvlan
)) {
2999 if (!arg_settings_trusted
)
3000 log_warning("Ignoring network settings, file %s is not trusted.", p
);
3002 strv_free(arg_network_interfaces
);
3003 arg_network_interfaces
= settings
->network_interfaces
;
3004 settings
->network_interfaces
= NULL
;
3006 strv_free(arg_network_macvlan
);
3007 arg_network_macvlan
= settings
->network_macvlan
;
3008 settings
->network_macvlan
= NULL
;
3010 strv_free(arg_network_ipvlan
);
3011 arg_network_ipvlan
= settings
->network_ipvlan
;
3012 settings
->network_ipvlan
= NULL
;
3014 free(arg_network_bridge
);
3015 arg_network_bridge
= settings
->network_bridge
;
3016 settings
->network_bridge
= NULL
;
3018 arg_network_veth
= settings
->network_veth
> 0 || settings
->network_bridge
;
3020 arg_private_network
= true; /* all these settings imply private networking */
3024 if ((arg_settings_mask
& SETTING_EXPOSE_PORTS
) == 0 &&
3025 settings
->expose_ports
) {
3027 if (!arg_settings_trusted
)
3028 log_warning("Ignoring Port= setting, file %s is not trusted.", p
);
3030 expose_port_free_all(arg_expose_ports
);
3031 arg_expose_ports
= settings
->expose_ports
;
3032 settings
->expose_ports
= NULL
;
3039 int main(int argc
, char *argv
[]) {
3041 _cleanup_free_
char *device_path
= NULL
, *root_device
= NULL
, *home_device
= NULL
, *srv_device
= NULL
, *console
= NULL
;
3042 bool root_device_rw
= true, home_device_rw
= true, srv_device_rw
= true;
3043 _cleanup_close_
int master
= -1, image_fd
= -1;
3044 _cleanup_fdset_free_ FDSet
*fds
= NULL
;
3045 int r
, n_fd_passed
, loop_nr
= -1;
3046 char veth_name
[IFNAMSIZ
];
3047 bool secondary
= false, remove_subvol
= false;
3050 int ret
= EXIT_SUCCESS
;
3051 union in_addr_union exposed
= {};
3052 _cleanup_release_lock_file_ LockFile tree_global_lock
= LOCK_FILE_INIT
, tree_local_lock
= LOCK_FILE_INIT
;
3055 log_parse_environment();
3058 r
= parse_argv(argc
, argv
);
3062 if (geteuid() != 0) {
3063 log_error("Need to be root.");
3067 r
= determine_names();
3071 r
= load_settings();
3075 r
= verify_arguments();
3079 n_fd_passed
= sd_listen_fds(false);
3080 if (n_fd_passed
> 0) {
3081 r
= fdset_new_listen_fds(&fds
, false);
3083 log_error_errno(r
, "Failed to collect file descriptors: %m");
3088 if (arg_directory
) {
3091 if (path_equal(arg_directory
, "/") && !arg_ephemeral
) {
3092 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3097 if (arg_ephemeral
) {
3098 _cleanup_free_
char *np
= NULL
;
3100 /* If the specified path is a mount point we
3101 * generate the new snapshot immediately
3102 * inside it under a random name. However if
3103 * the specified is not a mount point we
3104 * create the new snapshot in the parent
3105 * directory, just next to it. */
3106 r
= path_is_mount_point(arg_directory
, 0);
3108 log_error_errno(r
, "Failed to determine whether directory %s is mount point: %m", arg_directory
);
3112 r
= tempfn_random_child(arg_directory
, "machine.", &np
);
3114 r
= tempfn_random(arg_directory
, "machine.", &np
);
3116 log_error_errno(r
, "Failed to generate name for snapshot: %m");
3120 r
= image_path_lock(np
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3122 log_error_errno(r
, "Failed to lock %s: %m", np
);
3126 r
= btrfs_subvol_snapshot(arg_directory
, np
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
);
3128 log_error_errno(r
, "Failed to create snapshot %s from %s: %m", np
, arg_directory
);
3132 free(arg_directory
);
3136 remove_subvol
= true;
3139 r
= image_path_lock(arg_directory
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3141 log_error_errno(r
, "Directory tree %s is currently busy.", arg_directory
);
3145 log_error_errno(r
, "Failed to lock %s: %m", arg_directory
);
3150 r
= btrfs_subvol_snapshot(arg_template
, arg_directory
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
);
3153 log_info("Directory %s already exists, not populating from template %s.", arg_directory
, arg_template
);
3155 log_error_errno(r
, "Couldn't create snapshot %s from %s: %m", arg_directory
, arg_template
);
3159 log_info("Populated %s from template %s.", arg_directory
, arg_template
);
3165 if (path_is_os_tree(arg_directory
) <= 0) {
3166 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory
);
3173 p
= strjoina(arg_directory
,
3174 argc
> optind
&& path_is_absolute(argv
[optind
]) ? argv
[optind
] : "/usr/bin/");
3175 if (access(p
, F_OK
) < 0) {
3176 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory
);
3183 char template[] = "/tmp/nspawn-root-XXXXXX";
3186 assert(!arg_template
);
3188 r
= image_path_lock(arg_image
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3190 r
= log_error_errno(r
, "Disk image %s is currently busy.", arg_image
);
3194 r
= log_error_errno(r
, "Failed to create image lock: %m");
3198 if (!mkdtemp(template)) {
3199 log_error_errno(errno
, "Failed to create temporary directory: %m");
3204 arg_directory
= strdup(template);
3205 if (!arg_directory
) {
3210 image_fd
= setup_image(&device_path
, &loop_nr
);
3216 r
= dissect_image(image_fd
,
3217 &root_device
, &root_device_rw
,
3218 &home_device
, &home_device_rw
,
3219 &srv_device
, &srv_device_rw
,
3225 r
= custom_mounts_prepare();
3230 isatty(STDIN_FILENO
) > 0 &&
3231 isatty(STDOUT_FILENO
) > 0;
3233 master
= posix_openpt(O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NDELAY
);
3235 r
= log_error_errno(errno
, "Failed to acquire pseudo tty: %m");
3239 r
= ptsname_malloc(master
, &console
);
3241 r
= log_error_errno(r
, "Failed to determine tty name: %m");
3245 if (unlockpt(master
) < 0) {
3246 r
= log_error_errno(errno
, "Failed to unlock tty: %m");
3251 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3252 arg_machine
, arg_image
?: arg_directory
);
3254 assert_se(sigprocmask_many(SIG_BLOCK
, NULL
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, -1) >= 0);
3256 assert_se(sigemptyset(&mask_chld
) == 0);
3257 assert_se(sigaddset(&mask_chld
, SIGCHLD
) == 0);
3259 if (prctl(PR_SET_CHILD_SUBREAPER
, 1) < 0) {
3260 r
= log_error_errno(errno
, "Failed to become subreaper: %m");
3265 _cleanup_close_pair_
int kmsg_socket_pair
[2] = { -1, -1 }, rtnl_socket_pair
[2] = { -1, -1 }, pid_socket_pair
[2] = { -1, -1 },
3266 uid_shift_socket_pair
[2] = { -1, -1 };
3267 ContainerStatus container_status
;
3268 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
3269 static const struct sigaction sa
= {
3270 .sa_handler
= nop_handler
,
3271 .sa_flags
= SA_NOCLDSTOP
,
3275 _cleanup_event_unref_ sd_event
*event
= NULL
;
3276 _cleanup_(pty_forward_freep
) PTYForward
*forward
= NULL
;
3277 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
3280 r
= barrier_create(&barrier
);
3282 log_error_errno(r
, "Cannot initialize IPC barrier: %m");
3286 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, kmsg_socket_pair
) < 0) {
3287 r
= log_error_errno(errno
, "Failed to create kmsg socket pair: %m");
3291 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, rtnl_socket_pair
) < 0) {
3292 r
= log_error_errno(errno
, "Failed to create rtnl socket pair: %m");
3296 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, pid_socket_pair
) < 0) {
3297 r
= log_error_errno(errno
, "Failed to create pid socket pair: %m");
3302 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, uid_shift_socket_pair
) < 0) {
3303 r
= log_error_errno(errno
, "Failed to create uid shift socket pair: %m");
3307 /* Child can be killed before execv(), so handle SIGCHLD
3308 * in order to interrupt parent's blocking calls and
3309 * give it a chance to call wait() and terminate. */
3310 r
= sigprocmask(SIG_UNBLOCK
, &mask_chld
, NULL
);
3312 r
= log_error_errno(errno
, "Failed to change the signal mask: %m");
3316 r
= sigaction(SIGCHLD
, &sa
, NULL
);
3318 r
= log_error_errno(errno
, "Failed to install SIGCHLD handler: %m");
3322 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
, NULL
);
3324 if (errno
== EINVAL
)
3325 r
= log_error_errno(errno
, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3327 r
= log_error_errno(errno
, "clone() failed: %m");
3333 /* The outer child only has a file system namespace. */
3334 barrier_set_role(&barrier
, BARRIER_CHILD
);
3336 master
= safe_close(master
);
3338 kmsg_socket_pair
[0] = safe_close(kmsg_socket_pair
[0]);
3339 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3340 pid_socket_pair
[0] = safe_close(pid_socket_pair
[0]);
3341 uid_shift_socket_pair
[0] = safe_close(uid_shift_socket_pair
[0]);
3343 (void) reset_all_signal_handlers();
3344 (void) reset_signal_mask();
3346 r
= outer_child(&barrier
,
3349 root_device
, root_device_rw
,
3350 home_device
, home_device_rw
,
3351 srv_device
, srv_device_rw
,
3355 kmsg_socket_pair
[1],
3356 rtnl_socket_pair
[1],
3357 uid_shift_socket_pair
[1],
3360 _exit(EXIT_FAILURE
);
3362 _exit(EXIT_SUCCESS
);
3365 barrier_set_role(&barrier
, BARRIER_PARENT
);
3370 kmsg_socket_pair
[1] = safe_close(kmsg_socket_pair
[1]);
3371 rtnl_socket_pair
[1] = safe_close(rtnl_socket_pair
[1]);
3372 pid_socket_pair
[1] = safe_close(pid_socket_pair
[1]);
3374 /* Wait for the outer child. */
3375 r
= wait_for_terminate_and_warn("namespace helper", pid
, NULL
);
3384 /* And now retrieve the PID of the inner child. */
3385 l
= recv(pid_socket_pair
[0], &pid
, sizeof(pid
), 0);
3387 r
= log_error_errno(errno
, "Failed to read inner child PID: %m");
3390 if (l
!= sizeof(pid
)) {
3391 log_error("Short read while reading inner child PID: %m");
3396 log_debug("Init process invoked as PID " PID_FMT
, pid
);
3399 if (!barrier_place_and_sync(&barrier
)) { /* #1 */
3400 log_error("Child died too early.");
3405 l
= recv(uid_shift_socket_pair
[0], &arg_uid_shift
, sizeof(arg_uid_shift
), 0);
3407 r
= log_error_errno(errno
, "Failed to read UID shift: %m");
3410 if (l
!= sizeof(arg_uid_shift
)) {
3411 log_error("Short read while reading UID shift: %m");
3416 r
= setup_uid_map(pid
);
3420 (void) barrier_place(&barrier
); /* #2 */
3423 if (arg_private_network
) {
3425 r
= move_network_interfaces(pid
, arg_network_interfaces
);
3429 if (arg_network_veth
) {
3430 r
= setup_veth(arg_machine
, pid
, veth_name
, !!arg_network_bridge
);
3436 if (arg_network_bridge
) {
3437 r
= setup_bridge(veth_name
, arg_network_bridge
);
3445 r
= setup_macvlan(arg_machine
, pid
, arg_network_macvlan
);
3449 r
= setup_ipvlan(arg_machine
, pid
, arg_network_ipvlan
);
3455 r
= register_machine(
3462 arg_custom_mounts
, arg_n_custom_mounts
,
3470 r
= sync_cgroup(pid
, arg_unified_cgroup_hierarchy
);
3474 if (arg_keep_unit
) {
3475 r
= create_subcgroup(pid
, arg_unified_cgroup_hierarchy
);
3480 r
= chown_cgroup(pid
, arg_uid_shift
);
3484 /* Notify the child that the parent is ready with all
3485 * its setup (including cgroup-ification), and that
3486 * the child can now hand over control to the code to
3487 * run inside the container. */
3488 (void) barrier_place(&barrier
); /* #3 */
3490 /* Block SIGCHLD here, before notifying child.
3491 * process_pty() will handle it with the other signals. */
3492 assert_se(sigprocmask(SIG_BLOCK
, &mask_chld
, NULL
) >= 0);
3494 /* Reset signal to default */
3495 r
= default_signals(SIGCHLD
, -1);
3497 log_error_errno(r
, "Failed to reset SIGCHLD: %m");
3501 /* Let the child know that we are ready and wait that the child is completely ready now. */
3502 if (!barrier_place_and_sync(&barrier
)) { /* #5 */
3503 log_error("Client died too early.");
3510 "STATUS=Container running.\n"
3511 "X_NSPAWN_LEADER_PID=" PID_FMT
, pid
);
3513 r
= sd_event_new(&event
);
3515 log_error_errno(r
, "Failed to get default event source: %m");
3519 if (arg_kill_signal
> 0) {
3520 /* Try to kill the init system on SIGINT or SIGTERM */
3521 sd_event_add_signal(event
, NULL
, SIGINT
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
3522 sd_event_add_signal(event
, NULL
, SIGTERM
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
3524 /* Immediately exit */
3525 sd_event_add_signal(event
, NULL
, SIGINT
, NULL
, NULL
);
3526 sd_event_add_signal(event
, NULL
, SIGTERM
, NULL
, NULL
);
3529 /* simply exit on sigchld */
3530 sd_event_add_signal(event
, NULL
, SIGCHLD
, NULL
, NULL
);
3532 if (arg_expose_ports
) {
3533 r
= expose_port_watch_rtnl(event
, rtnl_socket_pair
[0], on_address_change
, &exposed
, &rtnl
);
3537 (void) expose_port_execute(rtnl
, arg_expose_ports
, &exposed
);
3540 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3542 r
= pty_forward_new(event
, master
, true, !interactive
, &forward
);
3544 log_error_errno(r
, "Failed to create PTY forwarder: %m");
3548 r
= sd_event_loop(event
);
3550 log_error_errno(r
, "Failed to run event loop: %m");
3554 pty_forward_get_last_char(forward
, &last_char
);
3556 forward
= pty_forward_free(forward
);
3558 if (!arg_quiet
&& last_char
!= '\n')
3561 /* Kill if it is not dead yet anyway */
3562 if (arg_register
&& !arg_keep_unit
)
3563 terminate_machine(pid
);
3565 /* Normally redundant, but better safe than sorry */
3568 r
= wait_for_container(pid
, &container_status
);
3572 /* We failed to wait for the container, or the
3573 * container exited abnormally */
3575 else if (r
> 0 || container_status
== CONTAINER_TERMINATED
){
3576 /* The container exited with a non-zero
3577 * status, or with zero status and no reboot
3583 /* CONTAINER_REBOOTED, loop again */
3585 if (arg_keep_unit
) {
3586 /* Special handling if we are running as a
3587 * service: instead of simply restarting the
3588 * machine we want to restart the entire
3589 * service, so let's inform systemd about this
3590 * with the special exit code 133. The service
3591 * file uses RestartForceExitStatus=133 so
3592 * that this results in a full nspawn
3593 * restart. This is necessary since we might
3594 * have cgroup parameters set we want to have
3601 expose_port_flush(arg_expose_ports
, &exposed
);
3607 "STATUS=Terminating...");
3612 /* Try to flush whatever is still queued in the pty */
3614 (void) copy_bytes(master
, STDOUT_FILENO
, (off_t
) -1, false);
3616 loop_remove(loop_nr
, &image_fd
);
3618 if (remove_subvol
&& arg_directory
) {
3621 k
= btrfs_subvol_remove(arg_directory
, true);
3623 log_warning_errno(k
, "Cannot remove subvolume '%s', ignoring: %m", arg_directory
);
3629 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
3630 (void) rm_rf(p
, REMOVE_ROOT
);
3633 expose_port_flush(arg_expose_ports
, &exposed
);
3635 free(arg_directory
);
3640 strv_free(arg_setenv
);
3641 free(arg_network_bridge
);
3642 strv_free(arg_network_interfaces
);
3643 strv_free(arg_network_macvlan
);
3644 strv_free(arg_network_ipvlan
);
3645 strv_free(arg_parameters
);
3646 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
3647 expose_port_free_all(arg_expose_ports
);
3649 return r
< 0 ? EXIT_FAILURE
: ret
;