1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
23 #include <blkid/blkid.h>
27 #include <linux/loop.h>
33 #include <selinux/selinux.h>
40 #include <sys/mount.h>
41 #include <sys/personality.h>
42 #include <sys/prctl.h>
43 #include <sys/types.h>
46 #include "sd-daemon.h"
50 #include "base-filesystem.h"
51 #include "blkid-util.h"
52 #include "btrfs-util.h"
55 #include "capability.h"
56 #include "cgroup-util.h"
58 #include "dev-setup.h"
60 #include "event-util.h"
63 #include "formats-util.h"
65 #include "hostname-util.h"
67 #include "loopback-setup.h"
68 #include "machine-image.h"
72 #include "netlink-util.h"
73 #include "path-util.h"
74 #include "process-util.h"
76 #include "random-util.h"
79 #include "seccomp-util.h"
81 #include "signal-util.h"
83 #include "terminal-util.h"
84 #include "udev-util.h"
87 #include "nspawn-settings.h"
88 #include "nspawn-mount.h"
89 #include "nspawn-network.h"
90 #include "nspawn-expose-ports.h"
91 #include "nspawn-cgroup.h"
92 #include "nspawn-register.h"
93 #include "nspawn-setuid.h"
95 typedef enum ContainerStatus
{
100 typedef enum LinkJournal
{
107 static char *arg_directory
= NULL
;
108 static char *arg_template
= NULL
;
109 static char *arg_user
= NULL
;
110 static sd_id128_t arg_uuid
= {};
111 static char *arg_machine
= NULL
;
112 static const char *arg_selinux_context
= NULL
;
113 static const char *arg_selinux_apifs_context
= NULL
;
114 static const char *arg_slice
= NULL
;
115 static bool arg_private_network
= false;
116 static bool arg_read_only
= false;
117 static bool arg_boot
= false;
118 static bool arg_ephemeral
= false;
119 static LinkJournal arg_link_journal
= LINK_AUTO
;
120 static bool arg_link_journal_try
= false;
121 static uint64_t arg_retain
=
122 (1ULL << CAP_CHOWN
) |
123 (1ULL << CAP_DAC_OVERRIDE
) |
124 (1ULL << CAP_DAC_READ_SEARCH
) |
125 (1ULL << CAP_FOWNER
) |
126 (1ULL << CAP_FSETID
) |
127 (1ULL << CAP_IPC_OWNER
) |
129 (1ULL << CAP_LEASE
) |
130 (1ULL << CAP_LINUX_IMMUTABLE
) |
131 (1ULL << CAP_NET_BIND_SERVICE
) |
132 (1ULL << CAP_NET_BROADCAST
) |
133 (1ULL << CAP_NET_RAW
) |
134 (1ULL << CAP_SETGID
) |
135 (1ULL << CAP_SETFCAP
) |
136 (1ULL << CAP_SETPCAP
) |
137 (1ULL << CAP_SETUID
) |
138 (1ULL << CAP_SYS_ADMIN
) |
139 (1ULL << CAP_SYS_CHROOT
) |
140 (1ULL << CAP_SYS_NICE
) |
141 (1ULL << CAP_SYS_PTRACE
) |
142 (1ULL << CAP_SYS_TTY_CONFIG
) |
143 (1ULL << CAP_SYS_RESOURCE
) |
144 (1ULL << CAP_SYS_BOOT
) |
145 (1ULL << CAP_AUDIT_WRITE
) |
146 (1ULL << CAP_AUDIT_CONTROL
) |
148 static CustomMount
*arg_custom_mounts
= NULL
;
149 static unsigned arg_n_custom_mounts
= 0;
150 static char **arg_setenv
= NULL
;
151 static bool arg_quiet
= false;
152 static bool arg_share_system
= false;
153 static bool arg_register
= true;
154 static bool arg_keep_unit
= false;
155 static char **arg_network_interfaces
= NULL
;
156 static char **arg_network_macvlan
= NULL
;
157 static char **arg_network_ipvlan
= NULL
;
158 static bool arg_network_veth
= false;
159 static char *arg_network_bridge
= NULL
;
160 static unsigned long arg_personality
= PERSONALITY_INVALID
;
161 static char *arg_image
= NULL
;
162 static VolatileMode arg_volatile_mode
= VOLATILE_NO
;
163 static ExposePort
*arg_expose_ports
= NULL
;
164 static char **arg_property
= NULL
;
165 static uid_t arg_uid_shift
= UID_INVALID
, arg_uid_range
= 0x10000U
;
166 static bool arg_userns
= false;
167 static int arg_kill_signal
= 0;
168 static bool arg_unified_cgroup_hierarchy
= false;
169 static SettingsMask arg_settings_mask
= 0;
170 static int arg_settings_trusted
= -1;
171 static char **arg_parameters
= NULL
;
173 static void help(void) {
174 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
175 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
176 " -h --help Show this help\n"
177 " --version Print version string\n"
178 " -q --quiet Do not show status information\n"
179 " -D --directory=PATH Root directory for the container\n"
180 " --template=PATH Initialize root directory from template directory,\n"
182 " -x --ephemeral Run container with snapshot of root directory, and\n"
183 " remove it after exit\n"
184 " -i --image=PATH File system device or disk image for the container\n"
185 " -b --boot Boot up full system (i.e. invoke init)\n"
186 " -u --user=USER Run the command under specified user or uid\n"
187 " -M --machine=NAME Set the machine name for the container\n"
188 " --uuid=UUID Set a specific machine UUID for the container\n"
189 " -S --slice=SLICE Place the container in the specified slice\n"
190 " --property=NAME=VALUE Set scope unit property\n"
191 " --private-users[=UIDBASE[:NUIDS]]\n"
192 " Run within user namespace\n"
193 " --private-network Disable network in container\n"
194 " --network-interface=INTERFACE\n"
195 " Assign an existing network interface to the\n"
197 " --network-macvlan=INTERFACE\n"
198 " Create a macvlan network interface based on an\n"
199 " existing network interface to the container\n"
200 " --network-ipvlan=INTERFACE\n"
201 " Create a ipvlan network interface based on an\n"
202 " existing network interface to the container\n"
203 " -n --network-veth Add a virtual ethernet connection between host\n"
205 " --network-bridge=INTERFACE\n"
206 " Add a virtual ethernet connection between host\n"
207 " and container and add it to an existing bridge on\n"
209 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
210 " Expose a container IP port on the host\n"
211 " -Z --selinux-context=SECLABEL\n"
212 " Set the SELinux security context to be used by\n"
213 " processes in the container\n"
214 " -L --selinux-apifs-context=SECLABEL\n"
215 " Set the SELinux security context to be used by\n"
216 " API/tmpfs file systems in the container\n"
217 " --capability=CAP In addition to the default, retain specified\n"
219 " --drop-capability=CAP Drop the specified capability from the default set\n"
220 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
221 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
222 " try-guest, try-host\n"
223 " -j Equivalent to --link-journal=try-guest\n"
224 " --read-only Mount the root directory read-only\n"
225 " --bind=PATH[:PATH[:OPTIONS]]\n"
226 " Bind mount a file or directory from the host into\n"
228 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
229 " Similar, but creates a read-only bind mount\n"
230 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
231 " --overlay=PATH[:PATH...]:PATH\n"
232 " Create an overlay mount from the host to \n"
234 " --overlay-ro=PATH[:PATH...]:PATH\n"
235 " Similar, but creates a read-only overlay mount\n"
236 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
237 " --share-system Share system namespaces with host\n"
238 " --register=BOOLEAN Register container as machine\n"
239 " --keep-unit Do not register a scope for the machine, reuse\n"
240 " the service unit nspawn is running in\n"
241 " --volatile[=MODE] Run the system in volatile mode\n"
242 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
243 , program_invocation_short_name
);
247 static int custom_mounts_prepare(void) {
251 /* Ensure the mounts are applied prefix first. */
252 qsort_safe(arg_custom_mounts
, arg_n_custom_mounts
, sizeof(CustomMount
), custom_mount_compare
);
254 /* Allocate working directories for the overlay file systems that need it */
255 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
256 CustomMount
*m
= &arg_custom_mounts
[i
];
258 if (arg_userns
&& arg_uid_shift
== UID_INVALID
&& path_equal(m
->destination
, "/")) {
259 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
263 if (m
->type
!= CUSTOM_MOUNT_OVERLAY
)
272 r
= tempfn_random(m
->source
, NULL
, &m
->work_dir
);
274 return log_error_errno(r
, "Failed to generate work directory from %s: %m", m
->source
);
280 static int set_sanitized_path(char **b
, const char *path
) {
286 p
= canonicalize_file_name(path
);
291 p
= path_make_absolute_cwd(path
);
297 *b
= path_kill_slashes(p
);
301 static int detect_unified_cgroup_hierarchy(void) {
305 /* Allow the user to control whether the unified hierarchy is used */
306 e
= getenv("UNIFIED_CGROUP_HIERARCHY");
308 r
= parse_boolean(e
);
310 return log_error_errno(r
, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
312 arg_unified_cgroup_hierarchy
= r
;
316 /* Otherwise inherit the default from the host system */
319 return log_error_errno(r
, "Failed to determine whether the unified cgroups hierarchy is used: %m");
321 arg_unified_cgroup_hierarchy
= r
;
325 static int parse_argv(int argc
, char *argv
[]) {
344 ARG_NETWORK_INTERFACE
,
357 static const struct option options
[] = {
358 { "help", no_argument
, NULL
, 'h' },
359 { "version", no_argument
, NULL
, ARG_VERSION
},
360 { "directory", required_argument
, NULL
, 'D' },
361 { "template", required_argument
, NULL
, ARG_TEMPLATE
},
362 { "ephemeral", no_argument
, NULL
, 'x' },
363 { "user", required_argument
, NULL
, 'u' },
364 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
365 { "boot", no_argument
, NULL
, 'b' },
366 { "uuid", required_argument
, NULL
, ARG_UUID
},
367 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
368 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
369 { "drop-capability", required_argument
, NULL
, ARG_DROP_CAPABILITY
},
370 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
371 { "bind", required_argument
, NULL
, ARG_BIND
},
372 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
373 { "tmpfs", required_argument
, NULL
, ARG_TMPFS
},
374 { "overlay", required_argument
, NULL
, ARG_OVERLAY
},
375 { "overlay-ro", required_argument
, NULL
, ARG_OVERLAY_RO
},
376 { "machine", required_argument
, NULL
, 'M' },
377 { "slice", required_argument
, NULL
, 'S' },
378 { "setenv", required_argument
, NULL
, ARG_SETENV
},
379 { "selinux-context", required_argument
, NULL
, 'Z' },
380 { "selinux-apifs-context", required_argument
, NULL
, 'L' },
381 { "quiet", no_argument
, NULL
, 'q' },
382 { "share-system", no_argument
, NULL
, ARG_SHARE_SYSTEM
},
383 { "register", required_argument
, NULL
, ARG_REGISTER
},
384 { "keep-unit", no_argument
, NULL
, ARG_KEEP_UNIT
},
385 { "network-interface", required_argument
, NULL
, ARG_NETWORK_INTERFACE
},
386 { "network-macvlan", required_argument
, NULL
, ARG_NETWORK_MACVLAN
},
387 { "network-ipvlan", required_argument
, NULL
, ARG_NETWORK_IPVLAN
},
388 { "network-veth", no_argument
, NULL
, 'n' },
389 { "network-bridge", required_argument
, NULL
, ARG_NETWORK_BRIDGE
},
390 { "personality", required_argument
, NULL
, ARG_PERSONALITY
},
391 { "image", required_argument
, NULL
, 'i' },
392 { "volatile", optional_argument
, NULL
, ARG_VOLATILE
},
393 { "port", required_argument
, NULL
, 'p' },
394 { "property", required_argument
, NULL
, ARG_PROPERTY
},
395 { "private-users", optional_argument
, NULL
, ARG_PRIVATE_USERS
},
396 { "kill-signal", required_argument
, NULL
, ARG_KILL_SIGNAL
},
397 { "settings", required_argument
, NULL
, ARG_SETTINGS
},
402 uint64_t plus
= 0, minus
= 0;
403 bool mask_all_settings
= false, mask_no_settings
= false;
408 while ((c
= getopt_long(argc
, argv
, "+hD:u:bL:M:jS:Z:qi:xp:n", options
, NULL
)) >= 0)
417 puts(PACKAGE_STRING
);
418 puts(SYSTEMD_FEATURES
);
422 r
= set_sanitized_path(&arg_directory
, optarg
);
424 return log_error_errno(r
, "Invalid root directory: %m");
429 r
= set_sanitized_path(&arg_template
, optarg
);
431 return log_error_errno(r
, "Invalid template directory: %m");
436 r
= set_sanitized_path(&arg_image
, optarg
);
438 return log_error_errno(r
, "Invalid image path: %m");
443 arg_ephemeral
= true;
447 r
= free_and_strdup(&arg_user
, optarg
);
451 arg_settings_mask
|= SETTING_USER
;
454 case ARG_NETWORK_BRIDGE
:
455 r
= free_and_strdup(&arg_network_bridge
, optarg
);
462 arg_network_veth
= true;
463 arg_private_network
= true;
464 arg_settings_mask
|= SETTING_NETWORK
;
467 case ARG_NETWORK_INTERFACE
:
468 if (strv_extend(&arg_network_interfaces
, optarg
) < 0)
471 arg_private_network
= true;
472 arg_settings_mask
|= SETTING_NETWORK
;
475 case ARG_NETWORK_MACVLAN
:
476 if (strv_extend(&arg_network_macvlan
, optarg
) < 0)
479 arg_private_network
= true;
480 arg_settings_mask
|= SETTING_NETWORK
;
483 case ARG_NETWORK_IPVLAN
:
484 if (strv_extend(&arg_network_ipvlan
, optarg
) < 0)
489 case ARG_PRIVATE_NETWORK
:
490 arg_private_network
= true;
491 arg_settings_mask
|= SETTING_NETWORK
;
496 arg_settings_mask
|= SETTING_BOOT
;
500 r
= sd_id128_from_string(optarg
, &arg_uuid
);
502 log_error("Invalid UUID: %s", optarg
);
506 arg_settings_mask
|= SETTING_MACHINE_ID
;
515 arg_machine
= mfree(arg_machine
);
517 if (!machine_name_is_valid(optarg
)) {
518 log_error("Invalid machine name: %s", optarg
);
522 r
= free_and_strdup(&arg_machine
, optarg
);
530 arg_selinux_context
= optarg
;
534 arg_selinux_apifs_context
= optarg
;
538 arg_read_only
= true;
539 arg_settings_mask
|= SETTING_READ_ONLY
;
543 case ARG_DROP_CAPABILITY
: {
544 const char *state
, *word
;
547 FOREACH_WORD_SEPARATOR(word
, length
, optarg
, ",", state
) {
548 _cleanup_free_
char *t
;
550 t
= strndup(word
, length
);
554 if (streq(t
, "all")) {
555 if (c
== ARG_CAPABILITY
)
556 plus
= (uint64_t) -1;
558 minus
= (uint64_t) -1;
562 cap
= capability_from_name(t
);
564 log_error("Failed to parse capability %s.", t
);
568 if (c
== ARG_CAPABILITY
)
569 plus
|= 1ULL << (uint64_t) cap
;
571 minus
|= 1ULL << (uint64_t) cap
;
575 arg_settings_mask
|= SETTING_CAPABILITY
;
580 arg_link_journal
= LINK_GUEST
;
581 arg_link_journal_try
= true;
584 case ARG_LINK_JOURNAL
:
585 if (streq(optarg
, "auto")) {
586 arg_link_journal
= LINK_AUTO
;
587 arg_link_journal_try
= false;
588 } else if (streq(optarg
, "no")) {
589 arg_link_journal
= LINK_NO
;
590 arg_link_journal_try
= false;
591 } else if (streq(optarg
, "guest")) {
592 arg_link_journal
= LINK_GUEST
;
593 arg_link_journal_try
= false;
594 } else if (streq(optarg
, "host")) {
595 arg_link_journal
= LINK_HOST
;
596 arg_link_journal_try
= false;
597 } else if (streq(optarg
, "try-guest")) {
598 arg_link_journal
= LINK_GUEST
;
599 arg_link_journal_try
= true;
600 } else if (streq(optarg
, "try-host")) {
601 arg_link_journal
= LINK_HOST
;
602 arg_link_journal_try
= true;
604 log_error("Failed to parse link journal mode %s", optarg
);
612 r
= bind_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
, c
== ARG_BIND_RO
);
614 return log_error_errno(r
, "Failed to parse --bind(-ro)= argument %s: %m", optarg
);
616 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
620 r
= tmpfs_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
);
622 return log_error_errno(r
, "Failed to parse --tmpfs= argument %s: %m", optarg
);
624 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
628 case ARG_OVERLAY_RO
: {
629 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
630 _cleanup_strv_free_
char **lower
= NULL
;
635 r
= strv_split_extract(&lower
, optarg
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
639 log_error("Invalid overlay specification: %s", optarg
);
643 STRV_FOREACH(i
, lower
) {
644 if (!path_is_absolute(*i
)) {
645 log_error("Overlay path %s is not absolute.", *i
);
653 log_error("--overlay= needs at least two colon-separated directories specified.");
658 /* If two parameters are specified,
659 * the first one is the lower, the
660 * second one the upper directory. And
661 * we'll also define the destination
662 * mount point the same as the upper. */
666 destination
= strdup(upper
);
671 upper
= lower
[n
- 2];
672 destination
= lower
[n
- 1];
676 m
= custom_mount_add(&arg_custom_mounts
, &arg_n_custom_mounts
, CUSTOM_MOUNT_OVERLAY
);
680 m
->destination
= destination
;
683 m
->read_only
= c
== ARG_OVERLAY_RO
;
685 upper
= destination
= NULL
;
688 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
695 if (!env_assignment_is_valid(optarg
)) {
696 log_error("Environment variable assignment '%s' is not valid.", optarg
);
700 n
= strv_env_set(arg_setenv
, optarg
);
704 strv_free(arg_setenv
);
707 arg_settings_mask
|= SETTING_ENVIRONMENT
;
715 case ARG_SHARE_SYSTEM
:
716 arg_share_system
= true;
720 r
= parse_boolean(optarg
);
722 log_error("Failed to parse --register= argument: %s", optarg
);
730 arg_keep_unit
= true;
733 case ARG_PERSONALITY
:
735 arg_personality
= personality_from_string(optarg
);
736 if (arg_personality
== PERSONALITY_INVALID
) {
737 log_error("Unknown or unsupported personality '%s'.", optarg
);
741 arg_settings_mask
|= SETTING_PERSONALITY
;
747 arg_volatile_mode
= VOLATILE_YES
;
751 m
= volatile_mode_from_string(optarg
);
753 log_error("Failed to parse --volatile= argument: %s", optarg
);
756 arg_volatile_mode
= m
;
759 arg_settings_mask
|= SETTING_VOLATILE_MODE
;
763 r
= expose_port_parse(&arg_expose_ports
, optarg
);
765 return log_error_errno(r
, "Duplicate port specification: %s", optarg
);
767 return log_error_errno(r
, "Failed to parse host port %s: %m", optarg
);
769 arg_settings_mask
|= SETTING_EXPOSE_PORTS
;
773 if (strv_extend(&arg_property
, optarg
) < 0)
778 case ARG_PRIVATE_USERS
:
780 _cleanup_free_
char *buffer
= NULL
;
781 const char *range
, *shift
;
783 range
= strchr(optarg
, ':');
785 buffer
= strndup(optarg
, range
- optarg
);
791 if (safe_atou32(range
, &arg_uid_range
) < 0 || arg_uid_range
<= 0) {
792 log_error("Failed to parse UID range: %s", range
);
798 if (parse_uid(shift
, &arg_uid_shift
) < 0) {
799 log_error("Failed to parse UID: %s", optarg
);
807 case ARG_KILL_SIGNAL
:
808 arg_kill_signal
= signal_from_string_try_harder(optarg
);
809 if (arg_kill_signal
< 0) {
810 log_error("Cannot parse signal: %s", optarg
);
814 arg_settings_mask
|= SETTING_KILL_SIGNAL
;
819 /* no → do not read files
820 * yes → read files, do not override cmdline, trust only subset
821 * override → read files, override cmdline, trust only subset
822 * trusted → read files, do not override cmdline, trust all
825 r
= parse_boolean(optarg
);
827 if (streq(optarg
, "trusted")) {
828 mask_all_settings
= false;
829 mask_no_settings
= false;
830 arg_settings_trusted
= true;
832 } else if (streq(optarg
, "override")) {
833 mask_all_settings
= false;
834 mask_no_settings
= true;
835 arg_settings_trusted
= -1;
837 return log_error_errno(r
, "Failed to parse --settings= argument: %s", optarg
);
840 mask_all_settings
= false;
841 mask_no_settings
= false;
842 arg_settings_trusted
= -1;
845 mask_all_settings
= true;
846 mask_no_settings
= false;
847 arg_settings_trusted
= false;
856 assert_not_reached("Unhandled option");
859 if (arg_share_system
)
860 arg_register
= false;
862 if (arg_boot
&& arg_share_system
) {
863 log_error("--boot and --share-system may not be combined.");
867 if (arg_keep_unit
&& cg_pid_get_owner_uid(0, NULL
) >= 0) {
868 log_error("--keep-unit may not be used when invoked from a user session.");
872 if (arg_directory
&& arg_image
) {
873 log_error("--directory= and --image= may not be combined.");
877 if (arg_template
&& arg_image
) {
878 log_error("--template= and --image= may not be combined.");
882 if (arg_template
&& !(arg_directory
|| arg_machine
)) {
883 log_error("--template= needs --directory= or --machine=.");
887 if (arg_ephemeral
&& arg_template
) {
888 log_error("--ephemeral and --template= may not be combined.");
892 if (arg_ephemeral
&& arg_image
) {
893 log_error("--ephemeral and --image= may not be combined.");
897 if (arg_ephemeral
&& !IN_SET(arg_link_journal
, LINK_NO
, LINK_AUTO
)) {
898 log_error("--ephemeral and --link-journal= may not be combined.");
902 if (arg_userns
&& access("/proc/self/uid_map", F_OK
) < 0)
903 return log_error_errno(EOPNOTSUPP
, "--private-users= is not supported, kernel compiled without user namespace support.");
906 arg_parameters
= strv_copy(argv
+ optind
);
910 arg_settings_mask
|= SETTING_BOOT
;
913 /* Load all settings from .nspawn files */
914 if (mask_no_settings
)
915 arg_settings_mask
= 0;
917 /* Don't load any settings from .nspawn files */
918 if (mask_all_settings
)
919 arg_settings_mask
= _SETTINGS_MASK_ALL
;
921 arg_retain
= (arg_retain
| plus
| (arg_private_network
? 1ULL << CAP_NET_ADMIN
: 0)) & ~minus
;
923 r
= detect_unified_cgroup_hierarchy();
930 static int verify_arguments(void) {
932 if (arg_volatile_mode
!= VOLATILE_NO
&& arg_read_only
) {
933 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
937 if (arg_expose_ports
&& !arg_private_network
) {
938 log_error("Cannot use --port= without private networking.");
942 if (arg_boot
&& arg_kill_signal
<= 0)
943 arg_kill_signal
= SIGRTMIN
+3;
948 static int userns_lchown(const char *p
, uid_t uid
, gid_t gid
) {
954 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
957 if (uid
!= UID_INVALID
) {
958 uid
+= arg_uid_shift
;
960 if (uid
< arg_uid_shift
|| uid
>= arg_uid_shift
+ arg_uid_range
)
964 if (gid
!= GID_INVALID
) {
965 gid
+= (gid_t
) arg_uid_shift
;
967 if (gid
< (gid_t
) arg_uid_shift
|| gid
>= (gid_t
) (arg_uid_shift
+ arg_uid_range
))
971 if (lchown(p
, uid
, gid
) < 0)
977 static int userns_mkdir(const char *root
, const char *path
, mode_t mode
, uid_t uid
, gid_t gid
) {
980 q
= prefix_roota(root
, path
);
981 if (mkdir(q
, mode
) < 0) {
987 return userns_lchown(q
, uid
, gid
);
990 static int setup_timezone(const char *dest
) {
991 _cleanup_free_
char *p
= NULL
, *q
= NULL
;
992 const char *where
, *check
, *what
;
998 /* Fix the timezone, if possible */
999 r
= readlink_malloc("/etc/localtime", &p
);
1001 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1005 z
= path_startswith(p
, "../usr/share/zoneinfo/");
1007 z
= path_startswith(p
, "/usr/share/zoneinfo/");
1009 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1013 where
= prefix_roota(dest
, "/etc/localtime");
1014 r
= readlink_malloc(where
, &q
);
1016 y
= path_startswith(q
, "../usr/share/zoneinfo/");
1018 y
= path_startswith(q
, "/usr/share/zoneinfo/");
1020 /* Already pointing to the right place? Then do nothing .. */
1021 if (y
&& streq(y
, z
))
1025 check
= strjoina("/usr/share/zoneinfo/", z
);
1026 check
= prefix_root(dest
, check
);
1027 if (laccess(check
, F_OK
) < 0) {
1028 log_warning("Timezone %s does not exist in container, not updating container timezone.", z
);
1033 if (r
< 0 && errno
!= ENOENT
) {
1034 log_error_errno(errno
, "Failed to remove existing timezone info %s in container: %m", where
);
1038 what
= strjoina("../usr/share/zoneinfo/", z
);
1039 if (symlink(what
, where
) < 0) {
1040 log_error_errno(errno
, "Failed to correct timezone of container: %m");
1044 r
= userns_lchown(where
, 0, 0);
1046 return log_warning_errno(r
, "Failed to chown /etc/localtime: %m");
1051 static int setup_resolv_conf(const char *dest
) {
1052 const char *where
= NULL
;
1057 if (arg_private_network
)
1060 /* Fix resolv.conf, if possible */
1061 where
= prefix_roota(dest
, "/etc/resolv.conf");
1063 r
= copy_file("/etc/resolv.conf", where
, O_TRUNC
|O_NOFOLLOW
, 0644, 0);
1065 /* If the file already exists as symlink, let's
1066 * suppress the warning, under the assumption that
1067 * resolved or something similar runs inside and the
1068 * symlink points there.
1070 * If the disk image is read-only, there's also no
1071 * point in complaining.
1073 log_full_errno(IN_SET(r
, -ELOOP
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
1074 "Failed to copy /etc/resolv.conf to %s: %m", where
);
1078 r
= userns_lchown(where
, 0, 0);
1080 log_warning_errno(r
, "Failed to chown /etc/resolv.conf: %m");
1085 static char* id128_format_as_uuid(sd_id128_t id
, char s
[37]) {
1089 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1090 SD_ID128_FORMAT_VAL(id
));
1095 static int setup_boot_id(const char *dest
) {
1096 const char *from
, *to
;
1097 sd_id128_t rnd
= {};
1101 if (arg_share_system
)
1104 /* Generate a new randomized boot ID, so that each boot-up of
1105 * the container gets a new one */
1107 from
= prefix_roota(dest
, "/run/proc-sys-kernel-random-boot-id");
1108 to
= prefix_roota(dest
, "/proc/sys/kernel/random/boot_id");
1110 r
= sd_id128_randomize(&rnd
);
1112 return log_error_errno(r
, "Failed to generate random boot id: %m");
1114 id128_format_as_uuid(rnd
, as_uuid
);
1116 r
= write_string_file(from
, as_uuid
, WRITE_STRING_FILE_CREATE
);
1118 return log_error_errno(r
, "Failed to write boot id: %m");
1120 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1121 r
= log_error_errno(errno
, "Failed to bind mount boot id: %m");
1122 else if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
) < 0)
1123 log_warning_errno(errno
, "Failed to make boot id read-only: %m");
1129 static int copy_devnodes(const char *dest
) {
1131 static const char devnodes
[] =
1142 _cleanup_umask_ mode_t u
;
1148 /* Create /dev/net, so that we can create /dev/net/tun in it */
1149 if (userns_mkdir(dest
, "/dev/net", 0755, 0, 0) < 0)
1150 return log_error_errno(r
, "Failed to create /dev/net directory: %m");
1152 NULSTR_FOREACH(d
, devnodes
) {
1153 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
1156 from
= strappend("/dev/", d
);
1157 to
= prefix_root(dest
, from
);
1159 if (stat(from
, &st
) < 0) {
1161 if (errno
!= ENOENT
)
1162 return log_error_errno(errno
, "Failed to stat %s: %m", from
);
1164 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
1166 log_error("%s is not a char or block device, cannot copy.", from
);
1170 if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
1172 return log_error_errno(errno
, "mknod(%s) failed: %m", to
);
1174 /* Some systems abusively restrict mknod but
1175 * allow bind mounts. */
1178 return log_error_errno(r
, "touch (%s) failed: %m", to
);
1179 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1180 return log_error_errno(errno
, "Both mknod and bind mount (%s) failed: %m", to
);
1183 r
= userns_lchown(to
, 0, 0);
1185 return log_error_errno(r
, "chown() of device node %s failed: %m", to
);
1192 static int setup_pts(const char *dest
) {
1193 _cleanup_free_
char *options
= NULL
;
1197 if (arg_selinux_apifs_context
)
1198 (void) asprintf(&options
,
1199 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
",context=\"%s\"",
1200 arg_uid_shift
+ TTY_GID
,
1201 arg_selinux_apifs_context
);
1204 (void) asprintf(&options
,
1205 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
,
1206 arg_uid_shift
+ TTY_GID
);
1211 /* Mount /dev/pts itself */
1212 p
= prefix_roota(dest
, "/dev/pts");
1213 if (mkdir(p
, 0755) < 0)
1214 return log_error_errno(errno
, "Failed to create /dev/pts: %m");
1215 if (mount("devpts", p
, "devpts", MS_NOSUID
|MS_NOEXEC
, options
) < 0)
1216 return log_error_errno(errno
, "Failed to mount /dev/pts: %m");
1217 if (userns_lchown(p
, 0, 0) < 0)
1218 return log_error_errno(errno
, "Failed to chown /dev/pts: %m");
1220 /* Create /dev/ptmx symlink */
1221 p
= prefix_roota(dest
, "/dev/ptmx");
1222 if (symlink("pts/ptmx", p
) < 0)
1223 return log_error_errno(errno
, "Failed to create /dev/ptmx symlink: %m");
1224 if (userns_lchown(p
, 0, 0) < 0)
1225 return log_error_errno(errno
, "Failed to chown /dev/ptmx: %m");
1227 /* And fix /dev/pts/ptmx ownership */
1228 p
= prefix_roota(dest
, "/dev/pts/ptmx");
1229 if (userns_lchown(p
, 0, 0) < 0)
1230 return log_error_errno(errno
, "Failed to chown /dev/pts/ptmx: %m");
1235 static int setup_dev_console(const char *dest
, const char *console
) {
1236 _cleanup_umask_ mode_t u
;
1245 r
= chmod_and_chown(console
, 0600, arg_uid_shift
, arg_uid_shift
);
1247 return log_error_errno(r
, "Failed to correct access mode for TTY: %m");
1249 /* We need to bind mount the right tty to /dev/console since
1250 * ptys can only exist on pts file systems. To have something
1251 * to bind mount things on we create a empty regular file. */
1253 to
= prefix_roota(dest
, "/dev/console");
1256 return log_error_errno(r
, "touch() for /dev/console failed: %m");
1258 if (mount(console
, to
, NULL
, MS_BIND
, NULL
) < 0)
1259 return log_error_errno(errno
, "Bind mount for /dev/console failed: %m");
1264 static int setup_kmsg(const char *dest
, int kmsg_socket
) {
1265 const char *from
, *to
;
1266 _cleanup_umask_ mode_t u
;
1269 struct cmsghdr cmsghdr
;
1270 uint8_t buf
[CMSG_SPACE(sizeof(int))];
1272 struct msghdr mh
= {
1273 .msg_control
= &control
,
1274 .msg_controllen
= sizeof(control
),
1276 struct cmsghdr
*cmsg
;
1278 assert(kmsg_socket
>= 0);
1282 /* We create the kmsg FIFO as /run/kmsg, but immediately
1283 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1284 * on the reading side behave very similar to /proc/kmsg,
1285 * their writing side behaves differently from /dev/kmsg in
1286 * that writing blocks when nothing is reading. In order to
1287 * avoid any problems with containers deadlocking due to this
1288 * we simply make /dev/kmsg unavailable to the container. */
1289 from
= prefix_roota(dest
, "/run/kmsg");
1290 to
= prefix_roota(dest
, "/proc/kmsg");
1292 if (mkfifo(from
, 0600) < 0)
1293 return log_error_errno(errno
, "mkfifo() for /run/kmsg failed: %m");
1294 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1295 return log_error_errno(errno
, "Bind mount for /proc/kmsg failed: %m");
1297 fd
= open(from
, O_RDWR
|O_NDELAY
|O_CLOEXEC
);
1299 return log_error_errno(errno
, "Failed to open fifo: %m");
1301 cmsg
= CMSG_FIRSTHDR(&mh
);
1302 cmsg
->cmsg_level
= SOL_SOCKET
;
1303 cmsg
->cmsg_type
= SCM_RIGHTS
;
1304 cmsg
->cmsg_len
= CMSG_LEN(sizeof(int));
1305 memcpy(CMSG_DATA(cmsg
), &fd
, sizeof(int));
1307 mh
.msg_controllen
= cmsg
->cmsg_len
;
1309 /* Store away the fd in the socket, so that it stays open as
1310 * long as we run the child */
1311 k
= sendmsg(kmsg_socket
, &mh
, MSG_NOSIGNAL
);
1315 return log_error_errno(errno
, "Failed to send FIFO fd: %m");
1317 /* And now make the FIFO unavailable as /run/kmsg... */
1318 (void) unlink(from
);
1323 static int on_address_change(sd_netlink
*rtnl
, sd_netlink_message
*m
, void *userdata
) {
1324 union in_addr_union
*exposed
= userdata
;
1330 expose_port_execute(rtnl
, arg_expose_ports
, exposed
);
1334 static int setup_hostname(void) {
1336 if (arg_share_system
)
1339 if (sethostname_idempotent(arg_machine
) < 0)
1345 static int setup_journal(const char *directory
) {
1346 sd_id128_t machine_id
, this_id
;
1347 _cleanup_free_
char *b
= NULL
, *d
= NULL
;
1348 const char *etc_machine_id
, *p
, *q
;
1352 /* Don't link journals in ephemeral mode */
1356 etc_machine_id
= prefix_roota(directory
, "/etc/machine-id");
1358 r
= read_one_line_file(etc_machine_id
, &b
);
1359 if (r
== -ENOENT
&& arg_link_journal
== LINK_AUTO
)
1362 return log_error_errno(r
, "Failed to read machine ID from %s: %m", etc_machine_id
);
1365 if (isempty(id
) && arg_link_journal
== LINK_AUTO
)
1368 /* Verify validity */
1369 r
= sd_id128_from_string(id
, &machine_id
);
1371 return log_error_errno(r
, "Failed to parse machine ID from %s: %m", etc_machine_id
);
1373 r
= sd_id128_get_machine(&this_id
);
1375 return log_error_errno(r
, "Failed to retrieve machine ID: %m");
1377 if (sd_id128_equal(machine_id
, this_id
)) {
1378 log_full(arg_link_journal
== LINK_AUTO
? LOG_WARNING
: LOG_ERR
,
1379 "Host and machine ids are equal (%s): refusing to link journals", id
);
1380 if (arg_link_journal
== LINK_AUTO
)
1385 if (arg_link_journal
== LINK_NO
)
1388 r
= userns_mkdir(directory
, "/var", 0755, 0, 0);
1390 return log_error_errno(r
, "Failed to create /var: %m");
1392 r
= userns_mkdir(directory
, "/var/log", 0755, 0, 0);
1394 return log_error_errno(r
, "Failed to create /var/log: %m");
1396 r
= userns_mkdir(directory
, "/var/log/journal", 0755, 0, 0);
1398 return log_error_errno(r
, "Failed to create /var/log/journal: %m");
1400 p
= strjoina("/var/log/journal/", id
);
1401 q
= prefix_roota(directory
, p
);
1403 if (path_is_mount_point(p
, 0) > 0) {
1404 if (arg_link_journal
!= LINK_AUTO
) {
1405 log_error("%s: already a mount point, refusing to use for journal", p
);
1412 if (path_is_mount_point(q
, 0) > 0) {
1413 if (arg_link_journal
!= LINK_AUTO
) {
1414 log_error("%s: already a mount point, refusing to use for journal", q
);
1421 r
= readlink_and_make_absolute(p
, &d
);
1423 if ((arg_link_journal
== LINK_GUEST
||
1424 arg_link_journal
== LINK_AUTO
) &&
1427 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1429 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
1434 return log_error_errno(errno
, "Failed to remove symlink %s: %m", p
);
1435 } else if (r
== -EINVAL
) {
1437 if (arg_link_journal
== LINK_GUEST
&&
1440 if (errno
== ENOTDIR
) {
1441 log_error("%s already exists and is neither a symlink nor a directory", p
);
1444 log_error_errno(errno
, "Failed to remove %s: %m", p
);
1448 } else if (r
!= -ENOENT
) {
1449 log_error_errno(errno
, "readlink(%s) failed: %m", p
);
1453 if (arg_link_journal
== LINK_GUEST
) {
1455 if (symlink(q
, p
) < 0) {
1456 if (arg_link_journal_try
) {
1457 log_debug_errno(errno
, "Failed to symlink %s to %s, skipping journal setup: %m", q
, p
);
1460 log_error_errno(errno
, "Failed to symlink %s to %s: %m", q
, p
);
1465 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1467 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
1471 if (arg_link_journal
== LINK_HOST
) {
1472 /* don't create parents here -- if the host doesn't have
1473 * permanent journal set up, don't force it here */
1476 if (arg_link_journal_try
) {
1477 log_debug_errno(errno
, "Failed to create %s, skipping journal setup: %m", p
);
1480 log_error_errno(errno
, "Failed to create %s: %m", p
);
1485 } else if (access(p
, F_OK
) < 0)
1488 if (dir_is_empty(q
) == 0)
1489 log_warning("%s is not empty, proceeding anyway.", q
);
1491 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1493 log_error_errno(errno
, "Failed to create %s: %m", q
);
1497 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1498 return log_error_errno(errno
, "Failed to bind mount journal from host into guest: %m");
1503 static int drop_capabilities(void) {
1504 return capability_bounding_set_drop(~arg_retain
, false);
1507 static int reset_audit_loginuid(void) {
1508 _cleanup_free_
char *p
= NULL
;
1511 if (arg_share_system
)
1514 r
= read_one_line_file("/proc/self/loginuid", &p
);
1518 return log_error_errno(r
, "Failed to read /proc/self/loginuid: %m");
1520 /* Already reset? */
1521 if (streq(p
, "4294967295"))
1524 r
= write_string_file("/proc/self/loginuid", "4294967295", 0);
1527 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1528 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1529 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1530 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1531 "using systemd-nspawn. Sleeping for 5s... (%m)");
1539 static int setup_seccomp(void) {
1542 static const struct {
1543 uint64_t capability
;
1546 { CAP_SYS_RAWIO
, SCMP_SYS(iopl
) },
1547 { CAP_SYS_RAWIO
, SCMP_SYS(ioperm
) },
1548 { CAP_SYS_BOOT
, SCMP_SYS(kexec_load
) },
1549 { CAP_SYS_ADMIN
, SCMP_SYS(swapon
) },
1550 { CAP_SYS_ADMIN
, SCMP_SYS(swapoff
) },
1551 { CAP_SYS_ADMIN
, SCMP_SYS(open_by_handle_at
) },
1552 { CAP_SYS_MODULE
, SCMP_SYS(init_module
) },
1553 { CAP_SYS_MODULE
, SCMP_SYS(finit_module
) },
1554 { CAP_SYS_MODULE
, SCMP_SYS(delete_module
) },
1555 { CAP_SYSLOG
, SCMP_SYS(syslog
) },
1558 scmp_filter_ctx seccomp
;
1562 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
1566 r
= seccomp_add_secondary_archs(seccomp
);
1568 log_error_errno(r
, "Failed to add secondary archs to seccomp filter: %m");
1572 for (i
= 0; i
< ELEMENTSOF(blacklist
); i
++) {
1573 if (arg_retain
& (1ULL << blacklist
[i
].capability
))
1576 r
= seccomp_rule_add(seccomp
, SCMP_ACT_ERRNO(EPERM
), blacklist
[i
].syscall_num
, 0);
1578 continue; /* unknown syscall */
1580 log_error_errno(r
, "Failed to block syscall: %m");
1587 Audit is broken in containers, much of the userspace audit
1588 hookup will fail if running inside a container. We don't
1589 care and just turn off creation of audit sockets.
1591 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1592 with EAFNOSUPPORT which audit userspace uses as indication
1593 that audit is disabled in the kernel.
1596 r
= seccomp_rule_add(
1598 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1601 SCMP_A0(SCMP_CMP_EQ
, AF_NETLINK
),
1602 SCMP_A2(SCMP_CMP_EQ
, NETLINK_AUDIT
));
1604 log_error_errno(r
, "Failed to add audit seccomp rule: %m");
1608 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
1610 log_error_errno(r
, "Failed to unset NO_NEW_PRIVS: %m");
1614 r
= seccomp_load(seccomp
);
1616 log_debug_errno(r
, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1621 log_error_errno(r
, "Failed to install seccomp audit filter: %m");
1626 seccomp_release(seccomp
);
1634 static int setup_propagate(const char *root
) {
1637 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1638 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1639 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
1640 (void) mkdir_p(p
, 0600);
1642 if (userns_mkdir(root
, "/run/systemd", 0755, 0, 0) < 0)
1643 return log_error_errno(errno
, "Failed to create /run/systemd: %m");
1645 if (userns_mkdir(root
, "/run/systemd/nspawn", 0755, 0, 0) < 0)
1646 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn: %m");
1648 if (userns_mkdir(root
, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
1649 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn/incoming: %m");
1651 q
= prefix_roota(root
, "/run/systemd/nspawn/incoming");
1652 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1653 return log_error_errno(errno
, "Failed to install propagation bind mount.");
1655 if (mount(NULL
, q
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
) < 0)
1656 return log_error_errno(errno
, "Failed to make propagation mount read-only");
1661 static int setup_image(char **device_path
, int *loop_nr
) {
1662 struct loop_info64 info
= {
1663 .lo_flags
= LO_FLAGS_AUTOCLEAR
|LO_FLAGS_PARTSCAN
1665 _cleanup_close_
int fd
= -1, control
= -1, loop
= -1;
1666 _cleanup_free_
char* loopdev
= NULL
;
1670 assert(device_path
);
1674 fd
= open(arg_image
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1676 return log_error_errno(errno
, "Failed to open %s: %m", arg_image
);
1678 if (fstat(fd
, &st
) < 0)
1679 return log_error_errno(errno
, "Failed to stat %s: %m", arg_image
);
1681 if (S_ISBLK(st
.st_mode
)) {
1684 p
= strdup(arg_image
);
1698 if (!S_ISREG(st
.st_mode
)) {
1699 log_error_errno(errno
, "%s is not a regular file or block device: %m", arg_image
);
1703 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
1705 return log_error_errno(errno
, "Failed to open /dev/loop-control: %m");
1707 nr
= ioctl(control
, LOOP_CTL_GET_FREE
);
1709 return log_error_errno(errno
, "Failed to allocate loop device: %m");
1711 if (asprintf(&loopdev
, "/dev/loop%i", nr
) < 0)
1714 loop
= open(loopdev
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1716 return log_error_errno(errno
, "Failed to open loop device %s: %m", loopdev
);
1718 if (ioctl(loop
, LOOP_SET_FD
, fd
) < 0)
1719 return log_error_errno(errno
, "Failed to set loopback file descriptor on %s: %m", loopdev
);
1722 info
.lo_flags
|= LO_FLAGS_READ_ONLY
;
1724 if (ioctl(loop
, LOOP_SET_STATUS64
, &info
) < 0)
1725 return log_error_errno(errno
, "Failed to set loopback settings on %s: %m", loopdev
);
1727 *device_path
= loopdev
;
1738 #define PARTITION_TABLE_BLURB \
1739 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1740 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1741 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1742 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1743 "to be bootable with systemd-nspawn."
1745 static int dissect_image(
1747 char **root_device
, bool *root_device_rw
,
1748 char **home_device
, bool *home_device_rw
,
1749 char **srv_device
, bool *srv_device_rw
,
1753 int home_nr
= -1, srv_nr
= -1;
1754 #ifdef GPT_ROOT_NATIVE
1757 #ifdef GPT_ROOT_SECONDARY
1758 int secondary_root_nr
= -1;
1760 _cleanup_free_
char *home
= NULL
, *root
= NULL
, *secondary_root
= NULL
, *srv
= NULL
, *generic
= NULL
;
1761 _cleanup_udev_enumerate_unref_
struct udev_enumerate
*e
= NULL
;
1762 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
1763 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
1764 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
1765 struct udev_list_entry
*first
, *item
;
1766 bool home_rw
= true, root_rw
= true, secondary_root_rw
= true, srv_rw
= true, generic_rw
= true;
1767 bool is_gpt
, is_mbr
, multiple_generic
= false;
1768 const char *pttype
= NULL
;
1775 assert(root_device
);
1776 assert(home_device
);
1781 b
= blkid_new_probe();
1786 r
= blkid_probe_set_device(b
, fd
, 0, 0);
1791 log_error_errno(errno
, "Failed to set device on blkid probe: %m");
1795 blkid_probe_enable_partitions(b
, 1);
1796 blkid_probe_set_partitions_flags(b
, BLKID_PARTS_ENTRY_DETAILS
);
1799 r
= blkid_do_safeprobe(b
);
1800 if (r
== -2 || r
== 1) {
1801 log_error("Failed to identify any partition table on\n"
1803 PARTITION_TABLE_BLURB
, arg_image
);
1805 } else if (r
!= 0) {
1808 log_error_errno(errno
, "Failed to probe: %m");
1812 (void) blkid_probe_lookup_value(b
, "PTTYPE", &pttype
, NULL
);
1814 is_gpt
= streq_ptr(pttype
, "gpt");
1815 is_mbr
= streq_ptr(pttype
, "dos");
1817 if (!is_gpt
&& !is_mbr
) {
1818 log_error("No GPT or MBR partition table discovered on\n"
1820 PARTITION_TABLE_BLURB
, arg_image
);
1825 pl
= blkid_probe_get_partitions(b
);
1830 log_error("Failed to list partitions of %s", arg_image
);
1838 if (fstat(fd
, &st
) < 0)
1839 return log_error_errno(errno
, "Failed to stat block device: %m");
1841 d
= udev_device_new_from_devnum(udev
, 'b', st
.st_rdev
);
1849 log_error("Kernel partitions never appeared.");
1853 e
= udev_enumerate_new(udev
);
1857 r
= udev_enumerate_add_match_parent(e
, d
);
1861 r
= udev_enumerate_scan_devices(e
);
1863 return log_error_errno(r
, "Failed to scan for partition devices of %s: %m", arg_image
);
1865 /* Count the partitions enumerated by the kernel */
1867 first
= udev_enumerate_get_list_entry(e
);
1868 udev_list_entry_foreach(item
, first
)
1871 /* Count the partitions enumerated by blkid */
1872 m
= blkid_partlist_numof_partitions(pl
);
1876 log_error("blkid and kernel partition list do not match.");
1882 /* The kernel has probed fewer partitions than
1883 * blkid? Maybe the kernel prober is still
1884 * running or it got EBUSY because udev
1885 * already opened the device. Let's reprobe
1886 * the device, which is a synchronous call
1887 * that waits until probing is complete. */
1889 for (j
= 0; j
< 20; j
++) {
1891 r
= ioctl(fd
, BLKRRPART
, 0);
1894 if (r
>= 0 || r
!= -EBUSY
)
1897 /* If something else has the device
1898 * open, such as an udev rule, the
1899 * ioctl will return EBUSY. Since
1900 * there's no way to wait until it
1901 * isn't busy anymore, let's just wait
1902 * a bit, and try again.
1904 * This is really something they
1905 * should fix in the kernel! */
1907 usleep(50 * USEC_PER_MSEC
);
1911 return log_error_errno(r
, "Failed to reread partition table: %m");
1914 e
= udev_enumerate_unref(e
);
1917 first
= udev_enumerate_get_list_entry(e
);
1918 udev_list_entry_foreach(item
, first
) {
1919 _cleanup_udev_device_unref_
struct udev_device
*q
;
1921 unsigned long long flags
;
1927 q
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(item
));
1932 log_error_errno(errno
, "Failed to get partition device of %s: %m", arg_image
);
1936 qn
= udev_device_get_devnum(q
);
1940 if (st
.st_rdev
== qn
)
1943 node
= udev_device_get_devnode(q
);
1947 pp
= blkid_partlist_devno_to_partition(pl
, qn
);
1951 flags
= blkid_partition_get_flags(pp
);
1953 nr
= blkid_partition_get_partno(pp
);
1961 if (flags
& GPT_FLAG_NO_AUTO
)
1964 stype
= blkid_partition_get_type_string(pp
);
1968 if (sd_id128_from_string(stype
, &type_id
) < 0)
1971 if (sd_id128_equal(type_id
, GPT_HOME
)) {
1973 if (home
&& nr
>= home_nr
)
1977 home_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1979 r
= free_and_strdup(&home
, node
);
1983 } else if (sd_id128_equal(type_id
, GPT_SRV
)) {
1985 if (srv
&& nr
>= srv_nr
)
1989 srv_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1991 r
= free_and_strdup(&srv
, node
);
1995 #ifdef GPT_ROOT_NATIVE
1996 else if (sd_id128_equal(type_id
, GPT_ROOT_NATIVE
)) {
1998 if (root
&& nr
>= root_nr
)
2002 root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2004 r
= free_and_strdup(&root
, node
);
2009 #ifdef GPT_ROOT_SECONDARY
2010 else if (sd_id128_equal(type_id
, GPT_ROOT_SECONDARY
)) {
2012 if (secondary_root
&& nr
>= secondary_root_nr
)
2015 secondary_root_nr
= nr
;
2016 secondary_root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2018 r
= free_and_strdup(&secondary_root
, node
);
2023 else if (sd_id128_equal(type_id
, GPT_LINUX_GENERIC
)) {
2026 multiple_generic
= true;
2028 generic_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2030 r
= free_and_strdup(&generic
, node
);
2036 } else if (is_mbr
) {
2039 if (flags
!= 0x80) /* Bootable flag */
2042 type
= blkid_partition_get_type(pp
);
2043 if (type
!= 0x83) /* Linux partition */
2047 multiple_generic
= true;
2051 r
= free_and_strdup(&root
, node
);
2059 *root_device
= root
;
2062 *root_device_rw
= root_rw
;
2064 } else if (secondary_root
) {
2065 *root_device
= secondary_root
;
2066 secondary_root
= NULL
;
2068 *root_device_rw
= secondary_root_rw
;
2070 } else if (generic
) {
2072 /* There were no partitions with precise meanings
2073 * around, but we found generic partitions. In this
2074 * case, if there's only one, we can go ahead and boot
2075 * it, otherwise we bail out, because we really cannot
2076 * make any sense of it. */
2078 if (multiple_generic
) {
2079 log_error("Identified multiple bootable Linux partitions on\n"
2081 PARTITION_TABLE_BLURB
, arg_image
);
2085 *root_device
= generic
;
2088 *root_device_rw
= generic_rw
;
2091 log_error("Failed to identify root partition in disk image\n"
2093 PARTITION_TABLE_BLURB
, arg_image
);
2098 *home_device
= home
;
2101 *home_device_rw
= home_rw
;
2108 *srv_device_rw
= srv_rw
;
2113 log_error("--image= is not supported, compiled without blkid support.");
2118 static int mount_device(const char *what
, const char *where
, const char *directory
, bool rw
) {
2120 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
2121 const char *fstype
, *p
;
2131 p
= strjoina(where
, directory
);
2136 b
= blkid_new_probe_from_filename(what
);
2140 log_error_errno(errno
, "Failed to allocate prober for %s: %m", what
);
2144 blkid_probe_enable_superblocks(b
, 1);
2145 blkid_probe_set_superblocks_flags(b
, BLKID_SUBLKS_TYPE
);
2148 r
= blkid_do_safeprobe(b
);
2149 if (r
== -1 || r
== 1) {
2150 log_error("Cannot determine file system type of %s", what
);
2152 } else if (r
!= 0) {
2155 log_error_errno(errno
, "Failed to probe %s: %m", what
);
2160 if (blkid_probe_lookup_value(b
, "TYPE", &fstype
, NULL
) < 0) {
2163 log_error("Failed to determine file system type of %s", what
);
2167 if (streq(fstype
, "crypto_LUKS")) {
2168 log_error("nspawn currently does not support LUKS disk images.");
2172 if (mount(what
, p
, fstype
, MS_NODEV
|(rw
? 0 : MS_RDONLY
), NULL
) < 0)
2173 return log_error_errno(errno
, "Failed to mount %s: %m", what
);
2177 log_error("--image= is not supported, compiled without blkid support.");
2182 static int mount_devices(
2184 const char *root_device
, bool root_device_rw
,
2185 const char *home_device
, bool home_device_rw
,
2186 const char *srv_device
, bool srv_device_rw
) {
2192 r
= mount_device(root_device
, arg_directory
, NULL
, root_device_rw
);
2194 return log_error_errno(r
, "Failed to mount root directory: %m");
2198 r
= mount_device(home_device
, arg_directory
, "/home", home_device_rw
);
2200 return log_error_errno(r
, "Failed to mount home directory: %m");
2204 r
= mount_device(srv_device
, arg_directory
, "/srv", srv_device_rw
);
2206 return log_error_errno(r
, "Failed to mount server data directory: %m");
2212 static void loop_remove(int nr
, int *image_fd
) {
2213 _cleanup_close_
int control
= -1;
2219 if (image_fd
&& *image_fd
>= 0) {
2220 r
= ioctl(*image_fd
, LOOP_CLR_FD
);
2222 log_debug_errno(errno
, "Failed to close loop image: %m");
2223 *image_fd
= safe_close(*image_fd
);
2226 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
2228 log_warning_errno(errno
, "Failed to open /dev/loop-control: %m");
2232 r
= ioctl(control
, LOOP_CTL_REMOVE
, nr
);
2234 log_debug_errno(errno
, "Failed to remove loop %d: %m", nr
);
2239 * < 0 : wait_for_terminate() failed to get the state of the
2240 * container, the container was terminated by a signal, or
2241 * failed for an unknown reason. No change is made to the
2242 * container argument.
2243 * > 0 : The program executed in the container terminated with an
2244 * error. The exit code of the program executed in the
2245 * container is returned. The container argument has been set
2246 * to CONTAINER_TERMINATED.
2247 * 0 : The container is being rebooted, has been shut down or exited
2248 * successfully. The container argument has been set to either
2249 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2251 * That is, success is indicated by a return value of zero, and an
2252 * error is indicated by a non-zero value.
2254 static int wait_for_container(pid_t pid
, ContainerStatus
*container
) {
2258 r
= wait_for_terminate(pid
, &status
);
2260 return log_warning_errno(r
, "Failed to wait for container: %m");
2262 switch (status
.si_code
) {
2265 if (status
.si_status
== 0) {
2266 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s exited successfully.", arg_machine
);
2269 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s failed with error code %i.", arg_machine
, status
.si_status
);
2271 *container
= CONTAINER_TERMINATED
;
2272 return status
.si_status
;
2275 if (status
.si_status
== SIGINT
) {
2277 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s has been shut down.", arg_machine
);
2278 *container
= CONTAINER_TERMINATED
;
2281 } else if (status
.si_status
== SIGHUP
) {
2283 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s is being rebooted.", arg_machine
);
2284 *container
= CONTAINER_REBOOTED
;
2288 /* CLD_KILLED fallthrough */
2291 log_error("Container %s terminated by signal %s.", arg_machine
, signal_to_string(status
.si_status
));
2295 log_error("Container %s failed due to unknown reason.", arg_machine
);
2302 static void nop_handler(int sig
) {}
2304 static int on_orderly_shutdown(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
2307 pid
= PTR_TO_UINT32(userdata
);
2309 if (kill(pid
, arg_kill_signal
) >= 0) {
2310 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2311 sd_event_source_set_userdata(s
, NULL
);
2316 sd_event_exit(sd_event_source_get_event(s
), 0);
2320 static int determine_names(void) {
2323 if (arg_template
&& !arg_directory
&& arg_machine
) {
2325 /* If --template= was specified then we should not
2326 * search for a machine, but instead create a new one
2327 * in /var/lib/machine. */
2329 arg_directory
= strjoin("/var/lib/machines/", arg_machine
, NULL
);
2334 if (!arg_image
&& !arg_directory
) {
2336 _cleanup_(image_unrefp
) Image
*i
= NULL
;
2338 r
= image_find(arg_machine
, &i
);
2340 return log_error_errno(r
, "Failed to find image for machine '%s': %m", arg_machine
);
2342 log_error("No image for machine '%s': %m", arg_machine
);
2346 if (i
->type
== IMAGE_RAW
)
2347 r
= set_sanitized_path(&arg_image
, i
->path
);
2349 r
= set_sanitized_path(&arg_directory
, i
->path
);
2351 return log_error_errno(r
, "Invalid image directory: %m");
2354 arg_read_only
= arg_read_only
|| i
->read_only
;
2356 arg_directory
= get_current_dir_name();
2358 if (!arg_directory
&& !arg_machine
) {
2359 log_error("Failed to determine path, please use -D or -i.");
2365 if (arg_directory
&& path_equal(arg_directory
, "/"))
2366 arg_machine
= gethostname_malloc();
2368 arg_machine
= strdup(basename(arg_image
?: arg_directory
));
2373 hostname_cleanup(arg_machine
);
2374 if (!machine_name_is_valid(arg_machine
)) {
2375 log_error("Failed to determine machine name automatically, please use -M.");
2379 if (arg_ephemeral
) {
2382 /* Add a random suffix when this is an
2383 * ephemeral machine, so that we can run many
2384 * instances at once without manually having
2385 * to specify -M each time. */
2387 if (asprintf(&b
, "%s-%016" PRIx64
, arg_machine
, random_u64()) < 0)
2398 static int determine_uid_shift(const char *directory
) {
2406 if (arg_uid_shift
== UID_INVALID
) {
2409 r
= stat(directory
, &st
);
2411 return log_error_errno(errno
, "Failed to determine UID base of %s: %m", directory
);
2413 arg_uid_shift
= st
.st_uid
& UINT32_C(0xffff0000);
2415 if (arg_uid_shift
!= (st
.st_gid
& UINT32_C(0xffff0000))) {
2416 log_error("UID and GID base of %s don't match.", directory
);
2420 arg_uid_range
= UINT32_C(0x10000);
2423 if (arg_uid_shift
> (uid_t
) -1 - arg_uid_range
) {
2424 log_error("UID base too high for UID range.");
2428 log_info("Using user namespaces with base " UID_FMT
" and range " UID_FMT
".", arg_uid_shift
, arg_uid_range
);
2432 static int inner_child(
2434 const char *directory
,
2440 _cleanup_free_
char *home
= NULL
;
2442 const char *envp
[] = {
2443 "PATH=" DEFAULT_PATH_SPLIT_USR
,
2444 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2449 NULL
, /* container_uuid */
2450 NULL
, /* LISTEN_FDS */
2451 NULL
, /* LISTEN_PID */
2455 _cleanup_strv_free_
char **env_use
= NULL
;
2460 assert(kmsg_socket
>= 0);
2465 /* Tell the parent, that it now can write the UID map. */
2466 (void) barrier_place(barrier
); /* #1 */
2468 /* Wait until the parent wrote the UID map */
2469 if (!barrier_place_and_sync(barrier
)) { /* #2 */
2470 log_error("Parent died too early");
2475 r
= mount_all(NULL
, true, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2479 /* Wait until we are cgroup-ified, so that we
2480 * can mount the right cgroup path writable */
2481 if (!barrier_place_and_sync(barrier
)) { /* #3 */
2482 log_error("Parent died too early");
2486 r
= mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy
);
2490 r
= reset_uid_gid();
2492 return log_error_errno(r
, "Couldn't become new root: %m");
2494 r
= setup_boot_id(NULL
);
2498 r
= setup_kmsg(NULL
, kmsg_socket
);
2501 kmsg_socket
= safe_close(kmsg_socket
);
2506 return log_error_errno(errno
, "setsid() failed: %m");
2508 if (arg_private_network
)
2511 if (arg_expose_ports
) {
2512 r
= expose_port_send_rtnl(rtnl_socket
);
2515 rtnl_socket
= safe_close(rtnl_socket
);
2518 if (drop_capabilities() < 0)
2519 return log_error_errno(errno
, "drop_capabilities() failed: %m");
2523 if (arg_personality
!= PERSONALITY_INVALID
) {
2524 if (personality(arg_personality
) < 0)
2525 return log_error_errno(errno
, "personality() failed: %m");
2526 } else if (secondary
) {
2527 if (personality(PER_LINUX32
) < 0)
2528 return log_error_errno(errno
, "personality() failed: %m");
2532 if (arg_selinux_context
)
2533 if (setexeccon((security_context_t
) arg_selinux_context
) < 0)
2534 return log_error_errno(errno
, "setexeccon(\"%s\") failed: %m", arg_selinux_context
);
2537 r
= change_uid_gid(arg_user
, &home
);
2541 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
2545 if ((asprintf((char**)(envp
+ n_env
++), "HOME=%s", home
? home
: "/root") < 0) ||
2546 (asprintf((char**)(envp
+ n_env
++), "USER=%s", arg_user
? arg_user
: "root") < 0) ||
2547 (asprintf((char**)(envp
+ n_env
++), "LOGNAME=%s", arg_user
? arg_user
: "root") < 0))
2550 if (!sd_id128_equal(arg_uuid
, SD_ID128_NULL
)) {
2553 if (asprintf((char**)(envp
+ n_env
++), "container_uuid=%s", id128_format_as_uuid(arg_uuid
, as_uuid
)) < 0)
2557 if (fdset_size(fds
) > 0) {
2558 r
= fdset_cloexec(fds
, false);
2560 return log_error_errno(r
, "Failed to unset O_CLOEXEC for file descriptors.");
2562 if ((asprintf((char **)(envp
+ n_env
++), "LISTEN_FDS=%u", fdset_size(fds
)) < 0) ||
2563 (asprintf((char **)(envp
+ n_env
++), "LISTEN_PID=1") < 0))
2567 env_use
= strv_env_merge(2, envp
, arg_setenv
);
2571 /* Let the parent know that we are ready and
2572 * wait until the parent is ready with the
2574 if (!barrier_place_and_sync(barrier
)) { /* #4 */
2575 log_error("Parent died too early");
2579 /* Now, explicitly close the log, so that we
2580 * then can close all remaining fds. Closing
2581 * the log explicitly first has the benefit
2582 * that the logging subsystem knows about it,
2583 * and is thus ready to be reopened should we
2584 * need it again. Note that the other fds
2585 * closed here are at least the locking and
2588 (void) fdset_close_others(fds
);
2594 /* Automatically search for the init system */
2596 m
= 1 + strv_length(arg_parameters
);
2597 a
= newa(char*, m
+ 1);
2598 if (strv_isempty(arg_parameters
))
2601 memcpy(a
+ 1, arg_parameters
, m
* sizeof(char*));
2603 a
[0] = (char*) "/usr/lib/systemd/systemd";
2604 execve(a
[0], a
, env_use
);
2606 a
[0] = (char*) "/lib/systemd/systemd";
2607 execve(a
[0], a
, env_use
);
2609 a
[0] = (char*) "/sbin/init";
2610 execve(a
[0], a
, env_use
);
2611 } else if (!strv_isempty(arg_parameters
))
2612 execvpe(arg_parameters
[0], arg_parameters
, env_use
);
2614 chdir(home
?: "/root");
2615 execle("/bin/bash", "-bash", NULL
, env_use
);
2616 execle("/bin/sh", "-sh", NULL
, env_use
);
2620 return log_error_errno(errno
, "execv() failed: %m");
2623 static int outer_child(
2625 const char *directory
,
2626 const char *console
,
2627 const char *root_device
, bool root_device_rw
,
2628 const char *home_device
, bool home_device_rw
,
2629 const char *srv_device
, bool srv_device_rw
,
2635 int uid_shift_socket
,
2645 assert(pid_socket
>= 0);
2646 assert(kmsg_socket
>= 0);
2650 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0)
2651 return log_error_errno(errno
, "PR_SET_PDEATHSIG failed: %m");
2654 close_nointr(STDIN_FILENO
);
2655 close_nointr(STDOUT_FILENO
);
2656 close_nointr(STDERR_FILENO
);
2658 r
= open_terminal(console
, O_RDWR
);
2659 if (r
!= STDIN_FILENO
) {
2665 return log_error_errno(r
, "Failed to open console: %m");
2668 if (dup2(STDIN_FILENO
, STDOUT_FILENO
) != STDOUT_FILENO
||
2669 dup2(STDIN_FILENO
, STDERR_FILENO
) != STDERR_FILENO
)
2670 return log_error_errno(errno
, "Failed to duplicate console: %m");
2673 r
= reset_audit_loginuid();
2677 /* Mark everything as slave, so that we still
2678 * receive mounts from the real root, but don't
2679 * propagate mounts to the real root. */
2680 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
2681 return log_error_errno(errno
, "MS_SLAVE|MS_REC failed: %m");
2683 r
= mount_devices(directory
,
2684 root_device
, root_device_rw
,
2685 home_device
, home_device_rw
,
2686 srv_device
, srv_device_rw
);
2690 r
= determine_uid_shift(directory
);
2695 l
= send(uid_shift_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), MSG_NOSIGNAL
);
2697 return log_error_errno(errno
, "Failed to send UID shift: %m");
2698 if (l
!= sizeof(arg_uid_shift
)) {
2699 log_error("Short write while sending UID shift.");
2704 /* Turn directory into bind mount */
2705 if (mount(directory
, directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
2706 return log_error_errno(errno
, "Failed to make bind mount: %m");
2708 r
= setup_volatile(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2712 r
= setup_volatile_state(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2716 r
= base_filesystem_create(directory
, arg_uid_shift
, (gid_t
) arg_uid_shift
);
2720 if (arg_read_only
) {
2721 r
= bind_remount_recursive(directory
, true);
2723 return log_error_errno(r
, "Failed to make tree read-only: %m");
2726 r
= mount_all(directory
, false, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2730 r
= copy_devnodes(directory
);
2734 dev_setup(directory
, arg_uid_shift
, arg_uid_shift
);
2736 r
= setup_pts(directory
);
2740 r
= setup_propagate(directory
);
2744 r
= setup_dev_console(directory
, console
);
2748 r
= setup_seccomp();
2752 r
= setup_timezone(directory
);
2756 r
= setup_resolv_conf(directory
);
2760 r
= setup_journal(directory
);
2764 r
= mount_custom(directory
, arg_custom_mounts
, arg_n_custom_mounts
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2768 r
= mount_cgroups(directory
, arg_unified_cgroup_hierarchy
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2772 r
= mount_move_root(directory
);
2774 return log_error_errno(r
, "Failed to move root directory: %m");
2776 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
|
2777 (arg_share_system
? 0 : CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
) |
2778 (arg_private_network
? CLONE_NEWNET
: 0) |
2779 (arg_userns
? CLONE_NEWUSER
: 0),
2782 return log_error_errno(errno
, "Failed to fork inner child: %m");
2784 pid_socket
= safe_close(pid_socket
);
2785 uid_shift_socket
= safe_close(uid_shift_socket
);
2787 /* The inner child has all namespaces that are
2788 * requested, so that we all are owned by the user if
2789 * user namespaces are turned on. */
2791 r
= inner_child(barrier
, directory
, secondary
, kmsg_socket
, rtnl_socket
, fds
);
2793 _exit(EXIT_FAILURE
);
2795 _exit(EXIT_SUCCESS
);
2798 l
= send(pid_socket
, &pid
, sizeof(pid
), MSG_NOSIGNAL
);
2800 return log_error_errno(errno
, "Failed to send PID: %m");
2801 if (l
!= sizeof(pid
)) {
2802 log_error("Short write while sending PID.");
2806 pid_socket
= safe_close(pid_socket
);
2811 static int setup_uid_map(pid_t pid
) {
2812 char uid_map
[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t
) + 1], line
[DECIMAL_STR_MAX(uid_t
)*3+3+1];
2817 xsprintf(uid_map
, "/proc/" PID_FMT
"/uid_map", pid
);
2818 xsprintf(line
, UID_FMT
" " UID_FMT
" " UID_FMT
"\n", 0, arg_uid_shift
, arg_uid_range
);
2819 r
= write_string_file(uid_map
, line
, 0);
2821 return log_error_errno(r
, "Failed to write UID map: %m");
2823 /* We always assign the same UID and GID ranges */
2824 xsprintf(uid_map
, "/proc/" PID_FMT
"/gid_map", pid
);
2825 r
= write_string_file(uid_map
, line
, 0);
2827 return log_error_errno(r
, "Failed to write GID map: %m");
2832 static int load_settings(void) {
2833 _cleanup_(settings_freep
) Settings
*settings
= NULL
;
2834 _cleanup_fclose_
FILE *f
= NULL
;
2835 _cleanup_free_
char *p
= NULL
;
2839 /* If all settings are masked, there's no point in looking for
2840 * the settings file */
2841 if ((arg_settings_mask
& _SETTINGS_MASK_ALL
) == _SETTINGS_MASK_ALL
)
2844 fn
= strjoina(arg_machine
, ".nspawn");
2846 /* We first look in the admin's directories in /etc and /run */
2847 FOREACH_STRING(i
, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2848 _cleanup_free_
char *j
= NULL
;
2850 j
= strjoin(i
, "/", fn
, NULL
);
2859 /* By default we trust configuration from /etc and /run */
2860 if (arg_settings_trusted
< 0)
2861 arg_settings_trusted
= true;
2866 if (errno
!= ENOENT
)
2867 return log_error_errno(errno
, "Failed to open %s: %m", j
);
2871 /* After that, let's look for a file next to the
2872 * actual image we shall boot. */
2875 p
= file_in_same_dir(arg_image
, fn
);
2878 } else if (arg_directory
) {
2879 p
= file_in_same_dir(arg_directory
, fn
);
2886 if (!f
&& errno
!= ENOENT
)
2887 return log_error_errno(errno
, "Failed to open %s: %m", p
);
2889 /* By default we do not trust configuration from /var/lib/machines */
2890 if (arg_settings_trusted
< 0)
2891 arg_settings_trusted
= false;
2898 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted
));
2900 r
= settings_load(f
, p
, &settings
);
2904 /* Copy over bits from the settings, unless they have been
2905 * explicitly masked by command line switches. */
2907 if ((arg_settings_mask
& SETTING_BOOT
) == 0 &&
2908 settings
->boot
>= 0) {
2909 arg_boot
= settings
->boot
;
2911 strv_free(arg_parameters
);
2912 arg_parameters
= settings
->parameters
;
2913 settings
->parameters
= NULL
;
2916 if ((arg_settings_mask
& SETTING_ENVIRONMENT
) == 0 &&
2917 settings
->environment
) {
2918 strv_free(arg_setenv
);
2919 arg_setenv
= settings
->environment
;
2920 settings
->environment
= NULL
;
2923 if ((arg_settings_mask
& SETTING_USER
) == 0 &&
2926 arg_user
= settings
->user
;
2927 settings
->user
= NULL
;
2930 if ((arg_settings_mask
& SETTING_CAPABILITY
) == 0) {
2932 if (!arg_settings_trusted
&& settings
->capability
!= 0)
2933 log_warning("Ignoring Capability= setting, file %s is not trusted.", p
);
2935 arg_retain
|= settings
->capability
;
2937 arg_retain
&= ~settings
->drop_capability
;
2940 if ((arg_settings_mask
& SETTING_KILL_SIGNAL
) == 0 &&
2941 settings
->kill_signal
> 0)
2942 arg_kill_signal
= settings
->kill_signal
;
2944 if ((arg_settings_mask
& SETTING_PERSONALITY
) == 0 &&
2945 settings
->personality
!= PERSONALITY_INVALID
)
2946 arg_personality
= settings
->personality
;
2948 if ((arg_settings_mask
& SETTING_MACHINE_ID
) == 0 &&
2949 !sd_id128_is_null(settings
->machine_id
)) {
2951 if (!arg_settings_trusted
)
2952 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p
);
2954 arg_uuid
= settings
->machine_id
;
2957 if ((arg_settings_mask
& SETTING_READ_ONLY
) == 0 &&
2958 settings
->read_only
>= 0)
2959 arg_read_only
= settings
->read_only
;
2961 if ((arg_settings_mask
& SETTING_VOLATILE_MODE
) == 0 &&
2962 settings
->volatile_mode
!= _VOLATILE_MODE_INVALID
)
2963 arg_volatile_mode
= settings
->volatile_mode
;
2965 if ((arg_settings_mask
& SETTING_CUSTOM_MOUNTS
) == 0 &&
2966 settings
->n_custom_mounts
> 0) {
2968 if (!arg_settings_trusted
)
2969 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p
);
2971 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
2972 arg_custom_mounts
= settings
->custom_mounts
;
2973 arg_n_custom_mounts
= settings
->n_custom_mounts
;
2975 settings
->custom_mounts
= NULL
;
2976 settings
->n_custom_mounts
= 0;
2980 if ((arg_settings_mask
& SETTING_NETWORK
) == 0 &&
2981 (settings
->private_network
>= 0 ||
2982 settings
->network_veth
>= 0 ||
2983 settings
->network_bridge
||
2984 settings
->network_interfaces
||
2985 settings
->network_macvlan
||
2986 settings
->network_ipvlan
)) {
2988 if (!arg_settings_trusted
)
2989 log_warning("Ignoring network settings, file %s is not trusted.", p
);
2991 strv_free(arg_network_interfaces
);
2992 arg_network_interfaces
= settings
->network_interfaces
;
2993 settings
->network_interfaces
= NULL
;
2995 strv_free(arg_network_macvlan
);
2996 arg_network_macvlan
= settings
->network_macvlan
;
2997 settings
->network_macvlan
= NULL
;
2999 strv_free(arg_network_ipvlan
);
3000 arg_network_ipvlan
= settings
->network_ipvlan
;
3001 settings
->network_ipvlan
= NULL
;
3003 free(arg_network_bridge
);
3004 arg_network_bridge
= settings
->network_bridge
;
3005 settings
->network_bridge
= NULL
;
3007 arg_network_veth
= settings
->network_veth
> 0 || settings
->network_bridge
;
3009 arg_private_network
= true; /* all these settings imply private networking */
3013 if ((arg_settings_mask
& SETTING_EXPOSE_PORTS
) == 0 &&
3014 settings
->expose_ports
) {
3016 if (!arg_settings_trusted
)
3017 log_warning("Ignoring Port= setting, file %s is not trusted.", p
);
3019 expose_port_free_all(arg_expose_ports
);
3020 arg_expose_ports
= settings
->expose_ports
;
3021 settings
->expose_ports
= NULL
;
3028 int main(int argc
, char *argv
[]) {
3030 _cleanup_free_
char *device_path
= NULL
, *root_device
= NULL
, *home_device
= NULL
, *srv_device
= NULL
, *console
= NULL
;
3031 bool root_device_rw
= true, home_device_rw
= true, srv_device_rw
= true;
3032 _cleanup_close_
int master
= -1, image_fd
= -1;
3033 _cleanup_fdset_free_ FDSet
*fds
= NULL
;
3034 int r
, n_fd_passed
, loop_nr
= -1;
3035 char veth_name
[IFNAMSIZ
];
3036 bool secondary
= false, remove_subvol
= false;
3039 int ret
= EXIT_SUCCESS
;
3040 union in_addr_union exposed
= {};
3041 _cleanup_release_lock_file_ LockFile tree_global_lock
= LOCK_FILE_INIT
, tree_local_lock
= LOCK_FILE_INIT
;
3044 log_parse_environment();
3047 r
= parse_argv(argc
, argv
);
3051 if (geteuid() != 0) {
3052 log_error("Need to be root.");
3056 r
= determine_names();
3060 r
= load_settings();
3064 r
= verify_arguments();
3068 n_fd_passed
= sd_listen_fds(false);
3069 if (n_fd_passed
> 0) {
3070 r
= fdset_new_listen_fds(&fds
, false);
3072 log_error_errno(r
, "Failed to collect file descriptors: %m");
3077 if (arg_directory
) {
3080 if (path_equal(arg_directory
, "/") && !arg_ephemeral
) {
3081 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3086 if (arg_ephemeral
) {
3087 _cleanup_free_
char *np
= NULL
;
3089 /* If the specified path is a mount point we
3090 * generate the new snapshot immediately
3091 * inside it under a random name. However if
3092 * the specified is not a mount point we
3093 * create the new snapshot in the parent
3094 * directory, just next to it. */
3095 r
= path_is_mount_point(arg_directory
, 0);
3097 log_error_errno(r
, "Failed to determine whether directory %s is mount point: %m", arg_directory
);
3101 r
= tempfn_random_child(arg_directory
, "machine.", &np
);
3103 r
= tempfn_random(arg_directory
, "machine.", &np
);
3105 log_error_errno(r
, "Failed to generate name for snapshot: %m");
3109 r
= image_path_lock(np
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3111 log_error_errno(r
, "Failed to lock %s: %m", np
);
3115 r
= btrfs_subvol_snapshot(arg_directory
, np
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
);
3117 log_error_errno(r
, "Failed to create snapshot %s from %s: %m", np
, arg_directory
);
3121 free(arg_directory
);
3125 remove_subvol
= true;
3128 r
= image_path_lock(arg_directory
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3130 log_error_errno(r
, "Directory tree %s is currently busy.", arg_directory
);
3134 log_error_errno(r
, "Failed to lock %s: %m", arg_directory
);
3139 r
= btrfs_subvol_snapshot(arg_template
, arg_directory
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
);
3142 log_info("Directory %s already exists, not populating from template %s.", arg_directory
, arg_template
);
3144 log_error_errno(r
, "Couldn't create snapshot %s from %s: %m", arg_directory
, arg_template
);
3148 log_info("Populated %s from template %s.", arg_directory
, arg_template
);
3154 if (path_is_os_tree(arg_directory
) <= 0) {
3155 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory
);
3162 p
= strjoina(arg_directory
,
3163 argc
> optind
&& path_is_absolute(argv
[optind
]) ? argv
[optind
] : "/usr/bin/");
3164 if (access(p
, F_OK
) < 0) {
3165 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory
);
3172 char template[] = "/tmp/nspawn-root-XXXXXX";
3175 assert(!arg_template
);
3177 r
= image_path_lock(arg_image
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3179 r
= log_error_errno(r
, "Disk image %s is currently busy.", arg_image
);
3183 r
= log_error_errno(r
, "Failed to create image lock: %m");
3187 if (!mkdtemp(template)) {
3188 log_error_errno(errno
, "Failed to create temporary directory: %m");
3193 arg_directory
= strdup(template);
3194 if (!arg_directory
) {
3199 image_fd
= setup_image(&device_path
, &loop_nr
);
3205 r
= dissect_image(image_fd
,
3206 &root_device
, &root_device_rw
,
3207 &home_device
, &home_device_rw
,
3208 &srv_device
, &srv_device_rw
,
3214 r
= custom_mounts_prepare();
3219 isatty(STDIN_FILENO
) > 0 &&
3220 isatty(STDOUT_FILENO
) > 0;
3222 master
= posix_openpt(O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NDELAY
);
3224 r
= log_error_errno(errno
, "Failed to acquire pseudo tty: %m");
3228 r
= ptsname_malloc(master
, &console
);
3230 r
= log_error_errno(r
, "Failed to determine tty name: %m");
3234 if (unlockpt(master
) < 0) {
3235 r
= log_error_errno(errno
, "Failed to unlock tty: %m");
3240 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3241 arg_machine
, arg_image
?: arg_directory
);
3243 assert_se(sigprocmask_many(SIG_BLOCK
, NULL
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, -1) >= 0);
3245 assert_se(sigemptyset(&mask_chld
) == 0);
3246 assert_se(sigaddset(&mask_chld
, SIGCHLD
) == 0);
3248 if (prctl(PR_SET_CHILD_SUBREAPER
, 1) < 0) {
3249 r
= log_error_errno(errno
, "Failed to become subreaper: %m");
3254 _cleanup_close_pair_
int kmsg_socket_pair
[2] = { -1, -1 }, rtnl_socket_pair
[2] = { -1, -1 }, pid_socket_pair
[2] = { -1, -1 },
3255 uid_shift_socket_pair
[2] = { -1, -1 };
3256 ContainerStatus container_status
;
3257 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
3258 static const struct sigaction sa
= {
3259 .sa_handler
= nop_handler
,
3260 .sa_flags
= SA_NOCLDSTOP
,
3264 _cleanup_event_unref_ sd_event
*event
= NULL
;
3265 _cleanup_(pty_forward_freep
) PTYForward
*forward
= NULL
;
3266 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
3269 r
= barrier_create(&barrier
);
3271 log_error_errno(r
, "Cannot initialize IPC barrier: %m");
3275 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, kmsg_socket_pair
) < 0) {
3276 r
= log_error_errno(errno
, "Failed to create kmsg socket pair: %m");
3280 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, rtnl_socket_pair
) < 0) {
3281 r
= log_error_errno(errno
, "Failed to create rtnl socket pair: %m");
3285 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, pid_socket_pair
) < 0) {
3286 r
= log_error_errno(errno
, "Failed to create pid socket pair: %m");
3291 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, uid_shift_socket_pair
) < 0) {
3292 r
= log_error_errno(errno
, "Failed to create uid shift socket pair: %m");
3296 /* Child can be killed before execv(), so handle SIGCHLD
3297 * in order to interrupt parent's blocking calls and
3298 * give it a chance to call wait() and terminate. */
3299 r
= sigprocmask(SIG_UNBLOCK
, &mask_chld
, NULL
);
3301 r
= log_error_errno(errno
, "Failed to change the signal mask: %m");
3305 r
= sigaction(SIGCHLD
, &sa
, NULL
);
3307 r
= log_error_errno(errno
, "Failed to install SIGCHLD handler: %m");
3311 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
, NULL
);
3313 if (errno
== EINVAL
)
3314 r
= log_error_errno(errno
, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3316 r
= log_error_errno(errno
, "clone() failed: %m");
3322 /* The outer child only has a file system namespace. */
3323 barrier_set_role(&barrier
, BARRIER_CHILD
);
3325 master
= safe_close(master
);
3327 kmsg_socket_pair
[0] = safe_close(kmsg_socket_pair
[0]);
3328 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3329 pid_socket_pair
[0] = safe_close(pid_socket_pair
[0]);
3330 uid_shift_socket_pair
[0] = safe_close(uid_shift_socket_pair
[0]);
3332 (void) reset_all_signal_handlers();
3333 (void) reset_signal_mask();
3335 r
= outer_child(&barrier
,
3338 root_device
, root_device_rw
,
3339 home_device
, home_device_rw
,
3340 srv_device
, srv_device_rw
,
3344 kmsg_socket_pair
[1],
3345 rtnl_socket_pair
[1],
3346 uid_shift_socket_pair
[1],
3349 _exit(EXIT_FAILURE
);
3351 _exit(EXIT_SUCCESS
);
3354 barrier_set_role(&barrier
, BARRIER_PARENT
);
3359 kmsg_socket_pair
[1] = safe_close(kmsg_socket_pair
[1]);
3360 rtnl_socket_pair
[1] = safe_close(rtnl_socket_pair
[1]);
3361 pid_socket_pair
[1] = safe_close(pid_socket_pair
[1]);
3363 /* Wait for the outer child. */
3364 r
= wait_for_terminate_and_warn("namespace helper", pid
, NULL
);
3373 /* And now retrieve the PID of the inner child. */
3374 l
= recv(pid_socket_pair
[0], &pid
, sizeof(pid
), 0);
3376 r
= log_error_errno(errno
, "Failed to read inner child PID: %m");
3379 if (l
!= sizeof(pid
)) {
3380 log_error("Short read while reading inner child PID: %m");
3385 log_debug("Init process invoked as PID " PID_FMT
, pid
);
3388 if (!barrier_place_and_sync(&barrier
)) { /* #1 */
3389 log_error("Child died too early.");
3394 l
= recv(uid_shift_socket_pair
[0], &arg_uid_shift
, sizeof(arg_uid_shift
), 0);
3396 r
= log_error_errno(errno
, "Failed to read UID shift: %m");
3399 if (l
!= sizeof(arg_uid_shift
)) {
3400 log_error("Short read while reading UID shift: %m");
3405 r
= setup_uid_map(pid
);
3409 (void) barrier_place(&barrier
); /* #2 */
3412 if (arg_private_network
) {
3414 r
= move_network_interfaces(pid
, arg_network_interfaces
);
3418 if (arg_network_veth
) {
3419 r
= setup_veth(arg_machine
, pid
, veth_name
, !!arg_network_bridge
);
3425 if (arg_network_bridge
) {
3426 r
= setup_bridge(veth_name
, arg_network_bridge
);
3434 r
= setup_macvlan(arg_machine
, pid
, arg_network_macvlan
);
3438 r
= setup_ipvlan(arg_machine
, pid
, arg_network_ipvlan
);
3444 r
= register_machine(
3451 arg_custom_mounts
, arg_n_custom_mounts
,
3459 r
= sync_cgroup(pid
, arg_unified_cgroup_hierarchy
);
3463 if (arg_keep_unit
) {
3464 r
= create_subcgroup(pid
, arg_unified_cgroup_hierarchy
);
3469 r
= chown_cgroup(pid
, arg_uid_shift
);
3473 /* Notify the child that the parent is ready with all
3474 * its setup (including cgroup-ification), and that
3475 * the child can now hand over control to the code to
3476 * run inside the container. */
3477 (void) barrier_place(&barrier
); /* #3 */
3479 /* Block SIGCHLD here, before notifying child.
3480 * process_pty() will handle it with the other signals. */
3481 assert_se(sigprocmask(SIG_BLOCK
, &mask_chld
, NULL
) >= 0);
3483 /* Reset signal to default */
3484 r
= default_signals(SIGCHLD
, -1);
3486 log_error_errno(r
, "Failed to reset SIGCHLD: %m");
3490 /* Let the child know that we are ready and wait that the child is completely ready now. */
3491 if (!barrier_place_and_sync(&barrier
)) { /* #5 */
3492 log_error("Client died too early.");
3499 "STATUS=Container running.\n"
3500 "X_NSPAWN_LEADER_PID=" PID_FMT
, pid
);
3502 r
= sd_event_new(&event
);
3504 log_error_errno(r
, "Failed to get default event source: %m");
3508 if (arg_kill_signal
> 0) {
3509 /* Try to kill the init system on SIGINT or SIGTERM */
3510 sd_event_add_signal(event
, NULL
, SIGINT
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
3511 sd_event_add_signal(event
, NULL
, SIGTERM
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
3513 /* Immediately exit */
3514 sd_event_add_signal(event
, NULL
, SIGINT
, NULL
, NULL
);
3515 sd_event_add_signal(event
, NULL
, SIGTERM
, NULL
, NULL
);
3518 /* simply exit on sigchld */
3519 sd_event_add_signal(event
, NULL
, SIGCHLD
, NULL
, NULL
);
3521 if (arg_expose_ports
) {
3522 r
= expose_port_watch_rtnl(event
, rtnl_socket_pair
[0], on_address_change
, &exposed
, &rtnl
);
3526 (void) expose_port_execute(rtnl
, arg_expose_ports
, &exposed
);
3529 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3531 r
= pty_forward_new(event
, master
, true, !interactive
, &forward
);
3533 log_error_errno(r
, "Failed to create PTY forwarder: %m");
3537 r
= sd_event_loop(event
);
3539 log_error_errno(r
, "Failed to run event loop: %m");
3543 pty_forward_get_last_char(forward
, &last_char
);
3545 forward
= pty_forward_free(forward
);
3547 if (!arg_quiet
&& last_char
!= '\n')
3550 /* Kill if it is not dead yet anyway */
3551 if (arg_register
&& !arg_keep_unit
)
3552 terminate_machine(pid
);
3554 /* Normally redundant, but better safe than sorry */
3557 r
= wait_for_container(pid
, &container_status
);
3561 /* We failed to wait for the container, or the
3562 * container exited abnormally */
3564 else if (r
> 0 || container_status
== CONTAINER_TERMINATED
){
3565 /* The container exited with a non-zero
3566 * status, or with zero status and no reboot
3572 /* CONTAINER_REBOOTED, loop again */
3574 if (arg_keep_unit
) {
3575 /* Special handling if we are running as a
3576 * service: instead of simply restarting the
3577 * machine we want to restart the entire
3578 * service, so let's inform systemd about this
3579 * with the special exit code 133. The service
3580 * file uses RestartForceExitStatus=133 so
3581 * that this results in a full nspawn
3582 * restart. This is necessary since we might
3583 * have cgroup parameters set we want to have
3590 expose_port_flush(arg_expose_ports
, &exposed
);
3596 "STATUS=Terminating...");
3601 /* Try to flush whatever is still queued in the pty */
3603 (void) copy_bytes(master
, STDOUT_FILENO
, (off_t
) -1, false);
3605 loop_remove(loop_nr
, &image_fd
);
3607 if (remove_subvol
&& arg_directory
) {
3610 k
= btrfs_subvol_remove(arg_directory
, true);
3612 log_warning_errno(k
, "Cannot remove subvolume '%s', ignoring: %m", arg_directory
);
3618 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
3619 (void) rm_rf(p
, REMOVE_ROOT
);
3622 expose_port_flush(arg_expose_ports
, &exposed
);
3624 free(arg_directory
);
3629 strv_free(arg_setenv
);
3630 free(arg_network_bridge
);
3631 strv_free(arg_network_interfaces
);
3632 strv_free(arg_network_macvlan
);
3633 strv_free(arg_network_ipvlan
);
3634 strv_free(arg_parameters
);
3635 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
3636 expose_port_free_all(arg_expose_ports
);
3638 return r
< 0 ? EXIT_FAILURE
: ret
;