1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
23 #include <blkid/blkid.h>
27 #include <linux/loop.h>
33 #include <selinux/selinux.h>
40 #include <sys/mount.h>
41 #include <sys/personality.h>
42 #include <sys/prctl.h>
43 #include <sys/types.h>
46 #include "sd-daemon.h"
50 #include "base-filesystem.h"
51 #include "blkid-util.h"
52 #include "btrfs-util.h"
54 #include "capability.h"
55 #include "cgroup-util.h"
57 #include "dev-setup.h"
59 #include "event-util.h"
62 #include "formats-util.h"
64 #include "hostname-util.h"
66 #include "loopback-setup.h"
67 #include "machine-image.h"
71 #include "netlink-util.h"
72 #include "path-util.h"
73 #include "process-util.h"
75 #include "random-util.h"
78 #include "seccomp-util.h"
80 #include "signal-util.h"
82 #include "terminal-util.h"
83 #include "udev-util.h"
86 #include "nspawn-cgroup.h"
87 #include "nspawn-expose-ports.h"
88 #include "nspawn-mount.h"
89 #include "nspawn-network.h"
90 #include "nspawn-register.h"
91 #include "nspawn-settings.h"
92 #include "nspawn-setuid.h"
94 typedef enum ContainerStatus
{
99 typedef enum LinkJournal
{
106 static char *arg_directory
= NULL
;
107 static char *arg_template
= NULL
;
108 static char *arg_user
= NULL
;
109 static sd_id128_t arg_uuid
= {};
110 static char *arg_machine
= NULL
;
111 static const char *arg_selinux_context
= NULL
;
112 static const char *arg_selinux_apifs_context
= NULL
;
113 static const char *arg_slice
= NULL
;
114 static bool arg_private_network
= false;
115 static bool arg_read_only
= false;
116 static bool arg_boot
= false;
117 static bool arg_ephemeral
= false;
118 static LinkJournal arg_link_journal
= LINK_AUTO
;
119 static bool arg_link_journal_try
= false;
120 static uint64_t arg_retain
=
121 (1ULL << CAP_CHOWN
) |
122 (1ULL << CAP_DAC_OVERRIDE
) |
123 (1ULL << CAP_DAC_READ_SEARCH
) |
124 (1ULL << CAP_FOWNER
) |
125 (1ULL << CAP_FSETID
) |
126 (1ULL << CAP_IPC_OWNER
) |
128 (1ULL << CAP_LEASE
) |
129 (1ULL << CAP_LINUX_IMMUTABLE
) |
130 (1ULL << CAP_NET_BIND_SERVICE
) |
131 (1ULL << CAP_NET_BROADCAST
) |
132 (1ULL << CAP_NET_RAW
) |
133 (1ULL << CAP_SETGID
) |
134 (1ULL << CAP_SETFCAP
) |
135 (1ULL << CAP_SETPCAP
) |
136 (1ULL << CAP_SETUID
) |
137 (1ULL << CAP_SYS_ADMIN
) |
138 (1ULL << CAP_SYS_CHROOT
) |
139 (1ULL << CAP_SYS_NICE
) |
140 (1ULL << CAP_SYS_PTRACE
) |
141 (1ULL << CAP_SYS_TTY_CONFIG
) |
142 (1ULL << CAP_SYS_RESOURCE
) |
143 (1ULL << CAP_SYS_BOOT
) |
144 (1ULL << CAP_AUDIT_WRITE
) |
145 (1ULL << CAP_AUDIT_CONTROL
) |
147 static CustomMount
*arg_custom_mounts
= NULL
;
148 static unsigned arg_n_custom_mounts
= 0;
149 static char **arg_setenv
= NULL
;
150 static bool arg_quiet
= false;
151 static bool arg_share_system
= false;
152 static bool arg_register
= true;
153 static bool arg_keep_unit
= false;
154 static char **arg_network_interfaces
= NULL
;
155 static char **arg_network_macvlan
= NULL
;
156 static char **arg_network_ipvlan
= NULL
;
157 static bool arg_network_veth
= false;
158 static char *arg_network_bridge
= NULL
;
159 static unsigned long arg_personality
= PERSONALITY_INVALID
;
160 static char *arg_image
= NULL
;
161 static VolatileMode arg_volatile_mode
= VOLATILE_NO
;
162 static ExposePort
*arg_expose_ports
= NULL
;
163 static char **arg_property
= NULL
;
164 static uid_t arg_uid_shift
= UID_INVALID
, arg_uid_range
= 0x10000U
;
165 static bool arg_userns
= false;
166 static int arg_kill_signal
= 0;
167 static bool arg_unified_cgroup_hierarchy
= false;
168 static SettingsMask arg_settings_mask
= 0;
169 static int arg_settings_trusted
= -1;
170 static char **arg_parameters
= NULL
;
172 static void help(void) {
173 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
174 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
175 " -h --help Show this help\n"
176 " --version Print version string\n"
177 " -q --quiet Do not show status information\n"
178 " -D --directory=PATH Root directory for the container\n"
179 " --template=PATH Initialize root directory from template directory,\n"
181 " -x --ephemeral Run container with snapshot of root directory, and\n"
182 " remove it after exit\n"
183 " -i --image=PATH File system device or disk image for the container\n"
184 " -b --boot Boot up full system (i.e. invoke init)\n"
185 " -u --user=USER Run the command under specified user or uid\n"
186 " -M --machine=NAME Set the machine name for the container\n"
187 " --uuid=UUID Set a specific machine UUID for the container\n"
188 " -S --slice=SLICE Place the container in the specified slice\n"
189 " --property=NAME=VALUE Set scope unit property\n"
190 " --private-users[=UIDBASE[:NUIDS]]\n"
191 " Run within user namespace\n"
192 " --private-network Disable network in container\n"
193 " --network-interface=INTERFACE\n"
194 " Assign an existing network interface to the\n"
196 " --network-macvlan=INTERFACE\n"
197 " Create a macvlan network interface based on an\n"
198 " existing network interface to the container\n"
199 " --network-ipvlan=INTERFACE\n"
200 " Create a ipvlan network interface based on an\n"
201 " existing network interface to the container\n"
202 " -n --network-veth Add a virtual ethernet connection between host\n"
204 " --network-bridge=INTERFACE\n"
205 " Add a virtual ethernet connection between host\n"
206 " and container and add it to an existing bridge on\n"
208 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
209 " Expose a container IP port on the host\n"
210 " -Z --selinux-context=SECLABEL\n"
211 " Set the SELinux security context to be used by\n"
212 " processes in the container\n"
213 " -L --selinux-apifs-context=SECLABEL\n"
214 " Set the SELinux security context to be used by\n"
215 " API/tmpfs file systems in the container\n"
216 " --capability=CAP In addition to the default, retain specified\n"
218 " --drop-capability=CAP Drop the specified capability from the default set\n"
219 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
220 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
221 " try-guest, try-host\n"
222 " -j Equivalent to --link-journal=try-guest\n"
223 " --read-only Mount the root directory read-only\n"
224 " --bind=PATH[:PATH[:OPTIONS]]\n"
225 " Bind mount a file or directory from the host into\n"
227 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
228 " Similar, but creates a read-only bind mount\n"
229 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
230 " --overlay=PATH[:PATH...]:PATH\n"
231 " Create an overlay mount from the host to \n"
233 " --overlay-ro=PATH[:PATH...]:PATH\n"
234 " Similar, but creates a read-only overlay mount\n"
235 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
236 " --share-system Share system namespaces with host\n"
237 " --register=BOOLEAN Register container as machine\n"
238 " --keep-unit Do not register a scope for the machine, reuse\n"
239 " the service unit nspawn is running in\n"
240 " --volatile[=MODE] Run the system in volatile mode\n"
241 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
242 , program_invocation_short_name
);
246 static int custom_mounts_prepare(void) {
250 /* Ensure the mounts are applied prefix first. */
251 qsort_safe(arg_custom_mounts
, arg_n_custom_mounts
, sizeof(CustomMount
), custom_mount_compare
);
253 /* Allocate working directories for the overlay file systems that need it */
254 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
255 CustomMount
*m
= &arg_custom_mounts
[i
];
257 if (arg_userns
&& arg_uid_shift
== UID_INVALID
&& path_equal(m
->destination
, "/")) {
258 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
262 if (m
->type
!= CUSTOM_MOUNT_OVERLAY
)
271 r
= tempfn_random(m
->source
, NULL
, &m
->work_dir
);
273 return log_error_errno(r
, "Failed to generate work directory from %s: %m", m
->source
);
279 static int set_sanitized_path(char **b
, const char *path
) {
285 p
= canonicalize_file_name(path
);
290 p
= path_make_absolute_cwd(path
);
296 *b
= path_kill_slashes(p
);
300 static int detect_unified_cgroup_hierarchy(void) {
304 /* Allow the user to control whether the unified hierarchy is used */
305 e
= getenv("UNIFIED_CGROUP_HIERARCHY");
307 r
= parse_boolean(e
);
309 return log_error_errno(r
, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
311 arg_unified_cgroup_hierarchy
= r
;
315 /* Otherwise inherit the default from the host system */
318 return log_error_errno(r
, "Failed to determine whether the unified cgroups hierarchy is used: %m");
320 arg_unified_cgroup_hierarchy
= r
;
324 static int parse_argv(int argc
, char *argv
[]) {
343 ARG_NETWORK_INTERFACE
,
356 static const struct option options
[] = {
357 { "help", no_argument
, NULL
, 'h' },
358 { "version", no_argument
, NULL
, ARG_VERSION
},
359 { "directory", required_argument
, NULL
, 'D' },
360 { "template", required_argument
, NULL
, ARG_TEMPLATE
},
361 { "ephemeral", no_argument
, NULL
, 'x' },
362 { "user", required_argument
, NULL
, 'u' },
363 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
364 { "boot", no_argument
, NULL
, 'b' },
365 { "uuid", required_argument
, NULL
, ARG_UUID
},
366 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
367 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
368 { "drop-capability", required_argument
, NULL
, ARG_DROP_CAPABILITY
},
369 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
370 { "bind", required_argument
, NULL
, ARG_BIND
},
371 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
372 { "tmpfs", required_argument
, NULL
, ARG_TMPFS
},
373 { "overlay", required_argument
, NULL
, ARG_OVERLAY
},
374 { "overlay-ro", required_argument
, NULL
, ARG_OVERLAY_RO
},
375 { "machine", required_argument
, NULL
, 'M' },
376 { "slice", required_argument
, NULL
, 'S' },
377 { "setenv", required_argument
, NULL
, ARG_SETENV
},
378 { "selinux-context", required_argument
, NULL
, 'Z' },
379 { "selinux-apifs-context", required_argument
, NULL
, 'L' },
380 { "quiet", no_argument
, NULL
, 'q' },
381 { "share-system", no_argument
, NULL
, ARG_SHARE_SYSTEM
},
382 { "register", required_argument
, NULL
, ARG_REGISTER
},
383 { "keep-unit", no_argument
, NULL
, ARG_KEEP_UNIT
},
384 { "network-interface", required_argument
, NULL
, ARG_NETWORK_INTERFACE
},
385 { "network-macvlan", required_argument
, NULL
, ARG_NETWORK_MACVLAN
},
386 { "network-ipvlan", required_argument
, NULL
, ARG_NETWORK_IPVLAN
},
387 { "network-veth", no_argument
, NULL
, 'n' },
388 { "network-bridge", required_argument
, NULL
, ARG_NETWORK_BRIDGE
},
389 { "personality", required_argument
, NULL
, ARG_PERSONALITY
},
390 { "image", required_argument
, NULL
, 'i' },
391 { "volatile", optional_argument
, NULL
, ARG_VOLATILE
},
392 { "port", required_argument
, NULL
, 'p' },
393 { "property", required_argument
, NULL
, ARG_PROPERTY
},
394 { "private-users", optional_argument
, NULL
, ARG_PRIVATE_USERS
},
395 { "kill-signal", required_argument
, NULL
, ARG_KILL_SIGNAL
},
396 { "settings", required_argument
, NULL
, ARG_SETTINGS
},
401 uint64_t plus
= 0, minus
= 0;
402 bool mask_all_settings
= false, mask_no_settings
= false;
407 while ((c
= getopt_long(argc
, argv
, "+hD:u:bL:M:jS:Z:qi:xp:n", options
, NULL
)) >= 0)
419 r
= set_sanitized_path(&arg_directory
, optarg
);
421 return log_error_errno(r
, "Invalid root directory: %m");
426 r
= set_sanitized_path(&arg_template
, optarg
);
428 return log_error_errno(r
, "Invalid template directory: %m");
433 r
= set_sanitized_path(&arg_image
, optarg
);
435 return log_error_errno(r
, "Invalid image path: %m");
440 arg_ephemeral
= true;
444 r
= free_and_strdup(&arg_user
, optarg
);
448 arg_settings_mask
|= SETTING_USER
;
451 case ARG_NETWORK_BRIDGE
:
452 r
= free_and_strdup(&arg_network_bridge
, optarg
);
459 arg_network_veth
= true;
460 arg_private_network
= true;
461 arg_settings_mask
|= SETTING_NETWORK
;
464 case ARG_NETWORK_INTERFACE
:
465 if (strv_extend(&arg_network_interfaces
, optarg
) < 0)
468 arg_private_network
= true;
469 arg_settings_mask
|= SETTING_NETWORK
;
472 case ARG_NETWORK_MACVLAN
:
473 if (strv_extend(&arg_network_macvlan
, optarg
) < 0)
476 arg_private_network
= true;
477 arg_settings_mask
|= SETTING_NETWORK
;
480 case ARG_NETWORK_IPVLAN
:
481 if (strv_extend(&arg_network_ipvlan
, optarg
) < 0)
486 case ARG_PRIVATE_NETWORK
:
487 arg_private_network
= true;
488 arg_settings_mask
|= SETTING_NETWORK
;
493 arg_settings_mask
|= SETTING_BOOT
;
497 r
= sd_id128_from_string(optarg
, &arg_uuid
);
499 log_error("Invalid UUID: %s", optarg
);
503 arg_settings_mask
|= SETTING_MACHINE_ID
;
512 arg_machine
= mfree(arg_machine
);
514 if (!machine_name_is_valid(optarg
)) {
515 log_error("Invalid machine name: %s", optarg
);
519 r
= free_and_strdup(&arg_machine
, optarg
);
527 arg_selinux_context
= optarg
;
531 arg_selinux_apifs_context
= optarg
;
535 arg_read_only
= true;
536 arg_settings_mask
|= SETTING_READ_ONLY
;
540 case ARG_DROP_CAPABILITY
: {
541 const char *state
, *word
;
544 FOREACH_WORD_SEPARATOR(word
, length
, optarg
, ",", state
) {
545 _cleanup_free_
char *t
;
547 t
= strndup(word
, length
);
551 if (streq(t
, "all")) {
552 if (c
== ARG_CAPABILITY
)
553 plus
= (uint64_t) -1;
555 minus
= (uint64_t) -1;
559 cap
= capability_from_name(t
);
561 log_error("Failed to parse capability %s.", t
);
565 if (c
== ARG_CAPABILITY
)
566 plus
|= 1ULL << (uint64_t) cap
;
568 minus
|= 1ULL << (uint64_t) cap
;
572 arg_settings_mask
|= SETTING_CAPABILITY
;
577 arg_link_journal
= LINK_GUEST
;
578 arg_link_journal_try
= true;
581 case ARG_LINK_JOURNAL
:
582 if (streq(optarg
, "auto")) {
583 arg_link_journal
= LINK_AUTO
;
584 arg_link_journal_try
= false;
585 } else if (streq(optarg
, "no")) {
586 arg_link_journal
= LINK_NO
;
587 arg_link_journal_try
= false;
588 } else if (streq(optarg
, "guest")) {
589 arg_link_journal
= LINK_GUEST
;
590 arg_link_journal_try
= false;
591 } else if (streq(optarg
, "host")) {
592 arg_link_journal
= LINK_HOST
;
593 arg_link_journal_try
= false;
594 } else if (streq(optarg
, "try-guest")) {
595 arg_link_journal
= LINK_GUEST
;
596 arg_link_journal_try
= true;
597 } else if (streq(optarg
, "try-host")) {
598 arg_link_journal
= LINK_HOST
;
599 arg_link_journal_try
= true;
601 log_error("Failed to parse link journal mode %s", optarg
);
609 r
= bind_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
, c
== ARG_BIND_RO
);
611 return log_error_errno(r
, "Failed to parse --bind(-ro)= argument %s: %m", optarg
);
613 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
617 r
= tmpfs_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
);
619 return log_error_errno(r
, "Failed to parse --tmpfs= argument %s: %m", optarg
);
621 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
625 case ARG_OVERLAY_RO
: {
626 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
627 _cleanup_strv_free_
char **lower
= NULL
;
632 r
= strv_split_extract(&lower
, optarg
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
636 log_error("Invalid overlay specification: %s", optarg
);
640 STRV_FOREACH(i
, lower
) {
641 if (!path_is_absolute(*i
)) {
642 log_error("Overlay path %s is not absolute.", *i
);
650 log_error("--overlay= needs at least two colon-separated directories specified.");
655 /* If two parameters are specified,
656 * the first one is the lower, the
657 * second one the upper directory. And
658 * we'll also define the destination
659 * mount point the same as the upper. */
663 destination
= strdup(upper
);
668 upper
= lower
[n
- 2];
669 destination
= lower
[n
- 1];
673 m
= custom_mount_add(&arg_custom_mounts
, &arg_n_custom_mounts
, CUSTOM_MOUNT_OVERLAY
);
677 m
->destination
= destination
;
680 m
->read_only
= c
== ARG_OVERLAY_RO
;
682 upper
= destination
= NULL
;
685 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
692 if (!env_assignment_is_valid(optarg
)) {
693 log_error("Environment variable assignment '%s' is not valid.", optarg
);
697 n
= strv_env_set(arg_setenv
, optarg
);
701 strv_free(arg_setenv
);
704 arg_settings_mask
|= SETTING_ENVIRONMENT
;
712 case ARG_SHARE_SYSTEM
:
713 arg_share_system
= true;
717 r
= parse_boolean(optarg
);
719 log_error("Failed to parse --register= argument: %s", optarg
);
727 arg_keep_unit
= true;
730 case ARG_PERSONALITY
:
732 arg_personality
= personality_from_string(optarg
);
733 if (arg_personality
== PERSONALITY_INVALID
) {
734 log_error("Unknown or unsupported personality '%s'.", optarg
);
738 arg_settings_mask
|= SETTING_PERSONALITY
;
744 arg_volatile_mode
= VOLATILE_YES
;
748 m
= volatile_mode_from_string(optarg
);
750 log_error("Failed to parse --volatile= argument: %s", optarg
);
753 arg_volatile_mode
= m
;
756 arg_settings_mask
|= SETTING_VOLATILE_MODE
;
760 r
= expose_port_parse(&arg_expose_ports
, optarg
);
762 return log_error_errno(r
, "Duplicate port specification: %s", optarg
);
764 return log_error_errno(r
, "Failed to parse host port %s: %m", optarg
);
766 arg_settings_mask
|= SETTING_EXPOSE_PORTS
;
770 if (strv_extend(&arg_property
, optarg
) < 0)
775 case ARG_PRIVATE_USERS
:
777 _cleanup_free_
char *buffer
= NULL
;
778 const char *range
, *shift
;
780 range
= strchr(optarg
, ':');
782 buffer
= strndup(optarg
, range
- optarg
);
788 if (safe_atou32(range
, &arg_uid_range
) < 0 || arg_uid_range
<= 0) {
789 log_error("Failed to parse UID range: %s", range
);
795 if (parse_uid(shift
, &arg_uid_shift
) < 0) {
796 log_error("Failed to parse UID: %s", optarg
);
804 case ARG_KILL_SIGNAL
:
805 arg_kill_signal
= signal_from_string_try_harder(optarg
);
806 if (arg_kill_signal
< 0) {
807 log_error("Cannot parse signal: %s", optarg
);
811 arg_settings_mask
|= SETTING_KILL_SIGNAL
;
816 /* no → do not read files
817 * yes → read files, do not override cmdline, trust only subset
818 * override → read files, override cmdline, trust only subset
819 * trusted → read files, do not override cmdline, trust all
822 r
= parse_boolean(optarg
);
824 if (streq(optarg
, "trusted")) {
825 mask_all_settings
= false;
826 mask_no_settings
= false;
827 arg_settings_trusted
= true;
829 } else if (streq(optarg
, "override")) {
830 mask_all_settings
= false;
831 mask_no_settings
= true;
832 arg_settings_trusted
= -1;
834 return log_error_errno(r
, "Failed to parse --settings= argument: %s", optarg
);
837 mask_all_settings
= false;
838 mask_no_settings
= false;
839 arg_settings_trusted
= -1;
842 mask_all_settings
= true;
843 mask_no_settings
= false;
844 arg_settings_trusted
= false;
853 assert_not_reached("Unhandled option");
856 if (arg_share_system
)
857 arg_register
= false;
859 if (arg_boot
&& arg_share_system
) {
860 log_error("--boot and --share-system may not be combined.");
864 if (arg_keep_unit
&& cg_pid_get_owner_uid(0, NULL
) >= 0) {
865 log_error("--keep-unit may not be used when invoked from a user session.");
869 if (arg_directory
&& arg_image
) {
870 log_error("--directory= and --image= may not be combined.");
874 if (arg_template
&& arg_image
) {
875 log_error("--template= and --image= may not be combined.");
879 if (arg_template
&& !(arg_directory
|| arg_machine
)) {
880 log_error("--template= needs --directory= or --machine=.");
884 if (arg_ephemeral
&& arg_template
) {
885 log_error("--ephemeral and --template= may not be combined.");
889 if (arg_ephemeral
&& arg_image
) {
890 log_error("--ephemeral and --image= may not be combined.");
894 if (arg_ephemeral
&& !IN_SET(arg_link_journal
, LINK_NO
, LINK_AUTO
)) {
895 log_error("--ephemeral and --link-journal= may not be combined.");
899 if (arg_userns
&& access("/proc/self/uid_map", F_OK
) < 0)
900 return log_error_errno(EOPNOTSUPP
, "--private-users= is not supported, kernel compiled without user namespace support.");
903 arg_parameters
= strv_copy(argv
+ optind
);
907 arg_settings_mask
|= SETTING_BOOT
;
910 /* Load all settings from .nspawn files */
911 if (mask_no_settings
)
912 arg_settings_mask
= 0;
914 /* Don't load any settings from .nspawn files */
915 if (mask_all_settings
)
916 arg_settings_mask
= _SETTINGS_MASK_ALL
;
918 arg_retain
= (arg_retain
| plus
| (arg_private_network
? 1ULL << CAP_NET_ADMIN
: 0)) & ~minus
;
920 r
= detect_unified_cgroup_hierarchy();
927 static int verify_arguments(void) {
929 if (arg_volatile_mode
!= VOLATILE_NO
&& arg_read_only
) {
930 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
934 if (arg_expose_ports
&& !arg_private_network
) {
935 log_error("Cannot use --port= without private networking.");
939 if (arg_boot
&& arg_kill_signal
<= 0)
940 arg_kill_signal
= SIGRTMIN
+3;
945 static int userns_lchown(const char *p
, uid_t uid
, gid_t gid
) {
951 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
954 if (uid
!= UID_INVALID
) {
955 uid
+= arg_uid_shift
;
957 if (uid
< arg_uid_shift
|| uid
>= arg_uid_shift
+ arg_uid_range
)
961 if (gid
!= GID_INVALID
) {
962 gid
+= (gid_t
) arg_uid_shift
;
964 if (gid
< (gid_t
) arg_uid_shift
|| gid
>= (gid_t
) (arg_uid_shift
+ arg_uid_range
))
968 if (lchown(p
, uid
, gid
) < 0)
974 static int userns_mkdir(const char *root
, const char *path
, mode_t mode
, uid_t uid
, gid_t gid
) {
977 q
= prefix_roota(root
, path
);
978 if (mkdir(q
, mode
) < 0) {
984 return userns_lchown(q
, uid
, gid
);
987 static int setup_timezone(const char *dest
) {
988 _cleanup_free_
char *p
= NULL
, *q
= NULL
;
989 const char *where
, *check
, *what
;
995 /* Fix the timezone, if possible */
996 r
= readlink_malloc("/etc/localtime", &p
);
998 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1002 z
= path_startswith(p
, "../usr/share/zoneinfo/");
1004 z
= path_startswith(p
, "/usr/share/zoneinfo/");
1006 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1010 where
= prefix_roota(dest
, "/etc/localtime");
1011 r
= readlink_malloc(where
, &q
);
1013 y
= path_startswith(q
, "../usr/share/zoneinfo/");
1015 y
= path_startswith(q
, "/usr/share/zoneinfo/");
1017 /* Already pointing to the right place? Then do nothing .. */
1018 if (y
&& streq(y
, z
))
1022 check
= strjoina("/usr/share/zoneinfo/", z
);
1023 check
= prefix_root(dest
, check
);
1024 if (laccess(check
, F_OK
) < 0) {
1025 log_warning("Timezone %s does not exist in container, not updating container timezone.", z
);
1030 if (r
< 0 && errno
!= ENOENT
) {
1031 log_error_errno(errno
, "Failed to remove existing timezone info %s in container: %m", where
);
1035 what
= strjoina("../usr/share/zoneinfo/", z
);
1036 if (symlink(what
, where
) < 0) {
1037 log_error_errno(errno
, "Failed to correct timezone of container: %m");
1041 r
= userns_lchown(where
, 0, 0);
1043 return log_warning_errno(r
, "Failed to chown /etc/localtime: %m");
1048 static int setup_resolv_conf(const char *dest
) {
1049 const char *where
= NULL
;
1054 if (arg_private_network
)
1057 /* Fix resolv.conf, if possible */
1058 where
= prefix_roota(dest
, "/etc/resolv.conf");
1060 r
= copy_file("/etc/resolv.conf", where
, O_TRUNC
|O_NOFOLLOW
, 0644, 0);
1062 /* If the file already exists as symlink, let's
1063 * suppress the warning, under the assumption that
1064 * resolved or something similar runs inside and the
1065 * symlink points there.
1067 * If the disk image is read-only, there's also no
1068 * point in complaining.
1070 log_full_errno(IN_SET(r
, -ELOOP
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
1071 "Failed to copy /etc/resolv.conf to %s: %m", where
);
1075 r
= userns_lchown(where
, 0, 0);
1077 log_warning_errno(r
, "Failed to chown /etc/resolv.conf: %m");
1082 static char* id128_format_as_uuid(sd_id128_t id
, char s
[37]) {
1086 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1087 SD_ID128_FORMAT_VAL(id
));
1092 static int setup_boot_id(const char *dest
) {
1093 const char *from
, *to
;
1094 sd_id128_t rnd
= {};
1098 if (arg_share_system
)
1101 /* Generate a new randomized boot ID, so that each boot-up of
1102 * the container gets a new one */
1104 from
= prefix_roota(dest
, "/run/proc-sys-kernel-random-boot-id");
1105 to
= prefix_roota(dest
, "/proc/sys/kernel/random/boot_id");
1107 r
= sd_id128_randomize(&rnd
);
1109 return log_error_errno(r
, "Failed to generate random boot id: %m");
1111 id128_format_as_uuid(rnd
, as_uuid
);
1113 r
= write_string_file(from
, as_uuid
, WRITE_STRING_FILE_CREATE
);
1115 return log_error_errno(r
, "Failed to write boot id: %m");
1117 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1118 r
= log_error_errno(errno
, "Failed to bind mount boot id: %m");
1119 else if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
) < 0)
1120 log_warning_errno(errno
, "Failed to make boot id read-only: %m");
1126 static int copy_devnodes(const char *dest
) {
1128 static const char devnodes
[] =
1139 _cleanup_umask_ mode_t u
;
1145 /* Create /dev/net, so that we can create /dev/net/tun in it */
1146 if (userns_mkdir(dest
, "/dev/net", 0755, 0, 0) < 0)
1147 return log_error_errno(r
, "Failed to create /dev/net directory: %m");
1149 NULSTR_FOREACH(d
, devnodes
) {
1150 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
1153 from
= strappend("/dev/", d
);
1154 to
= prefix_root(dest
, from
);
1156 if (stat(from
, &st
) < 0) {
1158 if (errno
!= ENOENT
)
1159 return log_error_errno(errno
, "Failed to stat %s: %m", from
);
1161 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
1163 log_error("%s is not a char or block device, cannot copy.", from
);
1167 if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
1169 return log_error_errno(errno
, "mknod(%s) failed: %m", to
);
1171 /* Some systems abusively restrict mknod but
1172 * allow bind mounts. */
1175 return log_error_errno(r
, "touch (%s) failed: %m", to
);
1176 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1177 return log_error_errno(errno
, "Both mknod and bind mount (%s) failed: %m", to
);
1180 r
= userns_lchown(to
, 0, 0);
1182 return log_error_errno(r
, "chown() of device node %s failed: %m", to
);
1189 static int setup_pts(const char *dest
) {
1190 _cleanup_free_
char *options
= NULL
;
1194 if (arg_selinux_apifs_context
)
1195 (void) asprintf(&options
,
1196 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
",context=\"%s\"",
1197 arg_uid_shift
+ TTY_GID
,
1198 arg_selinux_apifs_context
);
1201 (void) asprintf(&options
,
1202 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
,
1203 arg_uid_shift
+ TTY_GID
);
1208 /* Mount /dev/pts itself */
1209 p
= prefix_roota(dest
, "/dev/pts");
1210 if (mkdir(p
, 0755) < 0)
1211 return log_error_errno(errno
, "Failed to create /dev/pts: %m");
1212 if (mount("devpts", p
, "devpts", MS_NOSUID
|MS_NOEXEC
, options
) < 0)
1213 return log_error_errno(errno
, "Failed to mount /dev/pts: %m");
1214 if (userns_lchown(p
, 0, 0) < 0)
1215 return log_error_errno(errno
, "Failed to chown /dev/pts: %m");
1217 /* Create /dev/ptmx symlink */
1218 p
= prefix_roota(dest
, "/dev/ptmx");
1219 if (symlink("pts/ptmx", p
) < 0)
1220 return log_error_errno(errno
, "Failed to create /dev/ptmx symlink: %m");
1221 if (userns_lchown(p
, 0, 0) < 0)
1222 return log_error_errno(errno
, "Failed to chown /dev/ptmx: %m");
1224 /* And fix /dev/pts/ptmx ownership */
1225 p
= prefix_roota(dest
, "/dev/pts/ptmx");
1226 if (userns_lchown(p
, 0, 0) < 0)
1227 return log_error_errno(errno
, "Failed to chown /dev/pts/ptmx: %m");
1232 static int setup_dev_console(const char *dest
, const char *console
) {
1233 _cleanup_umask_ mode_t u
;
1242 r
= chmod_and_chown(console
, 0600, arg_uid_shift
, arg_uid_shift
);
1244 return log_error_errno(r
, "Failed to correct access mode for TTY: %m");
1246 /* We need to bind mount the right tty to /dev/console since
1247 * ptys can only exist on pts file systems. To have something
1248 * to bind mount things on we create a empty regular file. */
1250 to
= prefix_roota(dest
, "/dev/console");
1253 return log_error_errno(r
, "touch() for /dev/console failed: %m");
1255 if (mount(console
, to
, NULL
, MS_BIND
, NULL
) < 0)
1256 return log_error_errno(errno
, "Bind mount for /dev/console failed: %m");
1261 static int setup_kmsg(const char *dest
, int kmsg_socket
) {
1262 const char *from
, *to
;
1263 _cleanup_umask_ mode_t u
;
1266 assert(kmsg_socket
>= 0);
1270 /* We create the kmsg FIFO as /run/kmsg, but immediately
1271 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1272 * on the reading side behave very similar to /proc/kmsg,
1273 * their writing side behaves differently from /dev/kmsg in
1274 * that writing blocks when nothing is reading. In order to
1275 * avoid any problems with containers deadlocking due to this
1276 * we simply make /dev/kmsg unavailable to the container. */
1277 from
= prefix_roota(dest
, "/run/kmsg");
1278 to
= prefix_roota(dest
, "/proc/kmsg");
1280 if (mkfifo(from
, 0600) < 0)
1281 return log_error_errno(errno
, "mkfifo() for /run/kmsg failed: %m");
1282 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1283 return log_error_errno(errno
, "Bind mount for /proc/kmsg failed: %m");
1285 fd
= open(from
, O_RDWR
|O_NDELAY
|O_CLOEXEC
);
1287 return log_error_errno(errno
, "Failed to open fifo: %m");
1289 /* Store away the fd in the socket, so that it stays open as
1290 * long as we run the child */
1291 r
= send_one_fd(kmsg_socket
, fd
, 0);
1295 return log_error_errno(r
, "Failed to send FIFO fd: %m");
1297 /* And now make the FIFO unavailable as /run/kmsg... */
1298 (void) unlink(from
);
1303 static int on_address_change(sd_netlink
*rtnl
, sd_netlink_message
*m
, void *userdata
) {
1304 union in_addr_union
*exposed
= userdata
;
1310 expose_port_execute(rtnl
, arg_expose_ports
, exposed
);
1314 static int setup_hostname(void) {
1316 if (arg_share_system
)
1319 if (sethostname_idempotent(arg_machine
) < 0)
1325 static int setup_journal(const char *directory
) {
1326 sd_id128_t machine_id
, this_id
;
1327 _cleanup_free_
char *b
= NULL
, *d
= NULL
;
1328 const char *etc_machine_id
, *p
, *q
;
1332 /* Don't link journals in ephemeral mode */
1336 etc_machine_id
= prefix_roota(directory
, "/etc/machine-id");
1338 r
= read_one_line_file(etc_machine_id
, &b
);
1339 if (r
== -ENOENT
&& arg_link_journal
== LINK_AUTO
)
1342 return log_error_errno(r
, "Failed to read machine ID from %s: %m", etc_machine_id
);
1345 if (isempty(id
) && arg_link_journal
== LINK_AUTO
)
1348 /* Verify validity */
1349 r
= sd_id128_from_string(id
, &machine_id
);
1351 return log_error_errno(r
, "Failed to parse machine ID from %s: %m", etc_machine_id
);
1353 r
= sd_id128_get_machine(&this_id
);
1355 return log_error_errno(r
, "Failed to retrieve machine ID: %m");
1357 if (sd_id128_equal(machine_id
, this_id
)) {
1358 log_full(arg_link_journal
== LINK_AUTO
? LOG_WARNING
: LOG_ERR
,
1359 "Host and machine ids are equal (%s): refusing to link journals", id
);
1360 if (arg_link_journal
== LINK_AUTO
)
1365 if (arg_link_journal
== LINK_NO
)
1368 r
= userns_mkdir(directory
, "/var", 0755, 0, 0);
1370 return log_error_errno(r
, "Failed to create /var: %m");
1372 r
= userns_mkdir(directory
, "/var/log", 0755, 0, 0);
1374 return log_error_errno(r
, "Failed to create /var/log: %m");
1376 r
= userns_mkdir(directory
, "/var/log/journal", 0755, 0, 0);
1378 return log_error_errno(r
, "Failed to create /var/log/journal: %m");
1380 p
= strjoina("/var/log/journal/", id
);
1381 q
= prefix_roota(directory
, p
);
1383 if (path_is_mount_point(p
, 0) > 0) {
1384 if (arg_link_journal
!= LINK_AUTO
) {
1385 log_error("%s: already a mount point, refusing to use for journal", p
);
1392 if (path_is_mount_point(q
, 0) > 0) {
1393 if (arg_link_journal
!= LINK_AUTO
) {
1394 log_error("%s: already a mount point, refusing to use for journal", q
);
1401 r
= readlink_and_make_absolute(p
, &d
);
1403 if ((arg_link_journal
== LINK_GUEST
||
1404 arg_link_journal
== LINK_AUTO
) &&
1407 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1409 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
1414 return log_error_errno(errno
, "Failed to remove symlink %s: %m", p
);
1415 } else if (r
== -EINVAL
) {
1417 if (arg_link_journal
== LINK_GUEST
&&
1420 if (errno
== ENOTDIR
) {
1421 log_error("%s already exists and is neither a symlink nor a directory", p
);
1424 log_error_errno(errno
, "Failed to remove %s: %m", p
);
1428 } else if (r
!= -ENOENT
) {
1429 log_error_errno(errno
, "readlink(%s) failed: %m", p
);
1433 if (arg_link_journal
== LINK_GUEST
) {
1435 if (symlink(q
, p
) < 0) {
1436 if (arg_link_journal_try
) {
1437 log_debug_errno(errno
, "Failed to symlink %s to %s, skipping journal setup: %m", q
, p
);
1440 log_error_errno(errno
, "Failed to symlink %s to %s: %m", q
, p
);
1445 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1447 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
1451 if (arg_link_journal
== LINK_HOST
) {
1452 /* don't create parents here -- if the host doesn't have
1453 * permanent journal set up, don't force it here */
1456 if (arg_link_journal_try
) {
1457 log_debug_errno(errno
, "Failed to create %s, skipping journal setup: %m", p
);
1460 log_error_errno(errno
, "Failed to create %s: %m", p
);
1465 } else if (access(p
, F_OK
) < 0)
1468 if (dir_is_empty(q
) == 0)
1469 log_warning("%s is not empty, proceeding anyway.", q
);
1471 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1473 log_error_errno(errno
, "Failed to create %s: %m", q
);
1477 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1478 return log_error_errno(errno
, "Failed to bind mount journal from host into guest: %m");
1483 static int drop_capabilities(void) {
1484 return capability_bounding_set_drop(~arg_retain
, false);
1487 static int reset_audit_loginuid(void) {
1488 _cleanup_free_
char *p
= NULL
;
1491 if (arg_share_system
)
1494 r
= read_one_line_file("/proc/self/loginuid", &p
);
1498 return log_error_errno(r
, "Failed to read /proc/self/loginuid: %m");
1500 /* Already reset? */
1501 if (streq(p
, "4294967295"))
1504 r
= write_string_file("/proc/self/loginuid", "4294967295", 0);
1507 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1508 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1509 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1510 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1511 "using systemd-nspawn. Sleeping for 5s... (%m)");
1519 static int setup_seccomp(void) {
1522 static const struct {
1523 uint64_t capability
;
1526 { CAP_SYS_RAWIO
, SCMP_SYS(iopl
) },
1527 { CAP_SYS_RAWIO
, SCMP_SYS(ioperm
) },
1528 { CAP_SYS_BOOT
, SCMP_SYS(kexec_load
) },
1529 { CAP_SYS_ADMIN
, SCMP_SYS(swapon
) },
1530 { CAP_SYS_ADMIN
, SCMP_SYS(swapoff
) },
1531 { CAP_SYS_ADMIN
, SCMP_SYS(open_by_handle_at
) },
1532 { CAP_SYS_MODULE
, SCMP_SYS(init_module
) },
1533 { CAP_SYS_MODULE
, SCMP_SYS(finit_module
) },
1534 { CAP_SYS_MODULE
, SCMP_SYS(delete_module
) },
1535 { CAP_SYSLOG
, SCMP_SYS(syslog
) },
1538 scmp_filter_ctx seccomp
;
1542 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
1546 r
= seccomp_add_secondary_archs(seccomp
);
1548 log_error_errno(r
, "Failed to add secondary archs to seccomp filter: %m");
1552 for (i
= 0; i
< ELEMENTSOF(blacklist
); i
++) {
1553 if (arg_retain
& (1ULL << blacklist
[i
].capability
))
1556 r
= seccomp_rule_add(seccomp
, SCMP_ACT_ERRNO(EPERM
), blacklist
[i
].syscall_num
, 0);
1558 continue; /* unknown syscall */
1560 log_error_errno(r
, "Failed to block syscall: %m");
1567 Audit is broken in containers, much of the userspace audit
1568 hookup will fail if running inside a container. We don't
1569 care and just turn off creation of audit sockets.
1571 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1572 with EAFNOSUPPORT which audit userspace uses as indication
1573 that audit is disabled in the kernel.
1576 r
= seccomp_rule_add(
1578 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1581 SCMP_A0(SCMP_CMP_EQ
, AF_NETLINK
),
1582 SCMP_A2(SCMP_CMP_EQ
, NETLINK_AUDIT
));
1584 log_error_errno(r
, "Failed to add audit seccomp rule: %m");
1588 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
1590 log_error_errno(r
, "Failed to unset NO_NEW_PRIVS: %m");
1594 r
= seccomp_load(seccomp
);
1596 log_debug_errno(r
, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1601 log_error_errno(r
, "Failed to install seccomp audit filter: %m");
1606 seccomp_release(seccomp
);
1614 static int setup_propagate(const char *root
) {
1617 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1618 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1619 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
1620 (void) mkdir_p(p
, 0600);
1622 if (userns_mkdir(root
, "/run/systemd", 0755, 0, 0) < 0)
1623 return log_error_errno(errno
, "Failed to create /run/systemd: %m");
1625 if (userns_mkdir(root
, "/run/systemd/nspawn", 0755, 0, 0) < 0)
1626 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn: %m");
1628 if (userns_mkdir(root
, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
1629 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn/incoming: %m");
1631 q
= prefix_roota(root
, "/run/systemd/nspawn/incoming");
1632 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1633 return log_error_errno(errno
, "Failed to install propagation bind mount.");
1635 if (mount(NULL
, q
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
) < 0)
1636 return log_error_errno(errno
, "Failed to make propagation mount read-only");
1641 static int setup_image(char **device_path
, int *loop_nr
) {
1642 struct loop_info64 info
= {
1643 .lo_flags
= LO_FLAGS_AUTOCLEAR
|LO_FLAGS_PARTSCAN
1645 _cleanup_close_
int fd
= -1, control
= -1, loop
= -1;
1646 _cleanup_free_
char* loopdev
= NULL
;
1650 assert(device_path
);
1654 fd
= open(arg_image
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1656 return log_error_errno(errno
, "Failed to open %s: %m", arg_image
);
1658 if (fstat(fd
, &st
) < 0)
1659 return log_error_errno(errno
, "Failed to stat %s: %m", arg_image
);
1661 if (S_ISBLK(st
.st_mode
)) {
1664 p
= strdup(arg_image
);
1678 if (!S_ISREG(st
.st_mode
)) {
1679 log_error_errno(errno
, "%s is not a regular file or block device: %m", arg_image
);
1683 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
1685 return log_error_errno(errno
, "Failed to open /dev/loop-control: %m");
1687 nr
= ioctl(control
, LOOP_CTL_GET_FREE
);
1689 return log_error_errno(errno
, "Failed to allocate loop device: %m");
1691 if (asprintf(&loopdev
, "/dev/loop%i", nr
) < 0)
1694 loop
= open(loopdev
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1696 return log_error_errno(errno
, "Failed to open loop device %s: %m", loopdev
);
1698 if (ioctl(loop
, LOOP_SET_FD
, fd
) < 0)
1699 return log_error_errno(errno
, "Failed to set loopback file descriptor on %s: %m", loopdev
);
1702 info
.lo_flags
|= LO_FLAGS_READ_ONLY
;
1704 if (ioctl(loop
, LOOP_SET_STATUS64
, &info
) < 0)
1705 return log_error_errno(errno
, "Failed to set loopback settings on %s: %m", loopdev
);
1707 *device_path
= loopdev
;
1718 #define PARTITION_TABLE_BLURB \
1719 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1720 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1721 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1722 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1723 "to be bootable with systemd-nspawn."
1725 static int dissect_image(
1727 char **root_device
, bool *root_device_rw
,
1728 char **home_device
, bool *home_device_rw
,
1729 char **srv_device
, bool *srv_device_rw
,
1733 int home_nr
= -1, srv_nr
= -1;
1734 #ifdef GPT_ROOT_NATIVE
1737 #ifdef GPT_ROOT_SECONDARY
1738 int secondary_root_nr
= -1;
1740 _cleanup_free_
char *home
= NULL
, *root
= NULL
, *secondary_root
= NULL
, *srv
= NULL
, *generic
= NULL
;
1741 _cleanup_udev_enumerate_unref_
struct udev_enumerate
*e
= NULL
;
1742 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
1743 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
1744 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
1745 struct udev_list_entry
*first
, *item
;
1746 bool home_rw
= true, root_rw
= true, secondary_root_rw
= true, srv_rw
= true, generic_rw
= true;
1747 bool is_gpt
, is_mbr
, multiple_generic
= false;
1748 const char *pttype
= NULL
;
1755 assert(root_device
);
1756 assert(home_device
);
1761 b
= blkid_new_probe();
1766 r
= blkid_probe_set_device(b
, fd
, 0, 0);
1771 log_error_errno(errno
, "Failed to set device on blkid probe: %m");
1775 blkid_probe_enable_partitions(b
, 1);
1776 blkid_probe_set_partitions_flags(b
, BLKID_PARTS_ENTRY_DETAILS
);
1779 r
= blkid_do_safeprobe(b
);
1780 if (r
== -2 || r
== 1) {
1781 log_error("Failed to identify any partition table on\n"
1783 PARTITION_TABLE_BLURB
, arg_image
);
1785 } else if (r
!= 0) {
1788 log_error_errno(errno
, "Failed to probe: %m");
1792 (void) blkid_probe_lookup_value(b
, "PTTYPE", &pttype
, NULL
);
1794 is_gpt
= streq_ptr(pttype
, "gpt");
1795 is_mbr
= streq_ptr(pttype
, "dos");
1797 if (!is_gpt
&& !is_mbr
) {
1798 log_error("No GPT or MBR partition table discovered on\n"
1800 PARTITION_TABLE_BLURB
, arg_image
);
1805 pl
= blkid_probe_get_partitions(b
);
1810 log_error("Failed to list partitions of %s", arg_image
);
1818 if (fstat(fd
, &st
) < 0)
1819 return log_error_errno(errno
, "Failed to stat block device: %m");
1821 d
= udev_device_new_from_devnum(udev
, 'b', st
.st_rdev
);
1829 log_error("Kernel partitions never appeared.");
1833 e
= udev_enumerate_new(udev
);
1837 r
= udev_enumerate_add_match_parent(e
, d
);
1841 r
= udev_enumerate_scan_devices(e
);
1843 return log_error_errno(r
, "Failed to scan for partition devices of %s: %m", arg_image
);
1845 /* Count the partitions enumerated by the kernel */
1847 first
= udev_enumerate_get_list_entry(e
);
1848 udev_list_entry_foreach(item
, first
)
1851 /* Count the partitions enumerated by blkid */
1852 m
= blkid_partlist_numof_partitions(pl
);
1856 log_error("blkid and kernel partition list do not match.");
1862 /* The kernel has probed fewer partitions than
1863 * blkid? Maybe the kernel prober is still
1864 * running or it got EBUSY because udev
1865 * already opened the device. Let's reprobe
1866 * the device, which is a synchronous call
1867 * that waits until probing is complete. */
1869 for (j
= 0; j
< 20; j
++) {
1871 r
= ioctl(fd
, BLKRRPART
, 0);
1874 if (r
>= 0 || r
!= -EBUSY
)
1877 /* If something else has the device
1878 * open, such as an udev rule, the
1879 * ioctl will return EBUSY. Since
1880 * there's no way to wait until it
1881 * isn't busy anymore, let's just wait
1882 * a bit, and try again.
1884 * This is really something they
1885 * should fix in the kernel! */
1887 usleep(50 * USEC_PER_MSEC
);
1891 return log_error_errno(r
, "Failed to reread partition table: %m");
1894 e
= udev_enumerate_unref(e
);
1897 first
= udev_enumerate_get_list_entry(e
);
1898 udev_list_entry_foreach(item
, first
) {
1899 _cleanup_udev_device_unref_
struct udev_device
*q
;
1901 unsigned long long flags
;
1907 q
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(item
));
1912 log_error_errno(errno
, "Failed to get partition device of %s: %m", arg_image
);
1916 qn
= udev_device_get_devnum(q
);
1920 if (st
.st_rdev
== qn
)
1923 node
= udev_device_get_devnode(q
);
1927 pp
= blkid_partlist_devno_to_partition(pl
, qn
);
1931 flags
= blkid_partition_get_flags(pp
);
1933 nr
= blkid_partition_get_partno(pp
);
1941 if (flags
& GPT_FLAG_NO_AUTO
)
1944 stype
= blkid_partition_get_type_string(pp
);
1948 if (sd_id128_from_string(stype
, &type_id
) < 0)
1951 if (sd_id128_equal(type_id
, GPT_HOME
)) {
1953 if (home
&& nr
>= home_nr
)
1957 home_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1959 r
= free_and_strdup(&home
, node
);
1963 } else if (sd_id128_equal(type_id
, GPT_SRV
)) {
1965 if (srv
&& nr
>= srv_nr
)
1969 srv_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1971 r
= free_and_strdup(&srv
, node
);
1975 #ifdef GPT_ROOT_NATIVE
1976 else if (sd_id128_equal(type_id
, GPT_ROOT_NATIVE
)) {
1978 if (root
&& nr
>= root_nr
)
1982 root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1984 r
= free_and_strdup(&root
, node
);
1989 #ifdef GPT_ROOT_SECONDARY
1990 else if (sd_id128_equal(type_id
, GPT_ROOT_SECONDARY
)) {
1992 if (secondary_root
&& nr
>= secondary_root_nr
)
1995 secondary_root_nr
= nr
;
1996 secondary_root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1998 r
= free_and_strdup(&secondary_root
, node
);
2003 else if (sd_id128_equal(type_id
, GPT_LINUX_GENERIC
)) {
2006 multiple_generic
= true;
2008 generic_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2010 r
= free_and_strdup(&generic
, node
);
2016 } else if (is_mbr
) {
2019 if (flags
!= 0x80) /* Bootable flag */
2022 type
= blkid_partition_get_type(pp
);
2023 if (type
!= 0x83) /* Linux partition */
2027 multiple_generic
= true;
2031 r
= free_and_strdup(&root
, node
);
2039 *root_device
= root
;
2042 *root_device_rw
= root_rw
;
2044 } else if (secondary_root
) {
2045 *root_device
= secondary_root
;
2046 secondary_root
= NULL
;
2048 *root_device_rw
= secondary_root_rw
;
2050 } else if (generic
) {
2052 /* There were no partitions with precise meanings
2053 * around, but we found generic partitions. In this
2054 * case, if there's only one, we can go ahead and boot
2055 * it, otherwise we bail out, because we really cannot
2056 * make any sense of it. */
2058 if (multiple_generic
) {
2059 log_error("Identified multiple bootable Linux partitions on\n"
2061 PARTITION_TABLE_BLURB
, arg_image
);
2065 *root_device
= generic
;
2068 *root_device_rw
= generic_rw
;
2071 log_error("Failed to identify root partition in disk image\n"
2073 PARTITION_TABLE_BLURB
, arg_image
);
2078 *home_device
= home
;
2081 *home_device_rw
= home_rw
;
2088 *srv_device_rw
= srv_rw
;
2093 log_error("--image= is not supported, compiled without blkid support.");
2098 static int mount_device(const char *what
, const char *where
, const char *directory
, bool rw
) {
2100 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
2101 const char *fstype
, *p
;
2111 p
= strjoina(where
, directory
);
2116 b
= blkid_new_probe_from_filename(what
);
2120 log_error_errno(errno
, "Failed to allocate prober for %s: %m", what
);
2124 blkid_probe_enable_superblocks(b
, 1);
2125 blkid_probe_set_superblocks_flags(b
, BLKID_SUBLKS_TYPE
);
2128 r
= blkid_do_safeprobe(b
);
2129 if (r
== -1 || r
== 1) {
2130 log_error("Cannot determine file system type of %s", what
);
2132 } else if (r
!= 0) {
2135 log_error_errno(errno
, "Failed to probe %s: %m", what
);
2140 if (blkid_probe_lookup_value(b
, "TYPE", &fstype
, NULL
) < 0) {
2143 log_error("Failed to determine file system type of %s", what
);
2147 if (streq(fstype
, "crypto_LUKS")) {
2148 log_error("nspawn currently does not support LUKS disk images.");
2152 if (mount(what
, p
, fstype
, MS_NODEV
|(rw
? 0 : MS_RDONLY
), NULL
) < 0)
2153 return log_error_errno(errno
, "Failed to mount %s: %m", what
);
2157 log_error("--image= is not supported, compiled without blkid support.");
2162 static int mount_devices(
2164 const char *root_device
, bool root_device_rw
,
2165 const char *home_device
, bool home_device_rw
,
2166 const char *srv_device
, bool srv_device_rw
) {
2172 r
= mount_device(root_device
, arg_directory
, NULL
, root_device_rw
);
2174 return log_error_errno(r
, "Failed to mount root directory: %m");
2178 r
= mount_device(home_device
, arg_directory
, "/home", home_device_rw
);
2180 return log_error_errno(r
, "Failed to mount home directory: %m");
2184 r
= mount_device(srv_device
, arg_directory
, "/srv", srv_device_rw
);
2186 return log_error_errno(r
, "Failed to mount server data directory: %m");
2192 static void loop_remove(int nr
, int *image_fd
) {
2193 _cleanup_close_
int control
= -1;
2199 if (image_fd
&& *image_fd
>= 0) {
2200 r
= ioctl(*image_fd
, LOOP_CLR_FD
);
2202 log_debug_errno(errno
, "Failed to close loop image: %m");
2203 *image_fd
= safe_close(*image_fd
);
2206 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
2208 log_warning_errno(errno
, "Failed to open /dev/loop-control: %m");
2212 r
= ioctl(control
, LOOP_CTL_REMOVE
, nr
);
2214 log_debug_errno(errno
, "Failed to remove loop %d: %m", nr
);
2219 * < 0 : wait_for_terminate() failed to get the state of the
2220 * container, the container was terminated by a signal, or
2221 * failed for an unknown reason. No change is made to the
2222 * container argument.
2223 * > 0 : The program executed in the container terminated with an
2224 * error. The exit code of the program executed in the
2225 * container is returned. The container argument has been set
2226 * to CONTAINER_TERMINATED.
2227 * 0 : The container is being rebooted, has been shut down or exited
2228 * successfully. The container argument has been set to either
2229 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2231 * That is, success is indicated by a return value of zero, and an
2232 * error is indicated by a non-zero value.
2234 static int wait_for_container(pid_t pid
, ContainerStatus
*container
) {
2238 r
= wait_for_terminate(pid
, &status
);
2240 return log_warning_errno(r
, "Failed to wait for container: %m");
2242 switch (status
.si_code
) {
2245 if (status
.si_status
== 0) {
2246 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s exited successfully.", arg_machine
);
2249 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s failed with error code %i.", arg_machine
, status
.si_status
);
2251 *container
= CONTAINER_TERMINATED
;
2252 return status
.si_status
;
2255 if (status
.si_status
== SIGINT
) {
2257 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s has been shut down.", arg_machine
);
2258 *container
= CONTAINER_TERMINATED
;
2261 } else if (status
.si_status
== SIGHUP
) {
2263 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s is being rebooted.", arg_machine
);
2264 *container
= CONTAINER_REBOOTED
;
2268 /* CLD_KILLED fallthrough */
2271 log_error("Container %s terminated by signal %s.", arg_machine
, signal_to_string(status
.si_status
));
2275 log_error("Container %s failed due to unknown reason.", arg_machine
);
2282 static int on_orderly_shutdown(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
2285 pid
= PTR_TO_UINT32(userdata
);
2287 if (kill(pid
, arg_kill_signal
) >= 0) {
2288 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2289 sd_event_source_set_userdata(s
, NULL
);
2294 sd_event_exit(sd_event_source_get_event(s
), 0);
2298 static int determine_names(void) {
2301 if (arg_template
&& !arg_directory
&& arg_machine
) {
2303 /* If --template= was specified then we should not
2304 * search for a machine, but instead create a new one
2305 * in /var/lib/machine. */
2307 arg_directory
= strjoin("/var/lib/machines/", arg_machine
, NULL
);
2312 if (!arg_image
&& !arg_directory
) {
2314 _cleanup_(image_unrefp
) Image
*i
= NULL
;
2316 r
= image_find(arg_machine
, &i
);
2318 return log_error_errno(r
, "Failed to find image for machine '%s': %m", arg_machine
);
2320 log_error("No image for machine '%s': %m", arg_machine
);
2324 if (i
->type
== IMAGE_RAW
)
2325 r
= set_sanitized_path(&arg_image
, i
->path
);
2327 r
= set_sanitized_path(&arg_directory
, i
->path
);
2329 return log_error_errno(r
, "Invalid image directory: %m");
2332 arg_read_only
= arg_read_only
|| i
->read_only
;
2334 arg_directory
= get_current_dir_name();
2336 if (!arg_directory
&& !arg_machine
) {
2337 log_error("Failed to determine path, please use -D or -i.");
2343 if (arg_directory
&& path_equal(arg_directory
, "/"))
2344 arg_machine
= gethostname_malloc();
2346 arg_machine
= strdup(basename(arg_image
?: arg_directory
));
2351 hostname_cleanup(arg_machine
);
2352 if (!machine_name_is_valid(arg_machine
)) {
2353 log_error("Failed to determine machine name automatically, please use -M.");
2357 if (arg_ephemeral
) {
2360 /* Add a random suffix when this is an
2361 * ephemeral machine, so that we can run many
2362 * instances at once without manually having
2363 * to specify -M each time. */
2365 if (asprintf(&b
, "%s-%016" PRIx64
, arg_machine
, random_u64()) < 0)
2376 static int determine_uid_shift(const char *directory
) {
2384 if (arg_uid_shift
== UID_INVALID
) {
2387 r
= stat(directory
, &st
);
2389 return log_error_errno(errno
, "Failed to determine UID base of %s: %m", directory
);
2391 arg_uid_shift
= st
.st_uid
& UINT32_C(0xffff0000);
2393 if (arg_uid_shift
!= (st
.st_gid
& UINT32_C(0xffff0000))) {
2394 log_error("UID and GID base of %s don't match.", directory
);
2398 arg_uid_range
= UINT32_C(0x10000);
2401 if (arg_uid_shift
> (uid_t
) -1 - arg_uid_range
) {
2402 log_error("UID base too high for UID range.");
2406 log_info("Using user namespaces with base " UID_FMT
" and range " UID_FMT
".", arg_uid_shift
, arg_uid_range
);
2410 static int inner_child(
2412 const char *directory
,
2418 _cleanup_free_
char *home
= NULL
;
2420 const char *envp
[] = {
2421 "PATH=" DEFAULT_PATH_SPLIT_USR
,
2422 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2427 NULL
, /* container_uuid */
2428 NULL
, /* LISTEN_FDS */
2429 NULL
, /* LISTEN_PID */
2433 _cleanup_strv_free_
char **env_use
= NULL
;
2438 assert(kmsg_socket
>= 0);
2443 /* Tell the parent, that it now can write the UID map. */
2444 (void) barrier_place(barrier
); /* #1 */
2446 /* Wait until the parent wrote the UID map */
2447 if (!barrier_place_and_sync(barrier
)) { /* #2 */
2448 log_error("Parent died too early");
2453 r
= mount_all(NULL
, arg_userns
, true, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2457 r
= mount_sysfs(NULL
);
2461 /* Wait until we are cgroup-ified, so that we
2462 * can mount the right cgroup path writable */
2463 if (!barrier_place_and_sync(barrier
)) { /* #3 */
2464 log_error("Parent died too early");
2468 r
= mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy
);
2472 r
= reset_uid_gid();
2474 return log_error_errno(r
, "Couldn't become new root: %m");
2476 r
= setup_boot_id(NULL
);
2480 r
= setup_kmsg(NULL
, kmsg_socket
);
2483 kmsg_socket
= safe_close(kmsg_socket
);
2488 return log_error_errno(errno
, "setsid() failed: %m");
2490 if (arg_private_network
)
2493 if (arg_expose_ports
) {
2494 r
= expose_port_send_rtnl(rtnl_socket
);
2497 rtnl_socket
= safe_close(rtnl_socket
);
2500 if (drop_capabilities() < 0)
2501 return log_error_errno(errno
, "drop_capabilities() failed: %m");
2505 if (arg_personality
!= PERSONALITY_INVALID
) {
2506 if (personality(arg_personality
) < 0)
2507 return log_error_errno(errno
, "personality() failed: %m");
2508 } else if (secondary
) {
2509 if (personality(PER_LINUX32
) < 0)
2510 return log_error_errno(errno
, "personality() failed: %m");
2514 if (arg_selinux_context
)
2515 if (setexeccon((security_context_t
) arg_selinux_context
) < 0)
2516 return log_error_errno(errno
, "setexeccon(\"%s\") failed: %m", arg_selinux_context
);
2519 r
= change_uid_gid(arg_user
, &home
);
2523 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
2527 if ((asprintf((char**)(envp
+ n_env
++), "HOME=%s", home
? home
: "/root") < 0) ||
2528 (asprintf((char**)(envp
+ n_env
++), "USER=%s", arg_user
? arg_user
: "root") < 0) ||
2529 (asprintf((char**)(envp
+ n_env
++), "LOGNAME=%s", arg_user
? arg_user
: "root") < 0))
2532 if (!sd_id128_equal(arg_uuid
, SD_ID128_NULL
)) {
2535 if (asprintf((char**)(envp
+ n_env
++), "container_uuid=%s", id128_format_as_uuid(arg_uuid
, as_uuid
)) < 0)
2539 if (fdset_size(fds
) > 0) {
2540 r
= fdset_cloexec(fds
, false);
2542 return log_error_errno(r
, "Failed to unset O_CLOEXEC for file descriptors.");
2544 if ((asprintf((char **)(envp
+ n_env
++), "LISTEN_FDS=%u", fdset_size(fds
)) < 0) ||
2545 (asprintf((char **)(envp
+ n_env
++), "LISTEN_PID=1") < 0))
2549 env_use
= strv_env_merge(2, envp
, arg_setenv
);
2553 /* Let the parent know that we are ready and
2554 * wait until the parent is ready with the
2556 if (!barrier_place_and_sync(barrier
)) { /* #4 */
2557 log_error("Parent died too early");
2561 /* Now, explicitly close the log, so that we
2562 * then can close all remaining fds. Closing
2563 * the log explicitly first has the benefit
2564 * that the logging subsystem knows about it,
2565 * and is thus ready to be reopened should we
2566 * need it again. Note that the other fds
2567 * closed here are at least the locking and
2570 (void) fdset_close_others(fds
);
2576 /* Automatically search for the init system */
2578 m
= 1 + strv_length(arg_parameters
);
2579 a
= newa(char*, m
+ 1);
2580 if (strv_isempty(arg_parameters
))
2583 memcpy(a
+ 1, arg_parameters
, m
* sizeof(char*));
2585 a
[0] = (char*) "/usr/lib/systemd/systemd";
2586 execve(a
[0], a
, env_use
);
2588 a
[0] = (char*) "/lib/systemd/systemd";
2589 execve(a
[0], a
, env_use
);
2591 a
[0] = (char*) "/sbin/init";
2592 execve(a
[0], a
, env_use
);
2593 } else if (!strv_isempty(arg_parameters
))
2594 execvpe(arg_parameters
[0], arg_parameters
, env_use
);
2596 chdir(home
?: "/root");
2597 execle("/bin/bash", "-bash", NULL
, env_use
);
2598 execle("/bin/sh", "-sh", NULL
, env_use
);
2602 return log_error_errno(errno
, "execv() failed: %m");
2605 static int outer_child(
2607 const char *directory
,
2608 const char *console
,
2609 const char *root_device
, bool root_device_rw
,
2610 const char *home_device
, bool home_device_rw
,
2611 const char *srv_device
, bool srv_device_rw
,
2617 int uid_shift_socket
,
2627 assert(pid_socket
>= 0);
2628 assert(kmsg_socket
>= 0);
2632 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0)
2633 return log_error_errno(errno
, "PR_SET_PDEATHSIG failed: %m");
2636 close_nointr(STDIN_FILENO
);
2637 close_nointr(STDOUT_FILENO
);
2638 close_nointr(STDERR_FILENO
);
2640 r
= open_terminal(console
, O_RDWR
);
2641 if (r
!= STDIN_FILENO
) {
2647 return log_error_errno(r
, "Failed to open console: %m");
2650 if (dup2(STDIN_FILENO
, STDOUT_FILENO
) != STDOUT_FILENO
||
2651 dup2(STDIN_FILENO
, STDERR_FILENO
) != STDERR_FILENO
)
2652 return log_error_errno(errno
, "Failed to duplicate console: %m");
2655 r
= reset_audit_loginuid();
2659 /* Mark everything as slave, so that we still
2660 * receive mounts from the real root, but don't
2661 * propagate mounts to the real root. */
2662 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
2663 return log_error_errno(errno
, "MS_SLAVE|MS_REC failed: %m");
2665 r
= mount_devices(directory
,
2666 root_device
, root_device_rw
,
2667 home_device
, home_device_rw
,
2668 srv_device
, srv_device_rw
);
2672 r
= determine_uid_shift(directory
);
2677 l
= send(uid_shift_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), MSG_NOSIGNAL
);
2679 return log_error_errno(errno
, "Failed to send UID shift: %m");
2680 if (l
!= sizeof(arg_uid_shift
)) {
2681 log_error("Short write while sending UID shift.");
2686 /* Turn directory into bind mount */
2687 if (mount(directory
, directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
2688 return log_error_errno(errno
, "Failed to make bind mount: %m");
2690 r
= setup_volatile(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2694 r
= setup_volatile_state(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2698 r
= base_filesystem_create(directory
, arg_uid_shift
, (gid_t
) arg_uid_shift
);
2702 if (arg_read_only
) {
2703 r
= bind_remount_recursive(directory
, true);
2705 return log_error_errno(r
, "Failed to make tree read-only: %m");
2708 r
= mount_all(directory
, arg_userns
, false, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2712 r
= copy_devnodes(directory
);
2716 dev_setup(directory
, arg_uid_shift
, arg_uid_shift
);
2718 r
= setup_pts(directory
);
2722 r
= setup_propagate(directory
);
2726 r
= setup_dev_console(directory
, console
);
2730 r
= setup_seccomp();
2734 r
= setup_timezone(directory
);
2738 r
= setup_resolv_conf(directory
);
2742 r
= setup_journal(directory
);
2746 r
= mount_custom(directory
, arg_custom_mounts
, arg_n_custom_mounts
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2750 r
= mount_cgroups(directory
, arg_unified_cgroup_hierarchy
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2754 r
= mount_move_root(directory
);
2756 return log_error_errno(r
, "Failed to move root directory: %m");
2758 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
|
2759 (arg_share_system
? 0 : CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
) |
2760 (arg_private_network
? CLONE_NEWNET
: 0) |
2761 (arg_userns
? CLONE_NEWUSER
: 0),
2764 return log_error_errno(errno
, "Failed to fork inner child: %m");
2766 pid_socket
= safe_close(pid_socket
);
2767 uid_shift_socket
= safe_close(uid_shift_socket
);
2769 /* The inner child has all namespaces that are
2770 * requested, so that we all are owned by the user if
2771 * user namespaces are turned on. */
2773 r
= inner_child(barrier
, directory
, secondary
, kmsg_socket
, rtnl_socket
, fds
);
2775 _exit(EXIT_FAILURE
);
2777 _exit(EXIT_SUCCESS
);
2780 l
= send(pid_socket
, &pid
, sizeof(pid
), MSG_NOSIGNAL
);
2782 return log_error_errno(errno
, "Failed to send PID: %m");
2783 if (l
!= sizeof(pid
)) {
2784 log_error("Short write while sending PID.");
2788 pid_socket
= safe_close(pid_socket
);
2789 kmsg_socket
= safe_close(kmsg_socket
);
2790 rtnl_socket
= safe_close(rtnl_socket
);
2795 static int setup_uid_map(pid_t pid
) {
2796 char uid_map
[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t
) + 1], line
[DECIMAL_STR_MAX(uid_t
)*3+3+1];
2801 xsprintf(uid_map
, "/proc/" PID_FMT
"/uid_map", pid
);
2802 xsprintf(line
, UID_FMT
" " UID_FMT
" " UID_FMT
"\n", 0, arg_uid_shift
, arg_uid_range
);
2803 r
= write_string_file(uid_map
, line
, 0);
2805 return log_error_errno(r
, "Failed to write UID map: %m");
2807 /* We always assign the same UID and GID ranges */
2808 xsprintf(uid_map
, "/proc/" PID_FMT
"/gid_map", pid
);
2809 r
= write_string_file(uid_map
, line
, 0);
2811 return log_error_errno(r
, "Failed to write GID map: %m");
2816 static int load_settings(void) {
2817 _cleanup_(settings_freep
) Settings
*settings
= NULL
;
2818 _cleanup_fclose_
FILE *f
= NULL
;
2819 _cleanup_free_
char *p
= NULL
;
2823 /* If all settings are masked, there's no point in looking for
2824 * the settings file */
2825 if ((arg_settings_mask
& _SETTINGS_MASK_ALL
) == _SETTINGS_MASK_ALL
)
2828 fn
= strjoina(arg_machine
, ".nspawn");
2830 /* We first look in the admin's directories in /etc and /run */
2831 FOREACH_STRING(i
, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2832 _cleanup_free_
char *j
= NULL
;
2834 j
= strjoin(i
, "/", fn
, NULL
);
2843 /* By default we trust configuration from /etc and /run */
2844 if (arg_settings_trusted
< 0)
2845 arg_settings_trusted
= true;
2850 if (errno
!= ENOENT
)
2851 return log_error_errno(errno
, "Failed to open %s: %m", j
);
2855 /* After that, let's look for a file next to the
2856 * actual image we shall boot. */
2859 p
= file_in_same_dir(arg_image
, fn
);
2862 } else if (arg_directory
) {
2863 p
= file_in_same_dir(arg_directory
, fn
);
2870 if (!f
&& errno
!= ENOENT
)
2871 return log_error_errno(errno
, "Failed to open %s: %m", p
);
2873 /* By default we do not trust configuration from /var/lib/machines */
2874 if (arg_settings_trusted
< 0)
2875 arg_settings_trusted
= false;
2882 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted
));
2884 r
= settings_load(f
, p
, &settings
);
2888 /* Copy over bits from the settings, unless they have been
2889 * explicitly masked by command line switches. */
2891 if ((arg_settings_mask
& SETTING_BOOT
) == 0 &&
2892 settings
->boot
>= 0) {
2893 arg_boot
= settings
->boot
;
2895 strv_free(arg_parameters
);
2896 arg_parameters
= settings
->parameters
;
2897 settings
->parameters
= NULL
;
2900 if ((arg_settings_mask
& SETTING_ENVIRONMENT
) == 0 &&
2901 settings
->environment
) {
2902 strv_free(arg_setenv
);
2903 arg_setenv
= settings
->environment
;
2904 settings
->environment
= NULL
;
2907 if ((arg_settings_mask
& SETTING_USER
) == 0 &&
2910 arg_user
= settings
->user
;
2911 settings
->user
= NULL
;
2914 if ((arg_settings_mask
& SETTING_CAPABILITY
) == 0) {
2916 if (!arg_settings_trusted
&& settings
->capability
!= 0)
2917 log_warning("Ignoring Capability= setting, file %s is not trusted.", p
);
2919 arg_retain
|= settings
->capability
;
2921 arg_retain
&= ~settings
->drop_capability
;
2924 if ((arg_settings_mask
& SETTING_KILL_SIGNAL
) == 0 &&
2925 settings
->kill_signal
> 0)
2926 arg_kill_signal
= settings
->kill_signal
;
2928 if ((arg_settings_mask
& SETTING_PERSONALITY
) == 0 &&
2929 settings
->personality
!= PERSONALITY_INVALID
)
2930 arg_personality
= settings
->personality
;
2932 if ((arg_settings_mask
& SETTING_MACHINE_ID
) == 0 &&
2933 !sd_id128_is_null(settings
->machine_id
)) {
2935 if (!arg_settings_trusted
)
2936 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p
);
2938 arg_uuid
= settings
->machine_id
;
2941 if ((arg_settings_mask
& SETTING_READ_ONLY
) == 0 &&
2942 settings
->read_only
>= 0)
2943 arg_read_only
= settings
->read_only
;
2945 if ((arg_settings_mask
& SETTING_VOLATILE_MODE
) == 0 &&
2946 settings
->volatile_mode
!= _VOLATILE_MODE_INVALID
)
2947 arg_volatile_mode
= settings
->volatile_mode
;
2949 if ((arg_settings_mask
& SETTING_CUSTOM_MOUNTS
) == 0 &&
2950 settings
->n_custom_mounts
> 0) {
2952 if (!arg_settings_trusted
)
2953 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p
);
2955 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
2956 arg_custom_mounts
= settings
->custom_mounts
;
2957 arg_n_custom_mounts
= settings
->n_custom_mounts
;
2959 settings
->custom_mounts
= NULL
;
2960 settings
->n_custom_mounts
= 0;
2964 if ((arg_settings_mask
& SETTING_NETWORK
) == 0 &&
2965 (settings
->private_network
>= 0 ||
2966 settings
->network_veth
>= 0 ||
2967 settings
->network_bridge
||
2968 settings
->network_interfaces
||
2969 settings
->network_macvlan
||
2970 settings
->network_ipvlan
)) {
2972 if (!arg_settings_trusted
)
2973 log_warning("Ignoring network settings, file %s is not trusted.", p
);
2975 strv_free(arg_network_interfaces
);
2976 arg_network_interfaces
= settings
->network_interfaces
;
2977 settings
->network_interfaces
= NULL
;
2979 strv_free(arg_network_macvlan
);
2980 arg_network_macvlan
= settings
->network_macvlan
;
2981 settings
->network_macvlan
= NULL
;
2983 strv_free(arg_network_ipvlan
);
2984 arg_network_ipvlan
= settings
->network_ipvlan
;
2985 settings
->network_ipvlan
= NULL
;
2987 free(arg_network_bridge
);
2988 arg_network_bridge
= settings
->network_bridge
;
2989 settings
->network_bridge
= NULL
;
2991 arg_network_veth
= settings
->network_veth
> 0 || settings
->network_bridge
;
2993 arg_private_network
= true; /* all these settings imply private networking */
2997 if ((arg_settings_mask
& SETTING_EXPOSE_PORTS
) == 0 &&
2998 settings
->expose_ports
) {
3000 if (!arg_settings_trusted
)
3001 log_warning("Ignoring Port= setting, file %s is not trusted.", p
);
3003 expose_port_free_all(arg_expose_ports
);
3004 arg_expose_ports
= settings
->expose_ports
;
3005 settings
->expose_ports
= NULL
;
3012 int main(int argc
, char *argv
[]) {
3014 _cleanup_free_
char *device_path
= NULL
, *root_device
= NULL
, *home_device
= NULL
, *srv_device
= NULL
, *console
= NULL
;
3015 bool root_device_rw
= true, home_device_rw
= true, srv_device_rw
= true;
3016 _cleanup_close_
int master
= -1, image_fd
= -1;
3017 _cleanup_fdset_free_ FDSet
*fds
= NULL
;
3018 int r
, n_fd_passed
, loop_nr
= -1;
3019 char veth_name
[IFNAMSIZ
];
3020 bool secondary
= false, remove_subvol
= false;
3023 int ret
= EXIT_SUCCESS
;
3024 union in_addr_union exposed
= {};
3025 _cleanup_release_lock_file_ LockFile tree_global_lock
= LOCK_FILE_INIT
, tree_local_lock
= LOCK_FILE_INIT
;
3028 log_parse_environment();
3031 r
= parse_argv(argc
, argv
);
3035 if (geteuid() != 0) {
3036 log_error("Need to be root.");
3040 r
= determine_names();
3044 r
= load_settings();
3048 r
= verify_arguments();
3052 n_fd_passed
= sd_listen_fds(false);
3053 if (n_fd_passed
> 0) {
3054 r
= fdset_new_listen_fds(&fds
, false);
3056 log_error_errno(r
, "Failed to collect file descriptors: %m");
3061 if (arg_directory
) {
3064 if (path_equal(arg_directory
, "/") && !arg_ephemeral
) {
3065 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3070 if (arg_ephemeral
) {
3071 _cleanup_free_
char *np
= NULL
;
3073 /* If the specified path is a mount point we
3074 * generate the new snapshot immediately
3075 * inside it under a random name. However if
3076 * the specified is not a mount point we
3077 * create the new snapshot in the parent
3078 * directory, just next to it. */
3079 r
= path_is_mount_point(arg_directory
, 0);
3081 log_error_errno(r
, "Failed to determine whether directory %s is mount point: %m", arg_directory
);
3085 r
= tempfn_random_child(arg_directory
, "machine.", &np
);
3087 r
= tempfn_random(arg_directory
, "machine.", &np
);
3089 log_error_errno(r
, "Failed to generate name for snapshot: %m");
3093 r
= image_path_lock(np
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3095 log_error_errno(r
, "Failed to lock %s: %m", np
);
3099 r
= btrfs_subvol_snapshot(arg_directory
, np
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
);
3101 log_error_errno(r
, "Failed to create snapshot %s from %s: %m", np
, arg_directory
);
3105 free(arg_directory
);
3109 remove_subvol
= true;
3112 r
= image_path_lock(arg_directory
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3114 log_error_errno(r
, "Directory tree %s is currently busy.", arg_directory
);
3118 log_error_errno(r
, "Failed to lock %s: %m", arg_directory
);
3123 r
= btrfs_subvol_snapshot(arg_template
, arg_directory
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
);
3126 log_info("Directory %s already exists, not populating from template %s.", arg_directory
, arg_template
);
3128 log_error_errno(r
, "Couldn't create snapshot %s from %s: %m", arg_directory
, arg_template
);
3132 log_info("Populated %s from template %s.", arg_directory
, arg_template
);
3138 if (path_is_os_tree(arg_directory
) <= 0) {
3139 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory
);
3146 p
= strjoina(arg_directory
,
3147 argc
> optind
&& path_is_absolute(argv
[optind
]) ? argv
[optind
] : "/usr/bin/");
3148 if (access(p
, F_OK
) < 0) {
3149 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory
);
3156 char template[] = "/tmp/nspawn-root-XXXXXX";
3159 assert(!arg_template
);
3161 r
= image_path_lock(arg_image
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3163 r
= log_error_errno(r
, "Disk image %s is currently busy.", arg_image
);
3167 r
= log_error_errno(r
, "Failed to create image lock: %m");
3171 if (!mkdtemp(template)) {
3172 log_error_errno(errno
, "Failed to create temporary directory: %m");
3177 arg_directory
= strdup(template);
3178 if (!arg_directory
) {
3183 image_fd
= setup_image(&device_path
, &loop_nr
);
3189 r
= dissect_image(image_fd
,
3190 &root_device
, &root_device_rw
,
3191 &home_device
, &home_device_rw
,
3192 &srv_device
, &srv_device_rw
,
3198 r
= custom_mounts_prepare();
3203 isatty(STDIN_FILENO
) > 0 &&
3204 isatty(STDOUT_FILENO
) > 0;
3206 master
= posix_openpt(O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NDELAY
);
3208 r
= log_error_errno(errno
, "Failed to acquire pseudo tty: %m");
3212 r
= ptsname_malloc(master
, &console
);
3214 r
= log_error_errno(r
, "Failed to determine tty name: %m");
3218 if (unlockpt(master
) < 0) {
3219 r
= log_error_errno(errno
, "Failed to unlock tty: %m");
3224 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3225 arg_machine
, arg_image
?: arg_directory
);
3227 assert_se(sigprocmask_many(SIG_BLOCK
, NULL
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, -1) >= 0);
3229 assert_se(sigemptyset(&mask_chld
) == 0);
3230 assert_se(sigaddset(&mask_chld
, SIGCHLD
) == 0);
3232 if (prctl(PR_SET_CHILD_SUBREAPER
, 1) < 0) {
3233 r
= log_error_errno(errno
, "Failed to become subreaper: %m");
3238 _cleanup_close_pair_
int kmsg_socket_pair
[2] = { -1, -1 }, rtnl_socket_pair
[2] = { -1, -1 }, pid_socket_pair
[2] = { -1, -1 },
3239 uid_shift_socket_pair
[2] = { -1, -1 };
3240 ContainerStatus container_status
;
3241 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
3242 static const struct sigaction sa
= {
3243 .sa_handler
= nop_signal_handler
,
3244 .sa_flags
= SA_NOCLDSTOP
,
3248 _cleanup_event_unref_ sd_event
*event
= NULL
;
3249 _cleanup_(pty_forward_freep
) PTYForward
*forward
= NULL
;
3250 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
3253 r
= barrier_create(&barrier
);
3255 log_error_errno(r
, "Cannot initialize IPC barrier: %m");
3259 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, kmsg_socket_pair
) < 0) {
3260 r
= log_error_errno(errno
, "Failed to create kmsg socket pair: %m");
3264 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, rtnl_socket_pair
) < 0) {
3265 r
= log_error_errno(errno
, "Failed to create rtnl socket pair: %m");
3269 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, pid_socket_pair
) < 0) {
3270 r
= log_error_errno(errno
, "Failed to create pid socket pair: %m");
3275 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, uid_shift_socket_pair
) < 0) {
3276 r
= log_error_errno(errno
, "Failed to create uid shift socket pair: %m");
3280 /* Child can be killed before execv(), so handle SIGCHLD
3281 * in order to interrupt parent's blocking calls and
3282 * give it a chance to call wait() and terminate. */
3283 r
= sigprocmask(SIG_UNBLOCK
, &mask_chld
, NULL
);
3285 r
= log_error_errno(errno
, "Failed to change the signal mask: %m");
3289 r
= sigaction(SIGCHLD
, &sa
, NULL
);
3291 r
= log_error_errno(errno
, "Failed to install SIGCHLD handler: %m");
3295 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
, NULL
);
3297 if (errno
== EINVAL
)
3298 r
= log_error_errno(errno
, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3300 r
= log_error_errno(errno
, "clone() failed: %m");
3306 /* The outer child only has a file system namespace. */
3307 barrier_set_role(&barrier
, BARRIER_CHILD
);
3309 master
= safe_close(master
);
3311 kmsg_socket_pair
[0] = safe_close(kmsg_socket_pair
[0]);
3312 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3313 pid_socket_pair
[0] = safe_close(pid_socket_pair
[0]);
3314 uid_shift_socket_pair
[0] = safe_close(uid_shift_socket_pair
[0]);
3316 (void) reset_all_signal_handlers();
3317 (void) reset_signal_mask();
3319 r
= outer_child(&barrier
,
3322 root_device
, root_device_rw
,
3323 home_device
, home_device_rw
,
3324 srv_device
, srv_device_rw
,
3328 kmsg_socket_pair
[1],
3329 rtnl_socket_pair
[1],
3330 uid_shift_socket_pair
[1],
3333 _exit(EXIT_FAILURE
);
3335 _exit(EXIT_SUCCESS
);
3338 barrier_set_role(&barrier
, BARRIER_PARENT
);
3340 fds
= fdset_free(fds
);
3342 kmsg_socket_pair
[1] = safe_close(kmsg_socket_pair
[1]);
3343 rtnl_socket_pair
[1] = safe_close(rtnl_socket_pair
[1]);
3344 pid_socket_pair
[1] = safe_close(pid_socket_pair
[1]);
3345 uid_shift_socket_pair
[1] = safe_close(uid_shift_socket_pair
[1]);
3347 /* Wait for the outer child. */
3348 r
= wait_for_terminate_and_warn("namespace helper", pid
, NULL
);
3357 /* And now retrieve the PID of the inner child. */
3358 l
= recv(pid_socket_pair
[0], &pid
, sizeof(pid
), 0);
3360 r
= log_error_errno(errno
, "Failed to read inner child PID: %m");
3363 if (l
!= sizeof(pid
)) {
3364 log_error("Short read while reading inner child PID.");
3369 log_debug("Init process invoked as PID " PID_FMT
, pid
);
3372 if (!barrier_place_and_sync(&barrier
)) { /* #1 */
3373 log_error("Child died too early.");
3378 l
= recv(uid_shift_socket_pair
[0], &arg_uid_shift
, sizeof(arg_uid_shift
), 0);
3380 r
= log_error_errno(errno
, "Failed to read UID shift: %m");
3383 if (l
!= sizeof(arg_uid_shift
)) {
3384 log_error("Short read while reading UID shift.");
3389 r
= setup_uid_map(pid
);
3393 (void) barrier_place(&barrier
); /* #2 */
3396 if (arg_private_network
) {
3398 r
= move_network_interfaces(pid
, arg_network_interfaces
);
3402 if (arg_network_veth
) {
3403 r
= setup_veth(arg_machine
, pid
, veth_name
, !!arg_network_bridge
);
3409 if (arg_network_bridge
) {
3410 r
= setup_bridge(veth_name
, arg_network_bridge
);
3418 r
= setup_macvlan(arg_machine
, pid
, arg_network_macvlan
);
3422 r
= setup_ipvlan(arg_machine
, pid
, arg_network_ipvlan
);
3428 r
= register_machine(
3435 arg_custom_mounts
, arg_n_custom_mounts
,
3443 r
= sync_cgroup(pid
, arg_unified_cgroup_hierarchy
);
3447 if (arg_keep_unit
) {
3448 r
= create_subcgroup(pid
, arg_unified_cgroup_hierarchy
);
3453 r
= chown_cgroup(pid
, arg_uid_shift
);
3457 /* Notify the child that the parent is ready with all
3458 * its setup (including cgroup-ification), and that
3459 * the child can now hand over control to the code to
3460 * run inside the container. */
3461 (void) barrier_place(&barrier
); /* #3 */
3463 /* Block SIGCHLD here, before notifying child.
3464 * process_pty() will handle it with the other signals. */
3465 assert_se(sigprocmask(SIG_BLOCK
, &mask_chld
, NULL
) >= 0);
3467 /* Reset signal to default */
3468 r
= default_signals(SIGCHLD
, -1);
3470 log_error_errno(r
, "Failed to reset SIGCHLD: %m");
3474 /* Let the child know that we are ready and wait that the child is completely ready now. */
3475 if (!barrier_place_and_sync(&barrier
)) { /* #4 */
3476 log_error("Child died too early.");
3483 "STATUS=Container running.\n"
3484 "X_NSPAWN_LEADER_PID=" PID_FMT
, pid
);
3486 r
= sd_event_new(&event
);
3488 log_error_errno(r
, "Failed to get default event source: %m");
3492 if (arg_kill_signal
> 0) {
3493 /* Try to kill the init system on SIGINT or SIGTERM */
3494 sd_event_add_signal(event
, NULL
, SIGINT
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
3495 sd_event_add_signal(event
, NULL
, SIGTERM
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
3497 /* Immediately exit */
3498 sd_event_add_signal(event
, NULL
, SIGINT
, NULL
, NULL
);
3499 sd_event_add_signal(event
, NULL
, SIGTERM
, NULL
, NULL
);
3502 /* simply exit on sigchld */
3503 sd_event_add_signal(event
, NULL
, SIGCHLD
, NULL
, NULL
);
3505 if (arg_expose_ports
) {
3506 r
= expose_port_watch_rtnl(event
, rtnl_socket_pair
[0], on_address_change
, &exposed
, &rtnl
);
3510 (void) expose_port_execute(rtnl
, arg_expose_ports
, &exposed
);
3513 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3515 r
= pty_forward_new(event
, master
, true, !interactive
, &forward
);
3517 log_error_errno(r
, "Failed to create PTY forwarder: %m");
3521 r
= sd_event_loop(event
);
3523 log_error_errno(r
, "Failed to run event loop: %m");
3527 pty_forward_get_last_char(forward
, &last_char
);
3529 forward
= pty_forward_free(forward
);
3531 if (!arg_quiet
&& last_char
!= '\n')
3534 /* Kill if it is not dead yet anyway */
3535 if (arg_register
&& !arg_keep_unit
)
3536 terminate_machine(pid
);
3538 /* Normally redundant, but better safe than sorry */
3541 r
= wait_for_container(pid
, &container_status
);
3545 /* We failed to wait for the container, or the
3546 * container exited abnormally */
3548 else if (r
> 0 || container_status
== CONTAINER_TERMINATED
){
3549 /* The container exited with a non-zero
3550 * status, or with zero status and no reboot
3556 /* CONTAINER_REBOOTED, loop again */
3558 if (arg_keep_unit
) {
3559 /* Special handling if we are running as a
3560 * service: instead of simply restarting the
3561 * machine we want to restart the entire
3562 * service, so let's inform systemd about this
3563 * with the special exit code 133. The service
3564 * file uses RestartForceExitStatus=133 so
3565 * that this results in a full nspawn
3566 * restart. This is necessary since we might
3567 * have cgroup parameters set we want to have
3574 expose_port_flush(arg_expose_ports
, &exposed
);
3580 "STATUS=Terminating...");
3585 /* Try to flush whatever is still queued in the pty */
3587 (void) copy_bytes(master
, STDOUT_FILENO
, (uint64_t) -1, false);
3589 loop_remove(loop_nr
, &image_fd
);
3591 if (remove_subvol
&& arg_directory
) {
3594 k
= btrfs_subvol_remove(arg_directory
, true);
3596 log_warning_errno(k
, "Cannot remove subvolume '%s', ignoring: %m", arg_directory
);
3602 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
3603 (void) rm_rf(p
, REMOVE_ROOT
);
3606 expose_port_flush(arg_expose_ports
, &exposed
);
3608 free(arg_directory
);
3613 strv_free(arg_setenv
);
3614 free(arg_network_bridge
);
3615 strv_free(arg_network_interfaces
);
3616 strv_free(arg_network_macvlan
);
3617 strv_free(arg_network_ipvlan
);
3618 strv_free(arg_parameters
);
3619 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
3620 expose_port_free_all(arg_expose_ports
);
3622 return r
< 0 ? EXIT_FAILURE
: ret
;