1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
23 #include <blkid/blkid.h>
27 #include <linux/loop.h>
33 #include <selinux/selinux.h>
40 #include <sys/mount.h>
41 #include <sys/personality.h>
42 #include <sys/prctl.h>
43 #include <sys/types.h>
46 #include "sd-daemon.h"
50 #include "base-filesystem.h"
51 #include "blkid-util.h"
52 #include "btrfs-util.h"
54 #include "capability.h"
55 #include "cgroup-util.h"
57 #include "dev-setup.h"
59 #include "event-util.h"
62 #include "formats-util.h"
64 #include "hostname-util.h"
66 #include "loopback-setup.h"
67 #include "machine-image.h"
71 #include "netlink-util.h"
72 #include "path-util.h"
73 #include "process-util.h"
75 #include "random-util.h"
78 #include "seccomp-util.h"
80 #include "signal-util.h"
82 #include "terminal-util.h"
83 #include "udev-util.h"
86 #include "nspawn-cgroup.h"
87 #include "nspawn-expose-ports.h"
88 #include "nspawn-mount.h"
89 #include "nspawn-network.h"
90 #include "nspawn-register.h"
91 #include "nspawn-settings.h"
92 #include "nspawn-setuid.h"
94 typedef enum ContainerStatus
{
99 typedef enum LinkJournal
{
106 static char *arg_directory
= NULL
;
107 static char *arg_template
= NULL
;
108 static char *arg_user
= NULL
;
109 static sd_id128_t arg_uuid
= {};
110 static char *arg_machine
= NULL
;
111 static const char *arg_selinux_context
= NULL
;
112 static const char *arg_selinux_apifs_context
= NULL
;
113 static const char *arg_slice
= NULL
;
114 static bool arg_private_network
= false;
115 static bool arg_read_only
= false;
116 static bool arg_boot
= false;
117 static bool arg_ephemeral
= false;
118 static LinkJournal arg_link_journal
= LINK_AUTO
;
119 static bool arg_link_journal_try
= false;
120 static uint64_t arg_retain
=
121 (1ULL << CAP_CHOWN
) |
122 (1ULL << CAP_DAC_OVERRIDE
) |
123 (1ULL << CAP_DAC_READ_SEARCH
) |
124 (1ULL << CAP_FOWNER
) |
125 (1ULL << CAP_FSETID
) |
126 (1ULL << CAP_IPC_OWNER
) |
128 (1ULL << CAP_LEASE
) |
129 (1ULL << CAP_LINUX_IMMUTABLE
) |
130 (1ULL << CAP_NET_BIND_SERVICE
) |
131 (1ULL << CAP_NET_BROADCAST
) |
132 (1ULL << CAP_NET_RAW
) |
133 (1ULL << CAP_SETGID
) |
134 (1ULL << CAP_SETFCAP
) |
135 (1ULL << CAP_SETPCAP
) |
136 (1ULL << CAP_SETUID
) |
137 (1ULL << CAP_SYS_ADMIN
) |
138 (1ULL << CAP_SYS_CHROOT
) |
139 (1ULL << CAP_SYS_NICE
) |
140 (1ULL << CAP_SYS_PTRACE
) |
141 (1ULL << CAP_SYS_TTY_CONFIG
) |
142 (1ULL << CAP_SYS_RESOURCE
) |
143 (1ULL << CAP_SYS_BOOT
) |
144 (1ULL << CAP_AUDIT_WRITE
) |
145 (1ULL << CAP_AUDIT_CONTROL
) |
147 static CustomMount
*arg_custom_mounts
= NULL
;
148 static unsigned arg_n_custom_mounts
= 0;
149 static char **arg_setenv
= NULL
;
150 static bool arg_quiet
= false;
151 static bool arg_share_system
= false;
152 static bool arg_register
= true;
153 static bool arg_keep_unit
= false;
154 static char **arg_network_interfaces
= NULL
;
155 static char **arg_network_macvlan
= NULL
;
156 static char **arg_network_ipvlan
= NULL
;
157 static bool arg_network_veth
= false;
158 static char *arg_network_bridge
= NULL
;
159 static unsigned long arg_personality
= PERSONALITY_INVALID
;
160 static char *arg_image
= NULL
;
161 static VolatileMode arg_volatile_mode
= VOLATILE_NO
;
162 static ExposePort
*arg_expose_ports
= NULL
;
163 static char **arg_property
= NULL
;
164 static uid_t arg_uid_shift
= UID_INVALID
, arg_uid_range
= 0x10000U
;
165 static bool arg_userns
= false;
166 static int arg_kill_signal
= 0;
167 static bool arg_unified_cgroup_hierarchy
= false;
168 static SettingsMask arg_settings_mask
= 0;
169 static int arg_settings_trusted
= -1;
170 static char **arg_parameters
= NULL
;
172 static void help(void) {
173 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
174 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
175 " -h --help Show this help\n"
176 " --version Print version string\n"
177 " -q --quiet Do not show status information\n"
178 " -D --directory=PATH Root directory for the container\n"
179 " --template=PATH Initialize root directory from template directory,\n"
181 " -x --ephemeral Run container with snapshot of root directory, and\n"
182 " remove it after exit\n"
183 " -i --image=PATH File system device or disk image for the container\n"
184 " -b --boot Boot up full system (i.e. invoke init)\n"
185 " -u --user=USER Run the command under specified user or uid\n"
186 " -M --machine=NAME Set the machine name for the container\n"
187 " --uuid=UUID Set a specific machine UUID for the container\n"
188 " -S --slice=SLICE Place the container in the specified slice\n"
189 " --property=NAME=VALUE Set scope unit property\n"
190 " --private-users[=UIDBASE[:NUIDS]]\n"
191 " Run within user namespace\n"
192 " --private-network Disable network in container\n"
193 " --network-interface=INTERFACE\n"
194 " Assign an existing network interface to the\n"
196 " --network-macvlan=INTERFACE\n"
197 " Create a macvlan network interface based on an\n"
198 " existing network interface to the container\n"
199 " --network-ipvlan=INTERFACE\n"
200 " Create a ipvlan network interface based on an\n"
201 " existing network interface to the container\n"
202 " -n --network-veth Add a virtual ethernet connection between host\n"
204 " --network-bridge=INTERFACE\n"
205 " Add a virtual ethernet connection between host\n"
206 " and container and add it to an existing bridge on\n"
208 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
209 " Expose a container IP port on the host\n"
210 " -Z --selinux-context=SECLABEL\n"
211 " Set the SELinux security context to be used by\n"
212 " processes in the container\n"
213 " -L --selinux-apifs-context=SECLABEL\n"
214 " Set the SELinux security context to be used by\n"
215 " API/tmpfs file systems in the container\n"
216 " --capability=CAP In addition to the default, retain specified\n"
218 " --drop-capability=CAP Drop the specified capability from the default set\n"
219 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
220 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
221 " try-guest, try-host\n"
222 " -j Equivalent to --link-journal=try-guest\n"
223 " --read-only Mount the root directory read-only\n"
224 " --bind=PATH[:PATH[:OPTIONS]]\n"
225 " Bind mount a file or directory from the host into\n"
227 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
228 " Similar, but creates a read-only bind mount\n"
229 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
230 " --overlay=PATH[:PATH...]:PATH\n"
231 " Create an overlay mount from the host to \n"
233 " --overlay-ro=PATH[:PATH...]:PATH\n"
234 " Similar, but creates a read-only overlay mount\n"
235 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
236 " --share-system Share system namespaces with host\n"
237 " --register=BOOLEAN Register container as machine\n"
238 " --keep-unit Do not register a scope for the machine, reuse\n"
239 " the service unit nspawn is running in\n"
240 " --volatile[=MODE] Run the system in volatile mode\n"
241 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
242 , program_invocation_short_name
);
246 static int custom_mounts_prepare(void) {
250 /* Ensure the mounts are applied prefix first. */
251 qsort_safe(arg_custom_mounts
, arg_n_custom_mounts
, sizeof(CustomMount
), custom_mount_compare
);
253 /* Allocate working directories for the overlay file systems that need it */
254 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
255 CustomMount
*m
= &arg_custom_mounts
[i
];
257 if (arg_userns
&& arg_uid_shift
== UID_INVALID
&& path_equal(m
->destination
, "/")) {
258 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
262 if (m
->type
!= CUSTOM_MOUNT_OVERLAY
)
271 r
= tempfn_random(m
->source
, NULL
, &m
->work_dir
);
273 return log_error_errno(r
, "Failed to generate work directory from %s: %m", m
->source
);
279 static int set_sanitized_path(char **b
, const char *path
) {
286 p
= canonicalize_file_name(path
);
291 r
= path_make_absolute_cwd(path
, &p
);
297 *b
= path_kill_slashes(p
);
301 static int detect_unified_cgroup_hierarchy(void) {
305 /* Allow the user to control whether the unified hierarchy is used */
306 e
= getenv("UNIFIED_CGROUP_HIERARCHY");
308 r
= parse_boolean(e
);
310 return log_error_errno(r
, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
312 arg_unified_cgroup_hierarchy
= r
;
316 /* Otherwise inherit the default from the host system */
319 return log_error_errno(r
, "Failed to determine whether the unified cgroups hierarchy is used: %m");
321 arg_unified_cgroup_hierarchy
= r
;
325 static int parse_argv(int argc
, char *argv
[]) {
344 ARG_NETWORK_INTERFACE
,
357 static const struct option options
[] = {
358 { "help", no_argument
, NULL
, 'h' },
359 { "version", no_argument
, NULL
, ARG_VERSION
},
360 { "directory", required_argument
, NULL
, 'D' },
361 { "template", required_argument
, NULL
, ARG_TEMPLATE
},
362 { "ephemeral", no_argument
, NULL
, 'x' },
363 { "user", required_argument
, NULL
, 'u' },
364 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
365 { "boot", no_argument
, NULL
, 'b' },
366 { "uuid", required_argument
, NULL
, ARG_UUID
},
367 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
368 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
369 { "drop-capability", required_argument
, NULL
, ARG_DROP_CAPABILITY
},
370 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
371 { "bind", required_argument
, NULL
, ARG_BIND
},
372 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
373 { "tmpfs", required_argument
, NULL
, ARG_TMPFS
},
374 { "overlay", required_argument
, NULL
, ARG_OVERLAY
},
375 { "overlay-ro", required_argument
, NULL
, ARG_OVERLAY_RO
},
376 { "machine", required_argument
, NULL
, 'M' },
377 { "slice", required_argument
, NULL
, 'S' },
378 { "setenv", required_argument
, NULL
, ARG_SETENV
},
379 { "selinux-context", required_argument
, NULL
, 'Z' },
380 { "selinux-apifs-context", required_argument
, NULL
, 'L' },
381 { "quiet", no_argument
, NULL
, 'q' },
382 { "share-system", no_argument
, NULL
, ARG_SHARE_SYSTEM
},
383 { "register", required_argument
, NULL
, ARG_REGISTER
},
384 { "keep-unit", no_argument
, NULL
, ARG_KEEP_UNIT
},
385 { "network-interface", required_argument
, NULL
, ARG_NETWORK_INTERFACE
},
386 { "network-macvlan", required_argument
, NULL
, ARG_NETWORK_MACVLAN
},
387 { "network-ipvlan", required_argument
, NULL
, ARG_NETWORK_IPVLAN
},
388 { "network-veth", no_argument
, NULL
, 'n' },
389 { "network-bridge", required_argument
, NULL
, ARG_NETWORK_BRIDGE
},
390 { "personality", required_argument
, NULL
, ARG_PERSONALITY
},
391 { "image", required_argument
, NULL
, 'i' },
392 { "volatile", optional_argument
, NULL
, ARG_VOLATILE
},
393 { "port", required_argument
, NULL
, 'p' },
394 { "property", required_argument
, NULL
, ARG_PROPERTY
},
395 { "private-users", optional_argument
, NULL
, ARG_PRIVATE_USERS
},
396 { "kill-signal", required_argument
, NULL
, ARG_KILL_SIGNAL
},
397 { "settings", required_argument
, NULL
, ARG_SETTINGS
},
402 uint64_t plus
= 0, minus
= 0;
403 bool mask_all_settings
= false, mask_no_settings
= false;
408 while ((c
= getopt_long(argc
, argv
, "+hD:u:bL:M:jS:Z:qi:xp:n", options
, NULL
)) >= 0)
420 r
= set_sanitized_path(&arg_directory
, optarg
);
422 return log_error_errno(r
, "Invalid root directory: %m");
427 r
= set_sanitized_path(&arg_template
, optarg
);
429 return log_error_errno(r
, "Invalid template directory: %m");
434 r
= set_sanitized_path(&arg_image
, optarg
);
436 return log_error_errno(r
, "Invalid image path: %m");
441 arg_ephemeral
= true;
445 r
= free_and_strdup(&arg_user
, optarg
);
449 arg_settings_mask
|= SETTING_USER
;
452 case ARG_NETWORK_BRIDGE
:
453 r
= free_and_strdup(&arg_network_bridge
, optarg
);
460 arg_network_veth
= true;
461 arg_private_network
= true;
462 arg_settings_mask
|= SETTING_NETWORK
;
465 case ARG_NETWORK_INTERFACE
:
466 if (strv_extend(&arg_network_interfaces
, optarg
) < 0)
469 arg_private_network
= true;
470 arg_settings_mask
|= SETTING_NETWORK
;
473 case ARG_NETWORK_MACVLAN
:
474 if (strv_extend(&arg_network_macvlan
, optarg
) < 0)
477 arg_private_network
= true;
478 arg_settings_mask
|= SETTING_NETWORK
;
481 case ARG_NETWORK_IPVLAN
:
482 if (strv_extend(&arg_network_ipvlan
, optarg
) < 0)
487 case ARG_PRIVATE_NETWORK
:
488 arg_private_network
= true;
489 arg_settings_mask
|= SETTING_NETWORK
;
494 arg_settings_mask
|= SETTING_BOOT
;
498 r
= sd_id128_from_string(optarg
, &arg_uuid
);
500 log_error("Invalid UUID: %s", optarg
);
504 arg_settings_mask
|= SETTING_MACHINE_ID
;
513 arg_machine
= mfree(arg_machine
);
515 if (!machine_name_is_valid(optarg
)) {
516 log_error("Invalid machine name: %s", optarg
);
520 r
= free_and_strdup(&arg_machine
, optarg
);
528 arg_selinux_context
= optarg
;
532 arg_selinux_apifs_context
= optarg
;
536 arg_read_only
= true;
537 arg_settings_mask
|= SETTING_READ_ONLY
;
541 case ARG_DROP_CAPABILITY
: {
542 const char *state
, *word
;
545 FOREACH_WORD_SEPARATOR(word
, length
, optarg
, ",", state
) {
546 _cleanup_free_
char *t
;
548 t
= strndup(word
, length
);
552 if (streq(t
, "all")) {
553 if (c
== ARG_CAPABILITY
)
554 plus
= (uint64_t) -1;
556 minus
= (uint64_t) -1;
560 cap
= capability_from_name(t
);
562 log_error("Failed to parse capability %s.", t
);
566 if (c
== ARG_CAPABILITY
)
567 plus
|= 1ULL << (uint64_t) cap
;
569 minus
|= 1ULL << (uint64_t) cap
;
573 arg_settings_mask
|= SETTING_CAPABILITY
;
578 arg_link_journal
= LINK_GUEST
;
579 arg_link_journal_try
= true;
582 case ARG_LINK_JOURNAL
:
583 if (streq(optarg
, "auto")) {
584 arg_link_journal
= LINK_AUTO
;
585 arg_link_journal_try
= false;
586 } else if (streq(optarg
, "no")) {
587 arg_link_journal
= LINK_NO
;
588 arg_link_journal_try
= false;
589 } else if (streq(optarg
, "guest")) {
590 arg_link_journal
= LINK_GUEST
;
591 arg_link_journal_try
= false;
592 } else if (streq(optarg
, "host")) {
593 arg_link_journal
= LINK_HOST
;
594 arg_link_journal_try
= false;
595 } else if (streq(optarg
, "try-guest")) {
596 arg_link_journal
= LINK_GUEST
;
597 arg_link_journal_try
= true;
598 } else if (streq(optarg
, "try-host")) {
599 arg_link_journal
= LINK_HOST
;
600 arg_link_journal_try
= true;
602 log_error("Failed to parse link journal mode %s", optarg
);
610 r
= bind_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
, c
== ARG_BIND_RO
);
612 return log_error_errno(r
, "Failed to parse --bind(-ro)= argument %s: %m", optarg
);
614 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
618 r
= tmpfs_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
);
620 return log_error_errno(r
, "Failed to parse --tmpfs= argument %s: %m", optarg
);
622 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
626 case ARG_OVERLAY_RO
: {
627 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
628 _cleanup_strv_free_
char **lower
= NULL
;
633 r
= strv_split_extract(&lower
, optarg
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
637 log_error("Invalid overlay specification: %s", optarg
);
641 STRV_FOREACH(i
, lower
) {
642 if (!path_is_absolute(*i
)) {
643 log_error("Overlay path %s is not absolute.", *i
);
651 log_error("--overlay= needs at least two colon-separated directories specified.");
656 /* If two parameters are specified,
657 * the first one is the lower, the
658 * second one the upper directory. And
659 * we'll also define the destination
660 * mount point the same as the upper. */
664 destination
= strdup(upper
);
669 upper
= lower
[n
- 2];
670 destination
= lower
[n
- 1];
674 m
= custom_mount_add(&arg_custom_mounts
, &arg_n_custom_mounts
, CUSTOM_MOUNT_OVERLAY
);
678 m
->destination
= destination
;
681 m
->read_only
= c
== ARG_OVERLAY_RO
;
683 upper
= destination
= NULL
;
686 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
693 if (!env_assignment_is_valid(optarg
)) {
694 log_error("Environment variable assignment '%s' is not valid.", optarg
);
698 n
= strv_env_set(arg_setenv
, optarg
);
702 strv_free(arg_setenv
);
705 arg_settings_mask
|= SETTING_ENVIRONMENT
;
713 case ARG_SHARE_SYSTEM
:
714 arg_share_system
= true;
718 r
= parse_boolean(optarg
);
720 log_error("Failed to parse --register= argument: %s", optarg
);
728 arg_keep_unit
= true;
731 case ARG_PERSONALITY
:
733 arg_personality
= personality_from_string(optarg
);
734 if (arg_personality
== PERSONALITY_INVALID
) {
735 log_error("Unknown or unsupported personality '%s'.", optarg
);
739 arg_settings_mask
|= SETTING_PERSONALITY
;
745 arg_volatile_mode
= VOLATILE_YES
;
749 m
= volatile_mode_from_string(optarg
);
751 log_error("Failed to parse --volatile= argument: %s", optarg
);
754 arg_volatile_mode
= m
;
757 arg_settings_mask
|= SETTING_VOLATILE_MODE
;
761 r
= expose_port_parse(&arg_expose_ports
, optarg
);
763 return log_error_errno(r
, "Duplicate port specification: %s", optarg
);
765 return log_error_errno(r
, "Failed to parse host port %s: %m", optarg
);
767 arg_settings_mask
|= SETTING_EXPOSE_PORTS
;
771 if (strv_extend(&arg_property
, optarg
) < 0)
776 case ARG_PRIVATE_USERS
:
778 _cleanup_free_
char *buffer
= NULL
;
779 const char *range
, *shift
;
781 range
= strchr(optarg
, ':');
783 buffer
= strndup(optarg
, range
- optarg
);
789 if (safe_atou32(range
, &arg_uid_range
) < 0 || arg_uid_range
<= 0) {
790 log_error("Failed to parse UID range: %s", range
);
796 if (parse_uid(shift
, &arg_uid_shift
) < 0) {
797 log_error("Failed to parse UID: %s", optarg
);
805 case ARG_KILL_SIGNAL
:
806 arg_kill_signal
= signal_from_string_try_harder(optarg
);
807 if (arg_kill_signal
< 0) {
808 log_error("Cannot parse signal: %s", optarg
);
812 arg_settings_mask
|= SETTING_KILL_SIGNAL
;
817 /* no → do not read files
818 * yes → read files, do not override cmdline, trust only subset
819 * override → read files, override cmdline, trust only subset
820 * trusted → read files, do not override cmdline, trust all
823 r
= parse_boolean(optarg
);
825 if (streq(optarg
, "trusted")) {
826 mask_all_settings
= false;
827 mask_no_settings
= false;
828 arg_settings_trusted
= true;
830 } else if (streq(optarg
, "override")) {
831 mask_all_settings
= false;
832 mask_no_settings
= true;
833 arg_settings_trusted
= -1;
835 return log_error_errno(r
, "Failed to parse --settings= argument: %s", optarg
);
838 mask_all_settings
= false;
839 mask_no_settings
= false;
840 arg_settings_trusted
= -1;
843 mask_all_settings
= true;
844 mask_no_settings
= false;
845 arg_settings_trusted
= false;
854 assert_not_reached("Unhandled option");
857 if (arg_share_system
)
858 arg_register
= false;
860 if (arg_boot
&& arg_share_system
) {
861 log_error("--boot and --share-system may not be combined.");
865 if (arg_keep_unit
&& cg_pid_get_owner_uid(0, NULL
) >= 0) {
866 log_error("--keep-unit may not be used when invoked from a user session.");
870 if (arg_directory
&& arg_image
) {
871 log_error("--directory= and --image= may not be combined.");
875 if (arg_template
&& arg_image
) {
876 log_error("--template= and --image= may not be combined.");
880 if (arg_template
&& !(arg_directory
|| arg_machine
)) {
881 log_error("--template= needs --directory= or --machine=.");
885 if (arg_ephemeral
&& arg_template
) {
886 log_error("--ephemeral and --template= may not be combined.");
890 if (arg_ephemeral
&& arg_image
) {
891 log_error("--ephemeral and --image= may not be combined.");
895 if (arg_ephemeral
&& !IN_SET(arg_link_journal
, LINK_NO
, LINK_AUTO
)) {
896 log_error("--ephemeral and --link-journal= may not be combined.");
900 if (arg_userns
&& access("/proc/self/uid_map", F_OK
) < 0)
901 return log_error_errno(EOPNOTSUPP
, "--private-users= is not supported, kernel compiled without user namespace support.");
904 arg_parameters
= strv_copy(argv
+ optind
);
908 arg_settings_mask
|= SETTING_BOOT
;
911 /* Load all settings from .nspawn files */
912 if (mask_no_settings
)
913 arg_settings_mask
= 0;
915 /* Don't load any settings from .nspawn files */
916 if (mask_all_settings
)
917 arg_settings_mask
= _SETTINGS_MASK_ALL
;
919 arg_retain
= (arg_retain
| plus
| (arg_private_network
? 1ULL << CAP_NET_ADMIN
: 0)) & ~minus
;
921 r
= detect_unified_cgroup_hierarchy();
928 static int verify_arguments(void) {
930 if (arg_volatile_mode
!= VOLATILE_NO
&& arg_read_only
) {
931 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
935 if (arg_expose_ports
&& !arg_private_network
) {
936 log_error("Cannot use --port= without private networking.");
940 if (arg_boot
&& arg_kill_signal
<= 0)
941 arg_kill_signal
= SIGRTMIN
+3;
946 static int userns_lchown(const char *p
, uid_t uid
, gid_t gid
) {
952 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
955 if (uid
!= UID_INVALID
) {
956 uid
+= arg_uid_shift
;
958 if (uid
< arg_uid_shift
|| uid
>= arg_uid_shift
+ arg_uid_range
)
962 if (gid
!= GID_INVALID
) {
963 gid
+= (gid_t
) arg_uid_shift
;
965 if (gid
< (gid_t
) arg_uid_shift
|| gid
>= (gid_t
) (arg_uid_shift
+ arg_uid_range
))
969 if (lchown(p
, uid
, gid
) < 0)
975 static int userns_mkdir(const char *root
, const char *path
, mode_t mode
, uid_t uid
, gid_t gid
) {
978 q
= prefix_roota(root
, path
);
979 if (mkdir(q
, mode
) < 0) {
985 return userns_lchown(q
, uid
, gid
);
988 static int setup_timezone(const char *dest
) {
989 _cleanup_free_
char *p
= NULL
, *q
= NULL
;
990 const char *where
, *check
, *what
;
996 /* Fix the timezone, if possible */
997 r
= readlink_malloc("/etc/localtime", &p
);
999 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1003 z
= path_startswith(p
, "../usr/share/zoneinfo/");
1005 z
= path_startswith(p
, "/usr/share/zoneinfo/");
1007 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1011 where
= prefix_roota(dest
, "/etc/localtime");
1012 r
= readlink_malloc(where
, &q
);
1014 y
= path_startswith(q
, "../usr/share/zoneinfo/");
1016 y
= path_startswith(q
, "/usr/share/zoneinfo/");
1018 /* Already pointing to the right place? Then do nothing .. */
1019 if (y
&& streq(y
, z
))
1023 check
= strjoina("/usr/share/zoneinfo/", z
);
1024 check
= prefix_root(dest
, check
);
1025 if (laccess(check
, F_OK
) < 0) {
1026 log_warning("Timezone %s does not exist in container, not updating container timezone.", z
);
1031 if (r
< 0 && errno
!= ENOENT
) {
1032 log_error_errno(errno
, "Failed to remove existing timezone info %s in container: %m", where
);
1036 what
= strjoina("../usr/share/zoneinfo/", z
);
1037 if (symlink(what
, where
) < 0) {
1038 log_error_errno(errno
, "Failed to correct timezone of container: %m");
1042 r
= userns_lchown(where
, 0, 0);
1044 return log_warning_errno(r
, "Failed to chown /etc/localtime: %m");
1049 static int setup_resolv_conf(const char *dest
) {
1050 const char *where
= NULL
;
1055 if (arg_private_network
)
1058 /* Fix resolv.conf, if possible */
1059 where
= prefix_roota(dest
, "/etc/resolv.conf");
1061 r
= copy_file("/etc/resolv.conf", where
, O_TRUNC
|O_NOFOLLOW
, 0644, 0);
1063 /* If the file already exists as symlink, let's
1064 * suppress the warning, under the assumption that
1065 * resolved or something similar runs inside and the
1066 * symlink points there.
1068 * If the disk image is read-only, there's also no
1069 * point in complaining.
1071 log_full_errno(IN_SET(r
, -ELOOP
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
1072 "Failed to copy /etc/resolv.conf to %s: %m", where
);
1076 r
= userns_lchown(where
, 0, 0);
1078 log_warning_errno(r
, "Failed to chown /etc/resolv.conf: %m");
1083 static char* id128_format_as_uuid(sd_id128_t id
, char s
[37]) {
1087 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1088 SD_ID128_FORMAT_VAL(id
));
1093 static int setup_boot_id(const char *dest
) {
1094 const char *from
, *to
;
1095 sd_id128_t rnd
= {};
1099 if (arg_share_system
)
1102 /* Generate a new randomized boot ID, so that each boot-up of
1103 * the container gets a new one */
1105 from
= prefix_roota(dest
, "/run/proc-sys-kernel-random-boot-id");
1106 to
= prefix_roota(dest
, "/proc/sys/kernel/random/boot_id");
1108 r
= sd_id128_randomize(&rnd
);
1110 return log_error_errno(r
, "Failed to generate random boot id: %m");
1112 id128_format_as_uuid(rnd
, as_uuid
);
1114 r
= write_string_file(from
, as_uuid
, WRITE_STRING_FILE_CREATE
);
1116 return log_error_errno(r
, "Failed to write boot id: %m");
1118 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1119 r
= log_error_errno(errno
, "Failed to bind mount boot id: %m");
1120 else if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
) < 0)
1121 log_warning_errno(errno
, "Failed to make boot id read-only: %m");
1127 static int copy_devnodes(const char *dest
) {
1129 static const char devnodes
[] =
1140 _cleanup_umask_ mode_t u
;
1146 /* Create /dev/net, so that we can create /dev/net/tun in it */
1147 if (userns_mkdir(dest
, "/dev/net", 0755, 0, 0) < 0)
1148 return log_error_errno(r
, "Failed to create /dev/net directory: %m");
1150 NULSTR_FOREACH(d
, devnodes
) {
1151 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
1154 from
= strappend("/dev/", d
);
1155 to
= prefix_root(dest
, from
);
1157 if (stat(from
, &st
) < 0) {
1159 if (errno
!= ENOENT
)
1160 return log_error_errno(errno
, "Failed to stat %s: %m", from
);
1162 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
1164 log_error("%s is not a char or block device, cannot copy.", from
);
1168 if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
1170 return log_error_errno(errno
, "mknod(%s) failed: %m", to
);
1172 /* Some systems abusively restrict mknod but
1173 * allow bind mounts. */
1176 return log_error_errno(r
, "touch (%s) failed: %m", to
);
1177 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1178 return log_error_errno(errno
, "Both mknod and bind mount (%s) failed: %m", to
);
1181 r
= userns_lchown(to
, 0, 0);
1183 return log_error_errno(r
, "chown() of device node %s failed: %m", to
);
1190 static int setup_pts(const char *dest
) {
1191 _cleanup_free_
char *options
= NULL
;
1195 if (arg_selinux_apifs_context
)
1196 (void) asprintf(&options
,
1197 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
",context=\"%s\"",
1198 arg_uid_shift
+ TTY_GID
,
1199 arg_selinux_apifs_context
);
1202 (void) asprintf(&options
,
1203 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
,
1204 arg_uid_shift
+ TTY_GID
);
1209 /* Mount /dev/pts itself */
1210 p
= prefix_roota(dest
, "/dev/pts");
1211 if (mkdir(p
, 0755) < 0)
1212 return log_error_errno(errno
, "Failed to create /dev/pts: %m");
1213 if (mount("devpts", p
, "devpts", MS_NOSUID
|MS_NOEXEC
, options
) < 0)
1214 return log_error_errno(errno
, "Failed to mount /dev/pts: %m");
1215 if (userns_lchown(p
, 0, 0) < 0)
1216 return log_error_errno(errno
, "Failed to chown /dev/pts: %m");
1218 /* Create /dev/ptmx symlink */
1219 p
= prefix_roota(dest
, "/dev/ptmx");
1220 if (symlink("pts/ptmx", p
) < 0)
1221 return log_error_errno(errno
, "Failed to create /dev/ptmx symlink: %m");
1222 if (userns_lchown(p
, 0, 0) < 0)
1223 return log_error_errno(errno
, "Failed to chown /dev/ptmx: %m");
1225 /* And fix /dev/pts/ptmx ownership */
1226 p
= prefix_roota(dest
, "/dev/pts/ptmx");
1227 if (userns_lchown(p
, 0, 0) < 0)
1228 return log_error_errno(errno
, "Failed to chown /dev/pts/ptmx: %m");
1233 static int setup_dev_console(const char *dest
, const char *console
) {
1234 _cleanup_umask_ mode_t u
;
1243 r
= chmod_and_chown(console
, 0600, arg_uid_shift
, arg_uid_shift
);
1245 return log_error_errno(r
, "Failed to correct access mode for TTY: %m");
1247 /* We need to bind mount the right tty to /dev/console since
1248 * ptys can only exist on pts file systems. To have something
1249 * to bind mount things on we create a empty regular file. */
1251 to
= prefix_roota(dest
, "/dev/console");
1254 return log_error_errno(r
, "touch() for /dev/console failed: %m");
1256 if (mount(console
, to
, NULL
, MS_BIND
, NULL
) < 0)
1257 return log_error_errno(errno
, "Bind mount for /dev/console failed: %m");
1262 static int setup_kmsg(const char *dest
, int kmsg_socket
) {
1263 const char *from
, *to
;
1264 _cleanup_umask_ mode_t u
;
1267 assert(kmsg_socket
>= 0);
1271 /* We create the kmsg FIFO as /run/kmsg, but immediately
1272 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1273 * on the reading side behave very similar to /proc/kmsg,
1274 * their writing side behaves differently from /dev/kmsg in
1275 * that writing blocks when nothing is reading. In order to
1276 * avoid any problems with containers deadlocking due to this
1277 * we simply make /dev/kmsg unavailable to the container. */
1278 from
= prefix_roota(dest
, "/run/kmsg");
1279 to
= prefix_roota(dest
, "/proc/kmsg");
1281 if (mkfifo(from
, 0600) < 0)
1282 return log_error_errno(errno
, "mkfifo() for /run/kmsg failed: %m");
1283 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1284 return log_error_errno(errno
, "Bind mount for /proc/kmsg failed: %m");
1286 fd
= open(from
, O_RDWR
|O_NDELAY
|O_CLOEXEC
);
1288 return log_error_errno(errno
, "Failed to open fifo: %m");
1290 /* Store away the fd in the socket, so that it stays open as
1291 * long as we run the child */
1292 r
= send_one_fd(kmsg_socket
, fd
, 0);
1296 return log_error_errno(r
, "Failed to send FIFO fd: %m");
1298 /* And now make the FIFO unavailable as /run/kmsg... */
1299 (void) unlink(from
);
1304 static int on_address_change(sd_netlink
*rtnl
, sd_netlink_message
*m
, void *userdata
) {
1305 union in_addr_union
*exposed
= userdata
;
1311 expose_port_execute(rtnl
, arg_expose_ports
, exposed
);
1315 static int setup_hostname(void) {
1317 if (arg_share_system
)
1320 if (sethostname_idempotent(arg_machine
) < 0)
1326 static int setup_journal(const char *directory
) {
1327 sd_id128_t machine_id
, this_id
;
1328 _cleanup_free_
char *b
= NULL
, *d
= NULL
;
1329 const char *etc_machine_id
, *p
, *q
;
1333 /* Don't link journals in ephemeral mode */
1337 etc_machine_id
= prefix_roota(directory
, "/etc/machine-id");
1339 r
= read_one_line_file(etc_machine_id
, &b
);
1340 if (r
== -ENOENT
&& arg_link_journal
== LINK_AUTO
)
1343 return log_error_errno(r
, "Failed to read machine ID from %s: %m", etc_machine_id
);
1346 if (isempty(id
) && arg_link_journal
== LINK_AUTO
)
1349 /* Verify validity */
1350 r
= sd_id128_from_string(id
, &machine_id
);
1352 return log_error_errno(r
, "Failed to parse machine ID from %s: %m", etc_machine_id
);
1354 r
= sd_id128_get_machine(&this_id
);
1356 return log_error_errno(r
, "Failed to retrieve machine ID: %m");
1358 if (sd_id128_equal(machine_id
, this_id
)) {
1359 log_full(arg_link_journal
== LINK_AUTO
? LOG_WARNING
: LOG_ERR
,
1360 "Host and machine ids are equal (%s): refusing to link journals", id
);
1361 if (arg_link_journal
== LINK_AUTO
)
1366 if (arg_link_journal
== LINK_NO
)
1369 r
= userns_mkdir(directory
, "/var", 0755, 0, 0);
1371 return log_error_errno(r
, "Failed to create /var: %m");
1373 r
= userns_mkdir(directory
, "/var/log", 0755, 0, 0);
1375 return log_error_errno(r
, "Failed to create /var/log: %m");
1377 r
= userns_mkdir(directory
, "/var/log/journal", 0755, 0, 0);
1379 return log_error_errno(r
, "Failed to create /var/log/journal: %m");
1381 p
= strjoina("/var/log/journal/", id
);
1382 q
= prefix_roota(directory
, p
);
1384 if (path_is_mount_point(p
, 0) > 0) {
1385 if (arg_link_journal
!= LINK_AUTO
) {
1386 log_error("%s: already a mount point, refusing to use for journal", p
);
1393 if (path_is_mount_point(q
, 0) > 0) {
1394 if (arg_link_journal
!= LINK_AUTO
) {
1395 log_error("%s: already a mount point, refusing to use for journal", q
);
1402 r
= readlink_and_make_absolute(p
, &d
);
1404 if ((arg_link_journal
== LINK_GUEST
||
1405 arg_link_journal
== LINK_AUTO
) &&
1408 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1410 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
1415 return log_error_errno(errno
, "Failed to remove symlink %s: %m", p
);
1416 } else if (r
== -EINVAL
) {
1418 if (arg_link_journal
== LINK_GUEST
&&
1421 if (errno
== ENOTDIR
) {
1422 log_error("%s already exists and is neither a symlink nor a directory", p
);
1425 log_error_errno(errno
, "Failed to remove %s: %m", p
);
1429 } else if (r
!= -ENOENT
) {
1430 log_error_errno(errno
, "readlink(%s) failed: %m", p
);
1434 if (arg_link_journal
== LINK_GUEST
) {
1436 if (symlink(q
, p
) < 0) {
1437 if (arg_link_journal_try
) {
1438 log_debug_errno(errno
, "Failed to symlink %s to %s, skipping journal setup: %m", q
, p
);
1441 log_error_errno(errno
, "Failed to symlink %s to %s: %m", q
, p
);
1446 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1448 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
1452 if (arg_link_journal
== LINK_HOST
) {
1453 /* don't create parents here -- if the host doesn't have
1454 * permanent journal set up, don't force it here */
1457 if (arg_link_journal_try
) {
1458 log_debug_errno(errno
, "Failed to create %s, skipping journal setup: %m", p
);
1461 log_error_errno(errno
, "Failed to create %s: %m", p
);
1466 } else if (access(p
, F_OK
) < 0)
1469 if (dir_is_empty(q
) == 0)
1470 log_warning("%s is not empty, proceeding anyway.", q
);
1472 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1474 log_error_errno(errno
, "Failed to create %s: %m", q
);
1478 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1479 return log_error_errno(errno
, "Failed to bind mount journal from host into guest: %m");
1484 static int drop_capabilities(void) {
1485 return capability_bounding_set_drop(~arg_retain
, false);
1488 static int reset_audit_loginuid(void) {
1489 _cleanup_free_
char *p
= NULL
;
1492 if (arg_share_system
)
1495 r
= read_one_line_file("/proc/self/loginuid", &p
);
1499 return log_error_errno(r
, "Failed to read /proc/self/loginuid: %m");
1501 /* Already reset? */
1502 if (streq(p
, "4294967295"))
1505 r
= write_string_file("/proc/self/loginuid", "4294967295", 0);
1508 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1509 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1510 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1511 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1512 "using systemd-nspawn. Sleeping for 5s... (%m)");
1520 static int setup_seccomp(void) {
1523 static const struct {
1524 uint64_t capability
;
1527 { CAP_SYS_RAWIO
, SCMP_SYS(iopl
) },
1528 { CAP_SYS_RAWIO
, SCMP_SYS(ioperm
) },
1529 { CAP_SYS_BOOT
, SCMP_SYS(kexec_load
) },
1530 { CAP_SYS_ADMIN
, SCMP_SYS(swapon
) },
1531 { CAP_SYS_ADMIN
, SCMP_SYS(swapoff
) },
1532 { CAP_SYS_ADMIN
, SCMP_SYS(open_by_handle_at
) },
1533 { CAP_SYS_MODULE
, SCMP_SYS(init_module
) },
1534 { CAP_SYS_MODULE
, SCMP_SYS(finit_module
) },
1535 { CAP_SYS_MODULE
, SCMP_SYS(delete_module
) },
1536 { CAP_SYSLOG
, SCMP_SYS(syslog
) },
1539 scmp_filter_ctx seccomp
;
1543 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
1547 r
= seccomp_add_secondary_archs(seccomp
);
1549 log_error_errno(r
, "Failed to add secondary archs to seccomp filter: %m");
1553 for (i
= 0; i
< ELEMENTSOF(blacklist
); i
++) {
1554 if (arg_retain
& (1ULL << blacklist
[i
].capability
))
1557 r
= seccomp_rule_add(seccomp
, SCMP_ACT_ERRNO(EPERM
), blacklist
[i
].syscall_num
, 0);
1559 continue; /* unknown syscall */
1561 log_error_errno(r
, "Failed to block syscall: %m");
1568 Audit is broken in containers, much of the userspace audit
1569 hookup will fail if running inside a container. We don't
1570 care and just turn off creation of audit sockets.
1572 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1573 with EAFNOSUPPORT which audit userspace uses as indication
1574 that audit is disabled in the kernel.
1577 r
= seccomp_rule_add(
1579 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1582 SCMP_A0(SCMP_CMP_EQ
, AF_NETLINK
),
1583 SCMP_A2(SCMP_CMP_EQ
, NETLINK_AUDIT
));
1585 log_error_errno(r
, "Failed to add audit seccomp rule: %m");
1589 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
1591 log_error_errno(r
, "Failed to unset NO_NEW_PRIVS: %m");
1595 r
= seccomp_load(seccomp
);
1597 log_debug_errno(r
, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1602 log_error_errno(r
, "Failed to install seccomp audit filter: %m");
1607 seccomp_release(seccomp
);
1615 static int setup_propagate(const char *root
) {
1618 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1619 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1620 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
1621 (void) mkdir_p(p
, 0600);
1623 if (userns_mkdir(root
, "/run/systemd", 0755, 0, 0) < 0)
1624 return log_error_errno(errno
, "Failed to create /run/systemd: %m");
1626 if (userns_mkdir(root
, "/run/systemd/nspawn", 0755, 0, 0) < 0)
1627 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn: %m");
1629 if (userns_mkdir(root
, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
1630 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn/incoming: %m");
1632 q
= prefix_roota(root
, "/run/systemd/nspawn/incoming");
1633 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1634 return log_error_errno(errno
, "Failed to install propagation bind mount.");
1636 if (mount(NULL
, q
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
) < 0)
1637 return log_error_errno(errno
, "Failed to make propagation mount read-only");
1642 static int setup_image(char **device_path
, int *loop_nr
) {
1643 struct loop_info64 info
= {
1644 .lo_flags
= LO_FLAGS_AUTOCLEAR
|LO_FLAGS_PARTSCAN
1646 _cleanup_close_
int fd
= -1, control
= -1, loop
= -1;
1647 _cleanup_free_
char* loopdev
= NULL
;
1651 assert(device_path
);
1655 fd
= open(arg_image
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1657 return log_error_errno(errno
, "Failed to open %s: %m", arg_image
);
1659 if (fstat(fd
, &st
) < 0)
1660 return log_error_errno(errno
, "Failed to stat %s: %m", arg_image
);
1662 if (S_ISBLK(st
.st_mode
)) {
1665 p
= strdup(arg_image
);
1679 if (!S_ISREG(st
.st_mode
)) {
1680 log_error_errno(errno
, "%s is not a regular file or block device: %m", arg_image
);
1684 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
1686 return log_error_errno(errno
, "Failed to open /dev/loop-control: %m");
1688 nr
= ioctl(control
, LOOP_CTL_GET_FREE
);
1690 return log_error_errno(errno
, "Failed to allocate loop device: %m");
1692 if (asprintf(&loopdev
, "/dev/loop%i", nr
) < 0)
1695 loop
= open(loopdev
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1697 return log_error_errno(errno
, "Failed to open loop device %s: %m", loopdev
);
1699 if (ioctl(loop
, LOOP_SET_FD
, fd
) < 0)
1700 return log_error_errno(errno
, "Failed to set loopback file descriptor on %s: %m", loopdev
);
1703 info
.lo_flags
|= LO_FLAGS_READ_ONLY
;
1705 if (ioctl(loop
, LOOP_SET_STATUS64
, &info
) < 0)
1706 return log_error_errno(errno
, "Failed to set loopback settings on %s: %m", loopdev
);
1708 *device_path
= loopdev
;
1719 #define PARTITION_TABLE_BLURB \
1720 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1721 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1722 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1723 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1724 "to be bootable with systemd-nspawn."
1726 static int dissect_image(
1728 char **root_device
, bool *root_device_rw
,
1729 char **home_device
, bool *home_device_rw
,
1730 char **srv_device
, bool *srv_device_rw
,
1734 int home_nr
= -1, srv_nr
= -1;
1735 #ifdef GPT_ROOT_NATIVE
1738 #ifdef GPT_ROOT_SECONDARY
1739 int secondary_root_nr
= -1;
1741 _cleanup_free_
char *home
= NULL
, *root
= NULL
, *secondary_root
= NULL
, *srv
= NULL
, *generic
= NULL
;
1742 _cleanup_udev_enumerate_unref_
struct udev_enumerate
*e
= NULL
;
1743 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
1744 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
1745 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
1746 struct udev_list_entry
*first
, *item
;
1747 bool home_rw
= true, root_rw
= true, secondary_root_rw
= true, srv_rw
= true, generic_rw
= true;
1748 bool is_gpt
, is_mbr
, multiple_generic
= false;
1749 const char *pttype
= NULL
;
1756 assert(root_device
);
1757 assert(home_device
);
1762 b
= blkid_new_probe();
1767 r
= blkid_probe_set_device(b
, fd
, 0, 0);
1772 log_error_errno(errno
, "Failed to set device on blkid probe: %m");
1776 blkid_probe_enable_partitions(b
, 1);
1777 blkid_probe_set_partitions_flags(b
, BLKID_PARTS_ENTRY_DETAILS
);
1780 r
= blkid_do_safeprobe(b
);
1781 if (r
== -2 || r
== 1) {
1782 log_error("Failed to identify any partition table on\n"
1784 PARTITION_TABLE_BLURB
, arg_image
);
1786 } else if (r
!= 0) {
1789 log_error_errno(errno
, "Failed to probe: %m");
1793 (void) blkid_probe_lookup_value(b
, "PTTYPE", &pttype
, NULL
);
1795 is_gpt
= streq_ptr(pttype
, "gpt");
1796 is_mbr
= streq_ptr(pttype
, "dos");
1798 if (!is_gpt
&& !is_mbr
) {
1799 log_error("No GPT or MBR partition table discovered on\n"
1801 PARTITION_TABLE_BLURB
, arg_image
);
1806 pl
= blkid_probe_get_partitions(b
);
1811 log_error("Failed to list partitions of %s", arg_image
);
1819 if (fstat(fd
, &st
) < 0)
1820 return log_error_errno(errno
, "Failed to stat block device: %m");
1822 d
= udev_device_new_from_devnum(udev
, 'b', st
.st_rdev
);
1830 log_error("Kernel partitions never appeared.");
1834 e
= udev_enumerate_new(udev
);
1838 r
= udev_enumerate_add_match_parent(e
, d
);
1842 r
= udev_enumerate_scan_devices(e
);
1844 return log_error_errno(r
, "Failed to scan for partition devices of %s: %m", arg_image
);
1846 /* Count the partitions enumerated by the kernel */
1848 first
= udev_enumerate_get_list_entry(e
);
1849 udev_list_entry_foreach(item
, first
)
1852 /* Count the partitions enumerated by blkid */
1853 m
= blkid_partlist_numof_partitions(pl
);
1857 log_error("blkid and kernel partition list do not match.");
1863 /* The kernel has probed fewer partitions than
1864 * blkid? Maybe the kernel prober is still
1865 * running or it got EBUSY because udev
1866 * already opened the device. Let's reprobe
1867 * the device, which is a synchronous call
1868 * that waits until probing is complete. */
1870 for (j
= 0; j
< 20; j
++) {
1872 r
= ioctl(fd
, BLKRRPART
, 0);
1875 if (r
>= 0 || r
!= -EBUSY
)
1878 /* If something else has the device
1879 * open, such as an udev rule, the
1880 * ioctl will return EBUSY. Since
1881 * there's no way to wait until it
1882 * isn't busy anymore, let's just wait
1883 * a bit, and try again.
1885 * This is really something they
1886 * should fix in the kernel! */
1888 usleep(50 * USEC_PER_MSEC
);
1892 return log_error_errno(r
, "Failed to reread partition table: %m");
1895 e
= udev_enumerate_unref(e
);
1898 first
= udev_enumerate_get_list_entry(e
);
1899 udev_list_entry_foreach(item
, first
) {
1900 _cleanup_udev_device_unref_
struct udev_device
*q
;
1902 unsigned long long flags
;
1908 q
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(item
));
1913 log_error_errno(errno
, "Failed to get partition device of %s: %m", arg_image
);
1917 qn
= udev_device_get_devnum(q
);
1921 if (st
.st_rdev
== qn
)
1924 node
= udev_device_get_devnode(q
);
1928 pp
= blkid_partlist_devno_to_partition(pl
, qn
);
1932 flags
= blkid_partition_get_flags(pp
);
1934 nr
= blkid_partition_get_partno(pp
);
1942 if (flags
& GPT_FLAG_NO_AUTO
)
1945 stype
= blkid_partition_get_type_string(pp
);
1949 if (sd_id128_from_string(stype
, &type_id
) < 0)
1952 if (sd_id128_equal(type_id
, GPT_HOME
)) {
1954 if (home
&& nr
>= home_nr
)
1958 home_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1960 r
= free_and_strdup(&home
, node
);
1964 } else if (sd_id128_equal(type_id
, GPT_SRV
)) {
1966 if (srv
&& nr
>= srv_nr
)
1970 srv_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1972 r
= free_and_strdup(&srv
, node
);
1976 #ifdef GPT_ROOT_NATIVE
1977 else if (sd_id128_equal(type_id
, GPT_ROOT_NATIVE
)) {
1979 if (root
&& nr
>= root_nr
)
1983 root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1985 r
= free_and_strdup(&root
, node
);
1990 #ifdef GPT_ROOT_SECONDARY
1991 else if (sd_id128_equal(type_id
, GPT_ROOT_SECONDARY
)) {
1993 if (secondary_root
&& nr
>= secondary_root_nr
)
1996 secondary_root_nr
= nr
;
1997 secondary_root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1999 r
= free_and_strdup(&secondary_root
, node
);
2004 else if (sd_id128_equal(type_id
, GPT_LINUX_GENERIC
)) {
2007 multiple_generic
= true;
2009 generic_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2011 r
= free_and_strdup(&generic
, node
);
2017 } else if (is_mbr
) {
2020 if (flags
!= 0x80) /* Bootable flag */
2023 type
= blkid_partition_get_type(pp
);
2024 if (type
!= 0x83) /* Linux partition */
2028 multiple_generic
= true;
2032 r
= free_and_strdup(&root
, node
);
2040 *root_device
= root
;
2043 *root_device_rw
= root_rw
;
2045 } else if (secondary_root
) {
2046 *root_device
= secondary_root
;
2047 secondary_root
= NULL
;
2049 *root_device_rw
= secondary_root_rw
;
2051 } else if (generic
) {
2053 /* There were no partitions with precise meanings
2054 * around, but we found generic partitions. In this
2055 * case, if there's only one, we can go ahead and boot
2056 * it, otherwise we bail out, because we really cannot
2057 * make any sense of it. */
2059 if (multiple_generic
) {
2060 log_error("Identified multiple bootable Linux partitions on\n"
2062 PARTITION_TABLE_BLURB
, arg_image
);
2066 *root_device
= generic
;
2069 *root_device_rw
= generic_rw
;
2072 log_error("Failed to identify root partition in disk image\n"
2074 PARTITION_TABLE_BLURB
, arg_image
);
2079 *home_device
= home
;
2082 *home_device_rw
= home_rw
;
2089 *srv_device_rw
= srv_rw
;
2094 log_error("--image= is not supported, compiled without blkid support.");
2099 static int mount_device(const char *what
, const char *where
, const char *directory
, bool rw
) {
2101 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
2102 const char *fstype
, *p
;
2112 p
= strjoina(where
, directory
);
2117 b
= blkid_new_probe_from_filename(what
);
2121 log_error_errno(errno
, "Failed to allocate prober for %s: %m", what
);
2125 blkid_probe_enable_superblocks(b
, 1);
2126 blkid_probe_set_superblocks_flags(b
, BLKID_SUBLKS_TYPE
);
2129 r
= blkid_do_safeprobe(b
);
2130 if (r
== -1 || r
== 1) {
2131 log_error("Cannot determine file system type of %s", what
);
2133 } else if (r
!= 0) {
2136 log_error_errno(errno
, "Failed to probe %s: %m", what
);
2141 if (blkid_probe_lookup_value(b
, "TYPE", &fstype
, NULL
) < 0) {
2144 log_error("Failed to determine file system type of %s", what
);
2148 if (streq(fstype
, "crypto_LUKS")) {
2149 log_error("nspawn currently does not support LUKS disk images.");
2153 if (mount(what
, p
, fstype
, MS_NODEV
|(rw
? 0 : MS_RDONLY
), NULL
) < 0)
2154 return log_error_errno(errno
, "Failed to mount %s: %m", what
);
2158 log_error("--image= is not supported, compiled without blkid support.");
2163 static int mount_devices(
2165 const char *root_device
, bool root_device_rw
,
2166 const char *home_device
, bool home_device_rw
,
2167 const char *srv_device
, bool srv_device_rw
) {
2173 r
= mount_device(root_device
, arg_directory
, NULL
, root_device_rw
);
2175 return log_error_errno(r
, "Failed to mount root directory: %m");
2179 r
= mount_device(home_device
, arg_directory
, "/home", home_device_rw
);
2181 return log_error_errno(r
, "Failed to mount home directory: %m");
2185 r
= mount_device(srv_device
, arg_directory
, "/srv", srv_device_rw
);
2187 return log_error_errno(r
, "Failed to mount server data directory: %m");
2193 static void loop_remove(int nr
, int *image_fd
) {
2194 _cleanup_close_
int control
= -1;
2200 if (image_fd
&& *image_fd
>= 0) {
2201 r
= ioctl(*image_fd
, LOOP_CLR_FD
);
2203 log_debug_errno(errno
, "Failed to close loop image: %m");
2204 *image_fd
= safe_close(*image_fd
);
2207 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
2209 log_warning_errno(errno
, "Failed to open /dev/loop-control: %m");
2213 r
= ioctl(control
, LOOP_CTL_REMOVE
, nr
);
2215 log_debug_errno(errno
, "Failed to remove loop %d: %m", nr
);
2220 * < 0 : wait_for_terminate() failed to get the state of the
2221 * container, the container was terminated by a signal, or
2222 * failed for an unknown reason. No change is made to the
2223 * container argument.
2224 * > 0 : The program executed in the container terminated with an
2225 * error. The exit code of the program executed in the
2226 * container is returned. The container argument has been set
2227 * to CONTAINER_TERMINATED.
2228 * 0 : The container is being rebooted, has been shut down or exited
2229 * successfully. The container argument has been set to either
2230 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2232 * That is, success is indicated by a return value of zero, and an
2233 * error is indicated by a non-zero value.
2235 static int wait_for_container(pid_t pid
, ContainerStatus
*container
) {
2239 r
= wait_for_terminate(pid
, &status
);
2241 return log_warning_errno(r
, "Failed to wait for container: %m");
2243 switch (status
.si_code
) {
2246 if (status
.si_status
== 0) {
2247 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s exited successfully.", arg_machine
);
2250 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s failed with error code %i.", arg_machine
, status
.si_status
);
2252 *container
= CONTAINER_TERMINATED
;
2253 return status
.si_status
;
2256 if (status
.si_status
== SIGINT
) {
2258 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s has been shut down.", arg_machine
);
2259 *container
= CONTAINER_TERMINATED
;
2262 } else if (status
.si_status
== SIGHUP
) {
2264 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s is being rebooted.", arg_machine
);
2265 *container
= CONTAINER_REBOOTED
;
2269 /* CLD_KILLED fallthrough */
2272 log_error("Container %s terminated by signal %s.", arg_machine
, signal_to_string(status
.si_status
));
2276 log_error("Container %s failed due to unknown reason.", arg_machine
);
2283 static int on_orderly_shutdown(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
2286 pid
= PTR_TO_UINT32(userdata
);
2288 if (kill(pid
, arg_kill_signal
) >= 0) {
2289 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2290 sd_event_source_set_userdata(s
, NULL
);
2295 sd_event_exit(sd_event_source_get_event(s
), 0);
2299 static int determine_names(void) {
2302 if (arg_template
&& !arg_directory
&& arg_machine
) {
2304 /* If --template= was specified then we should not
2305 * search for a machine, but instead create a new one
2306 * in /var/lib/machine. */
2308 arg_directory
= strjoin("/var/lib/machines/", arg_machine
, NULL
);
2313 if (!arg_image
&& !arg_directory
) {
2315 _cleanup_(image_unrefp
) Image
*i
= NULL
;
2317 r
= image_find(arg_machine
, &i
);
2319 return log_error_errno(r
, "Failed to find image for machine '%s': %m", arg_machine
);
2321 log_error("No image for machine '%s': %m", arg_machine
);
2325 if (i
->type
== IMAGE_RAW
)
2326 r
= set_sanitized_path(&arg_image
, i
->path
);
2328 r
= set_sanitized_path(&arg_directory
, i
->path
);
2330 return log_error_errno(r
, "Invalid image directory: %m");
2333 arg_read_only
= arg_read_only
|| i
->read_only
;
2335 arg_directory
= get_current_dir_name();
2337 if (!arg_directory
&& !arg_machine
) {
2338 log_error("Failed to determine path, please use -D or -i.");
2344 if (arg_directory
&& path_equal(arg_directory
, "/"))
2345 arg_machine
= gethostname_malloc();
2347 arg_machine
= strdup(basename(arg_image
?: arg_directory
));
2352 hostname_cleanup(arg_machine
);
2353 if (!machine_name_is_valid(arg_machine
)) {
2354 log_error("Failed to determine machine name automatically, please use -M.");
2358 if (arg_ephemeral
) {
2361 /* Add a random suffix when this is an
2362 * ephemeral machine, so that we can run many
2363 * instances at once without manually having
2364 * to specify -M each time. */
2366 if (asprintf(&b
, "%s-%016" PRIx64
, arg_machine
, random_u64()) < 0)
2377 static int determine_uid_shift(const char *directory
) {
2385 if (arg_uid_shift
== UID_INVALID
) {
2388 r
= stat(directory
, &st
);
2390 return log_error_errno(errno
, "Failed to determine UID base of %s: %m", directory
);
2392 arg_uid_shift
= st
.st_uid
& UINT32_C(0xffff0000);
2394 if (arg_uid_shift
!= (st
.st_gid
& UINT32_C(0xffff0000))) {
2395 log_error("UID and GID base of %s don't match.", directory
);
2399 arg_uid_range
= UINT32_C(0x10000);
2402 if (arg_uid_shift
> (uid_t
) -1 - arg_uid_range
) {
2403 log_error("UID base too high for UID range.");
2407 log_info("Using user namespaces with base " UID_FMT
" and range " UID_FMT
".", arg_uid_shift
, arg_uid_range
);
2411 static int inner_child(
2413 const char *directory
,
2419 _cleanup_free_
char *home
= NULL
;
2421 const char *envp
[] = {
2422 "PATH=" DEFAULT_PATH_SPLIT_USR
,
2423 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2428 NULL
, /* container_uuid */
2429 NULL
, /* LISTEN_FDS */
2430 NULL
, /* LISTEN_PID */
2434 _cleanup_strv_free_
char **env_use
= NULL
;
2439 assert(kmsg_socket
>= 0);
2444 /* Tell the parent, that it now can write the UID map. */
2445 (void) barrier_place(barrier
); /* #1 */
2447 /* Wait until the parent wrote the UID map */
2448 if (!barrier_place_and_sync(barrier
)) { /* #2 */
2449 log_error("Parent died too early");
2454 r
= mount_all(NULL
, arg_userns
, true, arg_uid_shift
, arg_private_network
, arg_uid_range
, arg_selinux_apifs_context
);
2458 r
= mount_sysfs(NULL
);
2462 /* Wait until we are cgroup-ified, so that we
2463 * can mount the right cgroup path writable */
2464 if (!barrier_place_and_sync(barrier
)) { /* #3 */
2465 log_error("Parent died too early");
2469 r
= mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy
);
2473 r
= reset_uid_gid();
2475 return log_error_errno(r
, "Couldn't become new root: %m");
2477 r
= setup_boot_id(NULL
);
2481 r
= setup_kmsg(NULL
, kmsg_socket
);
2484 kmsg_socket
= safe_close(kmsg_socket
);
2489 return log_error_errno(errno
, "setsid() failed: %m");
2491 if (arg_private_network
)
2494 if (arg_expose_ports
) {
2495 r
= expose_port_send_rtnl(rtnl_socket
);
2498 rtnl_socket
= safe_close(rtnl_socket
);
2501 if (drop_capabilities() < 0)
2502 return log_error_errno(errno
, "drop_capabilities() failed: %m");
2506 if (arg_personality
!= PERSONALITY_INVALID
) {
2507 if (personality(arg_personality
) < 0)
2508 return log_error_errno(errno
, "personality() failed: %m");
2509 } else if (secondary
) {
2510 if (personality(PER_LINUX32
) < 0)
2511 return log_error_errno(errno
, "personality() failed: %m");
2515 if (arg_selinux_context
)
2516 if (setexeccon((security_context_t
) arg_selinux_context
) < 0)
2517 return log_error_errno(errno
, "setexeccon(\"%s\") failed: %m", arg_selinux_context
);
2520 r
= change_uid_gid(arg_user
, &home
);
2524 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
2528 if ((asprintf((char**)(envp
+ n_env
++), "HOME=%s", home
? home
: "/root") < 0) ||
2529 (asprintf((char**)(envp
+ n_env
++), "USER=%s", arg_user
? arg_user
: "root") < 0) ||
2530 (asprintf((char**)(envp
+ n_env
++), "LOGNAME=%s", arg_user
? arg_user
: "root") < 0))
2533 if (!sd_id128_equal(arg_uuid
, SD_ID128_NULL
)) {
2536 if (asprintf((char**)(envp
+ n_env
++), "container_uuid=%s", id128_format_as_uuid(arg_uuid
, as_uuid
)) < 0)
2540 if (fdset_size(fds
) > 0) {
2541 r
= fdset_cloexec(fds
, false);
2543 return log_error_errno(r
, "Failed to unset O_CLOEXEC for file descriptors.");
2545 if ((asprintf((char **)(envp
+ n_env
++), "LISTEN_FDS=%u", fdset_size(fds
)) < 0) ||
2546 (asprintf((char **)(envp
+ n_env
++), "LISTEN_PID=1") < 0))
2550 env_use
= strv_env_merge(2, envp
, arg_setenv
);
2554 /* Let the parent know that we are ready and
2555 * wait until the parent is ready with the
2557 if (!barrier_place_and_sync(barrier
)) { /* #4 */
2558 log_error("Parent died too early");
2562 /* Now, explicitly close the log, so that we
2563 * then can close all remaining fds. Closing
2564 * the log explicitly first has the benefit
2565 * that the logging subsystem knows about it,
2566 * and is thus ready to be reopened should we
2567 * need it again. Note that the other fds
2568 * closed here are at least the locking and
2571 (void) fdset_close_others(fds
);
2577 /* Automatically search for the init system */
2579 m
= 1 + strv_length(arg_parameters
);
2580 a
= newa(char*, m
+ 1);
2581 if (strv_isempty(arg_parameters
))
2584 memcpy(a
+ 1, arg_parameters
, m
* sizeof(char*));
2586 a
[0] = (char*) "/usr/lib/systemd/systemd";
2587 execve(a
[0], a
, env_use
);
2589 a
[0] = (char*) "/lib/systemd/systemd";
2590 execve(a
[0], a
, env_use
);
2592 a
[0] = (char*) "/sbin/init";
2593 execve(a
[0], a
, env_use
);
2594 } else if (!strv_isempty(arg_parameters
))
2595 execvpe(arg_parameters
[0], arg_parameters
, env_use
);
2597 chdir(home
?: "/root");
2598 execle("/bin/bash", "-bash", NULL
, env_use
);
2599 execle("/bin/sh", "-sh", NULL
, env_use
);
2603 return log_error_errno(errno
, "execv() failed: %m");
2606 static int outer_child(
2608 const char *directory
,
2609 const char *console
,
2610 const char *root_device
, bool root_device_rw
,
2611 const char *home_device
, bool home_device_rw
,
2612 const char *srv_device
, bool srv_device_rw
,
2618 int uid_shift_socket
,
2628 assert(pid_socket
>= 0);
2629 assert(kmsg_socket
>= 0);
2633 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0)
2634 return log_error_errno(errno
, "PR_SET_PDEATHSIG failed: %m");
2637 close_nointr(STDIN_FILENO
);
2638 close_nointr(STDOUT_FILENO
);
2639 close_nointr(STDERR_FILENO
);
2641 r
= open_terminal(console
, O_RDWR
);
2642 if (r
!= STDIN_FILENO
) {
2648 return log_error_errno(r
, "Failed to open console: %m");
2651 if (dup2(STDIN_FILENO
, STDOUT_FILENO
) != STDOUT_FILENO
||
2652 dup2(STDIN_FILENO
, STDERR_FILENO
) != STDERR_FILENO
)
2653 return log_error_errno(errno
, "Failed to duplicate console: %m");
2656 r
= reset_audit_loginuid();
2660 /* Mark everything as slave, so that we still
2661 * receive mounts from the real root, but don't
2662 * propagate mounts to the real root. */
2663 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
2664 return log_error_errno(errno
, "MS_SLAVE|MS_REC failed: %m");
2666 r
= mount_devices(directory
,
2667 root_device
, root_device_rw
,
2668 home_device
, home_device_rw
,
2669 srv_device
, srv_device_rw
);
2673 r
= determine_uid_shift(directory
);
2678 l
= send(uid_shift_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), MSG_NOSIGNAL
);
2680 return log_error_errno(errno
, "Failed to send UID shift: %m");
2681 if (l
!= sizeof(arg_uid_shift
)) {
2682 log_error("Short write while sending UID shift.");
2687 /* Turn directory into bind mount */
2688 if (mount(directory
, directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
2689 return log_error_errno(errno
, "Failed to make bind mount: %m");
2691 r
= setup_volatile(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2695 r
= setup_volatile_state(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2699 r
= base_filesystem_create(directory
, arg_uid_shift
, (gid_t
) arg_uid_shift
);
2703 if (arg_read_only
) {
2704 r
= bind_remount_recursive(directory
, true);
2706 return log_error_errno(r
, "Failed to make tree read-only: %m");
2709 r
= mount_all(directory
, arg_userns
, false, arg_private_network
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2713 r
= copy_devnodes(directory
);
2717 dev_setup(directory
, arg_uid_shift
, arg_uid_shift
);
2719 r
= setup_pts(directory
);
2723 r
= setup_propagate(directory
);
2727 r
= setup_dev_console(directory
, console
);
2731 r
= setup_seccomp();
2735 r
= setup_timezone(directory
);
2739 r
= setup_resolv_conf(directory
);
2743 r
= setup_journal(directory
);
2747 r
= mount_custom(directory
, arg_custom_mounts
, arg_n_custom_mounts
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2751 r
= mount_cgroups(directory
, arg_unified_cgroup_hierarchy
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2755 r
= mount_move_root(directory
);
2757 return log_error_errno(r
, "Failed to move root directory: %m");
2759 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
|
2760 (arg_share_system
? 0 : CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
) |
2761 (arg_private_network
? CLONE_NEWNET
: 0) |
2762 (arg_userns
? CLONE_NEWUSER
: 0),
2765 return log_error_errno(errno
, "Failed to fork inner child: %m");
2767 pid_socket
= safe_close(pid_socket
);
2768 uid_shift_socket
= safe_close(uid_shift_socket
);
2770 /* The inner child has all namespaces that are
2771 * requested, so that we all are owned by the user if
2772 * user namespaces are turned on. */
2774 r
= inner_child(barrier
, directory
, secondary
, kmsg_socket
, rtnl_socket
, fds
);
2776 _exit(EXIT_FAILURE
);
2778 _exit(EXIT_SUCCESS
);
2781 l
= send(pid_socket
, &pid
, sizeof(pid
), MSG_NOSIGNAL
);
2783 return log_error_errno(errno
, "Failed to send PID: %m");
2784 if (l
!= sizeof(pid
)) {
2785 log_error("Short write while sending PID.");
2789 pid_socket
= safe_close(pid_socket
);
2790 kmsg_socket
= safe_close(kmsg_socket
);
2791 rtnl_socket
= safe_close(rtnl_socket
);
2796 static int setup_uid_map(pid_t pid
) {
2797 char uid_map
[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t
) + 1], line
[DECIMAL_STR_MAX(uid_t
)*3+3+1];
2802 xsprintf(uid_map
, "/proc/" PID_FMT
"/uid_map", pid
);
2803 xsprintf(line
, UID_FMT
" " UID_FMT
" " UID_FMT
"\n", 0, arg_uid_shift
, arg_uid_range
);
2804 r
= write_string_file(uid_map
, line
, 0);
2806 return log_error_errno(r
, "Failed to write UID map: %m");
2808 /* We always assign the same UID and GID ranges */
2809 xsprintf(uid_map
, "/proc/" PID_FMT
"/gid_map", pid
);
2810 r
= write_string_file(uid_map
, line
, 0);
2812 return log_error_errno(r
, "Failed to write GID map: %m");
2817 static int load_settings(void) {
2818 _cleanup_(settings_freep
) Settings
*settings
= NULL
;
2819 _cleanup_fclose_
FILE *f
= NULL
;
2820 _cleanup_free_
char *p
= NULL
;
2824 /* If all settings are masked, there's no point in looking for
2825 * the settings file */
2826 if ((arg_settings_mask
& _SETTINGS_MASK_ALL
) == _SETTINGS_MASK_ALL
)
2829 fn
= strjoina(arg_machine
, ".nspawn");
2831 /* We first look in the admin's directories in /etc and /run */
2832 FOREACH_STRING(i
, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2833 _cleanup_free_
char *j
= NULL
;
2835 j
= strjoin(i
, "/", fn
, NULL
);
2844 /* By default we trust configuration from /etc and /run */
2845 if (arg_settings_trusted
< 0)
2846 arg_settings_trusted
= true;
2851 if (errno
!= ENOENT
)
2852 return log_error_errno(errno
, "Failed to open %s: %m", j
);
2856 /* After that, let's look for a file next to the
2857 * actual image we shall boot. */
2860 p
= file_in_same_dir(arg_image
, fn
);
2863 } else if (arg_directory
) {
2864 p
= file_in_same_dir(arg_directory
, fn
);
2871 if (!f
&& errno
!= ENOENT
)
2872 return log_error_errno(errno
, "Failed to open %s: %m", p
);
2874 /* By default we do not trust configuration from /var/lib/machines */
2875 if (arg_settings_trusted
< 0)
2876 arg_settings_trusted
= false;
2883 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted
));
2885 r
= settings_load(f
, p
, &settings
);
2889 /* Copy over bits from the settings, unless they have been
2890 * explicitly masked by command line switches. */
2892 if ((arg_settings_mask
& SETTING_BOOT
) == 0 &&
2893 settings
->boot
>= 0) {
2894 arg_boot
= settings
->boot
;
2896 strv_free(arg_parameters
);
2897 arg_parameters
= settings
->parameters
;
2898 settings
->parameters
= NULL
;
2901 if ((arg_settings_mask
& SETTING_ENVIRONMENT
) == 0 &&
2902 settings
->environment
) {
2903 strv_free(arg_setenv
);
2904 arg_setenv
= settings
->environment
;
2905 settings
->environment
= NULL
;
2908 if ((arg_settings_mask
& SETTING_USER
) == 0 &&
2911 arg_user
= settings
->user
;
2912 settings
->user
= NULL
;
2915 if ((arg_settings_mask
& SETTING_CAPABILITY
) == 0) {
2918 plus
= settings
->capability
;
2919 if (settings_private_network(settings
))
2920 plus
|= (1ULL << CAP_NET_ADMIN
);
2922 if (!arg_settings_trusted
&& plus
!= 0) {
2923 if (settings
->capability
!= 0)
2924 log_warning("Ignoring Capability= setting, file %s is not trusted.", p
);
2928 arg_retain
&= ~settings
->drop_capability
;
2931 if ((arg_settings_mask
& SETTING_KILL_SIGNAL
) == 0 &&
2932 settings
->kill_signal
> 0)
2933 arg_kill_signal
= settings
->kill_signal
;
2935 if ((arg_settings_mask
& SETTING_PERSONALITY
) == 0 &&
2936 settings
->personality
!= PERSONALITY_INVALID
)
2937 arg_personality
= settings
->personality
;
2939 if ((arg_settings_mask
& SETTING_MACHINE_ID
) == 0 &&
2940 !sd_id128_is_null(settings
->machine_id
)) {
2942 if (!arg_settings_trusted
)
2943 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p
);
2945 arg_uuid
= settings
->machine_id
;
2948 if ((arg_settings_mask
& SETTING_READ_ONLY
) == 0 &&
2949 settings
->read_only
>= 0)
2950 arg_read_only
= settings
->read_only
;
2952 if ((arg_settings_mask
& SETTING_VOLATILE_MODE
) == 0 &&
2953 settings
->volatile_mode
!= _VOLATILE_MODE_INVALID
)
2954 arg_volatile_mode
= settings
->volatile_mode
;
2956 if ((arg_settings_mask
& SETTING_CUSTOM_MOUNTS
) == 0 &&
2957 settings
->n_custom_mounts
> 0) {
2959 if (!arg_settings_trusted
)
2960 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p
);
2962 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
2963 arg_custom_mounts
= settings
->custom_mounts
;
2964 arg_n_custom_mounts
= settings
->n_custom_mounts
;
2966 settings
->custom_mounts
= NULL
;
2967 settings
->n_custom_mounts
= 0;
2971 if ((arg_settings_mask
& SETTING_NETWORK
) == 0 &&
2972 (settings
->private_network
>= 0 ||
2973 settings
->network_veth
>= 0 ||
2974 settings
->network_bridge
||
2975 settings
->network_interfaces
||
2976 settings
->network_macvlan
||
2977 settings
->network_ipvlan
)) {
2979 if (!arg_settings_trusted
)
2980 log_warning("Ignoring network settings, file %s is not trusted.", p
);
2982 arg_network_veth
= settings_private_network(settings
);
2983 arg_private_network
= settings_private_network(settings
);
2985 strv_free(arg_network_interfaces
);
2986 arg_network_interfaces
= settings
->network_interfaces
;
2987 settings
->network_interfaces
= NULL
;
2989 strv_free(arg_network_macvlan
);
2990 arg_network_macvlan
= settings
->network_macvlan
;
2991 settings
->network_macvlan
= NULL
;
2993 strv_free(arg_network_ipvlan
);
2994 arg_network_ipvlan
= settings
->network_ipvlan
;
2995 settings
->network_ipvlan
= NULL
;
2997 free(arg_network_bridge
);
2998 arg_network_bridge
= settings
->network_bridge
;
2999 settings
->network_bridge
= NULL
;
3003 if ((arg_settings_mask
& SETTING_EXPOSE_PORTS
) == 0 &&
3004 settings
->expose_ports
) {
3006 if (!arg_settings_trusted
)
3007 log_warning("Ignoring Port= setting, file %s is not trusted.", p
);
3009 expose_port_free_all(arg_expose_ports
);
3010 arg_expose_ports
= settings
->expose_ports
;
3011 settings
->expose_ports
= NULL
;
3018 int main(int argc
, char *argv
[]) {
3020 _cleanup_free_
char *device_path
= NULL
, *root_device
= NULL
, *home_device
= NULL
, *srv_device
= NULL
, *console
= NULL
;
3021 bool root_device_rw
= true, home_device_rw
= true, srv_device_rw
= true;
3022 _cleanup_close_
int master
= -1, image_fd
= -1;
3023 _cleanup_fdset_free_ FDSet
*fds
= NULL
;
3024 int r
, n_fd_passed
, loop_nr
= -1;
3025 char veth_name
[IFNAMSIZ
];
3026 bool secondary
= false, remove_subvol
= false;
3029 int ret
= EXIT_SUCCESS
;
3030 union in_addr_union exposed
= {};
3031 _cleanup_release_lock_file_ LockFile tree_global_lock
= LOCK_FILE_INIT
, tree_local_lock
= LOCK_FILE_INIT
;
3034 log_parse_environment();
3037 r
= parse_argv(argc
, argv
);
3041 if (geteuid() != 0) {
3042 log_error("Need to be root.");
3046 r
= determine_names();
3050 r
= load_settings();
3054 r
= verify_arguments();
3058 n_fd_passed
= sd_listen_fds(false);
3059 if (n_fd_passed
> 0) {
3060 r
= fdset_new_listen_fds(&fds
, false);
3062 log_error_errno(r
, "Failed to collect file descriptors: %m");
3067 if (arg_directory
) {
3070 if (path_equal(arg_directory
, "/") && !arg_ephemeral
) {
3071 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3076 if (arg_ephemeral
) {
3077 _cleanup_free_
char *np
= NULL
;
3079 /* If the specified path is a mount point we
3080 * generate the new snapshot immediately
3081 * inside it under a random name. However if
3082 * the specified is not a mount point we
3083 * create the new snapshot in the parent
3084 * directory, just next to it. */
3085 r
= path_is_mount_point(arg_directory
, 0);
3087 log_error_errno(r
, "Failed to determine whether directory %s is mount point: %m", arg_directory
);
3091 r
= tempfn_random_child(arg_directory
, "machine.", &np
);
3093 r
= tempfn_random(arg_directory
, "machine.", &np
);
3095 log_error_errno(r
, "Failed to generate name for snapshot: %m");
3099 r
= image_path_lock(np
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3101 log_error_errno(r
, "Failed to lock %s: %m", np
);
3105 r
= btrfs_subvol_snapshot(arg_directory
, np
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
3107 log_error_errno(r
, "Failed to create snapshot %s from %s: %m", np
, arg_directory
);
3111 free(arg_directory
);
3115 remove_subvol
= true;
3118 r
= image_path_lock(arg_directory
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3120 log_error_errno(r
, "Directory tree %s is currently busy.", arg_directory
);
3124 log_error_errno(r
, "Failed to lock %s: %m", arg_directory
);
3129 r
= btrfs_subvol_snapshot(arg_template
, arg_directory
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
3132 log_info("Directory %s already exists, not populating from template %s.", arg_directory
, arg_template
);
3134 log_error_errno(r
, "Couldn't create snapshot %s from %s: %m", arg_directory
, arg_template
);
3138 log_info("Populated %s from template %s.", arg_directory
, arg_template
);
3144 if (path_is_os_tree(arg_directory
) <= 0) {
3145 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory
);
3152 p
= strjoina(arg_directory
, "/usr/");
3153 if (laccess(p
, F_OK
) < 0) {
3154 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory
);
3161 char template[] = "/tmp/nspawn-root-XXXXXX";
3164 assert(!arg_template
);
3166 r
= image_path_lock(arg_image
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3168 r
= log_error_errno(r
, "Disk image %s is currently busy.", arg_image
);
3172 r
= log_error_errno(r
, "Failed to create image lock: %m");
3176 if (!mkdtemp(template)) {
3177 log_error_errno(errno
, "Failed to create temporary directory: %m");
3182 arg_directory
= strdup(template);
3183 if (!arg_directory
) {
3188 image_fd
= setup_image(&device_path
, &loop_nr
);
3194 r
= dissect_image(image_fd
,
3195 &root_device
, &root_device_rw
,
3196 &home_device
, &home_device_rw
,
3197 &srv_device
, &srv_device_rw
,
3203 r
= custom_mounts_prepare();
3208 isatty(STDIN_FILENO
) > 0 &&
3209 isatty(STDOUT_FILENO
) > 0;
3211 master
= posix_openpt(O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NDELAY
);
3213 r
= log_error_errno(errno
, "Failed to acquire pseudo tty: %m");
3217 r
= ptsname_malloc(master
, &console
);
3219 r
= log_error_errno(r
, "Failed to determine tty name: %m");
3223 if (unlockpt(master
) < 0) {
3224 r
= log_error_errno(errno
, "Failed to unlock tty: %m");
3229 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3230 arg_machine
, arg_image
?: arg_directory
);
3232 assert_se(sigprocmask_many(SIG_BLOCK
, NULL
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, -1) >= 0);
3234 assert_se(sigemptyset(&mask_chld
) == 0);
3235 assert_se(sigaddset(&mask_chld
, SIGCHLD
) == 0);
3237 if (prctl(PR_SET_CHILD_SUBREAPER
, 1) < 0) {
3238 r
= log_error_errno(errno
, "Failed to become subreaper: %m");
3243 _cleanup_close_pair_
int kmsg_socket_pair
[2] = { -1, -1 }, rtnl_socket_pair
[2] = { -1, -1 }, pid_socket_pair
[2] = { -1, -1 },
3244 uid_shift_socket_pair
[2] = { -1, -1 };
3245 ContainerStatus container_status
;
3246 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
3247 static const struct sigaction sa
= {
3248 .sa_handler
= nop_signal_handler
,
3249 .sa_flags
= SA_NOCLDSTOP
,
3253 _cleanup_event_unref_ sd_event
*event
= NULL
;
3254 _cleanup_(pty_forward_freep
) PTYForward
*forward
= NULL
;
3255 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
3258 r
= barrier_create(&barrier
);
3260 log_error_errno(r
, "Cannot initialize IPC barrier: %m");
3264 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, kmsg_socket_pair
) < 0) {
3265 r
= log_error_errno(errno
, "Failed to create kmsg socket pair: %m");
3269 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, rtnl_socket_pair
) < 0) {
3270 r
= log_error_errno(errno
, "Failed to create rtnl socket pair: %m");
3274 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, pid_socket_pair
) < 0) {
3275 r
= log_error_errno(errno
, "Failed to create pid socket pair: %m");
3280 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, uid_shift_socket_pair
) < 0) {
3281 r
= log_error_errno(errno
, "Failed to create uid shift socket pair: %m");
3285 /* Child can be killed before execv(), so handle SIGCHLD
3286 * in order to interrupt parent's blocking calls and
3287 * give it a chance to call wait() and terminate. */
3288 r
= sigprocmask(SIG_UNBLOCK
, &mask_chld
, NULL
);
3290 r
= log_error_errno(errno
, "Failed to change the signal mask: %m");
3294 r
= sigaction(SIGCHLD
, &sa
, NULL
);
3296 r
= log_error_errno(errno
, "Failed to install SIGCHLD handler: %m");
3300 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
, NULL
);
3302 if (errno
== EINVAL
)
3303 r
= log_error_errno(errno
, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3305 r
= log_error_errno(errno
, "clone() failed: %m");
3311 /* The outer child only has a file system namespace. */
3312 barrier_set_role(&barrier
, BARRIER_CHILD
);
3314 master
= safe_close(master
);
3316 kmsg_socket_pair
[0] = safe_close(kmsg_socket_pair
[0]);
3317 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3318 pid_socket_pair
[0] = safe_close(pid_socket_pair
[0]);
3319 uid_shift_socket_pair
[0] = safe_close(uid_shift_socket_pair
[0]);
3321 (void) reset_all_signal_handlers();
3322 (void) reset_signal_mask();
3324 r
= outer_child(&barrier
,
3327 root_device
, root_device_rw
,
3328 home_device
, home_device_rw
,
3329 srv_device
, srv_device_rw
,
3333 kmsg_socket_pair
[1],
3334 rtnl_socket_pair
[1],
3335 uid_shift_socket_pair
[1],
3338 _exit(EXIT_FAILURE
);
3340 _exit(EXIT_SUCCESS
);
3343 barrier_set_role(&barrier
, BARRIER_PARENT
);
3345 fds
= fdset_free(fds
);
3347 kmsg_socket_pair
[1] = safe_close(kmsg_socket_pair
[1]);
3348 rtnl_socket_pair
[1] = safe_close(rtnl_socket_pair
[1]);
3349 pid_socket_pair
[1] = safe_close(pid_socket_pair
[1]);
3350 uid_shift_socket_pair
[1] = safe_close(uid_shift_socket_pair
[1]);
3352 /* Wait for the outer child. */
3353 r
= wait_for_terminate_and_warn("namespace helper", pid
, NULL
);
3362 /* And now retrieve the PID of the inner child. */
3363 l
= recv(pid_socket_pair
[0], &pid
, sizeof(pid
), 0);
3365 r
= log_error_errno(errno
, "Failed to read inner child PID: %m");
3368 if (l
!= sizeof(pid
)) {
3369 log_error("Short read while reading inner child PID.");
3374 log_debug("Init process invoked as PID " PID_FMT
, pid
);
3377 if (!barrier_place_and_sync(&barrier
)) { /* #1 */
3378 log_error("Child died too early.");
3383 l
= recv(uid_shift_socket_pair
[0], &arg_uid_shift
, sizeof(arg_uid_shift
), 0);
3385 r
= log_error_errno(errno
, "Failed to read UID shift: %m");
3388 if (l
!= sizeof(arg_uid_shift
)) {
3389 log_error("Short read while reading UID shift.");
3394 r
= setup_uid_map(pid
);
3398 (void) barrier_place(&barrier
); /* #2 */
3401 if (arg_private_network
) {
3403 r
= move_network_interfaces(pid
, arg_network_interfaces
);
3407 if (arg_network_veth
) {
3408 r
= setup_veth(arg_machine
, pid
, veth_name
, !!arg_network_bridge
);
3414 if (arg_network_bridge
) {
3415 r
= setup_bridge(veth_name
, arg_network_bridge
);
3423 r
= setup_macvlan(arg_machine
, pid
, arg_network_macvlan
);
3427 r
= setup_ipvlan(arg_machine
, pid
, arg_network_ipvlan
);
3433 r
= register_machine(
3440 arg_custom_mounts
, arg_n_custom_mounts
,
3448 r
= sync_cgroup(pid
, arg_unified_cgroup_hierarchy
);
3452 if (arg_keep_unit
) {
3453 r
= create_subcgroup(pid
, arg_unified_cgroup_hierarchy
);
3458 r
= chown_cgroup(pid
, arg_uid_shift
);
3462 /* Notify the child that the parent is ready with all
3463 * its setup (including cgroup-ification), and that
3464 * the child can now hand over control to the code to
3465 * run inside the container. */
3466 (void) barrier_place(&barrier
); /* #3 */
3468 /* Block SIGCHLD here, before notifying child.
3469 * process_pty() will handle it with the other signals. */
3470 assert_se(sigprocmask(SIG_BLOCK
, &mask_chld
, NULL
) >= 0);
3472 /* Reset signal to default */
3473 r
= default_signals(SIGCHLD
, -1);
3475 log_error_errno(r
, "Failed to reset SIGCHLD: %m");
3479 /* Let the child know that we are ready and wait that the child is completely ready now. */
3480 if (!barrier_place_and_sync(&barrier
)) { /* #4 */
3481 log_error("Child died too early.");
3488 "STATUS=Container running.\n"
3489 "X_NSPAWN_LEADER_PID=" PID_FMT
, pid
);
3491 r
= sd_event_new(&event
);
3493 log_error_errno(r
, "Failed to get default event source: %m");
3497 if (arg_kill_signal
> 0) {
3498 /* Try to kill the init system on SIGINT or SIGTERM */
3499 sd_event_add_signal(event
, NULL
, SIGINT
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
3500 sd_event_add_signal(event
, NULL
, SIGTERM
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
3502 /* Immediately exit */
3503 sd_event_add_signal(event
, NULL
, SIGINT
, NULL
, NULL
);
3504 sd_event_add_signal(event
, NULL
, SIGTERM
, NULL
, NULL
);
3507 /* simply exit on sigchld */
3508 sd_event_add_signal(event
, NULL
, SIGCHLD
, NULL
, NULL
);
3510 if (arg_expose_ports
) {
3511 r
= expose_port_watch_rtnl(event
, rtnl_socket_pair
[0], on_address_change
, &exposed
, &rtnl
);
3515 (void) expose_port_execute(rtnl
, arg_expose_ports
, &exposed
);
3518 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3520 r
= pty_forward_new(event
, master
, PTY_FORWARD_IGNORE_VHANGUP
| (interactive
? 0 : PTY_FORWARD_READ_ONLY
), &forward
);
3522 log_error_errno(r
, "Failed to create PTY forwarder: %m");
3526 r
= sd_event_loop(event
);
3528 log_error_errno(r
, "Failed to run event loop: %m");
3532 pty_forward_get_last_char(forward
, &last_char
);
3534 forward
= pty_forward_free(forward
);
3536 if (!arg_quiet
&& last_char
!= '\n')
3539 /* Kill if it is not dead yet anyway */
3540 if (arg_register
&& !arg_keep_unit
)
3541 terminate_machine(pid
);
3543 /* Normally redundant, but better safe than sorry */
3546 r
= wait_for_container(pid
, &container_status
);
3550 /* We failed to wait for the container, or the
3551 * container exited abnormally */
3553 else if (r
> 0 || container_status
== CONTAINER_TERMINATED
){
3554 /* The container exited with a non-zero
3555 * status, or with zero status and no reboot
3561 /* CONTAINER_REBOOTED, loop again */
3563 if (arg_keep_unit
) {
3564 /* Special handling if we are running as a
3565 * service: instead of simply restarting the
3566 * machine we want to restart the entire
3567 * service, so let's inform systemd about this
3568 * with the special exit code 133. The service
3569 * file uses RestartForceExitStatus=133 so
3570 * that this results in a full nspawn
3571 * restart. This is necessary since we might
3572 * have cgroup parameters set we want to have
3579 expose_port_flush(arg_expose_ports
, &exposed
);
3585 "STATUS=Terminating...");
3590 /* Try to flush whatever is still queued in the pty */
3592 (void) copy_bytes(master
, STDOUT_FILENO
, (uint64_t) -1, false);
3594 loop_remove(loop_nr
, &image_fd
);
3596 if (remove_subvol
&& arg_directory
) {
3599 k
= btrfs_subvol_remove(arg_directory
, BTRFS_REMOVE_RECURSIVE
|BTRFS_REMOVE_QUOTA
);
3601 log_warning_errno(k
, "Cannot remove subvolume '%s', ignoring: %m", arg_directory
);
3607 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
3608 (void) rm_rf(p
, REMOVE_ROOT
);
3611 expose_port_flush(arg_expose_ports
, &exposed
);
3613 free(arg_directory
);
3618 strv_free(arg_setenv
);
3619 free(arg_network_bridge
);
3620 strv_free(arg_network_interfaces
);
3621 strv_free(arg_network_macvlan
);
3622 strv_free(arg_network_ipvlan
);
3623 strv_free(arg_parameters
);
3624 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
3625 expose_port_free_all(arg_expose_ports
);
3627 return r
< 0 ? EXIT_FAILURE
: ret
;