2 This file is part of systemd.
4 Copyright 2010 Lennart Poettering
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
21 #include <blkid/blkid.h>
25 #include <linux/loop.h>
31 #include <selinux/selinux.h>
38 #include <sys/mount.h>
39 #include <sys/personality.h>
40 #include <sys/prctl.h>
41 #include <sys/types.h>
44 #include "sd-daemon.h"
47 #include "alloc-util.h"
49 #include "base-filesystem.h"
50 #include "blkid-util.h"
51 #include "btrfs-util.h"
53 #include "capability-util.h"
54 #include "cgroup-util.h"
56 #include "dev-setup.h"
61 #include "formats-util.h"
64 #include "hostname-util.h"
66 #include "loopback-setup.h"
67 #include "machine-id-setup.h"
68 #include "machine-image.h"
72 #include "mount-util.h"
73 #include "netlink-util.h"
74 #include "nspawn-cgroup.h"
75 #include "nspawn-expose-ports.h"
76 #include "nspawn-mount.h"
77 #include "nspawn-network.h"
78 #include "nspawn-register.h"
79 #include "nspawn-settings.h"
80 #include "nspawn-setuid.h"
81 #include "nspawn-stub-pid1.h"
82 #include "parse-util.h"
83 #include "path-util.h"
84 #include "process-util.h"
86 #include "random-util.h"
89 #include "seccomp-util.h"
91 #include "selinux-util.h"
92 #include "signal-util.h"
93 #include "socket-util.h"
94 #include "stat-util.h"
95 #include "stdio-util.h"
96 #include "string-util.h"
98 #include "terminal-util.h"
99 #include "udev-util.h"
100 #include "umask-util.h"
101 #include "user-util.h"
104 typedef enum ContainerStatus
{
105 CONTAINER_TERMINATED
,
109 typedef enum LinkJournal
{
116 static char *arg_directory
= NULL
;
117 static char *arg_template
= NULL
;
118 static char *arg_chdir
= NULL
;
119 static char *arg_user
= NULL
;
120 static sd_id128_t arg_uuid
= {};
121 static char *arg_machine
= NULL
;
122 static const char *arg_selinux_context
= NULL
;
123 static const char *arg_selinux_apifs_context
= NULL
;
124 static const char *arg_slice
= NULL
;
125 static bool arg_private_network
= false;
126 static bool arg_read_only
= false;
127 static StartMode arg_start_mode
= START_PID1
;
128 static bool arg_ephemeral
= false;
129 static LinkJournal arg_link_journal
= LINK_AUTO
;
130 static bool arg_link_journal_try
= false;
131 static uint64_t arg_retain
=
132 (1ULL << CAP_CHOWN
) |
133 (1ULL << CAP_DAC_OVERRIDE
) |
134 (1ULL << CAP_DAC_READ_SEARCH
) |
135 (1ULL << CAP_FOWNER
) |
136 (1ULL << CAP_FSETID
) |
137 (1ULL << CAP_IPC_OWNER
) |
139 (1ULL << CAP_LEASE
) |
140 (1ULL << CAP_LINUX_IMMUTABLE
) |
141 (1ULL << CAP_NET_BIND_SERVICE
) |
142 (1ULL << CAP_NET_BROADCAST
) |
143 (1ULL << CAP_NET_RAW
) |
144 (1ULL << CAP_SETGID
) |
145 (1ULL << CAP_SETFCAP
) |
146 (1ULL << CAP_SETPCAP
) |
147 (1ULL << CAP_SETUID
) |
148 (1ULL << CAP_SYS_ADMIN
) |
149 (1ULL << CAP_SYS_CHROOT
) |
150 (1ULL << CAP_SYS_NICE
) |
151 (1ULL << CAP_SYS_PTRACE
) |
152 (1ULL << CAP_SYS_TTY_CONFIG
) |
153 (1ULL << CAP_SYS_RESOURCE
) |
154 (1ULL << CAP_SYS_BOOT
) |
155 (1ULL << CAP_AUDIT_WRITE
) |
156 (1ULL << CAP_AUDIT_CONTROL
) |
158 static CustomMount
*arg_custom_mounts
= NULL
;
159 static unsigned arg_n_custom_mounts
= 0;
160 static char **arg_setenv
= NULL
;
161 static bool arg_quiet
= false;
162 static bool arg_share_system
= false;
163 static bool arg_register
= true;
164 static bool arg_keep_unit
= false;
165 static char **arg_network_interfaces
= NULL
;
166 static char **arg_network_macvlan
= NULL
;
167 static char **arg_network_ipvlan
= NULL
;
168 static bool arg_network_veth
= false;
169 static char **arg_network_veth_extra
= NULL
;
170 static char *arg_network_bridge
= NULL
;
171 static unsigned long arg_personality
= PERSONALITY_INVALID
;
172 static char *arg_image
= NULL
;
173 static VolatileMode arg_volatile_mode
= VOLATILE_NO
;
174 static ExposePort
*arg_expose_ports
= NULL
;
175 static char **arg_property
= NULL
;
176 static uid_t arg_uid_shift
= UID_INVALID
, arg_uid_range
= 0x10000U
;
177 static bool arg_userns
= false;
178 static int arg_kill_signal
= 0;
179 static bool arg_unified_cgroup_hierarchy
= false;
180 static SettingsMask arg_settings_mask
= 0;
181 static int arg_settings_trusted
= -1;
182 static char **arg_parameters
= NULL
;
183 static const char *arg_container_service_name
= "systemd-nspawn";
185 static void help(void) {
186 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
187 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
188 " -h --help Show this help\n"
189 " --version Print version string\n"
190 " -q --quiet Do not show status information\n"
191 " -D --directory=PATH Root directory for the container\n"
192 " --template=PATH Initialize root directory from template directory,\n"
194 " -x --ephemeral Run container with snapshot of root directory, and\n"
195 " remove it after exit\n"
196 " -i --image=PATH File system device or disk image for the container\n"
197 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
198 " -b --boot Boot up full system (i.e. invoke init)\n"
199 " --chdir=PATH Set working directory in the container\n"
200 " -u --user=USER Run the command under specified user or uid\n"
201 " -M --machine=NAME Set the machine name for the container\n"
202 " --uuid=UUID Set a specific machine UUID for the container\n"
203 " -S --slice=SLICE Place the container in the specified slice\n"
204 " --property=NAME=VALUE Set scope unit property\n"
205 " --private-users[=UIDBASE[:NUIDS]]\n"
206 " Run within user namespace\n"
207 " --private-network Disable network in container\n"
208 " --network-interface=INTERFACE\n"
209 " Assign an existing network interface to the\n"
211 " --network-macvlan=INTERFACE\n"
212 " Create a macvlan network interface based on an\n"
213 " existing network interface to the container\n"
214 " --network-ipvlan=INTERFACE\n"
215 " Create a ipvlan network interface based on an\n"
216 " existing network interface to the container\n"
217 " -n --network-veth Add a virtual Ethernet connection between host\n"
219 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
220 " Add an additional virtual Ethernet link between\n"
221 " host and container\n"
222 " --network-bridge=INTERFACE\n"
223 " Add a virtual Ethernet connection between host\n"
224 " and container and add it to an existing bridge on\n"
226 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
227 " Expose a container IP port on the host\n"
228 " -Z --selinux-context=SECLABEL\n"
229 " Set the SELinux security context to be used by\n"
230 " processes in the container\n"
231 " -L --selinux-apifs-context=SECLABEL\n"
232 " Set the SELinux security context to be used by\n"
233 " API/tmpfs file systems in the container\n"
234 " --capability=CAP In addition to the default, retain specified\n"
236 " --drop-capability=CAP Drop the specified capability from the default set\n"
237 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
238 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
239 " host, try-guest, try-host\n"
240 " -j Equivalent to --link-journal=try-guest\n"
241 " --read-only Mount the root directory read-only\n"
242 " --bind=PATH[:PATH[:OPTIONS]]\n"
243 " Bind mount a file or directory from the host into\n"
245 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
246 " Similar, but creates a read-only bind mount\n"
247 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
248 " --overlay=PATH[:PATH...]:PATH\n"
249 " Create an overlay mount from the host to \n"
251 " --overlay-ro=PATH[:PATH...]:PATH\n"
252 " Similar, but creates a read-only overlay mount\n"
253 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
254 " --share-system Share system namespaces with host\n"
255 " --register=BOOLEAN Register container as machine\n"
256 " --keep-unit Do not register a scope for the machine, reuse\n"
257 " the service unit nspawn is running in\n"
258 " --volatile[=MODE] Run the system in volatile mode\n"
259 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
260 , program_invocation_short_name
);
264 static int custom_mounts_prepare(void) {
268 /* Ensure the mounts are applied prefix first. */
269 qsort_safe(arg_custom_mounts
, arg_n_custom_mounts
, sizeof(CustomMount
), custom_mount_compare
);
271 /* Allocate working directories for the overlay file systems that need it */
272 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
273 CustomMount
*m
= &arg_custom_mounts
[i
];
275 if (arg_userns
&& arg_uid_shift
== UID_INVALID
&& path_equal(m
->destination
, "/")) {
276 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
280 if (m
->type
!= CUSTOM_MOUNT_OVERLAY
)
289 r
= tempfn_random(m
->source
, NULL
, &m
->work_dir
);
291 return log_error_errno(r
, "Failed to generate work directory from %s: %m", m
->source
);
297 static int detect_unified_cgroup_hierarchy(void) {
301 /* Allow the user to control whether the unified hierarchy is used */
302 e
= getenv("UNIFIED_CGROUP_HIERARCHY");
304 r
= parse_boolean(e
);
306 return log_error_errno(r
, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
308 arg_unified_cgroup_hierarchy
= r
;
312 /* Otherwise inherit the default from the host system */
315 return log_error_errno(r
, "Failed to determine whether the unified cgroups hierarchy is used: %m");
317 arg_unified_cgroup_hierarchy
= r
;
321 static int parse_argv(int argc
, char *argv
[]) {
340 ARG_NETWORK_INTERFACE
,
344 ARG_NETWORK_VETH_EXTRA
,
355 static const struct option options
[] = {
356 { "help", no_argument
, NULL
, 'h' },
357 { "version", no_argument
, NULL
, ARG_VERSION
},
358 { "directory", required_argument
, NULL
, 'D' },
359 { "template", required_argument
, NULL
, ARG_TEMPLATE
},
360 { "ephemeral", no_argument
, NULL
, 'x' },
361 { "user", required_argument
, NULL
, 'u' },
362 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
363 { "as-pid2", no_argument
, NULL
, 'a' },
364 { "boot", no_argument
, NULL
, 'b' },
365 { "uuid", required_argument
, NULL
, ARG_UUID
},
366 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
367 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
368 { "drop-capability", required_argument
, NULL
, ARG_DROP_CAPABILITY
},
369 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
370 { "bind", required_argument
, NULL
, ARG_BIND
},
371 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
372 { "tmpfs", required_argument
, NULL
, ARG_TMPFS
},
373 { "overlay", required_argument
, NULL
, ARG_OVERLAY
},
374 { "overlay-ro", required_argument
, NULL
, ARG_OVERLAY_RO
},
375 { "machine", required_argument
, NULL
, 'M' },
376 { "slice", required_argument
, NULL
, 'S' },
377 { "setenv", required_argument
, NULL
, ARG_SETENV
},
378 { "selinux-context", required_argument
, NULL
, 'Z' },
379 { "selinux-apifs-context", required_argument
, NULL
, 'L' },
380 { "quiet", no_argument
, NULL
, 'q' },
381 { "share-system", no_argument
, NULL
, ARG_SHARE_SYSTEM
},
382 { "register", required_argument
, NULL
, ARG_REGISTER
},
383 { "keep-unit", no_argument
, NULL
, ARG_KEEP_UNIT
},
384 { "network-interface", required_argument
, NULL
, ARG_NETWORK_INTERFACE
},
385 { "network-macvlan", required_argument
, NULL
, ARG_NETWORK_MACVLAN
},
386 { "network-ipvlan", required_argument
, NULL
, ARG_NETWORK_IPVLAN
},
387 { "network-veth", no_argument
, NULL
, 'n' },
388 { "network-veth-extra", required_argument
, NULL
, ARG_NETWORK_VETH_EXTRA
},
389 { "network-bridge", required_argument
, NULL
, ARG_NETWORK_BRIDGE
},
390 { "personality", required_argument
, NULL
, ARG_PERSONALITY
},
391 { "image", required_argument
, NULL
, 'i' },
392 { "volatile", optional_argument
, NULL
, ARG_VOLATILE
},
393 { "port", required_argument
, NULL
, 'p' },
394 { "property", required_argument
, NULL
, ARG_PROPERTY
},
395 { "private-users", optional_argument
, NULL
, ARG_PRIVATE_USERS
},
396 { "kill-signal", required_argument
, NULL
, ARG_KILL_SIGNAL
},
397 { "settings", required_argument
, NULL
, ARG_SETTINGS
},
398 { "chdir", required_argument
, NULL
, ARG_CHDIR
},
404 uint64_t plus
= 0, minus
= 0;
405 bool mask_all_settings
= false, mask_no_settings
= false;
410 while ((c
= getopt_long(argc
, argv
, "+hD:u:abL:M:jS:Z:qi:xp:n", options
, NULL
)) >= 0)
422 r
= parse_path_argument_and_warn(optarg
, false, &arg_directory
);
428 r
= parse_path_argument_and_warn(optarg
, false, &arg_template
);
434 r
= parse_path_argument_and_warn(optarg
, false, &arg_image
);
440 arg_ephemeral
= true;
444 r
= free_and_strdup(&arg_user
, optarg
);
448 arg_settings_mask
|= SETTING_USER
;
451 case ARG_NETWORK_BRIDGE
:
452 r
= free_and_strdup(&arg_network_bridge
, optarg
);
459 arg_network_veth
= true;
460 arg_private_network
= true;
461 arg_settings_mask
|= SETTING_NETWORK
;
464 case ARG_NETWORK_VETH_EXTRA
:
465 r
= veth_extra_parse(&arg_network_veth_extra
, optarg
);
467 return log_error_errno(r
, "Failed to parse --network-veth-extra= parameter: %s", optarg
);
469 arg_private_network
= true;
470 arg_settings_mask
|= SETTING_NETWORK
;
473 case ARG_NETWORK_INTERFACE
:
474 if (strv_extend(&arg_network_interfaces
, optarg
) < 0)
477 arg_private_network
= true;
478 arg_settings_mask
|= SETTING_NETWORK
;
481 case ARG_NETWORK_MACVLAN
:
482 if (strv_extend(&arg_network_macvlan
, optarg
) < 0)
485 arg_private_network
= true;
486 arg_settings_mask
|= SETTING_NETWORK
;
489 case ARG_NETWORK_IPVLAN
:
490 if (strv_extend(&arg_network_ipvlan
, optarg
) < 0)
495 case ARG_PRIVATE_NETWORK
:
496 arg_private_network
= true;
497 arg_settings_mask
|= SETTING_NETWORK
;
501 if (arg_start_mode
== START_PID2
) {
502 log_error("--boot and --as-pid2 may not be combined.");
506 arg_start_mode
= START_BOOT
;
507 arg_settings_mask
|= SETTING_START_MODE
;
511 if (arg_start_mode
== START_BOOT
) {
512 log_error("--boot and --as-pid2 may not be combined.");
516 arg_start_mode
= START_PID2
;
517 arg_settings_mask
|= SETTING_START_MODE
;
521 r
= sd_id128_from_string(optarg
, &arg_uuid
);
523 log_error("Invalid UUID: %s", optarg
);
527 arg_settings_mask
|= SETTING_MACHINE_ID
;
536 arg_machine
= mfree(arg_machine
);
538 if (!machine_name_is_valid(optarg
)) {
539 log_error("Invalid machine name: %s", optarg
);
543 r
= free_and_strdup(&arg_machine
, optarg
);
551 arg_selinux_context
= optarg
;
555 arg_selinux_apifs_context
= optarg
;
559 arg_read_only
= true;
560 arg_settings_mask
|= SETTING_READ_ONLY
;
564 case ARG_DROP_CAPABILITY
: {
567 _cleanup_free_
char *t
= NULL
;
569 r
= extract_first_word(&p
, &t
, ",", 0);
571 return log_error_errno(r
, "Failed to parse capability %s.", t
);
576 if (streq(t
, "all")) {
577 if (c
== ARG_CAPABILITY
)
578 plus
= (uint64_t) -1;
580 minus
= (uint64_t) -1;
584 cap
= capability_from_name(t
);
586 log_error("Failed to parse capability %s.", t
);
590 if (c
== ARG_CAPABILITY
)
591 plus
|= 1ULL << (uint64_t) cap
;
593 minus
|= 1ULL << (uint64_t) cap
;
597 arg_settings_mask
|= SETTING_CAPABILITY
;
602 arg_link_journal
= LINK_GUEST
;
603 arg_link_journal_try
= true;
606 case ARG_LINK_JOURNAL
:
607 if (streq(optarg
, "auto")) {
608 arg_link_journal
= LINK_AUTO
;
609 arg_link_journal_try
= false;
610 } else if (streq(optarg
, "no")) {
611 arg_link_journal
= LINK_NO
;
612 arg_link_journal_try
= false;
613 } else if (streq(optarg
, "guest")) {
614 arg_link_journal
= LINK_GUEST
;
615 arg_link_journal_try
= false;
616 } else if (streq(optarg
, "host")) {
617 arg_link_journal
= LINK_HOST
;
618 arg_link_journal_try
= false;
619 } else if (streq(optarg
, "try-guest")) {
620 arg_link_journal
= LINK_GUEST
;
621 arg_link_journal_try
= true;
622 } else if (streq(optarg
, "try-host")) {
623 arg_link_journal
= LINK_HOST
;
624 arg_link_journal_try
= true;
626 log_error("Failed to parse link journal mode %s", optarg
);
634 r
= bind_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
, c
== ARG_BIND_RO
);
636 return log_error_errno(r
, "Failed to parse --bind(-ro)= argument %s: %m", optarg
);
638 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
642 r
= tmpfs_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
);
644 return log_error_errno(r
, "Failed to parse --tmpfs= argument %s: %m", optarg
);
646 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
650 case ARG_OVERLAY_RO
: {
651 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
652 _cleanup_strv_free_
char **lower
= NULL
;
657 r
= strv_split_extract(&lower
, optarg
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
661 log_error("Invalid overlay specification: %s", optarg
);
665 STRV_FOREACH(i
, lower
) {
666 if (!path_is_absolute(*i
)) {
667 log_error("Overlay path %s is not absolute.", *i
);
675 log_error("--overlay= needs at least two colon-separated directories specified.");
680 /* If two parameters are specified,
681 * the first one is the lower, the
682 * second one the upper directory. And
683 * we'll also define the destination
684 * mount point the same as the upper. */
688 destination
= strdup(upper
);
693 upper
= lower
[n
- 2];
694 destination
= lower
[n
- 1];
698 m
= custom_mount_add(&arg_custom_mounts
, &arg_n_custom_mounts
, CUSTOM_MOUNT_OVERLAY
);
702 m
->destination
= destination
;
705 m
->read_only
= c
== ARG_OVERLAY_RO
;
707 upper
= destination
= NULL
;
710 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
717 if (!env_assignment_is_valid(optarg
)) {
718 log_error("Environment variable assignment '%s' is not valid.", optarg
);
722 n
= strv_env_set(arg_setenv
, optarg
);
726 strv_free(arg_setenv
);
729 arg_settings_mask
|= SETTING_ENVIRONMENT
;
737 case ARG_SHARE_SYSTEM
:
738 arg_share_system
= true;
742 r
= parse_boolean(optarg
);
744 log_error("Failed to parse --register= argument: %s", optarg
);
752 arg_keep_unit
= true;
755 case ARG_PERSONALITY
:
757 arg_personality
= personality_from_string(optarg
);
758 if (arg_personality
== PERSONALITY_INVALID
) {
759 log_error("Unknown or unsupported personality '%s'.", optarg
);
763 arg_settings_mask
|= SETTING_PERSONALITY
;
769 arg_volatile_mode
= VOLATILE_YES
;
773 m
= volatile_mode_from_string(optarg
);
775 log_error("Failed to parse --volatile= argument: %s", optarg
);
778 arg_volatile_mode
= m
;
781 arg_settings_mask
|= SETTING_VOLATILE_MODE
;
785 r
= expose_port_parse(&arg_expose_ports
, optarg
);
787 return log_error_errno(r
, "Duplicate port specification: %s", optarg
);
789 return log_error_errno(r
, "Failed to parse host port %s: %m", optarg
);
791 arg_settings_mask
|= SETTING_EXPOSE_PORTS
;
795 if (strv_extend(&arg_property
, optarg
) < 0)
800 case ARG_PRIVATE_USERS
:
802 _cleanup_free_
char *buffer
= NULL
;
803 const char *range
, *shift
;
805 range
= strchr(optarg
, ':');
807 buffer
= strndup(optarg
, range
- optarg
);
813 if (safe_atou32(range
, &arg_uid_range
) < 0 || arg_uid_range
<= 0) {
814 log_error("Failed to parse UID range: %s", range
);
820 if (parse_uid(shift
, &arg_uid_shift
) < 0) {
821 log_error("Failed to parse UID: %s", optarg
);
829 case ARG_KILL_SIGNAL
:
830 arg_kill_signal
= signal_from_string_try_harder(optarg
);
831 if (arg_kill_signal
< 0) {
832 log_error("Cannot parse signal: %s", optarg
);
836 arg_settings_mask
|= SETTING_KILL_SIGNAL
;
841 /* no → do not read files
842 * yes → read files, do not override cmdline, trust only subset
843 * override → read files, override cmdline, trust only subset
844 * trusted → read files, do not override cmdline, trust all
847 r
= parse_boolean(optarg
);
849 if (streq(optarg
, "trusted")) {
850 mask_all_settings
= false;
851 mask_no_settings
= false;
852 arg_settings_trusted
= true;
854 } else if (streq(optarg
, "override")) {
855 mask_all_settings
= false;
856 mask_no_settings
= true;
857 arg_settings_trusted
= -1;
859 return log_error_errno(r
, "Failed to parse --settings= argument: %s", optarg
);
862 mask_all_settings
= false;
863 mask_no_settings
= false;
864 arg_settings_trusted
= -1;
867 mask_all_settings
= true;
868 mask_no_settings
= false;
869 arg_settings_trusted
= false;
875 if (!path_is_absolute(optarg
)) {
876 log_error("Working directory %s is not an absolute path.", optarg
);
880 r
= free_and_strdup(&arg_chdir
, optarg
);
884 arg_settings_mask
|= SETTING_WORKING_DIRECTORY
;
891 assert_not_reached("Unhandled option");
894 if (arg_share_system
)
895 arg_register
= false;
897 if (arg_start_mode
!= START_PID1
&& arg_share_system
) {
898 log_error("--boot and --share-system may not be combined.");
902 if (arg_keep_unit
&& cg_pid_get_owner_uid(0, NULL
) >= 0) {
903 log_error("--keep-unit may not be used when invoked from a user session.");
907 if (arg_directory
&& arg_image
) {
908 log_error("--directory= and --image= may not be combined.");
912 if (arg_template
&& arg_image
) {
913 log_error("--template= and --image= may not be combined.");
917 if (arg_template
&& !(arg_directory
|| arg_machine
)) {
918 log_error("--template= needs --directory= or --machine=.");
922 if (arg_ephemeral
&& arg_template
) {
923 log_error("--ephemeral and --template= may not be combined.");
927 if (arg_ephemeral
&& arg_image
) {
928 log_error("--ephemeral and --image= may not be combined.");
932 if (arg_ephemeral
&& !IN_SET(arg_link_journal
, LINK_NO
, LINK_AUTO
)) {
933 log_error("--ephemeral and --link-journal= may not be combined.");
937 if (arg_userns
&& access("/proc/self/uid_map", F_OK
) < 0)
938 return log_error_errno(EOPNOTSUPP
, "--private-users= is not supported, kernel compiled without user namespace support.");
941 arg_parameters
= strv_copy(argv
+ optind
);
945 arg_settings_mask
|= SETTING_START_MODE
;
948 /* Load all settings from .nspawn files */
949 if (mask_no_settings
)
950 arg_settings_mask
= 0;
952 /* Don't load any settings from .nspawn files */
953 if (mask_all_settings
)
954 arg_settings_mask
= _SETTINGS_MASK_ALL
;
956 arg_retain
= (arg_retain
| plus
| (arg_private_network
? 1ULL << CAP_NET_ADMIN
: 0)) & ~minus
;
958 r
= detect_unified_cgroup_hierarchy();
962 e
= getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
964 arg_container_service_name
= e
;
969 static int verify_arguments(void) {
971 if (arg_volatile_mode
!= VOLATILE_NO
&& arg_read_only
) {
972 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
976 if (arg_expose_ports
&& !arg_private_network
) {
977 log_error("Cannot use --port= without private networking.");
982 if (arg_expose_ports
) {
983 log_error("--port= is not supported, compiled without libiptc support.");
988 if (arg_start_mode
== START_BOOT
&& arg_kill_signal
<= 0)
989 arg_kill_signal
= SIGRTMIN
+3;
994 static int userns_lchown(const char *p
, uid_t uid
, gid_t gid
) {
1000 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
1003 if (uid
!= UID_INVALID
) {
1004 uid
+= arg_uid_shift
;
1006 if (uid
< arg_uid_shift
|| uid
>= arg_uid_shift
+ arg_uid_range
)
1010 if (gid
!= GID_INVALID
) {
1011 gid
+= (gid_t
) arg_uid_shift
;
1013 if (gid
< (gid_t
) arg_uid_shift
|| gid
>= (gid_t
) (arg_uid_shift
+ arg_uid_range
))
1017 if (lchown(p
, uid
, gid
) < 0)
1023 static int userns_mkdir(const char *root
, const char *path
, mode_t mode
, uid_t uid
, gid_t gid
) {
1026 q
= prefix_roota(root
, path
);
1027 if (mkdir(q
, mode
) < 0) {
1028 if (errno
== EEXIST
)
1033 return userns_lchown(q
, uid
, gid
);
1036 static int setup_timezone(const char *dest
) {
1037 _cleanup_free_
char *p
= NULL
, *q
= NULL
;
1038 const char *where
, *check
, *what
;
1044 /* Fix the timezone, if possible */
1045 r
= readlink_malloc("/etc/localtime", &p
);
1047 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1051 z
= path_startswith(p
, "../usr/share/zoneinfo/");
1053 z
= path_startswith(p
, "/usr/share/zoneinfo/");
1055 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1059 where
= prefix_roota(dest
, "/etc/localtime");
1060 r
= readlink_malloc(where
, &q
);
1062 y
= path_startswith(q
, "../usr/share/zoneinfo/");
1064 y
= path_startswith(q
, "/usr/share/zoneinfo/");
1066 /* Already pointing to the right place? Then do nothing .. */
1067 if (y
&& streq(y
, z
))
1071 check
= strjoina("/usr/share/zoneinfo/", z
);
1072 check
= prefix_roota(dest
, check
);
1073 if (laccess(check
, F_OK
) < 0) {
1074 log_warning("Timezone %s does not exist in container, not updating container timezone.", z
);
1079 if (r
< 0 && errno
!= ENOENT
) {
1080 log_error_errno(errno
, "Failed to remove existing timezone info %s in container: %m", where
);
1084 what
= strjoina("../usr/share/zoneinfo/", z
);
1085 if (symlink(what
, where
) < 0) {
1086 log_error_errno(errno
, "Failed to correct timezone of container: %m");
1090 r
= userns_lchown(where
, 0, 0);
1092 return log_warning_errno(r
, "Failed to chown /etc/localtime: %m");
1097 static int setup_resolv_conf(const char *dest
) {
1098 const char *where
= NULL
;
1103 if (arg_private_network
)
1106 /* Fix resolv.conf, if possible */
1107 where
= prefix_roota(dest
, "/etc/resolv.conf");
1109 r
= copy_file("/etc/resolv.conf", where
, O_TRUNC
|O_NOFOLLOW
, 0644, 0);
1111 /* If the file already exists as symlink, let's
1112 * suppress the warning, under the assumption that
1113 * resolved or something similar runs inside and the
1114 * symlink points there.
1116 * If the disk image is read-only, there's also no
1117 * point in complaining.
1119 log_full_errno(IN_SET(r
, -ELOOP
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
1120 "Failed to copy /etc/resolv.conf to %s: %m", where
);
1124 r
= userns_lchown(where
, 0, 0);
1126 log_warning_errno(r
, "Failed to chown /etc/resolv.conf: %m");
1131 static char* id128_format_as_uuid(sd_id128_t id
, char s
[37]) {
1135 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1136 SD_ID128_FORMAT_VAL(id
));
1141 static int setup_boot_id(const char *dest
) {
1142 const char *from
, *to
;
1143 sd_id128_t rnd
= {};
1147 if (arg_share_system
)
1150 /* Generate a new randomized boot ID, so that each boot-up of
1151 * the container gets a new one */
1153 from
= prefix_roota(dest
, "/run/proc-sys-kernel-random-boot-id");
1154 to
= prefix_roota(dest
, "/proc/sys/kernel/random/boot_id");
1156 r
= sd_id128_randomize(&rnd
);
1158 return log_error_errno(r
, "Failed to generate random boot id: %m");
1160 id128_format_as_uuid(rnd
, as_uuid
);
1162 r
= write_string_file(from
, as_uuid
, WRITE_STRING_FILE_CREATE
);
1164 return log_error_errno(r
, "Failed to write boot id: %m");
1166 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1167 r
= log_error_errno(errno
, "Failed to bind mount boot id: %m");
1168 else if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
) < 0)
1169 log_warning_errno(errno
, "Failed to make boot id read-only: %m");
1175 static int copy_devnodes(const char *dest
) {
1177 static const char devnodes
[] =
1188 _cleanup_umask_ mode_t u
;
1194 /* Create /dev/net, so that we can create /dev/net/tun in it */
1195 if (userns_mkdir(dest
, "/dev/net", 0755, 0, 0) < 0)
1196 return log_error_errno(r
, "Failed to create /dev/net directory: %m");
1198 NULSTR_FOREACH(d
, devnodes
) {
1199 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
1202 from
= strappend("/dev/", d
);
1203 to
= prefix_root(dest
, from
);
1205 if (stat(from
, &st
) < 0) {
1207 if (errno
!= ENOENT
)
1208 return log_error_errno(errno
, "Failed to stat %s: %m", from
);
1210 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
1212 log_error("%s is not a char or block device, cannot copy.", from
);
1216 if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
1218 return log_error_errno(errno
, "mknod(%s) failed: %m", to
);
1220 /* Some systems abusively restrict mknod but
1221 * allow bind mounts. */
1224 return log_error_errno(r
, "touch (%s) failed: %m", to
);
1225 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1226 return log_error_errno(errno
, "Both mknod and bind mount (%s) failed: %m", to
);
1229 r
= userns_lchown(to
, 0, 0);
1231 return log_error_errno(r
, "chown() of device node %s failed: %m", to
);
1238 static int setup_pts(const char *dest
) {
1239 _cleanup_free_
char *options
= NULL
;
1244 if (arg_selinux_apifs_context
)
1245 (void) asprintf(&options
,
1246 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
",context=\"%s\"",
1247 arg_uid_shift
+ TTY_GID
,
1248 arg_selinux_apifs_context
);
1251 (void) asprintf(&options
,
1252 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
,
1253 arg_uid_shift
+ TTY_GID
);
1258 /* Mount /dev/pts itself */
1259 p
= prefix_roota(dest
, "/dev/pts");
1260 if (mkdir(p
, 0755) < 0)
1261 return log_error_errno(errno
, "Failed to create /dev/pts: %m");
1262 if (mount("devpts", p
, "devpts", MS_NOSUID
|MS_NOEXEC
, options
) < 0)
1263 return log_error_errno(errno
, "Failed to mount /dev/pts: %m");
1264 r
= userns_lchown(p
, 0, 0);
1266 return log_error_errno(r
, "Failed to chown /dev/pts: %m");
1268 /* Create /dev/ptmx symlink */
1269 p
= prefix_roota(dest
, "/dev/ptmx");
1270 if (symlink("pts/ptmx", p
) < 0)
1271 return log_error_errno(errno
, "Failed to create /dev/ptmx symlink: %m");
1272 r
= userns_lchown(p
, 0, 0);
1274 return log_error_errno(r
, "Failed to chown /dev/ptmx: %m");
1276 /* And fix /dev/pts/ptmx ownership */
1277 p
= prefix_roota(dest
, "/dev/pts/ptmx");
1278 r
= userns_lchown(p
, 0, 0);
1280 return log_error_errno(r
, "Failed to chown /dev/pts/ptmx: %m");
1285 static int setup_dev_console(const char *dest
, const char *console
) {
1286 _cleanup_umask_ mode_t u
;
1295 r
= chmod_and_chown(console
, 0600, arg_uid_shift
, arg_uid_shift
);
1297 return log_error_errno(r
, "Failed to correct access mode for TTY: %m");
1299 /* We need to bind mount the right tty to /dev/console since
1300 * ptys can only exist on pts file systems. To have something
1301 * to bind mount things on we create a empty regular file. */
1303 to
= prefix_roota(dest
, "/dev/console");
1306 return log_error_errno(r
, "touch() for /dev/console failed: %m");
1308 if (mount(console
, to
, NULL
, MS_BIND
, NULL
) < 0)
1309 return log_error_errno(errno
, "Bind mount for /dev/console failed: %m");
1314 static int setup_kmsg(const char *dest
, int kmsg_socket
) {
1315 const char *from
, *to
;
1316 _cleanup_umask_ mode_t u
;
1319 assert(kmsg_socket
>= 0);
1323 /* We create the kmsg FIFO as /run/kmsg, but immediately
1324 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1325 * on the reading side behave very similar to /proc/kmsg,
1326 * their writing side behaves differently from /dev/kmsg in
1327 * that writing blocks when nothing is reading. In order to
1328 * avoid any problems with containers deadlocking due to this
1329 * we simply make /dev/kmsg unavailable to the container. */
1330 from
= prefix_roota(dest
, "/run/kmsg");
1331 to
= prefix_roota(dest
, "/proc/kmsg");
1333 if (mkfifo(from
, 0600) < 0)
1334 return log_error_errno(errno
, "mkfifo() for /run/kmsg failed: %m");
1335 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1336 return log_error_errno(errno
, "Bind mount for /proc/kmsg failed: %m");
1338 fd
= open(from
, O_RDWR
|O_NDELAY
|O_CLOEXEC
);
1340 return log_error_errno(errno
, "Failed to open fifo: %m");
1342 /* Store away the fd in the socket, so that it stays open as
1343 * long as we run the child */
1344 r
= send_one_fd(kmsg_socket
, fd
, 0);
1348 return log_error_errno(r
, "Failed to send FIFO fd: %m");
1350 /* And now make the FIFO unavailable as /run/kmsg... */
1351 (void) unlink(from
);
1356 static int on_address_change(sd_netlink
*rtnl
, sd_netlink_message
*m
, void *userdata
) {
1357 union in_addr_union
*exposed
= userdata
;
1363 expose_port_execute(rtnl
, arg_expose_ports
, exposed
);
1367 static int setup_hostname(void) {
1369 if (arg_share_system
)
1372 if (sethostname_idempotent(arg_machine
) < 0)
1378 static int setup_journal(const char *directory
) {
1380 _cleanup_free_
char *b
= NULL
, *d
= NULL
;
1386 /* Don't link journals in ephemeral mode */
1390 if (arg_link_journal
== LINK_NO
)
1393 try = arg_link_journal_try
|| arg_link_journal
== LINK_AUTO
;
1395 r
= sd_id128_get_machine(&this_id
);
1397 return log_error_errno(r
, "Failed to retrieve machine ID: %m");
1399 if (sd_id128_equal(arg_uuid
, this_id
)) {
1400 log_full(try ? LOG_WARNING
: LOG_ERR
,
1401 "Host and machine ids are equal (%s): refusing to link journals", id
);
1407 r
= userns_mkdir(directory
, "/var", 0755, 0, 0);
1409 return log_error_errno(r
, "Failed to create /var: %m");
1411 r
= userns_mkdir(directory
, "/var/log", 0755, 0, 0);
1413 return log_error_errno(r
, "Failed to create /var/log: %m");
1415 r
= userns_mkdir(directory
, "/var/log/journal", 0755, 0, 0);
1417 return log_error_errno(r
, "Failed to create /var/log/journal: %m");
1419 (void) sd_id128_to_string(arg_uuid
, id
);
1421 p
= strjoina("/var/log/journal/", id
);
1422 q
= prefix_roota(directory
, p
);
1424 if (path_is_mount_point(p
, 0) > 0) {
1428 log_error("%s: already a mount point, refusing to use for journal", p
);
1432 if (path_is_mount_point(q
, 0) > 0) {
1436 log_error("%s: already a mount point, refusing to use for journal", q
);
1440 r
= readlink_and_make_absolute(p
, &d
);
1442 if ((arg_link_journal
== LINK_GUEST
||
1443 arg_link_journal
== LINK_AUTO
) &&
1446 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1448 log_warning_errno(r
, "Failed to create directory %s: %m", q
);
1453 return log_error_errno(errno
, "Failed to remove symlink %s: %m", p
);
1454 } else if (r
== -EINVAL
) {
1456 if (arg_link_journal
== LINK_GUEST
&&
1459 if (errno
== ENOTDIR
) {
1460 log_error("%s already exists and is neither a symlink nor a directory", p
);
1463 return log_error_errno(errno
, "Failed to remove %s: %m", p
);
1465 } else if (r
!= -ENOENT
)
1466 return log_error_errno(r
, "readlink(%s) failed: %m", p
);
1468 if (arg_link_journal
== LINK_GUEST
) {
1470 if (symlink(q
, p
) < 0) {
1472 log_debug_errno(errno
, "Failed to symlink %s to %s, skipping journal setup: %m", q
, p
);
1475 return log_error_errno(errno
, "Failed to symlink %s to %s: %m", q
, p
);
1478 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1480 log_warning_errno(r
, "Failed to create directory %s: %m", q
);
1484 if (arg_link_journal
== LINK_HOST
) {
1485 /* don't create parents here -- if the host doesn't have
1486 * permanent journal set up, don't force it here */
1488 if (mkdir(p
, 0755) < 0 && errno
!= EEXIST
) {
1490 log_debug_errno(errno
, "Failed to create %s, skipping journal setup: %m", p
);
1493 return log_error_errno(errno
, "Failed to create %s: %m", p
);
1496 } else if (access(p
, F_OK
) < 0)
1499 if (dir_is_empty(q
) == 0)
1500 log_warning("%s is not empty, proceeding anyway.", q
);
1502 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1504 return log_error_errno(r
, "Failed to create %s: %m", q
);
1506 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1507 return log_error_errno(errno
, "Failed to bind mount journal from host into guest: %m");
1512 static int drop_capabilities(void) {
1513 return capability_bounding_set_drop(arg_retain
, false);
1516 static int reset_audit_loginuid(void) {
1517 _cleanup_free_
char *p
= NULL
;
1520 if (arg_share_system
)
1523 r
= read_one_line_file("/proc/self/loginuid", &p
);
1527 return log_error_errno(r
, "Failed to read /proc/self/loginuid: %m");
1529 /* Already reset? */
1530 if (streq(p
, "4294967295"))
1533 r
= write_string_file("/proc/self/loginuid", "4294967295", 0);
1536 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1537 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1538 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1539 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1540 "using systemd-nspawn. Sleeping for 5s... (%m)");
1548 static int setup_seccomp(void) {
1551 static const struct {
1552 uint64_t capability
;
1555 { CAP_SYS_RAWIO
, SCMP_SYS(iopl
) },
1556 { CAP_SYS_RAWIO
, SCMP_SYS(ioperm
) },
1557 { CAP_SYS_BOOT
, SCMP_SYS(kexec_load
) },
1558 { CAP_SYS_ADMIN
, SCMP_SYS(swapon
) },
1559 { CAP_SYS_ADMIN
, SCMP_SYS(swapoff
) },
1560 { CAP_SYS_ADMIN
, SCMP_SYS(open_by_handle_at
) },
1561 { CAP_SYS_MODULE
, SCMP_SYS(init_module
) },
1562 { CAP_SYS_MODULE
, SCMP_SYS(finit_module
) },
1563 { CAP_SYS_MODULE
, SCMP_SYS(delete_module
) },
1564 { CAP_SYSLOG
, SCMP_SYS(syslog
) },
1567 scmp_filter_ctx seccomp
;
1571 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
1575 r
= seccomp_add_secondary_archs(seccomp
);
1577 log_error_errno(r
, "Failed to add secondary archs to seccomp filter: %m");
1581 for (i
= 0; i
< ELEMENTSOF(blacklist
); i
++) {
1582 if (arg_retain
& (1ULL << blacklist
[i
].capability
))
1585 r
= seccomp_rule_add(seccomp
, SCMP_ACT_ERRNO(EPERM
), blacklist
[i
].syscall_num
, 0);
1587 continue; /* unknown syscall */
1589 log_error_errno(r
, "Failed to block syscall: %m");
1596 Audit is broken in containers, much of the userspace audit
1597 hookup will fail if running inside a container. We don't
1598 care and just turn off creation of audit sockets.
1600 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1601 with EAFNOSUPPORT which audit userspace uses as indication
1602 that audit is disabled in the kernel.
1605 r
= seccomp_rule_add(
1607 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1610 SCMP_A0(SCMP_CMP_EQ
, AF_NETLINK
),
1611 SCMP_A2(SCMP_CMP_EQ
, NETLINK_AUDIT
));
1613 log_error_errno(r
, "Failed to add audit seccomp rule: %m");
1617 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
1619 log_error_errno(r
, "Failed to unset NO_NEW_PRIVS: %m");
1623 r
= seccomp_load(seccomp
);
1625 log_debug_errno(r
, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1630 log_error_errno(r
, "Failed to install seccomp audit filter: %m");
1635 seccomp_release(seccomp
);
1643 static int setup_propagate(const char *root
) {
1647 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1648 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1649 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
1650 (void) mkdir_p(p
, 0600);
1652 r
= userns_mkdir(root
, "/run/systemd", 0755, 0, 0);
1654 return log_error_errno(r
, "Failed to create /run/systemd: %m");
1656 r
= userns_mkdir(root
, "/run/systemd/nspawn", 0755, 0, 0);
1658 return log_error_errno(r
, "Failed to create /run/systemd/nspawn: %m");
1660 r
= userns_mkdir(root
, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1662 return log_error_errno(r
, "Failed to create /run/systemd/nspawn/incoming: %m");
1664 q
= prefix_roota(root
, "/run/systemd/nspawn/incoming");
1665 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1666 return log_error_errno(errno
, "Failed to install propagation bind mount.");
1668 if (mount(NULL
, q
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
) < 0)
1669 return log_error_errno(errno
, "Failed to make propagation mount read-only");
1674 static int setup_image(char **device_path
, int *loop_nr
) {
1675 struct loop_info64 info
= {
1676 .lo_flags
= LO_FLAGS_AUTOCLEAR
|LO_FLAGS_PARTSCAN
1678 _cleanup_close_
int fd
= -1, control
= -1, loop
= -1;
1679 _cleanup_free_
char* loopdev
= NULL
;
1683 assert(device_path
);
1687 fd
= open(arg_image
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1689 return log_error_errno(errno
, "Failed to open %s: %m", arg_image
);
1691 if (fstat(fd
, &st
) < 0)
1692 return log_error_errno(errno
, "Failed to stat %s: %m", arg_image
);
1694 if (S_ISBLK(st
.st_mode
)) {
1697 p
= strdup(arg_image
);
1711 if (!S_ISREG(st
.st_mode
)) {
1712 log_error("%s is not a regular file or block device.", arg_image
);
1716 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
1718 return log_error_errno(errno
, "Failed to open /dev/loop-control: %m");
1720 nr
= ioctl(control
, LOOP_CTL_GET_FREE
);
1722 return log_error_errno(errno
, "Failed to allocate loop device: %m");
1724 if (asprintf(&loopdev
, "/dev/loop%i", nr
) < 0)
1727 loop
= open(loopdev
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1729 return log_error_errno(errno
, "Failed to open loop device %s: %m", loopdev
);
1731 if (ioctl(loop
, LOOP_SET_FD
, fd
) < 0)
1732 return log_error_errno(errno
, "Failed to set loopback file descriptor on %s: %m", loopdev
);
1735 info
.lo_flags
|= LO_FLAGS_READ_ONLY
;
1737 if (ioctl(loop
, LOOP_SET_STATUS64
, &info
) < 0)
1738 return log_error_errno(errno
, "Failed to set loopback settings on %s: %m", loopdev
);
1740 *device_path
= loopdev
;
1751 #define PARTITION_TABLE_BLURB \
1752 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1753 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1754 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1755 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1756 "to be bootable with systemd-nspawn."
1758 static int dissect_image(
1760 char **root_device
, bool *root_device_rw
,
1761 char **home_device
, bool *home_device_rw
,
1762 char **srv_device
, bool *srv_device_rw
,
1766 int home_nr
= -1, srv_nr
= -1;
1767 #ifdef GPT_ROOT_NATIVE
1770 #ifdef GPT_ROOT_SECONDARY
1771 int secondary_root_nr
= -1;
1773 _cleanup_free_
char *home
= NULL
, *root
= NULL
, *secondary_root
= NULL
, *srv
= NULL
, *generic
= NULL
;
1774 _cleanup_udev_enumerate_unref_
struct udev_enumerate
*e
= NULL
;
1775 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
1776 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
1777 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
1778 struct udev_list_entry
*first
, *item
;
1779 bool home_rw
= true, root_rw
= true, secondary_root_rw
= true, srv_rw
= true, generic_rw
= true;
1780 bool is_gpt
, is_mbr
, multiple_generic
= false;
1781 const char *pttype
= NULL
;
1788 assert(root_device
);
1789 assert(home_device
);
1794 b
= blkid_new_probe();
1799 r
= blkid_probe_set_device(b
, fd
, 0, 0);
1804 return log_error_errno(errno
, "Failed to set device on blkid probe: %m");
1807 blkid_probe_enable_partitions(b
, 1);
1808 blkid_probe_set_partitions_flags(b
, BLKID_PARTS_ENTRY_DETAILS
);
1811 r
= blkid_do_safeprobe(b
);
1812 if (r
== -2 || r
== 1) {
1813 log_error("Failed to identify any partition table on\n"
1815 PARTITION_TABLE_BLURB
, arg_image
);
1817 } else if (r
!= 0) {
1820 return log_error_errno(errno
, "Failed to probe: %m");
1823 (void) blkid_probe_lookup_value(b
, "PTTYPE", &pttype
, NULL
);
1825 is_gpt
= streq_ptr(pttype
, "gpt");
1826 is_mbr
= streq_ptr(pttype
, "dos");
1828 if (!is_gpt
&& !is_mbr
) {
1829 log_error("No GPT or MBR partition table discovered on\n"
1831 PARTITION_TABLE_BLURB
, arg_image
);
1836 pl
= blkid_probe_get_partitions(b
);
1841 log_error("Failed to list partitions of %s", arg_image
);
1849 if (fstat(fd
, &st
) < 0)
1850 return log_error_errno(errno
, "Failed to stat block device: %m");
1852 d
= udev_device_new_from_devnum(udev
, 'b', st
.st_rdev
);
1860 log_error("Kernel partitions never appeared.");
1864 e
= udev_enumerate_new(udev
);
1868 r
= udev_enumerate_add_match_parent(e
, d
);
1872 r
= udev_enumerate_scan_devices(e
);
1874 return log_error_errno(r
, "Failed to scan for partition devices of %s: %m", arg_image
);
1876 /* Count the partitions enumerated by the kernel */
1878 first
= udev_enumerate_get_list_entry(e
);
1879 udev_list_entry_foreach(item
, first
)
1882 /* Count the partitions enumerated by blkid */
1883 m
= blkid_partlist_numof_partitions(pl
);
1887 log_error("blkid and kernel partition list do not match.");
1893 /* The kernel has probed fewer partitions than
1894 * blkid? Maybe the kernel prober is still
1895 * running or it got EBUSY because udev
1896 * already opened the device. Let's reprobe
1897 * the device, which is a synchronous call
1898 * that waits until probing is complete. */
1900 for (j
= 0; j
< 20; j
++) {
1902 r
= ioctl(fd
, BLKRRPART
, 0);
1905 if (r
>= 0 || r
!= -EBUSY
)
1908 /* If something else has the device
1909 * open, such as an udev rule, the
1910 * ioctl will return EBUSY. Since
1911 * there's no way to wait until it
1912 * isn't busy anymore, let's just wait
1913 * a bit, and try again.
1915 * This is really something they
1916 * should fix in the kernel! */
1918 usleep(50 * USEC_PER_MSEC
);
1922 return log_error_errno(r
, "Failed to reread partition table: %m");
1925 e
= udev_enumerate_unref(e
);
1928 first
= udev_enumerate_get_list_entry(e
);
1929 udev_list_entry_foreach(item
, first
) {
1930 _cleanup_udev_device_unref_
struct udev_device
*q
;
1932 unsigned long long flags
;
1938 q
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(item
));
1943 return log_error_errno(errno
, "Failed to get partition device of %s: %m", arg_image
);
1946 qn
= udev_device_get_devnum(q
);
1950 if (st
.st_rdev
== qn
)
1953 node
= udev_device_get_devnode(q
);
1957 pp
= blkid_partlist_devno_to_partition(pl
, qn
);
1961 flags
= blkid_partition_get_flags(pp
);
1963 nr
= blkid_partition_get_partno(pp
);
1971 if (flags
& GPT_FLAG_NO_AUTO
)
1974 stype
= blkid_partition_get_type_string(pp
);
1978 if (sd_id128_from_string(stype
, &type_id
) < 0)
1981 if (sd_id128_equal(type_id
, GPT_HOME
)) {
1983 if (home
&& nr
>= home_nr
)
1987 home_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1989 r
= free_and_strdup(&home
, node
);
1993 } else if (sd_id128_equal(type_id
, GPT_SRV
)) {
1995 if (srv
&& nr
>= srv_nr
)
1999 srv_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2001 r
= free_and_strdup(&srv
, node
);
2005 #ifdef GPT_ROOT_NATIVE
2006 else if (sd_id128_equal(type_id
, GPT_ROOT_NATIVE
)) {
2008 if (root
&& nr
>= root_nr
)
2012 root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2014 r
= free_and_strdup(&root
, node
);
2019 #ifdef GPT_ROOT_SECONDARY
2020 else if (sd_id128_equal(type_id
, GPT_ROOT_SECONDARY
)) {
2022 if (secondary_root
&& nr
>= secondary_root_nr
)
2025 secondary_root_nr
= nr
;
2026 secondary_root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2028 r
= free_and_strdup(&secondary_root
, node
);
2033 else if (sd_id128_equal(type_id
, GPT_LINUX_GENERIC
)) {
2036 multiple_generic
= true;
2038 generic_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2040 r
= free_and_strdup(&generic
, node
);
2046 } else if (is_mbr
) {
2049 if (flags
!= 0x80) /* Bootable flag */
2052 type
= blkid_partition_get_type(pp
);
2053 if (type
!= 0x83) /* Linux partition */
2057 multiple_generic
= true;
2061 r
= free_and_strdup(&root
, node
);
2069 *root_device
= root
;
2072 *root_device_rw
= root_rw
;
2074 } else if (secondary_root
) {
2075 *root_device
= secondary_root
;
2076 secondary_root
= NULL
;
2078 *root_device_rw
= secondary_root_rw
;
2080 } else if (generic
) {
2082 /* There were no partitions with precise meanings
2083 * around, but we found generic partitions. In this
2084 * case, if there's only one, we can go ahead and boot
2085 * it, otherwise we bail out, because we really cannot
2086 * make any sense of it. */
2088 if (multiple_generic
) {
2089 log_error("Identified multiple bootable Linux partitions on\n"
2091 PARTITION_TABLE_BLURB
, arg_image
);
2095 *root_device
= generic
;
2098 *root_device_rw
= generic_rw
;
2101 log_error("Failed to identify root partition in disk image\n"
2103 PARTITION_TABLE_BLURB
, arg_image
);
2108 *home_device
= home
;
2111 *home_device_rw
= home_rw
;
2118 *srv_device_rw
= srv_rw
;
2123 log_error("--image= is not supported, compiled without blkid support.");
2128 static int mount_device(const char *what
, const char *where
, const char *directory
, bool rw
) {
2130 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
2131 const char *fstype
, *p
;
2141 p
= strjoina(where
, directory
);
2146 b
= blkid_new_probe_from_filename(what
);
2150 return log_error_errno(errno
, "Failed to allocate prober for %s: %m", what
);
2153 blkid_probe_enable_superblocks(b
, 1);
2154 blkid_probe_set_superblocks_flags(b
, BLKID_SUBLKS_TYPE
);
2157 r
= blkid_do_safeprobe(b
);
2158 if (r
== -1 || r
== 1) {
2159 log_error("Cannot determine file system type of %s", what
);
2161 } else if (r
!= 0) {
2164 return log_error_errno(errno
, "Failed to probe %s: %m", what
);
2168 if (blkid_probe_lookup_value(b
, "TYPE", &fstype
, NULL
) < 0) {
2171 log_error("Failed to determine file system type of %s", what
);
2175 if (streq(fstype
, "crypto_LUKS")) {
2176 log_error("nspawn currently does not support LUKS disk images.");
2180 if (mount(what
, p
, fstype
, MS_NODEV
|(rw
? 0 : MS_RDONLY
), NULL
) < 0)
2181 return log_error_errno(errno
, "Failed to mount %s: %m", what
);
2185 log_error("--image= is not supported, compiled without blkid support.");
2190 static int setup_machine_id(const char *directory
) {
2192 const char *etc_machine_id
, *t
;
2193 _cleanup_free_
char *s
= NULL
;
2195 etc_machine_id
= prefix_roota(directory
, "/etc/machine-id");
2197 r
= read_one_line_file(etc_machine_id
, &s
);
2199 return log_error_errno(r
, "Failed to read machine ID from %s: %m", etc_machine_id
);
2204 r
= sd_id128_from_string(t
, &arg_uuid
);
2206 return log_error_errno(r
, "Failed to parse machine ID from %s: %m", etc_machine_id
);
2208 if (sd_id128_is_null(arg_uuid
)) {
2209 r
= sd_id128_randomize(&arg_uuid
);
2211 return log_error_errno(r
, "Failed to generate random machine ID: %m");
2215 r
= machine_id_setup(directory
, arg_uuid
);
2217 return log_error_errno(r
, "Failed to setup machine ID: %m");
2222 static int mount_devices(
2224 const char *root_device
, bool root_device_rw
,
2225 const char *home_device
, bool home_device_rw
,
2226 const char *srv_device
, bool srv_device_rw
) {
2232 r
= mount_device(root_device
, arg_directory
, NULL
, root_device_rw
);
2234 return log_error_errno(r
, "Failed to mount root directory: %m");
2238 r
= mount_device(home_device
, arg_directory
, "/home", home_device_rw
);
2240 return log_error_errno(r
, "Failed to mount home directory: %m");
2244 r
= mount_device(srv_device
, arg_directory
, "/srv", srv_device_rw
);
2246 return log_error_errno(r
, "Failed to mount server data directory: %m");
2252 static void loop_remove(int nr
, int *image_fd
) {
2253 _cleanup_close_
int control
= -1;
2259 if (image_fd
&& *image_fd
>= 0) {
2260 r
= ioctl(*image_fd
, LOOP_CLR_FD
);
2262 log_debug_errno(errno
, "Failed to close loop image: %m");
2263 *image_fd
= safe_close(*image_fd
);
2266 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
2268 log_warning_errno(errno
, "Failed to open /dev/loop-control: %m");
2272 r
= ioctl(control
, LOOP_CTL_REMOVE
, nr
);
2274 log_debug_errno(errno
, "Failed to remove loop %d: %m", nr
);
2279 * < 0 : wait_for_terminate() failed to get the state of the
2280 * container, the container was terminated by a signal, or
2281 * failed for an unknown reason. No change is made to the
2282 * container argument.
2283 * > 0 : The program executed in the container terminated with an
2284 * error. The exit code of the program executed in the
2285 * container is returned. The container argument has been set
2286 * to CONTAINER_TERMINATED.
2287 * 0 : The container is being rebooted, has been shut down or exited
2288 * successfully. The container argument has been set to either
2289 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2291 * That is, success is indicated by a return value of zero, and an
2292 * error is indicated by a non-zero value.
2294 static int wait_for_container(pid_t pid
, ContainerStatus
*container
) {
2298 r
= wait_for_terminate(pid
, &status
);
2300 return log_warning_errno(r
, "Failed to wait for container: %m");
2302 switch (status
.si_code
) {
2305 if (status
.si_status
== 0) {
2306 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s exited successfully.", arg_machine
);
2309 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s failed with error code %i.", arg_machine
, status
.si_status
);
2311 *container
= CONTAINER_TERMINATED
;
2312 return status
.si_status
;
2315 if (status
.si_status
== SIGINT
) {
2317 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s has been shut down.", arg_machine
);
2318 *container
= CONTAINER_TERMINATED
;
2321 } else if (status
.si_status
== SIGHUP
) {
2323 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s is being rebooted.", arg_machine
);
2324 *container
= CONTAINER_REBOOTED
;
2328 /* CLD_KILLED fallthrough */
2331 log_error("Container %s terminated by signal %s.", arg_machine
, signal_to_string(status
.si_status
));
2335 log_error("Container %s failed due to unknown reason.", arg_machine
);
2342 static int on_orderly_shutdown(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
2345 pid
= PTR_TO_PID(userdata
);
2347 if (kill(pid
, arg_kill_signal
) >= 0) {
2348 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2349 sd_event_source_set_userdata(s
, NULL
);
2354 sd_event_exit(sd_event_source_get_event(s
), 0);
2358 static int determine_names(void) {
2361 if (arg_template
&& !arg_directory
&& arg_machine
) {
2363 /* If --template= was specified then we should not
2364 * search for a machine, but instead create a new one
2365 * in /var/lib/machine. */
2367 arg_directory
= strjoin("/var/lib/machines/", arg_machine
, NULL
);
2372 if (!arg_image
&& !arg_directory
) {
2374 _cleanup_(image_unrefp
) Image
*i
= NULL
;
2376 r
= image_find(arg_machine
, &i
);
2378 return log_error_errno(r
, "Failed to find image for machine '%s': %m", arg_machine
);
2380 log_error("No image for machine '%s': %m", arg_machine
);
2384 if (i
->type
== IMAGE_RAW
)
2385 r
= free_and_strdup(&arg_image
, i
->path
);
2387 r
= free_and_strdup(&arg_directory
, i
->path
);
2389 return log_error_errno(r
, "Invalid image directory: %m");
2392 arg_read_only
= arg_read_only
|| i
->read_only
;
2394 arg_directory
= get_current_dir_name();
2396 if (!arg_directory
&& !arg_machine
) {
2397 log_error("Failed to determine path, please use -D or -i.");
2403 if (arg_directory
&& path_equal(arg_directory
, "/"))
2404 arg_machine
= gethostname_malloc();
2406 arg_machine
= strdup(basename(arg_image
?: arg_directory
));
2411 hostname_cleanup(arg_machine
);
2412 if (!machine_name_is_valid(arg_machine
)) {
2413 log_error("Failed to determine machine name automatically, please use -M.");
2417 if (arg_ephemeral
) {
2420 /* Add a random suffix when this is an
2421 * ephemeral machine, so that we can run many
2422 * instances at once without manually having
2423 * to specify -M each time. */
2425 if (asprintf(&b
, "%s-%016" PRIx64
, arg_machine
, random_u64()) < 0)
2436 static int determine_uid_shift(const char *directory
) {
2444 if (arg_uid_shift
== UID_INVALID
) {
2447 r
= stat(directory
, &st
);
2449 return log_error_errno(errno
, "Failed to determine UID base of %s: %m", directory
);
2451 arg_uid_shift
= st
.st_uid
& UINT32_C(0xffff0000);
2453 if (arg_uid_shift
!= (st
.st_gid
& UINT32_C(0xffff0000))) {
2454 log_error("UID and GID base of %s don't match.", directory
);
2458 arg_uid_range
= UINT32_C(0x10000);
2461 if (arg_uid_shift
> (uid_t
) -1 - arg_uid_range
) {
2462 log_error("UID base too high for UID range.");
2466 log_info("Using user namespaces with base " UID_FMT
" and range " UID_FMT
".", arg_uid_shift
, arg_uid_range
);
2470 static int inner_child(
2472 const char *directory
,
2478 _cleanup_free_
char *home
= NULL
;
2481 const char *envp
[] = {
2482 "PATH=" DEFAULT_PATH_SPLIT_USR
,
2483 NULL
, /* container */
2488 NULL
, /* container_uuid */
2489 NULL
, /* LISTEN_FDS */
2490 NULL
, /* LISTEN_PID */
2494 _cleanup_strv_free_
char **env_use
= NULL
;
2499 assert(kmsg_socket
>= 0);
2504 /* Tell the parent, that it now can write the UID map. */
2505 (void) barrier_place(barrier
); /* #1 */
2507 /* Wait until the parent wrote the UID map */
2508 if (!barrier_place_and_sync(barrier
)) { /* #2 */
2509 log_error("Parent died too early");
2514 r
= mount_all(NULL
, arg_userns
, true, arg_uid_shift
, arg_private_network
, arg_uid_range
, arg_selinux_apifs_context
);
2518 r
= mount_sysfs(NULL
);
2522 /* Wait until we are cgroup-ified, so that we
2523 * can mount the right cgroup path writable */
2524 if (!barrier_place_and_sync(barrier
)) { /* #3 */
2525 log_error("Parent died too early");
2529 r
= mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy
);
2533 r
= reset_uid_gid();
2535 return log_error_errno(r
, "Couldn't become new root: %m");
2537 r
= setup_boot_id(NULL
);
2541 r
= setup_kmsg(NULL
, kmsg_socket
);
2544 kmsg_socket
= safe_close(kmsg_socket
);
2549 return log_error_errno(errno
, "setsid() failed: %m");
2551 if (arg_private_network
)
2554 if (arg_expose_ports
) {
2555 r
= expose_port_send_rtnl(rtnl_socket
);
2558 rtnl_socket
= safe_close(rtnl_socket
);
2561 r
= drop_capabilities();
2563 return log_error_errno(r
, "drop_capabilities() failed: %m");
2567 if (arg_personality
!= PERSONALITY_INVALID
) {
2568 if (personality(arg_personality
) < 0)
2569 return log_error_errno(errno
, "personality() failed: %m");
2570 } else if (secondary
) {
2571 if (personality(PER_LINUX32
) < 0)
2572 return log_error_errno(errno
, "personality() failed: %m");
2576 if (arg_selinux_context
)
2577 if (setexeccon((security_context_t
) arg_selinux_context
) < 0)
2578 return log_error_errno(errno
, "setexeccon(\"%s\") failed: %m", arg_selinux_context
);
2581 r
= change_uid_gid(arg_user
, &home
);
2585 /* LXC sets container=lxc, so follow the scheme here */
2586 envp
[n_env
++] = strjoina("container=", arg_container_service_name
);
2588 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
2592 if ((asprintf((char**)(envp
+ n_env
++), "HOME=%s", home
? home
: "/root") < 0) ||
2593 (asprintf((char**)(envp
+ n_env
++), "USER=%s", arg_user
? arg_user
: "root") < 0) ||
2594 (asprintf((char**)(envp
+ n_env
++), "LOGNAME=%s", arg_user
? arg_user
: "root") < 0))
2597 assert(!sd_id128_equal(arg_uuid
, SD_ID128_NULL
));
2599 if (asprintf((char**)(envp
+ n_env
++), "container_uuid=%s", id128_format_as_uuid(arg_uuid
, as_uuid
)) < 0)
2602 if (fdset_size(fds
) > 0) {
2603 r
= fdset_cloexec(fds
, false);
2605 return log_error_errno(r
, "Failed to unset O_CLOEXEC for file descriptors.");
2607 if ((asprintf((char **)(envp
+ n_env
++), "LISTEN_FDS=%u", fdset_size(fds
)) < 0) ||
2608 (asprintf((char **)(envp
+ n_env
++), "LISTEN_PID=1") < 0))
2612 env_use
= strv_env_merge(2, envp
, arg_setenv
);
2616 /* Let the parent know that we are ready and
2617 * wait until the parent is ready with the
2619 if (!barrier_place_and_sync(barrier
)) { /* #4 */
2620 log_error("Parent died too early");
2625 if (chdir(arg_chdir
) < 0)
2626 return log_error_errno(errno
, "Failed to change to specified working directory %s: %m", arg_chdir
);
2628 if (arg_start_mode
== START_PID2
) {
2634 /* Now, explicitly close the log, so that we
2635 * then can close all remaining fds. Closing
2636 * the log explicitly first has the benefit
2637 * that the logging subsystem knows about it,
2638 * and is thus ready to be reopened should we
2639 * need it again. Note that the other fds
2640 * closed here are at least the locking and
2643 (void) fdset_close_others(fds
);
2645 if (arg_start_mode
== START_BOOT
) {
2649 /* Automatically search for the init system */
2651 m
= strv_length(arg_parameters
);
2652 a
= newa(char*, m
+ 2);
2653 memcpy_safe(a
+ 1, arg_parameters
, m
* sizeof(char*));
2656 a
[0] = (char*) "/usr/lib/systemd/systemd";
2657 execve(a
[0], a
, env_use
);
2659 a
[0] = (char*) "/lib/systemd/systemd";
2660 execve(a
[0], a
, env_use
);
2662 a
[0] = (char*) "/sbin/init";
2663 execve(a
[0], a
, env_use
);
2664 } else if (!strv_isempty(arg_parameters
))
2665 execvpe(arg_parameters
[0], arg_parameters
, env_use
);
2668 chdir(home
?: "/root");
2670 execle("/bin/bash", "-bash", NULL
, env_use
);
2671 execle("/bin/sh", "-sh", NULL
, env_use
);
2676 return log_error_errno(r
, "execv() failed: %m");
2679 static int outer_child(
2681 const char *directory
,
2682 const char *console
,
2683 const char *root_device
, bool root_device_rw
,
2684 const char *home_device
, bool home_device_rw
,
2685 const char *srv_device
, bool srv_device_rw
,
2692 int uid_shift_socket
,
2702 assert(pid_socket
>= 0);
2703 assert(uuid_socket
>= 0);
2704 assert(kmsg_socket
>= 0);
2708 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0)
2709 return log_error_errno(errno
, "PR_SET_PDEATHSIG failed: %m");
2712 close_nointr(STDIN_FILENO
);
2713 close_nointr(STDOUT_FILENO
);
2714 close_nointr(STDERR_FILENO
);
2716 r
= open_terminal(console
, O_RDWR
);
2717 if (r
!= STDIN_FILENO
) {
2723 return log_error_errno(r
, "Failed to open console: %m");
2726 if (dup2(STDIN_FILENO
, STDOUT_FILENO
) != STDOUT_FILENO
||
2727 dup2(STDIN_FILENO
, STDERR_FILENO
) != STDERR_FILENO
)
2728 return log_error_errno(errno
, "Failed to duplicate console: %m");
2731 r
= reset_audit_loginuid();
2735 /* Mark everything as slave, so that we still
2736 * receive mounts from the real root, but don't
2737 * propagate mounts to the real root. */
2738 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
2739 return log_error_errno(errno
, "MS_SLAVE|MS_REC failed: %m");
2741 r
= mount_devices(directory
,
2742 root_device
, root_device_rw
,
2743 home_device
, home_device_rw
,
2744 srv_device
, srv_device_rw
);
2748 r
= determine_uid_shift(directory
);
2753 l
= send(uid_shift_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), MSG_NOSIGNAL
);
2755 return log_error_errno(errno
, "Failed to send UID shift: %m");
2756 if (l
!= sizeof(arg_uid_shift
)) {
2757 log_error("Short write while sending UID shift.");
2762 /* Turn directory into bind mount */
2763 if (mount(directory
, directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
2764 return log_error_errno(errno
, "Failed to make bind mount: %m");
2766 r
= setup_volatile(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2770 r
= setup_volatile_state(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2774 r
= base_filesystem_create(directory
, arg_uid_shift
, (gid_t
) arg_uid_shift
);
2778 if (arg_read_only
) {
2779 r
= bind_remount_recursive(directory
, true);
2781 return log_error_errno(r
, "Failed to make tree read-only: %m");
2784 r
= mount_all(directory
, arg_userns
, false, arg_private_network
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2788 r
= copy_devnodes(directory
);
2792 dev_setup(directory
, arg_uid_shift
, arg_uid_shift
);
2794 r
= setup_pts(directory
);
2798 r
= setup_propagate(directory
);
2802 r
= setup_dev_console(directory
, console
);
2806 r
= setup_seccomp();
2810 r
= setup_timezone(directory
);
2814 r
= setup_resolv_conf(directory
);
2818 r
= setup_machine_id(directory
);
2822 r
= setup_journal(directory
);
2826 r
= mount_custom(directory
, arg_custom_mounts
, arg_n_custom_mounts
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2830 r
= mount_cgroups(directory
, arg_unified_cgroup_hierarchy
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2834 r
= mount_move_root(directory
);
2836 return log_error_errno(r
, "Failed to move root directory: %m");
2838 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
|
2839 (arg_share_system
? 0 : CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
) |
2840 (arg_private_network
? CLONE_NEWNET
: 0) |
2841 (arg_userns
? CLONE_NEWUSER
: 0),
2844 return log_error_errno(errno
, "Failed to fork inner child: %m");
2846 pid_socket
= safe_close(pid_socket
);
2847 uuid_socket
= safe_close(uuid_socket
);
2848 uid_shift_socket
= safe_close(uid_shift_socket
);
2850 /* The inner child has all namespaces that are
2851 * requested, so that we all are owned by the user if
2852 * user namespaces are turned on. */
2854 r
= inner_child(barrier
, directory
, secondary
, kmsg_socket
, rtnl_socket
, fds
);
2856 _exit(EXIT_FAILURE
);
2858 _exit(EXIT_SUCCESS
);
2861 l
= send(pid_socket
, &pid
, sizeof(pid
), MSG_NOSIGNAL
);
2863 return log_error_errno(errno
, "Failed to send PID: %m");
2864 if (l
!= sizeof(pid
)) {
2865 log_error("Short write while sending PID.");
2869 l
= send(uuid_socket
, &arg_uuid
, sizeof(arg_uuid
), MSG_NOSIGNAL
);
2871 return log_error_errno(errno
, "Failed to send machine ID: %m");
2872 if (l
!= sizeof(arg_uuid
)) {
2873 log_error("Short write while sending machine ID.");
2877 pid_socket
= safe_close(pid_socket
);
2878 uuid_socket
= safe_close(uuid_socket
);
2879 kmsg_socket
= safe_close(kmsg_socket
);
2880 rtnl_socket
= safe_close(rtnl_socket
);
2885 static int setup_uid_map(pid_t pid
) {
2886 char uid_map
[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t
) + 1], line
[DECIMAL_STR_MAX(uid_t
)*3+3+1];
2891 xsprintf(uid_map
, "/proc/" PID_FMT
"/uid_map", pid
);
2892 xsprintf(line
, UID_FMT
" " UID_FMT
" " UID_FMT
"\n", 0, arg_uid_shift
, arg_uid_range
);
2893 r
= write_string_file(uid_map
, line
, 0);
2895 return log_error_errno(r
, "Failed to write UID map: %m");
2897 /* We always assign the same UID and GID ranges */
2898 xsprintf(uid_map
, "/proc/" PID_FMT
"/gid_map", pid
);
2899 r
= write_string_file(uid_map
, line
, 0);
2901 return log_error_errno(r
, "Failed to write GID map: %m");
2906 static int load_settings(void) {
2907 _cleanup_(settings_freep
) Settings
*settings
= NULL
;
2908 _cleanup_fclose_
FILE *f
= NULL
;
2909 _cleanup_free_
char *p
= NULL
;
2913 /* If all settings are masked, there's no point in looking for
2914 * the settings file */
2915 if ((arg_settings_mask
& _SETTINGS_MASK_ALL
) == _SETTINGS_MASK_ALL
)
2918 fn
= strjoina(arg_machine
, ".nspawn");
2920 /* We first look in the admin's directories in /etc and /run */
2921 FOREACH_STRING(i
, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2922 _cleanup_free_
char *j
= NULL
;
2924 j
= strjoin(i
, "/", fn
, NULL
);
2933 /* By default, we trust configuration from /etc and /run */
2934 if (arg_settings_trusted
< 0)
2935 arg_settings_trusted
= true;
2940 if (errno
!= ENOENT
)
2941 return log_error_errno(errno
, "Failed to open %s: %m", j
);
2945 /* After that, let's look for a file next to the
2946 * actual image we shall boot. */
2949 p
= file_in_same_dir(arg_image
, fn
);
2952 } else if (arg_directory
) {
2953 p
= file_in_same_dir(arg_directory
, fn
);
2960 if (!f
&& errno
!= ENOENT
)
2961 return log_error_errno(errno
, "Failed to open %s: %m", p
);
2963 /* By default, we do not trust configuration from /var/lib/machines */
2964 if (arg_settings_trusted
< 0)
2965 arg_settings_trusted
= false;
2972 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted
));
2974 r
= settings_load(f
, p
, &settings
);
2978 /* Copy over bits from the settings, unless they have been
2979 * explicitly masked by command line switches. */
2981 if ((arg_settings_mask
& SETTING_START_MODE
) == 0 &&
2982 settings
->start_mode
>= 0) {
2983 arg_start_mode
= settings
->start_mode
;
2985 strv_free(arg_parameters
);
2986 arg_parameters
= settings
->parameters
;
2987 settings
->parameters
= NULL
;
2990 if ((arg_settings_mask
& SETTING_WORKING_DIRECTORY
) == 0 &&
2991 settings
->working_directory
) {
2993 arg_chdir
= settings
->working_directory
;
2994 settings
->working_directory
= NULL
;
2997 if ((arg_settings_mask
& SETTING_ENVIRONMENT
) == 0 &&
2998 settings
->environment
) {
2999 strv_free(arg_setenv
);
3000 arg_setenv
= settings
->environment
;
3001 settings
->environment
= NULL
;
3004 if ((arg_settings_mask
& SETTING_USER
) == 0 &&
3007 arg_user
= settings
->user
;
3008 settings
->user
= NULL
;
3011 if ((arg_settings_mask
& SETTING_CAPABILITY
) == 0) {
3014 plus
= settings
->capability
;
3015 if (settings_private_network(settings
))
3016 plus
|= (1ULL << CAP_NET_ADMIN
);
3018 if (!arg_settings_trusted
&& plus
!= 0) {
3019 if (settings
->capability
!= 0)
3020 log_warning("Ignoring Capability= setting, file %s is not trusted.", p
);
3024 arg_retain
&= ~settings
->drop_capability
;
3027 if ((arg_settings_mask
& SETTING_KILL_SIGNAL
) == 0 &&
3028 settings
->kill_signal
> 0)
3029 arg_kill_signal
= settings
->kill_signal
;
3031 if ((arg_settings_mask
& SETTING_PERSONALITY
) == 0 &&
3032 settings
->personality
!= PERSONALITY_INVALID
)
3033 arg_personality
= settings
->personality
;
3035 if ((arg_settings_mask
& SETTING_MACHINE_ID
) == 0 &&
3036 !sd_id128_is_null(settings
->machine_id
)) {
3038 if (!arg_settings_trusted
)
3039 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p
);
3041 arg_uuid
= settings
->machine_id
;
3044 if ((arg_settings_mask
& SETTING_READ_ONLY
) == 0 &&
3045 settings
->read_only
>= 0)
3046 arg_read_only
= settings
->read_only
;
3048 if ((arg_settings_mask
& SETTING_VOLATILE_MODE
) == 0 &&
3049 settings
->volatile_mode
!= _VOLATILE_MODE_INVALID
)
3050 arg_volatile_mode
= settings
->volatile_mode
;
3052 if ((arg_settings_mask
& SETTING_CUSTOM_MOUNTS
) == 0 &&
3053 settings
->n_custom_mounts
> 0) {
3055 if (!arg_settings_trusted
)
3056 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p
);
3058 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
3059 arg_custom_mounts
= settings
->custom_mounts
;
3060 arg_n_custom_mounts
= settings
->n_custom_mounts
;
3062 settings
->custom_mounts
= NULL
;
3063 settings
->n_custom_mounts
= 0;
3067 if ((arg_settings_mask
& SETTING_NETWORK
) == 0 &&
3068 (settings
->private_network
>= 0 ||
3069 settings
->network_veth
>= 0 ||
3070 settings
->network_bridge
||
3071 settings
->network_interfaces
||
3072 settings
->network_macvlan
||
3073 settings
->network_ipvlan
||
3074 settings
->network_veth_extra
)) {
3076 if (!arg_settings_trusted
)
3077 log_warning("Ignoring network settings, file %s is not trusted.", p
);
3079 arg_network_veth
= settings_network_veth(settings
);
3080 arg_private_network
= settings_private_network(settings
);
3082 strv_free(arg_network_interfaces
);
3083 arg_network_interfaces
= settings
->network_interfaces
;
3084 settings
->network_interfaces
= NULL
;
3086 strv_free(arg_network_macvlan
);
3087 arg_network_macvlan
= settings
->network_macvlan
;
3088 settings
->network_macvlan
= NULL
;
3090 strv_free(arg_network_ipvlan
);
3091 arg_network_ipvlan
= settings
->network_ipvlan
;
3092 settings
->network_ipvlan
= NULL
;
3094 strv_free(arg_network_veth_extra
);
3095 arg_network_veth_extra
= settings
->network_veth_extra
;
3096 settings
->network_veth_extra
= NULL
;
3098 free(arg_network_bridge
);
3099 arg_network_bridge
= settings
->network_bridge
;
3100 settings
->network_bridge
= NULL
;
3104 if ((arg_settings_mask
& SETTING_EXPOSE_PORTS
) == 0 &&
3105 settings
->expose_ports
) {
3107 if (!arg_settings_trusted
)
3108 log_warning("Ignoring Port= setting, file %s is not trusted.", p
);
3110 expose_port_free_all(arg_expose_ports
);
3111 arg_expose_ports
= settings
->expose_ports
;
3112 settings
->expose_ports
= NULL
;
3119 int main(int argc
, char *argv
[]) {
3121 _cleanup_free_
char *device_path
= NULL
, *root_device
= NULL
, *home_device
= NULL
, *srv_device
= NULL
, *console
= NULL
;
3122 bool root_device_rw
= true, home_device_rw
= true, srv_device_rw
= true;
3123 _cleanup_close_
int master
= -1, image_fd
= -1;
3124 _cleanup_fdset_free_ FDSet
*fds
= NULL
;
3125 int r
, n_fd_passed
, loop_nr
= -1;
3126 char veth_name
[IFNAMSIZ
];
3127 bool secondary
= false, remove_subvol
= false;
3130 int ret
= EXIT_SUCCESS
;
3131 union in_addr_union exposed
= {};
3132 _cleanup_release_lock_file_ LockFile tree_global_lock
= LOCK_FILE_INIT
, tree_local_lock
= LOCK_FILE_INIT
;
3135 log_parse_environment();
3138 /* Make sure rename_process() in the stub init process can work */
3142 r
= parse_argv(argc
, argv
);
3146 if (geteuid() != 0) {
3147 log_error("Need to be root.");
3151 r
= determine_names();
3155 r
= load_settings();
3159 r
= verify_arguments();
3163 n_fd_passed
= sd_listen_fds(false);
3164 if (n_fd_passed
> 0) {
3165 r
= fdset_new_listen_fds(&fds
, false);
3167 log_error_errno(r
, "Failed to collect file descriptors: %m");
3172 if (arg_directory
) {
3175 if (path_equal(arg_directory
, "/") && !arg_ephemeral
) {
3176 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3181 if (arg_ephemeral
) {
3182 _cleanup_free_
char *np
= NULL
;
3184 /* If the specified path is a mount point we
3185 * generate the new snapshot immediately
3186 * inside it under a random name. However if
3187 * the specified is not a mount point we
3188 * create the new snapshot in the parent
3189 * directory, just next to it. */
3190 r
= path_is_mount_point(arg_directory
, 0);
3192 log_error_errno(r
, "Failed to determine whether directory %s is mount point: %m", arg_directory
);
3196 r
= tempfn_random_child(arg_directory
, "machine.", &np
);
3198 r
= tempfn_random(arg_directory
, "machine.", &np
);
3200 log_error_errno(r
, "Failed to generate name for snapshot: %m");
3204 r
= image_path_lock(np
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3206 log_error_errno(r
, "Failed to lock %s: %m", np
);
3210 r
= btrfs_subvol_snapshot(arg_directory
, np
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
3212 log_error_errno(r
, "Failed to create snapshot %s from %s: %m", np
, arg_directory
);
3216 free(arg_directory
);
3220 remove_subvol
= true;
3223 r
= image_path_lock(arg_directory
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3225 log_error_errno(r
, "Directory tree %s is currently busy.", arg_directory
);
3229 log_error_errno(r
, "Failed to lock %s: %m", arg_directory
);
3234 r
= btrfs_subvol_snapshot(arg_template
, arg_directory
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
3237 log_info("Directory %s already exists, not populating from template %s.", arg_directory
, arg_template
);
3239 log_error_errno(r
, "Couldn't create snapshot %s from %s: %m", arg_directory
, arg_template
);
3243 log_info("Populated %s from template %s.", arg_directory
, arg_template
);
3248 if (arg_start_mode
== START_BOOT
) {
3249 if (path_is_os_tree(arg_directory
) <= 0) {
3250 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory
);
3257 p
= strjoina(arg_directory
, "/usr/");
3258 if (laccess(p
, F_OK
) < 0) {
3259 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory
);
3266 char template[] = "/tmp/nspawn-root-XXXXXX";
3269 assert(!arg_template
);
3271 r
= image_path_lock(arg_image
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3273 r
= log_error_errno(r
, "Disk image %s is currently busy.", arg_image
);
3277 r
= log_error_errno(r
, "Failed to create image lock: %m");
3281 if (!mkdtemp(template)) {
3282 log_error_errno(errno
, "Failed to create temporary directory: %m");
3287 arg_directory
= strdup(template);
3288 if (!arg_directory
) {
3293 image_fd
= setup_image(&device_path
, &loop_nr
);
3299 r
= dissect_image(image_fd
,
3300 &root_device
, &root_device_rw
,
3301 &home_device
, &home_device_rw
,
3302 &srv_device
, &srv_device_rw
,
3308 r
= custom_mounts_prepare();
3313 isatty(STDIN_FILENO
) > 0 &&
3314 isatty(STDOUT_FILENO
) > 0;
3316 master
= posix_openpt(O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NDELAY
);
3318 r
= log_error_errno(errno
, "Failed to acquire pseudo tty: %m");
3322 r
= ptsname_malloc(master
, &console
);
3324 r
= log_error_errno(r
, "Failed to determine tty name: %m");
3328 if (arg_selinux_apifs_context
) {
3329 r
= mac_selinux_apply(console
, arg_selinux_apifs_context
);
3334 if (unlockpt(master
) < 0) {
3335 r
= log_error_errno(errno
, "Failed to unlock tty: %m");
3340 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3341 arg_machine
, arg_image
?: arg_directory
);
3343 assert_se(sigprocmask_many(SIG_BLOCK
, NULL
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, -1) >= 0);
3345 assert_se(sigemptyset(&mask_chld
) == 0);
3346 assert_se(sigaddset(&mask_chld
, SIGCHLD
) == 0);
3348 if (prctl(PR_SET_CHILD_SUBREAPER
, 1) < 0) {
3349 r
= log_error_errno(errno
, "Failed to become subreaper: %m");
3354 _cleanup_close_pair_
int kmsg_socket_pair
[2] = { -1, -1 }, rtnl_socket_pair
[2] = { -1, -1 },
3355 pid_socket_pair
[2] = { -1, -1 }, uuid_socket_pair
[2] = { -1, -1 }, uid_shift_socket_pair
[2] = { -1, -1 };
3356 ContainerStatus container_status
;
3357 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
3358 static const struct sigaction sa
= {
3359 .sa_handler
= nop_signal_handler
,
3360 .sa_flags
= SA_NOCLDSTOP
,
3364 _cleanup_(sd_event_unrefp
) sd_event
*event
= NULL
;
3365 _cleanup_(pty_forward_freep
) PTYForward
*forward
= NULL
;
3366 _cleanup_(sd_netlink_unrefp
) sd_netlink
*rtnl
= NULL
;
3369 r
= barrier_create(&barrier
);
3371 log_error_errno(r
, "Cannot initialize IPC barrier: %m");
3375 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, kmsg_socket_pair
) < 0) {
3376 r
= log_error_errno(errno
, "Failed to create kmsg socket pair: %m");
3380 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, rtnl_socket_pair
) < 0) {
3381 r
= log_error_errno(errno
, "Failed to create rtnl socket pair: %m");
3385 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, pid_socket_pair
) < 0) {
3386 r
= log_error_errno(errno
, "Failed to create pid socket pair: %m");
3390 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, uuid_socket_pair
) < 0) {
3391 r
= log_error_errno(errno
, "Failed to create id socket pair: %m");
3396 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, uid_shift_socket_pair
) < 0) {
3397 r
= log_error_errno(errno
, "Failed to create uid shift socket pair: %m");
3401 /* Child can be killed before execv(), so handle SIGCHLD
3402 * in order to interrupt parent's blocking calls and
3403 * give it a chance to call wait() and terminate. */
3404 r
= sigprocmask(SIG_UNBLOCK
, &mask_chld
, NULL
);
3406 r
= log_error_errno(errno
, "Failed to change the signal mask: %m");
3410 r
= sigaction(SIGCHLD
, &sa
, NULL
);
3412 r
= log_error_errno(errno
, "Failed to install SIGCHLD handler: %m");
3416 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
, NULL
);
3418 if (errno
== EINVAL
)
3419 r
= log_error_errno(errno
, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3421 r
= log_error_errno(errno
, "clone() failed: %m");
3427 /* The outer child only has a file system namespace. */
3428 barrier_set_role(&barrier
, BARRIER_CHILD
);
3430 master
= safe_close(master
);
3432 kmsg_socket_pair
[0] = safe_close(kmsg_socket_pair
[0]);
3433 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3434 pid_socket_pair
[0] = safe_close(pid_socket_pair
[0]);
3435 uuid_socket_pair
[0] = safe_close(uuid_socket_pair
[0]);
3436 uid_shift_socket_pair
[0] = safe_close(uid_shift_socket_pair
[0]);
3438 (void) reset_all_signal_handlers();
3439 (void) reset_signal_mask();
3441 r
= outer_child(&barrier
,
3444 root_device
, root_device_rw
,
3445 home_device
, home_device_rw
,
3446 srv_device
, srv_device_rw
,
3450 uuid_socket_pair
[1],
3451 kmsg_socket_pair
[1],
3452 rtnl_socket_pair
[1],
3453 uid_shift_socket_pair
[1],
3456 _exit(EXIT_FAILURE
);
3458 _exit(EXIT_SUCCESS
);
3461 barrier_set_role(&barrier
, BARRIER_PARENT
);
3463 fds
= fdset_free(fds
);
3465 kmsg_socket_pair
[1] = safe_close(kmsg_socket_pair
[1]);
3466 rtnl_socket_pair
[1] = safe_close(rtnl_socket_pair
[1]);
3467 pid_socket_pair
[1] = safe_close(pid_socket_pair
[1]);
3468 uuid_socket_pair
[1] = safe_close(uuid_socket_pair
[1]);
3469 uid_shift_socket_pair
[1] = safe_close(uid_shift_socket_pair
[1]);
3471 /* Wait for the outer child. */
3472 r
= wait_for_terminate_and_warn("namespace helper", pid
, NULL
);
3481 /* And now retrieve the PID of the inner child. */
3482 l
= recv(pid_socket_pair
[0], &pid
, sizeof(pid
), 0);
3484 r
= log_error_errno(errno
, "Failed to read inner child PID: %m");
3487 if (l
!= sizeof(pid
)) {
3488 log_error("Short read while reading inner child PID.");
3493 /* We also retrieve container UUID in case it was generated by outer child */
3494 l
= recv(uuid_socket_pair
[0], &arg_uuid
, sizeof(arg_uuid
), 0);
3496 r
= log_error_errno(errno
, "Failed to read container machine ID: %m");
3499 if (l
!= sizeof(arg_uuid
)) {
3500 log_error("Short read while reading container machined ID.");
3505 log_debug("Init process invoked as PID " PID_FMT
, pid
);
3508 if (!barrier_place_and_sync(&barrier
)) { /* #1 */
3509 log_error("Child died too early.");
3514 l
= recv(uid_shift_socket_pair
[0], &arg_uid_shift
, sizeof(arg_uid_shift
), 0);
3516 r
= log_error_errno(errno
, "Failed to read UID shift: %m");
3519 if (l
!= sizeof(arg_uid_shift
)) {
3520 log_error("Short read while reading UID shift.");
3525 r
= setup_uid_map(pid
);
3529 (void) barrier_place(&barrier
); /* #2 */
3532 if (arg_private_network
) {
3534 r
= move_network_interfaces(pid
, arg_network_interfaces
);
3538 if (arg_network_veth
) {
3539 r
= setup_veth(arg_machine
, pid
, veth_name
, !!arg_network_bridge
);
3545 if (arg_network_bridge
) {
3546 r
= setup_bridge(veth_name
, arg_network_bridge
);
3554 r
= setup_veth_extra(arg_machine
, pid
, arg_network_veth_extra
);
3558 r
= setup_macvlan(arg_machine
, pid
, arg_network_macvlan
);
3562 r
= setup_ipvlan(arg_machine
, pid
, arg_network_ipvlan
);
3568 r
= register_machine(
3575 arg_custom_mounts
, arg_n_custom_mounts
,
3579 arg_container_service_name
);
3584 r
= sync_cgroup(pid
, arg_unified_cgroup_hierarchy
);
3588 if (arg_keep_unit
) {
3589 r
= create_subcgroup(pid
, arg_unified_cgroup_hierarchy
);
3594 r
= chown_cgroup(pid
, arg_uid_shift
);
3598 /* Notify the child that the parent is ready with all
3599 * its setup (including cgroup-ification), and that
3600 * the child can now hand over control to the code to
3601 * run inside the container. */
3602 (void) barrier_place(&barrier
); /* #3 */
3604 /* Block SIGCHLD here, before notifying child.
3605 * process_pty() will handle it with the other signals. */
3606 assert_se(sigprocmask(SIG_BLOCK
, &mask_chld
, NULL
) >= 0);
3608 /* Reset signal to default */
3609 r
= default_signals(SIGCHLD
, -1);
3611 log_error_errno(r
, "Failed to reset SIGCHLD: %m");
3615 /* Let the child know that we are ready and wait that the child is completely ready now. */
3616 if (!barrier_place_and_sync(&barrier
)) { /* #4 */
3617 log_error("Child died too early.");
3624 "STATUS=Container running.\n"
3625 "X_NSPAWN_LEADER_PID=" PID_FMT
, pid
);
3627 r
= sd_event_new(&event
);
3629 log_error_errno(r
, "Failed to get default event source: %m");
3633 if (arg_kill_signal
> 0) {
3634 /* Try to kill the init system on SIGINT or SIGTERM */
3635 sd_event_add_signal(event
, NULL
, SIGINT
, on_orderly_shutdown
, PID_TO_PTR(pid
));
3636 sd_event_add_signal(event
, NULL
, SIGTERM
, on_orderly_shutdown
, PID_TO_PTR(pid
));
3638 /* Immediately exit */
3639 sd_event_add_signal(event
, NULL
, SIGINT
, NULL
, NULL
);
3640 sd_event_add_signal(event
, NULL
, SIGTERM
, NULL
, NULL
);
3643 /* simply exit on sigchld */
3644 sd_event_add_signal(event
, NULL
, SIGCHLD
, NULL
, NULL
);
3646 if (arg_expose_ports
) {
3647 r
= expose_port_watch_rtnl(event
, rtnl_socket_pair
[0], on_address_change
, &exposed
, &rtnl
);
3651 (void) expose_port_execute(rtnl
, arg_expose_ports
, &exposed
);
3654 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3656 r
= pty_forward_new(event
, master
, PTY_FORWARD_IGNORE_VHANGUP
| (interactive
? 0 : PTY_FORWARD_READ_ONLY
), &forward
);
3658 log_error_errno(r
, "Failed to create PTY forwarder: %m");
3662 r
= sd_event_loop(event
);
3664 log_error_errno(r
, "Failed to run event loop: %m");
3668 pty_forward_get_last_char(forward
, &last_char
);
3670 forward
= pty_forward_free(forward
);
3672 if (!arg_quiet
&& last_char
!= '\n')
3675 /* Kill if it is not dead yet anyway */
3676 if (arg_register
&& !arg_keep_unit
)
3677 terminate_machine(pid
);
3679 /* Normally redundant, but better safe than sorry */
3682 r
= wait_for_container(pid
, &container_status
);
3686 /* We failed to wait for the container, or the
3687 * container exited abnormally */
3689 else if (r
> 0 || container_status
== CONTAINER_TERMINATED
) {
3690 /* The container exited with a non-zero
3691 * status, or with zero status and no reboot
3697 /* CONTAINER_REBOOTED, loop again */
3699 if (arg_keep_unit
) {
3700 /* Special handling if we are running as a
3701 * service: instead of simply restarting the
3702 * machine we want to restart the entire
3703 * service, so let's inform systemd about this
3704 * with the special exit code 133. The service
3705 * file uses RestartForceExitStatus=133 so
3706 * that this results in a full nspawn
3707 * restart. This is necessary since we might
3708 * have cgroup parameters set we want to have
3715 expose_port_flush(arg_expose_ports
, &exposed
);
3721 "STATUS=Terminating...");
3726 /* Try to flush whatever is still queued in the pty */
3728 (void) copy_bytes(master
, STDOUT_FILENO
, (uint64_t) -1, false);
3730 loop_remove(loop_nr
, &image_fd
);
3732 if (remove_subvol
&& arg_directory
) {
3735 k
= btrfs_subvol_remove(arg_directory
, BTRFS_REMOVE_RECURSIVE
|BTRFS_REMOVE_QUOTA
);
3737 log_warning_errno(k
, "Cannot remove subvolume '%s', ignoring: %m", arg_directory
);
3743 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
3744 (void) rm_rf(p
, REMOVE_ROOT
);
3747 expose_port_flush(arg_expose_ports
, &exposed
);
3749 free(arg_directory
);
3755 strv_free(arg_setenv
);
3756 free(arg_network_bridge
);
3757 strv_free(arg_network_interfaces
);
3758 strv_free(arg_network_macvlan
);
3759 strv_free(arg_network_ipvlan
);
3760 strv_free(arg_network_veth_extra
);
3761 strv_free(arg_parameters
);
3762 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
3763 expose_port_free_all(arg_expose_ports
);
3765 return r
< 0 ? EXIT_FAILURE
: ret
;