1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
23 #include <blkid/blkid.h>
27 #include <linux/loop.h>
33 #include <selinux/selinux.h>
40 #include <sys/mount.h>
41 #include <sys/personality.h>
42 #include <sys/prctl.h>
43 #include <sys/types.h>
46 #include "sd-daemon.h"
49 #include "alloc-util.h"
51 #include "base-filesystem.h"
52 #include "blkid-util.h"
53 #include "btrfs-util.h"
55 #include "capability-util.h"
56 #include "cgroup-util.h"
58 #include "dev-setup.h"
63 #include "formats-util.h"
66 #include "hostname-util.h"
68 #include "loopback-setup.h"
69 #include "machine-image.h"
73 #include "mount-util.h"
74 #include "netlink-util.h"
75 #include "nspawn-cgroup.h"
76 #include "nspawn-expose-ports.h"
77 #include "nspawn-mount.h"
78 #include "nspawn-network.h"
79 #include "nspawn-register.h"
80 #include "nspawn-settings.h"
81 #include "nspawn-setuid.h"
82 #include "nspawn-stub-pid1.h"
83 #include "parse-util.h"
84 #include "path-util.h"
85 #include "process-util.h"
87 #include "random-util.h"
90 #include "seccomp-util.h"
92 #include "signal-util.h"
93 #include "socket-util.h"
94 #include "stat-util.h"
95 #include "stdio-util.h"
96 #include "string-util.h"
98 #include "terminal-util.h"
99 #include "udev-util.h"
100 #include "umask-util.h"
101 #include "user-util.h"
104 typedef enum ContainerStatus
{
105 CONTAINER_TERMINATED
,
109 typedef enum LinkJournal
{
116 static char *arg_directory
= NULL
;
117 static char *arg_template
= NULL
;
118 static char *arg_chdir
= NULL
;
119 static char *arg_user
= NULL
;
120 static sd_id128_t arg_uuid
= {};
121 static char *arg_machine
= NULL
;
122 static const char *arg_selinux_context
= NULL
;
123 static const char *arg_selinux_apifs_context
= NULL
;
124 static const char *arg_slice
= NULL
;
125 static bool arg_private_network
= false;
126 static bool arg_read_only
= false;
127 static StartMode arg_start_mode
= START_PID1
;
128 static bool arg_ephemeral
= false;
129 static LinkJournal arg_link_journal
= LINK_AUTO
;
130 static bool arg_link_journal_try
= false;
131 static uint64_t arg_retain
=
132 (1ULL << CAP_CHOWN
) |
133 (1ULL << CAP_DAC_OVERRIDE
) |
134 (1ULL << CAP_DAC_READ_SEARCH
) |
135 (1ULL << CAP_FOWNER
) |
136 (1ULL << CAP_FSETID
) |
137 (1ULL << CAP_IPC_OWNER
) |
139 (1ULL << CAP_LEASE
) |
140 (1ULL << CAP_LINUX_IMMUTABLE
) |
141 (1ULL << CAP_NET_BIND_SERVICE
) |
142 (1ULL << CAP_NET_BROADCAST
) |
143 (1ULL << CAP_NET_RAW
) |
144 (1ULL << CAP_SETGID
) |
145 (1ULL << CAP_SETFCAP
) |
146 (1ULL << CAP_SETPCAP
) |
147 (1ULL << CAP_SETUID
) |
148 (1ULL << CAP_SYS_ADMIN
) |
149 (1ULL << CAP_SYS_CHROOT
) |
150 (1ULL << CAP_SYS_NICE
) |
151 (1ULL << CAP_SYS_PTRACE
) |
152 (1ULL << CAP_SYS_TTY_CONFIG
) |
153 (1ULL << CAP_SYS_RESOURCE
) |
154 (1ULL << CAP_SYS_BOOT
) |
155 (1ULL << CAP_AUDIT_WRITE
) |
156 (1ULL << CAP_AUDIT_CONTROL
) |
158 static CustomMount
*arg_custom_mounts
= NULL
;
159 static unsigned arg_n_custom_mounts
= 0;
160 static char **arg_setenv
= NULL
;
161 static bool arg_quiet
= false;
162 static bool arg_share_system
= false;
163 static bool arg_register
= true;
164 static bool arg_keep_unit
= false;
165 static char **arg_network_interfaces
= NULL
;
166 static char **arg_network_macvlan
= NULL
;
167 static char **arg_network_ipvlan
= NULL
;
168 static bool arg_network_veth
= false;
169 static char **arg_network_veth_extra
= NULL
;
170 static char *arg_network_bridge
= NULL
;
171 static unsigned long arg_personality
= PERSONALITY_INVALID
;
172 static char *arg_image
= NULL
;
173 static VolatileMode arg_volatile_mode
= VOLATILE_NO
;
174 static ExposePort
*arg_expose_ports
= NULL
;
175 static char **arg_property
= NULL
;
176 static uid_t arg_uid_shift
= UID_INVALID
, arg_uid_range
= 0x10000U
;
177 static bool arg_userns
= false;
178 static int arg_kill_signal
= 0;
179 static bool arg_unified_cgroup_hierarchy
= false;
180 static SettingsMask arg_settings_mask
= 0;
181 static int arg_settings_trusted
= -1;
182 static char **arg_parameters
= NULL
;
183 static const char *arg_container_service_name
= "systemd-nspawn";
185 static void help(void) {
186 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
187 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
188 " -h --help Show this help\n"
189 " --version Print version string\n"
190 " -q --quiet Do not show status information\n"
191 " -D --directory=PATH Root directory for the container\n"
192 " --template=PATH Initialize root directory from template directory,\n"
194 " -x --ephemeral Run container with snapshot of root directory, and\n"
195 " remove it after exit\n"
196 " -i --image=PATH File system device or disk image for the container\n"
197 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
198 " -b --boot Boot up full system (i.e. invoke init)\n"
199 " --chdir=PATH Set working directory in the container\n"
200 " -u --user=USER Run the command under specified user or uid\n"
201 " -M --machine=NAME Set the machine name for the container\n"
202 " --uuid=UUID Set a specific machine UUID for the container\n"
203 " -S --slice=SLICE Place the container in the specified slice\n"
204 " --property=NAME=VALUE Set scope unit property\n"
205 " --private-users[=UIDBASE[:NUIDS]]\n"
206 " Run within user namespace\n"
207 " --private-network Disable network in container\n"
208 " --network-interface=INTERFACE\n"
209 " Assign an existing network interface to the\n"
211 " --network-macvlan=INTERFACE\n"
212 " Create a macvlan network interface based on an\n"
213 " existing network interface to the container\n"
214 " --network-ipvlan=INTERFACE\n"
215 " Create a ipvlan network interface based on an\n"
216 " existing network interface to the container\n"
217 " -n --network-veth Add a virtual Ethernet connection between host\n"
219 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
220 " Add an additional virtual Ethernet link between\n"
221 " host and container\n"
222 " --network-bridge=INTERFACE\n"
223 " Add a virtual Ethernet connection between host\n"
224 " and container and add it to an existing bridge on\n"
226 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
227 " Expose a container IP port on the host\n"
228 " -Z --selinux-context=SECLABEL\n"
229 " Set the SELinux security context to be used by\n"
230 " processes in the container\n"
231 " -L --selinux-apifs-context=SECLABEL\n"
232 " Set the SELinux security context to be used by\n"
233 " API/tmpfs file systems in the container\n"
234 " --capability=CAP In addition to the default, retain specified\n"
236 " --drop-capability=CAP Drop the specified capability from the default set\n"
237 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
238 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
239 " host, try-guest, try-host\n"
240 " -j Equivalent to --link-journal=try-guest\n"
241 " --read-only Mount the root directory read-only\n"
242 " --bind=PATH[:PATH[:OPTIONS]]\n"
243 " Bind mount a file or directory from the host into\n"
245 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
246 " Similar, but creates a read-only bind mount\n"
247 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
248 " --overlay=PATH[:PATH...]:PATH\n"
249 " Create an overlay mount from the host to \n"
251 " --overlay-ro=PATH[:PATH...]:PATH\n"
252 " Similar, but creates a read-only overlay mount\n"
253 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
254 " --share-system Share system namespaces with host\n"
255 " --register=BOOLEAN Register container as machine\n"
256 " --keep-unit Do not register a scope for the machine, reuse\n"
257 " the service unit nspawn is running in\n"
258 " --volatile[=MODE] Run the system in volatile mode\n"
259 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
260 , program_invocation_short_name
);
264 static int custom_mounts_prepare(void) {
268 /* Ensure the mounts are applied prefix first. */
269 qsort_safe(arg_custom_mounts
, arg_n_custom_mounts
, sizeof(CustomMount
), custom_mount_compare
);
271 /* Allocate working directories for the overlay file systems that need it */
272 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
273 CustomMount
*m
= &arg_custom_mounts
[i
];
275 if (arg_userns
&& arg_uid_shift
== UID_INVALID
&& path_equal(m
->destination
, "/")) {
276 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
280 if (m
->type
!= CUSTOM_MOUNT_OVERLAY
)
289 r
= tempfn_random(m
->source
, NULL
, &m
->work_dir
);
291 return log_error_errno(r
, "Failed to generate work directory from %s: %m", m
->source
);
297 static int detect_unified_cgroup_hierarchy(void) {
301 /* Allow the user to control whether the unified hierarchy is used */
302 e
= getenv("UNIFIED_CGROUP_HIERARCHY");
304 r
= parse_boolean(e
);
306 return log_error_errno(r
, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
308 arg_unified_cgroup_hierarchy
= r
;
312 /* Otherwise inherit the default from the host system */
315 return log_error_errno(r
, "Failed to determine whether the unified cgroups hierarchy is used: %m");
317 arg_unified_cgroup_hierarchy
= r
;
321 static int parse_argv(int argc
, char *argv
[]) {
340 ARG_NETWORK_INTERFACE
,
344 ARG_NETWORK_VETH_EXTRA
,
355 static const struct option options
[] = {
356 { "help", no_argument
, NULL
, 'h' },
357 { "version", no_argument
, NULL
, ARG_VERSION
},
358 { "directory", required_argument
, NULL
, 'D' },
359 { "template", required_argument
, NULL
, ARG_TEMPLATE
},
360 { "ephemeral", no_argument
, NULL
, 'x' },
361 { "user", required_argument
, NULL
, 'u' },
362 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
363 { "as-pid2", no_argument
, NULL
, 'a' },
364 { "boot", no_argument
, NULL
, 'b' },
365 { "uuid", required_argument
, NULL
, ARG_UUID
},
366 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
367 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
368 { "drop-capability", required_argument
, NULL
, ARG_DROP_CAPABILITY
},
369 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
370 { "bind", required_argument
, NULL
, ARG_BIND
},
371 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
372 { "tmpfs", required_argument
, NULL
, ARG_TMPFS
},
373 { "overlay", required_argument
, NULL
, ARG_OVERLAY
},
374 { "overlay-ro", required_argument
, NULL
, ARG_OVERLAY_RO
},
375 { "machine", required_argument
, NULL
, 'M' },
376 { "slice", required_argument
, NULL
, 'S' },
377 { "setenv", required_argument
, NULL
, ARG_SETENV
},
378 { "selinux-context", required_argument
, NULL
, 'Z' },
379 { "selinux-apifs-context", required_argument
, NULL
, 'L' },
380 { "quiet", no_argument
, NULL
, 'q' },
381 { "share-system", no_argument
, NULL
, ARG_SHARE_SYSTEM
},
382 { "register", required_argument
, NULL
, ARG_REGISTER
},
383 { "keep-unit", no_argument
, NULL
, ARG_KEEP_UNIT
},
384 { "network-interface", required_argument
, NULL
, ARG_NETWORK_INTERFACE
},
385 { "network-macvlan", required_argument
, NULL
, ARG_NETWORK_MACVLAN
},
386 { "network-ipvlan", required_argument
, NULL
, ARG_NETWORK_IPVLAN
},
387 { "network-veth", no_argument
, NULL
, 'n' },
388 { "network-veth-extra", required_argument
, NULL
, ARG_NETWORK_VETH_EXTRA
},
389 { "network-bridge", required_argument
, NULL
, ARG_NETWORK_BRIDGE
},
390 { "personality", required_argument
, NULL
, ARG_PERSONALITY
},
391 { "image", required_argument
, NULL
, 'i' },
392 { "volatile", optional_argument
, NULL
, ARG_VOLATILE
},
393 { "port", required_argument
, NULL
, 'p' },
394 { "property", required_argument
, NULL
, ARG_PROPERTY
},
395 { "private-users", optional_argument
, NULL
, ARG_PRIVATE_USERS
},
396 { "kill-signal", required_argument
, NULL
, ARG_KILL_SIGNAL
},
397 { "settings", required_argument
, NULL
, ARG_SETTINGS
},
398 { "chdir", required_argument
, NULL
, ARG_CHDIR
},
404 uint64_t plus
= 0, minus
= 0;
405 bool mask_all_settings
= false, mask_no_settings
= false;
410 while ((c
= getopt_long(argc
, argv
, "+hD:u:abL:M:jS:Z:qi:xp:n", options
, NULL
)) >= 0)
422 r
= parse_path_argument_and_warn(optarg
, false, &arg_directory
);
428 r
= parse_path_argument_and_warn(optarg
, false, &arg_template
);
434 r
= parse_path_argument_and_warn(optarg
, false, &arg_image
);
440 arg_ephemeral
= true;
444 r
= free_and_strdup(&arg_user
, optarg
);
448 arg_settings_mask
|= SETTING_USER
;
451 case ARG_NETWORK_BRIDGE
:
452 r
= free_and_strdup(&arg_network_bridge
, optarg
);
459 arg_network_veth
= true;
460 arg_private_network
= true;
461 arg_settings_mask
|= SETTING_NETWORK
;
464 case ARG_NETWORK_VETH_EXTRA
:
465 r
= veth_extra_parse(&arg_network_veth_extra
, optarg
);
467 return log_error_errno(r
, "Failed to parse --network-veth-extra= parameter: %s", optarg
);
469 arg_private_network
= true;
470 arg_settings_mask
|= SETTING_NETWORK
;
473 case ARG_NETWORK_INTERFACE
:
474 if (strv_extend(&arg_network_interfaces
, optarg
) < 0)
477 arg_private_network
= true;
478 arg_settings_mask
|= SETTING_NETWORK
;
481 case ARG_NETWORK_MACVLAN
:
482 if (strv_extend(&arg_network_macvlan
, optarg
) < 0)
485 arg_private_network
= true;
486 arg_settings_mask
|= SETTING_NETWORK
;
489 case ARG_NETWORK_IPVLAN
:
490 if (strv_extend(&arg_network_ipvlan
, optarg
) < 0)
495 case ARG_PRIVATE_NETWORK
:
496 arg_private_network
= true;
497 arg_settings_mask
|= SETTING_NETWORK
;
501 if (arg_start_mode
== START_PID2
) {
502 log_error("--boot and --as-pid2 may not be combined.");
506 arg_start_mode
= START_BOOT
;
507 arg_settings_mask
|= SETTING_START_MODE
;
511 if (arg_start_mode
== START_BOOT
) {
512 log_error("--boot and --as-pid2 may not be combined.");
516 arg_start_mode
= START_PID2
;
517 arg_settings_mask
|= SETTING_START_MODE
;
521 r
= sd_id128_from_string(optarg
, &arg_uuid
);
523 log_error("Invalid UUID: %s", optarg
);
527 arg_settings_mask
|= SETTING_MACHINE_ID
;
536 arg_machine
= mfree(arg_machine
);
538 if (!machine_name_is_valid(optarg
)) {
539 log_error("Invalid machine name: %s", optarg
);
543 r
= free_and_strdup(&arg_machine
, optarg
);
551 arg_selinux_context
= optarg
;
555 arg_selinux_apifs_context
= optarg
;
559 arg_read_only
= true;
560 arg_settings_mask
|= SETTING_READ_ONLY
;
564 case ARG_DROP_CAPABILITY
: {
567 _cleanup_free_
char *t
= NULL
;
569 r
= extract_first_word(&p
, &t
, ",", 0);
571 return log_error_errno(r
, "Failed to parse capability %s.", t
);
576 if (streq(t
, "all")) {
577 if (c
== ARG_CAPABILITY
)
578 plus
= (uint64_t) -1;
580 minus
= (uint64_t) -1;
584 cap
= capability_from_name(t
);
586 log_error("Failed to parse capability %s.", t
);
590 if (c
== ARG_CAPABILITY
)
591 plus
|= 1ULL << (uint64_t) cap
;
593 minus
|= 1ULL << (uint64_t) cap
;
597 arg_settings_mask
|= SETTING_CAPABILITY
;
602 arg_link_journal
= LINK_GUEST
;
603 arg_link_journal_try
= true;
606 case ARG_LINK_JOURNAL
:
607 if (streq(optarg
, "auto")) {
608 arg_link_journal
= LINK_AUTO
;
609 arg_link_journal_try
= false;
610 } else if (streq(optarg
, "no")) {
611 arg_link_journal
= LINK_NO
;
612 arg_link_journal_try
= false;
613 } else if (streq(optarg
, "guest")) {
614 arg_link_journal
= LINK_GUEST
;
615 arg_link_journal_try
= false;
616 } else if (streq(optarg
, "host")) {
617 arg_link_journal
= LINK_HOST
;
618 arg_link_journal_try
= false;
619 } else if (streq(optarg
, "try-guest")) {
620 arg_link_journal
= LINK_GUEST
;
621 arg_link_journal_try
= true;
622 } else if (streq(optarg
, "try-host")) {
623 arg_link_journal
= LINK_HOST
;
624 arg_link_journal_try
= true;
626 log_error("Failed to parse link journal mode %s", optarg
);
634 r
= bind_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
, c
== ARG_BIND_RO
);
636 return log_error_errno(r
, "Failed to parse --bind(-ro)= argument %s: %m", optarg
);
638 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
642 r
= tmpfs_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
);
644 return log_error_errno(r
, "Failed to parse --tmpfs= argument %s: %m", optarg
);
646 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
650 case ARG_OVERLAY_RO
: {
651 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
652 _cleanup_strv_free_
char **lower
= NULL
;
657 r
= strv_split_extract(&lower
, optarg
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
661 log_error("Invalid overlay specification: %s", optarg
);
665 STRV_FOREACH(i
, lower
) {
666 if (!path_is_absolute(*i
)) {
667 log_error("Overlay path %s is not absolute.", *i
);
675 log_error("--overlay= needs at least two colon-separated directories specified.");
680 /* If two parameters are specified,
681 * the first one is the lower, the
682 * second one the upper directory. And
683 * we'll also define the destination
684 * mount point the same as the upper. */
688 destination
= strdup(upper
);
693 upper
= lower
[n
- 2];
694 destination
= lower
[n
- 1];
698 m
= custom_mount_add(&arg_custom_mounts
, &arg_n_custom_mounts
, CUSTOM_MOUNT_OVERLAY
);
702 m
->destination
= destination
;
705 m
->read_only
= c
== ARG_OVERLAY_RO
;
707 upper
= destination
= NULL
;
710 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
717 if (!env_assignment_is_valid(optarg
)) {
718 log_error("Environment variable assignment '%s' is not valid.", optarg
);
722 n
= strv_env_set(arg_setenv
, optarg
);
726 strv_free(arg_setenv
);
729 arg_settings_mask
|= SETTING_ENVIRONMENT
;
737 case ARG_SHARE_SYSTEM
:
738 arg_share_system
= true;
742 r
= parse_boolean(optarg
);
744 log_error("Failed to parse --register= argument: %s", optarg
);
752 arg_keep_unit
= true;
755 case ARG_PERSONALITY
:
757 arg_personality
= personality_from_string(optarg
);
758 if (arg_personality
== PERSONALITY_INVALID
) {
759 log_error("Unknown or unsupported personality '%s'.", optarg
);
763 arg_settings_mask
|= SETTING_PERSONALITY
;
769 arg_volatile_mode
= VOLATILE_YES
;
773 m
= volatile_mode_from_string(optarg
);
775 log_error("Failed to parse --volatile= argument: %s", optarg
);
778 arg_volatile_mode
= m
;
781 arg_settings_mask
|= SETTING_VOLATILE_MODE
;
785 r
= expose_port_parse(&arg_expose_ports
, optarg
);
787 return log_error_errno(r
, "Duplicate port specification: %s", optarg
);
789 return log_error_errno(r
, "Failed to parse host port %s: %m", optarg
);
791 arg_settings_mask
|= SETTING_EXPOSE_PORTS
;
795 if (strv_extend(&arg_property
, optarg
) < 0)
800 case ARG_PRIVATE_USERS
:
802 _cleanup_free_
char *buffer
= NULL
;
803 const char *range
, *shift
;
805 range
= strchr(optarg
, ':');
807 buffer
= strndup(optarg
, range
- optarg
);
813 if (safe_atou32(range
, &arg_uid_range
) < 0 || arg_uid_range
<= 0) {
814 log_error("Failed to parse UID range: %s", range
);
820 if (parse_uid(shift
, &arg_uid_shift
) < 0) {
821 log_error("Failed to parse UID: %s", optarg
);
829 case ARG_KILL_SIGNAL
:
830 arg_kill_signal
= signal_from_string_try_harder(optarg
);
831 if (arg_kill_signal
< 0) {
832 log_error("Cannot parse signal: %s", optarg
);
836 arg_settings_mask
|= SETTING_KILL_SIGNAL
;
841 /* no → do not read files
842 * yes → read files, do not override cmdline, trust only subset
843 * override → read files, override cmdline, trust only subset
844 * trusted → read files, do not override cmdline, trust all
847 r
= parse_boolean(optarg
);
849 if (streq(optarg
, "trusted")) {
850 mask_all_settings
= false;
851 mask_no_settings
= false;
852 arg_settings_trusted
= true;
854 } else if (streq(optarg
, "override")) {
855 mask_all_settings
= false;
856 mask_no_settings
= true;
857 arg_settings_trusted
= -1;
859 return log_error_errno(r
, "Failed to parse --settings= argument: %s", optarg
);
862 mask_all_settings
= false;
863 mask_no_settings
= false;
864 arg_settings_trusted
= -1;
867 mask_all_settings
= true;
868 mask_no_settings
= false;
869 arg_settings_trusted
= false;
875 if (!path_is_absolute(optarg
)) {
876 log_error("Working directory %s is not an absolute path.", optarg
);
880 r
= free_and_strdup(&arg_chdir
, optarg
);
884 arg_settings_mask
|= SETTING_WORKING_DIRECTORY
;
891 assert_not_reached("Unhandled option");
894 if (arg_share_system
)
895 arg_register
= false;
897 if (arg_start_mode
!= START_PID1
&& arg_share_system
) {
898 log_error("--boot and --share-system may not be combined.");
902 if (arg_keep_unit
&& cg_pid_get_owner_uid(0, NULL
) >= 0) {
903 log_error("--keep-unit may not be used when invoked from a user session.");
907 if (arg_directory
&& arg_image
) {
908 log_error("--directory= and --image= may not be combined.");
912 if (arg_template
&& arg_image
) {
913 log_error("--template= and --image= may not be combined.");
917 if (arg_template
&& !(arg_directory
|| arg_machine
)) {
918 log_error("--template= needs --directory= or --machine=.");
922 if (arg_ephemeral
&& arg_template
) {
923 log_error("--ephemeral and --template= may not be combined.");
927 if (arg_ephemeral
&& arg_image
) {
928 log_error("--ephemeral and --image= may not be combined.");
932 if (arg_ephemeral
&& !IN_SET(arg_link_journal
, LINK_NO
, LINK_AUTO
)) {
933 log_error("--ephemeral and --link-journal= may not be combined.");
937 if (arg_userns
&& access("/proc/self/uid_map", F_OK
) < 0)
938 return log_error_errno(EOPNOTSUPP
, "--private-users= is not supported, kernel compiled without user namespace support.");
941 arg_parameters
= strv_copy(argv
+ optind
);
945 arg_settings_mask
|= SETTING_START_MODE
;
948 /* Load all settings from .nspawn files */
949 if (mask_no_settings
)
950 arg_settings_mask
= 0;
952 /* Don't load any settings from .nspawn files */
953 if (mask_all_settings
)
954 arg_settings_mask
= _SETTINGS_MASK_ALL
;
956 arg_retain
= (arg_retain
| plus
| (arg_private_network
? 1ULL << CAP_NET_ADMIN
: 0)) & ~minus
;
958 r
= detect_unified_cgroup_hierarchy();
962 e
= getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
964 arg_container_service_name
= e
;
969 static int verify_arguments(void) {
971 if (arg_volatile_mode
!= VOLATILE_NO
&& arg_read_only
) {
972 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
976 if (arg_expose_ports
&& !arg_private_network
) {
977 log_error("Cannot use --port= without private networking.");
981 if (arg_start_mode
== START_BOOT
&& arg_kill_signal
<= 0)
982 arg_kill_signal
= SIGRTMIN
+3;
987 static int userns_lchown(const char *p
, uid_t uid
, gid_t gid
) {
993 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
996 if (uid
!= UID_INVALID
) {
997 uid
+= arg_uid_shift
;
999 if (uid
< arg_uid_shift
|| uid
>= arg_uid_shift
+ arg_uid_range
)
1003 if (gid
!= GID_INVALID
) {
1004 gid
+= (gid_t
) arg_uid_shift
;
1006 if (gid
< (gid_t
) arg_uid_shift
|| gid
>= (gid_t
) (arg_uid_shift
+ arg_uid_range
))
1010 if (lchown(p
, uid
, gid
) < 0)
1016 static int userns_mkdir(const char *root
, const char *path
, mode_t mode
, uid_t uid
, gid_t gid
) {
1019 q
= prefix_roota(root
, path
);
1020 if (mkdir(q
, mode
) < 0) {
1021 if (errno
== EEXIST
)
1026 return userns_lchown(q
, uid
, gid
);
1029 static int setup_timezone(const char *dest
) {
1030 _cleanup_free_
char *p
= NULL
, *q
= NULL
;
1031 const char *where
, *check
, *what
;
1037 /* Fix the timezone, if possible */
1038 r
= readlink_malloc("/etc/localtime", &p
);
1040 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1044 z
= path_startswith(p
, "../usr/share/zoneinfo/");
1046 z
= path_startswith(p
, "/usr/share/zoneinfo/");
1048 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1052 where
= prefix_roota(dest
, "/etc/localtime");
1053 r
= readlink_malloc(where
, &q
);
1055 y
= path_startswith(q
, "../usr/share/zoneinfo/");
1057 y
= path_startswith(q
, "/usr/share/zoneinfo/");
1059 /* Already pointing to the right place? Then do nothing .. */
1060 if (y
&& streq(y
, z
))
1064 check
= strjoina("/usr/share/zoneinfo/", z
);
1065 check
= prefix_roota(dest
, check
);
1066 if (laccess(check
, F_OK
) < 0) {
1067 log_warning("Timezone %s does not exist in container, not updating container timezone.", z
);
1072 if (r
< 0 && errno
!= ENOENT
) {
1073 log_error_errno(errno
, "Failed to remove existing timezone info %s in container: %m", where
);
1077 what
= strjoina("../usr/share/zoneinfo/", z
);
1078 if (symlink(what
, where
) < 0) {
1079 log_error_errno(errno
, "Failed to correct timezone of container: %m");
1083 r
= userns_lchown(where
, 0, 0);
1085 return log_warning_errno(r
, "Failed to chown /etc/localtime: %m");
1090 static int setup_resolv_conf(const char *dest
) {
1091 const char *where
= NULL
;
1096 if (arg_private_network
)
1099 /* Fix resolv.conf, if possible */
1100 where
= prefix_roota(dest
, "/etc/resolv.conf");
1102 r
= copy_file("/etc/resolv.conf", where
, O_TRUNC
|O_NOFOLLOW
, 0644, 0);
1104 /* If the file already exists as symlink, let's
1105 * suppress the warning, under the assumption that
1106 * resolved or something similar runs inside and the
1107 * symlink points there.
1109 * If the disk image is read-only, there's also no
1110 * point in complaining.
1112 log_full_errno(IN_SET(r
, -ELOOP
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
1113 "Failed to copy /etc/resolv.conf to %s: %m", where
);
1117 r
= userns_lchown(where
, 0, 0);
1119 log_warning_errno(r
, "Failed to chown /etc/resolv.conf: %m");
1124 static char* id128_format_as_uuid(sd_id128_t id
, char s
[37]) {
1128 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1129 SD_ID128_FORMAT_VAL(id
));
1134 static int setup_boot_id(const char *dest
) {
1135 const char *from
, *to
;
1136 sd_id128_t rnd
= {};
1140 if (arg_share_system
)
1143 /* Generate a new randomized boot ID, so that each boot-up of
1144 * the container gets a new one */
1146 from
= prefix_roota(dest
, "/run/proc-sys-kernel-random-boot-id");
1147 to
= prefix_roota(dest
, "/proc/sys/kernel/random/boot_id");
1149 r
= sd_id128_randomize(&rnd
);
1151 return log_error_errno(r
, "Failed to generate random boot id: %m");
1153 id128_format_as_uuid(rnd
, as_uuid
);
1155 r
= write_string_file(from
, as_uuid
, WRITE_STRING_FILE_CREATE
);
1157 return log_error_errno(r
, "Failed to write boot id: %m");
1159 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1160 r
= log_error_errno(errno
, "Failed to bind mount boot id: %m");
1161 else if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
) < 0)
1162 log_warning_errno(errno
, "Failed to make boot id read-only: %m");
1168 static int copy_devnodes(const char *dest
) {
1170 static const char devnodes
[] =
1181 _cleanup_umask_ mode_t u
;
1187 /* Create /dev/net, so that we can create /dev/net/tun in it */
1188 if (userns_mkdir(dest
, "/dev/net", 0755, 0, 0) < 0)
1189 return log_error_errno(r
, "Failed to create /dev/net directory: %m");
1191 NULSTR_FOREACH(d
, devnodes
) {
1192 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
1195 from
= strappend("/dev/", d
);
1196 to
= prefix_root(dest
, from
);
1198 if (stat(from
, &st
) < 0) {
1200 if (errno
!= ENOENT
)
1201 return log_error_errno(errno
, "Failed to stat %s: %m", from
);
1203 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
1205 log_error("%s is not a char or block device, cannot copy.", from
);
1209 if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
1211 return log_error_errno(errno
, "mknod(%s) failed: %m", to
);
1213 /* Some systems abusively restrict mknod but
1214 * allow bind mounts. */
1217 return log_error_errno(r
, "touch (%s) failed: %m", to
);
1218 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1219 return log_error_errno(errno
, "Both mknod and bind mount (%s) failed: %m", to
);
1222 r
= userns_lchown(to
, 0, 0);
1224 return log_error_errno(r
, "chown() of device node %s failed: %m", to
);
1231 static int setup_pts(const char *dest
) {
1232 _cleanup_free_
char *options
= NULL
;
1237 if (arg_selinux_apifs_context
)
1238 (void) asprintf(&options
,
1239 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
",context=\"%s\"",
1240 arg_uid_shift
+ TTY_GID
,
1241 arg_selinux_apifs_context
);
1244 (void) asprintf(&options
,
1245 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
,
1246 arg_uid_shift
+ TTY_GID
);
1251 /* Mount /dev/pts itself */
1252 p
= prefix_roota(dest
, "/dev/pts");
1253 if (mkdir(p
, 0755) < 0)
1254 return log_error_errno(errno
, "Failed to create /dev/pts: %m");
1255 if (mount("devpts", p
, "devpts", MS_NOSUID
|MS_NOEXEC
, options
) < 0)
1256 return log_error_errno(errno
, "Failed to mount /dev/pts: %m");
1257 r
= userns_lchown(p
, 0, 0);
1259 return log_error_errno(r
, "Failed to chown /dev/pts: %m");
1261 /* Create /dev/ptmx symlink */
1262 p
= prefix_roota(dest
, "/dev/ptmx");
1263 if (symlink("pts/ptmx", p
) < 0)
1264 return log_error_errno(errno
, "Failed to create /dev/ptmx symlink: %m");
1265 r
= userns_lchown(p
, 0, 0);
1267 return log_error_errno(r
, "Failed to chown /dev/ptmx: %m");
1269 /* And fix /dev/pts/ptmx ownership */
1270 p
= prefix_roota(dest
, "/dev/pts/ptmx");
1271 r
= userns_lchown(p
, 0, 0);
1273 return log_error_errno(r
, "Failed to chown /dev/pts/ptmx: %m");
1278 static int setup_dev_console(const char *dest
, const char *console
) {
1279 _cleanup_umask_ mode_t u
;
1288 r
= chmod_and_chown(console
, 0600, arg_uid_shift
, arg_uid_shift
);
1290 return log_error_errno(r
, "Failed to correct access mode for TTY: %m");
1292 /* We need to bind mount the right tty to /dev/console since
1293 * ptys can only exist on pts file systems. To have something
1294 * to bind mount things on we create a empty regular file. */
1296 to
= prefix_roota(dest
, "/dev/console");
1299 return log_error_errno(r
, "touch() for /dev/console failed: %m");
1301 if (mount(console
, to
, NULL
, MS_BIND
, NULL
) < 0)
1302 return log_error_errno(errno
, "Bind mount for /dev/console failed: %m");
1307 static int setup_kmsg(const char *dest
, int kmsg_socket
) {
1308 const char *from
, *to
;
1309 _cleanup_umask_ mode_t u
;
1312 assert(kmsg_socket
>= 0);
1316 /* We create the kmsg FIFO as /run/kmsg, but immediately
1317 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1318 * on the reading side behave very similar to /proc/kmsg,
1319 * their writing side behaves differently from /dev/kmsg in
1320 * that writing blocks when nothing is reading. In order to
1321 * avoid any problems with containers deadlocking due to this
1322 * we simply make /dev/kmsg unavailable to the container. */
1323 from
= prefix_roota(dest
, "/run/kmsg");
1324 to
= prefix_roota(dest
, "/proc/kmsg");
1326 if (mkfifo(from
, 0600) < 0)
1327 return log_error_errno(errno
, "mkfifo() for /run/kmsg failed: %m");
1328 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1329 return log_error_errno(errno
, "Bind mount for /proc/kmsg failed: %m");
1331 fd
= open(from
, O_RDWR
|O_NDELAY
|O_CLOEXEC
);
1333 return log_error_errno(errno
, "Failed to open fifo: %m");
1335 /* Store away the fd in the socket, so that it stays open as
1336 * long as we run the child */
1337 r
= send_one_fd(kmsg_socket
, fd
, 0);
1341 return log_error_errno(r
, "Failed to send FIFO fd: %m");
1343 /* And now make the FIFO unavailable as /run/kmsg... */
1344 (void) unlink(from
);
1349 static int on_address_change(sd_netlink
*rtnl
, sd_netlink_message
*m
, void *userdata
) {
1350 union in_addr_union
*exposed
= userdata
;
1356 expose_port_execute(rtnl
, arg_expose_ports
, exposed
);
1360 static int setup_hostname(void) {
1362 if (arg_share_system
)
1365 if (sethostname_idempotent(arg_machine
) < 0)
1371 static int setup_journal(const char *directory
) {
1372 sd_id128_t machine_id
, this_id
;
1373 _cleanup_free_
char *b
= NULL
, *d
= NULL
;
1374 const char *etc_machine_id
, *p
, *q
;
1379 /* Don't link journals in ephemeral mode */
1383 if (arg_link_journal
== LINK_NO
)
1386 try = arg_link_journal_try
|| arg_link_journal
== LINK_AUTO
;
1388 etc_machine_id
= prefix_roota(directory
, "/etc/machine-id");
1390 r
= read_one_line_file(etc_machine_id
, &b
);
1391 if (r
== -ENOENT
&& try)
1394 return log_error_errno(r
, "Failed to read machine ID from %s: %m", etc_machine_id
);
1397 if (isempty(id
) && try)
1400 /* Verify validity */
1401 r
= sd_id128_from_string(id
, &machine_id
);
1403 return log_error_errno(r
, "Failed to parse machine ID from %s: %m", etc_machine_id
);
1405 r
= sd_id128_get_machine(&this_id
);
1407 return log_error_errno(r
, "Failed to retrieve machine ID: %m");
1409 if (sd_id128_equal(machine_id
, this_id
)) {
1410 log_full(try ? LOG_WARNING
: LOG_ERR
,
1411 "Host and machine ids are equal (%s): refusing to link journals", id
);
1417 r
= userns_mkdir(directory
, "/var", 0755, 0, 0);
1419 return log_error_errno(r
, "Failed to create /var: %m");
1421 r
= userns_mkdir(directory
, "/var/log", 0755, 0, 0);
1423 return log_error_errno(r
, "Failed to create /var/log: %m");
1425 r
= userns_mkdir(directory
, "/var/log/journal", 0755, 0, 0);
1427 return log_error_errno(r
, "Failed to create /var/log/journal: %m");
1429 p
= strjoina("/var/log/journal/", id
);
1430 q
= prefix_roota(directory
, p
);
1432 if (path_is_mount_point(p
, 0) > 0) {
1436 log_error("%s: already a mount point, refusing to use for journal", p
);
1440 if (path_is_mount_point(q
, 0) > 0) {
1444 log_error("%s: already a mount point, refusing to use for journal", q
);
1448 r
= readlink_and_make_absolute(p
, &d
);
1450 if ((arg_link_journal
== LINK_GUEST
||
1451 arg_link_journal
== LINK_AUTO
) &&
1454 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1456 log_warning_errno(r
, "Failed to create directory %s: %m", q
);
1461 return log_error_errno(errno
, "Failed to remove symlink %s: %m", p
);
1462 } else if (r
== -EINVAL
) {
1464 if (arg_link_journal
== LINK_GUEST
&&
1467 if (errno
== ENOTDIR
) {
1468 log_error("%s already exists and is neither a symlink nor a directory", p
);
1471 return log_error_errno(errno
, "Failed to remove %s: %m", p
);
1473 } else if (r
!= -ENOENT
)
1474 return log_error_errno(r
, "readlink(%s) failed: %m", p
);
1476 if (arg_link_journal
== LINK_GUEST
) {
1478 if (symlink(q
, p
) < 0) {
1480 log_debug_errno(errno
, "Failed to symlink %s to %s, skipping journal setup: %m", q
, p
);
1483 return log_error_errno(errno
, "Failed to symlink %s to %s: %m", q
, p
);
1486 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1488 log_warning_errno(r
, "Failed to create directory %s: %m", q
);
1492 if (arg_link_journal
== LINK_HOST
) {
1493 /* don't create parents here -- if the host doesn't have
1494 * permanent journal set up, don't force it here */
1496 if (mkdir(p
, 0755) < 0 && errno
!= EEXIST
) {
1498 log_debug_errno(errno
, "Failed to create %s, skipping journal setup: %m", p
);
1501 return log_error_errno(errno
, "Failed to create %s: %m", p
);
1504 } else if (access(p
, F_OK
) < 0)
1507 if (dir_is_empty(q
) == 0)
1508 log_warning("%s is not empty, proceeding anyway.", q
);
1510 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1512 return log_error_errno(r
, "Failed to create %s: %m", q
);
1514 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1515 return log_error_errno(errno
, "Failed to bind mount journal from host into guest: %m");
1520 static int drop_capabilities(void) {
1521 return capability_bounding_set_drop(arg_retain
, false);
1524 static int reset_audit_loginuid(void) {
1525 _cleanup_free_
char *p
= NULL
;
1528 if (arg_share_system
)
1531 r
= read_one_line_file("/proc/self/loginuid", &p
);
1535 return log_error_errno(r
, "Failed to read /proc/self/loginuid: %m");
1537 /* Already reset? */
1538 if (streq(p
, "4294967295"))
1541 r
= write_string_file("/proc/self/loginuid", "4294967295", 0);
1544 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1545 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1546 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1547 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1548 "using systemd-nspawn. Sleeping for 5s... (%m)");
1556 static int setup_seccomp(void) {
1559 static const struct {
1560 uint64_t capability
;
1563 { CAP_SYS_RAWIO
, SCMP_SYS(iopl
) },
1564 { CAP_SYS_RAWIO
, SCMP_SYS(ioperm
) },
1565 { CAP_SYS_BOOT
, SCMP_SYS(kexec_load
) },
1566 { CAP_SYS_ADMIN
, SCMP_SYS(swapon
) },
1567 { CAP_SYS_ADMIN
, SCMP_SYS(swapoff
) },
1568 { CAP_SYS_ADMIN
, SCMP_SYS(open_by_handle_at
) },
1569 { CAP_SYS_MODULE
, SCMP_SYS(init_module
) },
1570 { CAP_SYS_MODULE
, SCMP_SYS(finit_module
) },
1571 { CAP_SYS_MODULE
, SCMP_SYS(delete_module
) },
1572 { CAP_SYSLOG
, SCMP_SYS(syslog
) },
1575 scmp_filter_ctx seccomp
;
1579 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
1583 r
= seccomp_add_secondary_archs(seccomp
);
1585 log_error_errno(r
, "Failed to add secondary archs to seccomp filter: %m");
1589 for (i
= 0; i
< ELEMENTSOF(blacklist
); i
++) {
1590 if (arg_retain
& (1ULL << blacklist
[i
].capability
))
1593 r
= seccomp_rule_add(seccomp
, SCMP_ACT_ERRNO(EPERM
), blacklist
[i
].syscall_num
, 0);
1595 continue; /* unknown syscall */
1597 log_error_errno(r
, "Failed to block syscall: %m");
1604 Audit is broken in containers, much of the userspace audit
1605 hookup will fail if running inside a container. We don't
1606 care and just turn off creation of audit sockets.
1608 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1609 with EAFNOSUPPORT which audit userspace uses as indication
1610 that audit is disabled in the kernel.
1613 r
= seccomp_rule_add(
1615 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1618 SCMP_A0(SCMP_CMP_EQ
, AF_NETLINK
),
1619 SCMP_A2(SCMP_CMP_EQ
, NETLINK_AUDIT
));
1621 log_error_errno(r
, "Failed to add audit seccomp rule: %m");
1625 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
1627 log_error_errno(r
, "Failed to unset NO_NEW_PRIVS: %m");
1631 r
= seccomp_load(seccomp
);
1633 log_debug_errno(r
, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1638 log_error_errno(r
, "Failed to install seccomp audit filter: %m");
1643 seccomp_release(seccomp
);
1651 static int setup_propagate(const char *root
) {
1655 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1656 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1657 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
1658 (void) mkdir_p(p
, 0600);
1660 r
= userns_mkdir(root
, "/run/systemd", 0755, 0, 0);
1662 return log_error_errno(r
, "Failed to create /run/systemd: %m");
1664 r
= userns_mkdir(root
, "/run/systemd/nspawn", 0755, 0, 0);
1666 return log_error_errno(r
, "Failed to create /run/systemd/nspawn: %m");
1668 r
= userns_mkdir(root
, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1670 return log_error_errno(r
, "Failed to create /run/systemd/nspawn/incoming: %m");
1672 q
= prefix_roota(root
, "/run/systemd/nspawn/incoming");
1673 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1674 return log_error_errno(errno
, "Failed to install propagation bind mount.");
1676 if (mount(NULL
, q
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
) < 0)
1677 return log_error_errno(errno
, "Failed to make propagation mount read-only");
1682 static int setup_image(char **device_path
, int *loop_nr
) {
1683 struct loop_info64 info
= {
1684 .lo_flags
= LO_FLAGS_AUTOCLEAR
|LO_FLAGS_PARTSCAN
1686 _cleanup_close_
int fd
= -1, control
= -1, loop
= -1;
1687 _cleanup_free_
char* loopdev
= NULL
;
1691 assert(device_path
);
1695 fd
= open(arg_image
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1697 return log_error_errno(errno
, "Failed to open %s: %m", arg_image
);
1699 if (fstat(fd
, &st
) < 0)
1700 return log_error_errno(errno
, "Failed to stat %s: %m", arg_image
);
1702 if (S_ISBLK(st
.st_mode
)) {
1705 p
= strdup(arg_image
);
1719 if (!S_ISREG(st
.st_mode
)) {
1720 log_error("%s is not a regular file or block device.", arg_image
);
1724 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
1726 return log_error_errno(errno
, "Failed to open /dev/loop-control: %m");
1728 nr
= ioctl(control
, LOOP_CTL_GET_FREE
);
1730 return log_error_errno(errno
, "Failed to allocate loop device: %m");
1732 if (asprintf(&loopdev
, "/dev/loop%i", nr
) < 0)
1735 loop
= open(loopdev
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1737 return log_error_errno(errno
, "Failed to open loop device %s: %m", loopdev
);
1739 if (ioctl(loop
, LOOP_SET_FD
, fd
) < 0)
1740 return log_error_errno(errno
, "Failed to set loopback file descriptor on %s: %m", loopdev
);
1743 info
.lo_flags
|= LO_FLAGS_READ_ONLY
;
1745 if (ioctl(loop
, LOOP_SET_STATUS64
, &info
) < 0)
1746 return log_error_errno(errno
, "Failed to set loopback settings on %s: %m", loopdev
);
1748 *device_path
= loopdev
;
1759 #define PARTITION_TABLE_BLURB \
1760 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1761 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1762 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1763 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1764 "to be bootable with systemd-nspawn."
1766 static int dissect_image(
1768 char **root_device
, bool *root_device_rw
,
1769 char **home_device
, bool *home_device_rw
,
1770 char **srv_device
, bool *srv_device_rw
,
1774 int home_nr
= -1, srv_nr
= -1;
1775 #ifdef GPT_ROOT_NATIVE
1778 #ifdef GPT_ROOT_SECONDARY
1779 int secondary_root_nr
= -1;
1781 _cleanup_free_
char *home
= NULL
, *root
= NULL
, *secondary_root
= NULL
, *srv
= NULL
, *generic
= NULL
;
1782 _cleanup_udev_enumerate_unref_
struct udev_enumerate
*e
= NULL
;
1783 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
1784 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
1785 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
1786 struct udev_list_entry
*first
, *item
;
1787 bool home_rw
= true, root_rw
= true, secondary_root_rw
= true, srv_rw
= true, generic_rw
= true;
1788 bool is_gpt
, is_mbr
, multiple_generic
= false;
1789 const char *pttype
= NULL
;
1796 assert(root_device
);
1797 assert(home_device
);
1802 b
= blkid_new_probe();
1807 r
= blkid_probe_set_device(b
, fd
, 0, 0);
1812 return log_error_errno(errno
, "Failed to set device on blkid probe: %m");
1815 blkid_probe_enable_partitions(b
, 1);
1816 blkid_probe_set_partitions_flags(b
, BLKID_PARTS_ENTRY_DETAILS
);
1819 r
= blkid_do_safeprobe(b
);
1820 if (r
== -2 || r
== 1) {
1821 log_error("Failed to identify any partition table on\n"
1823 PARTITION_TABLE_BLURB
, arg_image
);
1825 } else if (r
!= 0) {
1828 return log_error_errno(errno
, "Failed to probe: %m");
1831 (void) blkid_probe_lookup_value(b
, "PTTYPE", &pttype
, NULL
);
1833 is_gpt
= streq_ptr(pttype
, "gpt");
1834 is_mbr
= streq_ptr(pttype
, "dos");
1836 if (!is_gpt
&& !is_mbr
) {
1837 log_error("No GPT or MBR partition table discovered on\n"
1839 PARTITION_TABLE_BLURB
, arg_image
);
1844 pl
= blkid_probe_get_partitions(b
);
1849 log_error("Failed to list partitions of %s", arg_image
);
1857 if (fstat(fd
, &st
) < 0)
1858 return log_error_errno(errno
, "Failed to stat block device: %m");
1860 d
= udev_device_new_from_devnum(udev
, 'b', st
.st_rdev
);
1868 log_error("Kernel partitions never appeared.");
1872 e
= udev_enumerate_new(udev
);
1876 r
= udev_enumerate_add_match_parent(e
, d
);
1880 r
= udev_enumerate_scan_devices(e
);
1882 return log_error_errno(r
, "Failed to scan for partition devices of %s: %m", arg_image
);
1884 /* Count the partitions enumerated by the kernel */
1886 first
= udev_enumerate_get_list_entry(e
);
1887 udev_list_entry_foreach(item
, first
)
1890 /* Count the partitions enumerated by blkid */
1891 m
= blkid_partlist_numof_partitions(pl
);
1895 log_error("blkid and kernel partition list do not match.");
1901 /* The kernel has probed fewer partitions than
1902 * blkid? Maybe the kernel prober is still
1903 * running or it got EBUSY because udev
1904 * already opened the device. Let's reprobe
1905 * the device, which is a synchronous call
1906 * that waits until probing is complete. */
1908 for (j
= 0; j
< 20; j
++) {
1910 r
= ioctl(fd
, BLKRRPART
, 0);
1913 if (r
>= 0 || r
!= -EBUSY
)
1916 /* If something else has the device
1917 * open, such as an udev rule, the
1918 * ioctl will return EBUSY. Since
1919 * there's no way to wait until it
1920 * isn't busy anymore, let's just wait
1921 * a bit, and try again.
1923 * This is really something they
1924 * should fix in the kernel! */
1926 usleep(50 * USEC_PER_MSEC
);
1930 return log_error_errno(r
, "Failed to reread partition table: %m");
1933 e
= udev_enumerate_unref(e
);
1936 first
= udev_enumerate_get_list_entry(e
);
1937 udev_list_entry_foreach(item
, first
) {
1938 _cleanup_udev_device_unref_
struct udev_device
*q
;
1940 unsigned long long flags
;
1946 q
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(item
));
1951 return log_error_errno(errno
, "Failed to get partition device of %s: %m", arg_image
);
1954 qn
= udev_device_get_devnum(q
);
1958 if (st
.st_rdev
== qn
)
1961 node
= udev_device_get_devnode(q
);
1965 pp
= blkid_partlist_devno_to_partition(pl
, qn
);
1969 flags
= blkid_partition_get_flags(pp
);
1971 nr
= blkid_partition_get_partno(pp
);
1979 if (flags
& GPT_FLAG_NO_AUTO
)
1982 stype
= blkid_partition_get_type_string(pp
);
1986 if (sd_id128_from_string(stype
, &type_id
) < 0)
1989 if (sd_id128_equal(type_id
, GPT_HOME
)) {
1991 if (home
&& nr
>= home_nr
)
1995 home_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1997 r
= free_and_strdup(&home
, node
);
2001 } else if (sd_id128_equal(type_id
, GPT_SRV
)) {
2003 if (srv
&& nr
>= srv_nr
)
2007 srv_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2009 r
= free_and_strdup(&srv
, node
);
2013 #ifdef GPT_ROOT_NATIVE
2014 else if (sd_id128_equal(type_id
, GPT_ROOT_NATIVE
)) {
2016 if (root
&& nr
>= root_nr
)
2020 root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2022 r
= free_and_strdup(&root
, node
);
2027 #ifdef GPT_ROOT_SECONDARY
2028 else if (sd_id128_equal(type_id
, GPT_ROOT_SECONDARY
)) {
2030 if (secondary_root
&& nr
>= secondary_root_nr
)
2033 secondary_root_nr
= nr
;
2034 secondary_root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2036 r
= free_and_strdup(&secondary_root
, node
);
2041 else if (sd_id128_equal(type_id
, GPT_LINUX_GENERIC
)) {
2044 multiple_generic
= true;
2046 generic_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2048 r
= free_and_strdup(&generic
, node
);
2054 } else if (is_mbr
) {
2057 if (flags
!= 0x80) /* Bootable flag */
2060 type
= blkid_partition_get_type(pp
);
2061 if (type
!= 0x83) /* Linux partition */
2065 multiple_generic
= true;
2069 r
= free_and_strdup(&root
, node
);
2077 *root_device
= root
;
2080 *root_device_rw
= root_rw
;
2082 } else if (secondary_root
) {
2083 *root_device
= secondary_root
;
2084 secondary_root
= NULL
;
2086 *root_device_rw
= secondary_root_rw
;
2088 } else if (generic
) {
2090 /* There were no partitions with precise meanings
2091 * around, but we found generic partitions. In this
2092 * case, if there's only one, we can go ahead and boot
2093 * it, otherwise we bail out, because we really cannot
2094 * make any sense of it. */
2096 if (multiple_generic
) {
2097 log_error("Identified multiple bootable Linux partitions on\n"
2099 PARTITION_TABLE_BLURB
, arg_image
);
2103 *root_device
= generic
;
2106 *root_device_rw
= generic_rw
;
2109 log_error("Failed to identify root partition in disk image\n"
2111 PARTITION_TABLE_BLURB
, arg_image
);
2116 *home_device
= home
;
2119 *home_device_rw
= home_rw
;
2126 *srv_device_rw
= srv_rw
;
2131 log_error("--image= is not supported, compiled without blkid support.");
2136 static int mount_device(const char *what
, const char *where
, const char *directory
, bool rw
) {
2138 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
2139 const char *fstype
, *p
;
2149 p
= strjoina(where
, directory
);
2154 b
= blkid_new_probe_from_filename(what
);
2158 return log_error_errno(errno
, "Failed to allocate prober for %s: %m", what
);
2161 blkid_probe_enable_superblocks(b
, 1);
2162 blkid_probe_set_superblocks_flags(b
, BLKID_SUBLKS_TYPE
);
2165 r
= blkid_do_safeprobe(b
);
2166 if (r
== -1 || r
== 1) {
2167 log_error("Cannot determine file system type of %s", what
);
2169 } else if (r
!= 0) {
2172 return log_error_errno(errno
, "Failed to probe %s: %m", what
);
2176 if (blkid_probe_lookup_value(b
, "TYPE", &fstype
, NULL
) < 0) {
2179 log_error("Failed to determine file system type of %s", what
);
2183 if (streq(fstype
, "crypto_LUKS")) {
2184 log_error("nspawn currently does not support LUKS disk images.");
2188 if (mount(what
, p
, fstype
, MS_NODEV
|(rw
? 0 : MS_RDONLY
), NULL
) < 0)
2189 return log_error_errno(errno
, "Failed to mount %s: %m", what
);
2193 log_error("--image= is not supported, compiled without blkid support.");
2198 static int mount_devices(
2200 const char *root_device
, bool root_device_rw
,
2201 const char *home_device
, bool home_device_rw
,
2202 const char *srv_device
, bool srv_device_rw
) {
2208 r
= mount_device(root_device
, arg_directory
, NULL
, root_device_rw
);
2210 return log_error_errno(r
, "Failed to mount root directory: %m");
2214 r
= mount_device(home_device
, arg_directory
, "/home", home_device_rw
);
2216 return log_error_errno(r
, "Failed to mount home directory: %m");
2220 r
= mount_device(srv_device
, arg_directory
, "/srv", srv_device_rw
);
2222 return log_error_errno(r
, "Failed to mount server data directory: %m");
2228 static void loop_remove(int nr
, int *image_fd
) {
2229 _cleanup_close_
int control
= -1;
2235 if (image_fd
&& *image_fd
>= 0) {
2236 r
= ioctl(*image_fd
, LOOP_CLR_FD
);
2238 log_debug_errno(errno
, "Failed to close loop image: %m");
2239 *image_fd
= safe_close(*image_fd
);
2242 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
2244 log_warning_errno(errno
, "Failed to open /dev/loop-control: %m");
2248 r
= ioctl(control
, LOOP_CTL_REMOVE
, nr
);
2250 log_debug_errno(errno
, "Failed to remove loop %d: %m", nr
);
2255 * < 0 : wait_for_terminate() failed to get the state of the
2256 * container, the container was terminated by a signal, or
2257 * failed for an unknown reason. No change is made to the
2258 * container argument.
2259 * > 0 : The program executed in the container terminated with an
2260 * error. The exit code of the program executed in the
2261 * container is returned. The container argument has been set
2262 * to CONTAINER_TERMINATED.
2263 * 0 : The container is being rebooted, has been shut down or exited
2264 * successfully. The container argument has been set to either
2265 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2267 * That is, success is indicated by a return value of zero, and an
2268 * error is indicated by a non-zero value.
2270 static int wait_for_container(pid_t pid
, ContainerStatus
*container
) {
2274 r
= wait_for_terminate(pid
, &status
);
2276 return log_warning_errno(r
, "Failed to wait for container: %m");
2278 switch (status
.si_code
) {
2281 if (status
.si_status
== 0) {
2282 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s exited successfully.", arg_machine
);
2285 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s failed with error code %i.", arg_machine
, status
.si_status
);
2287 *container
= CONTAINER_TERMINATED
;
2288 return status
.si_status
;
2291 if (status
.si_status
== SIGINT
) {
2293 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s has been shut down.", arg_machine
);
2294 *container
= CONTAINER_TERMINATED
;
2297 } else if (status
.si_status
== SIGHUP
) {
2299 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s is being rebooted.", arg_machine
);
2300 *container
= CONTAINER_REBOOTED
;
2304 /* CLD_KILLED fallthrough */
2307 log_error("Container %s terminated by signal %s.", arg_machine
, signal_to_string(status
.si_status
));
2311 log_error("Container %s failed due to unknown reason.", arg_machine
);
2318 static int on_orderly_shutdown(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
2321 pid
= PTR_TO_PID(userdata
);
2323 if (kill(pid
, arg_kill_signal
) >= 0) {
2324 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2325 sd_event_source_set_userdata(s
, NULL
);
2330 sd_event_exit(sd_event_source_get_event(s
), 0);
2334 static int determine_names(void) {
2337 if (arg_template
&& !arg_directory
&& arg_machine
) {
2339 /* If --template= was specified then we should not
2340 * search for a machine, but instead create a new one
2341 * in /var/lib/machine. */
2343 arg_directory
= strjoin("/var/lib/machines/", arg_machine
, NULL
);
2348 if (!arg_image
&& !arg_directory
) {
2350 _cleanup_(image_unrefp
) Image
*i
= NULL
;
2352 r
= image_find(arg_machine
, &i
);
2354 return log_error_errno(r
, "Failed to find image for machine '%s': %m", arg_machine
);
2356 log_error("No image for machine '%s': %m", arg_machine
);
2360 if (i
->type
== IMAGE_RAW
)
2361 r
= free_and_strdup(&arg_image
, i
->path
);
2363 r
= free_and_strdup(&arg_directory
, i
->path
);
2365 return log_error_errno(r
, "Invalid image directory: %m");
2368 arg_read_only
= arg_read_only
|| i
->read_only
;
2370 arg_directory
= get_current_dir_name();
2372 if (!arg_directory
&& !arg_machine
) {
2373 log_error("Failed to determine path, please use -D or -i.");
2379 if (arg_directory
&& path_equal(arg_directory
, "/"))
2380 arg_machine
= gethostname_malloc();
2382 arg_machine
= strdup(basename(arg_image
?: arg_directory
));
2387 hostname_cleanup(arg_machine
);
2388 if (!machine_name_is_valid(arg_machine
)) {
2389 log_error("Failed to determine machine name automatically, please use -M.");
2393 if (arg_ephemeral
) {
2396 /* Add a random suffix when this is an
2397 * ephemeral machine, so that we can run many
2398 * instances at once without manually having
2399 * to specify -M each time. */
2401 if (asprintf(&b
, "%s-%016" PRIx64
, arg_machine
, random_u64()) < 0)
2412 static int determine_uid_shift(const char *directory
) {
2420 if (arg_uid_shift
== UID_INVALID
) {
2423 r
= stat(directory
, &st
);
2425 return log_error_errno(errno
, "Failed to determine UID base of %s: %m", directory
);
2427 arg_uid_shift
= st
.st_uid
& UINT32_C(0xffff0000);
2429 if (arg_uid_shift
!= (st
.st_gid
& UINT32_C(0xffff0000))) {
2430 log_error("UID and GID base of %s don't match.", directory
);
2434 arg_uid_range
= UINT32_C(0x10000);
2437 if (arg_uid_shift
> (uid_t
) -1 - arg_uid_range
) {
2438 log_error("UID base too high for UID range.");
2442 log_info("Using user namespaces with base " UID_FMT
" and range " UID_FMT
".", arg_uid_shift
, arg_uid_range
);
2446 static int inner_child(
2448 const char *directory
,
2454 _cleanup_free_
char *home
= NULL
;
2456 const char *envp
[] = {
2457 "PATH=" DEFAULT_PATH_SPLIT_USR
,
2458 NULL
, /* container */
2463 NULL
, /* container_uuid */
2464 NULL
, /* LISTEN_FDS */
2465 NULL
, /* LISTEN_PID */
2469 _cleanup_strv_free_
char **env_use
= NULL
;
2474 assert(kmsg_socket
>= 0);
2479 /* Tell the parent, that it now can write the UID map. */
2480 (void) barrier_place(barrier
); /* #1 */
2482 /* Wait until the parent wrote the UID map */
2483 if (!barrier_place_and_sync(barrier
)) { /* #2 */
2484 log_error("Parent died too early");
2489 r
= mount_all(NULL
, arg_userns
, true, arg_uid_shift
, arg_private_network
, arg_uid_range
, arg_selinux_apifs_context
);
2493 r
= mount_sysfs(NULL
);
2497 /* Wait until we are cgroup-ified, so that we
2498 * can mount the right cgroup path writable */
2499 if (!barrier_place_and_sync(barrier
)) { /* #3 */
2500 log_error("Parent died too early");
2504 r
= mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy
);
2508 r
= reset_uid_gid();
2510 return log_error_errno(r
, "Couldn't become new root: %m");
2512 r
= setup_boot_id(NULL
);
2516 r
= setup_kmsg(NULL
, kmsg_socket
);
2519 kmsg_socket
= safe_close(kmsg_socket
);
2524 return log_error_errno(errno
, "setsid() failed: %m");
2526 if (arg_private_network
)
2529 if (arg_expose_ports
) {
2530 r
= expose_port_send_rtnl(rtnl_socket
);
2533 rtnl_socket
= safe_close(rtnl_socket
);
2536 r
= drop_capabilities();
2538 return log_error_errno(r
, "drop_capabilities() failed: %m");
2542 if (arg_personality
!= PERSONALITY_INVALID
) {
2543 if (personality(arg_personality
) < 0)
2544 return log_error_errno(errno
, "personality() failed: %m");
2545 } else if (secondary
) {
2546 if (personality(PER_LINUX32
) < 0)
2547 return log_error_errno(errno
, "personality() failed: %m");
2551 if (arg_selinux_context
)
2552 if (setexeccon((security_context_t
) arg_selinux_context
) < 0)
2553 return log_error_errno(errno
, "setexeccon(\"%s\") failed: %m", arg_selinux_context
);
2556 r
= change_uid_gid(arg_user
, &home
);
2560 /* LXC sets container=lxc, so follow the scheme here */
2561 envp
[n_env
++] = strjoina("container=", arg_container_service_name
);
2563 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
2567 if ((asprintf((char**)(envp
+ n_env
++), "HOME=%s", home
? home
: "/root") < 0) ||
2568 (asprintf((char**)(envp
+ n_env
++), "USER=%s", arg_user
? arg_user
: "root") < 0) ||
2569 (asprintf((char**)(envp
+ n_env
++), "LOGNAME=%s", arg_user
? arg_user
: "root") < 0))
2572 if (!sd_id128_equal(arg_uuid
, SD_ID128_NULL
)) {
2575 if (asprintf((char**)(envp
+ n_env
++), "container_uuid=%s", id128_format_as_uuid(arg_uuid
, as_uuid
)) < 0)
2579 if (fdset_size(fds
) > 0) {
2580 r
= fdset_cloexec(fds
, false);
2582 return log_error_errno(r
, "Failed to unset O_CLOEXEC for file descriptors.");
2584 if ((asprintf((char **)(envp
+ n_env
++), "LISTEN_FDS=%u", fdset_size(fds
)) < 0) ||
2585 (asprintf((char **)(envp
+ n_env
++), "LISTEN_PID=1") < 0))
2589 env_use
= strv_env_merge(2, envp
, arg_setenv
);
2593 /* Let the parent know that we are ready and
2594 * wait until the parent is ready with the
2596 if (!barrier_place_and_sync(barrier
)) { /* #4 */
2597 log_error("Parent died too early");
2602 if (chdir(arg_chdir
) < 0)
2603 return log_error_errno(errno
, "Failed to change to specified working directory %s: %m", arg_chdir
);
2605 if (arg_start_mode
== START_PID2
) {
2611 /* Now, explicitly close the log, so that we
2612 * then can close all remaining fds. Closing
2613 * the log explicitly first has the benefit
2614 * that the logging subsystem knows about it,
2615 * and is thus ready to be reopened should we
2616 * need it again. Note that the other fds
2617 * closed here are at least the locking and
2620 (void) fdset_close_others(fds
);
2622 if (arg_start_mode
== START_BOOT
) {
2626 /* Automatically search for the init system */
2628 m
= 1 + strv_length(arg_parameters
);
2629 a
= newa(char*, m
+ 1);
2630 if (strv_isempty(arg_parameters
))
2633 memcpy(a
+ 1, arg_parameters
, m
* sizeof(char*));
2635 a
[0] = (char*) "/usr/lib/systemd/systemd";
2636 execve(a
[0], a
, env_use
);
2638 a
[0] = (char*) "/lib/systemd/systemd";
2639 execve(a
[0], a
, env_use
);
2641 a
[0] = (char*) "/sbin/init";
2642 execve(a
[0], a
, env_use
);
2643 } else if (!strv_isempty(arg_parameters
))
2644 execvpe(arg_parameters
[0], arg_parameters
, env_use
);
2647 chdir(home
?: "/root");
2649 execle("/bin/bash", "-bash", NULL
, env_use
);
2650 execle("/bin/sh", "-sh", NULL
, env_use
);
2655 return log_error_errno(r
, "execv() failed: %m");
2658 static int outer_child(
2660 const char *directory
,
2661 const char *console
,
2662 const char *root_device
, bool root_device_rw
,
2663 const char *home_device
, bool home_device_rw
,
2664 const char *srv_device
, bool srv_device_rw
,
2670 int uid_shift_socket
,
2680 assert(pid_socket
>= 0);
2681 assert(kmsg_socket
>= 0);
2685 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0)
2686 return log_error_errno(errno
, "PR_SET_PDEATHSIG failed: %m");
2689 close_nointr(STDIN_FILENO
);
2690 close_nointr(STDOUT_FILENO
);
2691 close_nointr(STDERR_FILENO
);
2693 r
= open_terminal(console
, O_RDWR
);
2694 if (r
!= STDIN_FILENO
) {
2700 return log_error_errno(r
, "Failed to open console: %m");
2703 if (dup2(STDIN_FILENO
, STDOUT_FILENO
) != STDOUT_FILENO
||
2704 dup2(STDIN_FILENO
, STDERR_FILENO
) != STDERR_FILENO
)
2705 return log_error_errno(errno
, "Failed to duplicate console: %m");
2708 r
= reset_audit_loginuid();
2712 /* Mark everything as slave, so that we still
2713 * receive mounts from the real root, but don't
2714 * propagate mounts to the real root. */
2715 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
2716 return log_error_errno(errno
, "MS_SLAVE|MS_REC failed: %m");
2718 r
= mount_devices(directory
,
2719 root_device
, root_device_rw
,
2720 home_device
, home_device_rw
,
2721 srv_device
, srv_device_rw
);
2725 r
= determine_uid_shift(directory
);
2730 l
= send(uid_shift_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), MSG_NOSIGNAL
);
2732 return log_error_errno(errno
, "Failed to send UID shift: %m");
2733 if (l
!= sizeof(arg_uid_shift
)) {
2734 log_error("Short write while sending UID shift.");
2739 /* Turn directory into bind mount */
2740 if (mount(directory
, directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
2741 return log_error_errno(errno
, "Failed to make bind mount: %m");
2743 r
= setup_volatile(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2747 r
= setup_volatile_state(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2751 r
= base_filesystem_create(directory
, arg_uid_shift
, (gid_t
) arg_uid_shift
);
2755 if (arg_read_only
) {
2756 r
= bind_remount_recursive(directory
, true);
2758 return log_error_errno(r
, "Failed to make tree read-only: %m");
2761 r
= mount_all(directory
, arg_userns
, false, arg_private_network
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2765 r
= copy_devnodes(directory
);
2769 dev_setup(directory
, arg_uid_shift
, arg_uid_shift
);
2771 r
= setup_pts(directory
);
2775 r
= setup_propagate(directory
);
2779 r
= setup_dev_console(directory
, console
);
2783 r
= setup_seccomp();
2787 r
= setup_timezone(directory
);
2791 r
= setup_resolv_conf(directory
);
2795 r
= setup_journal(directory
);
2799 r
= mount_custom(directory
, arg_custom_mounts
, arg_n_custom_mounts
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2803 r
= mount_cgroups(directory
, arg_unified_cgroup_hierarchy
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2807 r
= mount_move_root(directory
);
2809 return log_error_errno(r
, "Failed to move root directory: %m");
2811 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
|
2812 (arg_share_system
? 0 : CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
) |
2813 (arg_private_network
? CLONE_NEWNET
: 0) |
2814 (arg_userns
? CLONE_NEWUSER
: 0),
2817 return log_error_errno(errno
, "Failed to fork inner child: %m");
2819 pid_socket
= safe_close(pid_socket
);
2820 uid_shift_socket
= safe_close(uid_shift_socket
);
2822 /* The inner child has all namespaces that are
2823 * requested, so that we all are owned by the user if
2824 * user namespaces are turned on. */
2826 r
= inner_child(barrier
, directory
, secondary
, kmsg_socket
, rtnl_socket
, fds
);
2828 _exit(EXIT_FAILURE
);
2830 _exit(EXIT_SUCCESS
);
2833 l
= send(pid_socket
, &pid
, sizeof(pid
), MSG_NOSIGNAL
);
2835 return log_error_errno(errno
, "Failed to send PID: %m");
2836 if (l
!= sizeof(pid
)) {
2837 log_error("Short write while sending PID.");
2841 pid_socket
= safe_close(pid_socket
);
2842 kmsg_socket
= safe_close(kmsg_socket
);
2843 rtnl_socket
= safe_close(rtnl_socket
);
2848 static int setup_uid_map(pid_t pid
) {
2849 char uid_map
[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t
) + 1], line
[DECIMAL_STR_MAX(uid_t
)*3+3+1];
2854 xsprintf(uid_map
, "/proc/" PID_FMT
"/uid_map", pid
);
2855 xsprintf(line
, UID_FMT
" " UID_FMT
" " UID_FMT
"\n", 0, arg_uid_shift
, arg_uid_range
);
2856 r
= write_string_file(uid_map
, line
, 0);
2858 return log_error_errno(r
, "Failed to write UID map: %m");
2860 /* We always assign the same UID and GID ranges */
2861 xsprintf(uid_map
, "/proc/" PID_FMT
"/gid_map", pid
);
2862 r
= write_string_file(uid_map
, line
, 0);
2864 return log_error_errno(r
, "Failed to write GID map: %m");
2869 static int load_settings(void) {
2870 _cleanup_(settings_freep
) Settings
*settings
= NULL
;
2871 _cleanup_fclose_
FILE *f
= NULL
;
2872 _cleanup_free_
char *p
= NULL
;
2876 /* If all settings are masked, there's no point in looking for
2877 * the settings file */
2878 if ((arg_settings_mask
& _SETTINGS_MASK_ALL
) == _SETTINGS_MASK_ALL
)
2881 fn
= strjoina(arg_machine
, ".nspawn");
2883 /* We first look in the admin's directories in /etc and /run */
2884 FOREACH_STRING(i
, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2885 _cleanup_free_
char *j
= NULL
;
2887 j
= strjoin(i
, "/", fn
, NULL
);
2896 /* By default, we trust configuration from /etc and /run */
2897 if (arg_settings_trusted
< 0)
2898 arg_settings_trusted
= true;
2903 if (errno
!= ENOENT
)
2904 return log_error_errno(errno
, "Failed to open %s: %m", j
);
2908 /* After that, let's look for a file next to the
2909 * actual image we shall boot. */
2912 p
= file_in_same_dir(arg_image
, fn
);
2915 } else if (arg_directory
) {
2916 p
= file_in_same_dir(arg_directory
, fn
);
2923 if (!f
&& errno
!= ENOENT
)
2924 return log_error_errno(errno
, "Failed to open %s: %m", p
);
2926 /* By default, we do not trust configuration from /var/lib/machines */
2927 if (arg_settings_trusted
< 0)
2928 arg_settings_trusted
= false;
2935 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted
));
2937 r
= settings_load(f
, p
, &settings
);
2941 /* Copy over bits from the settings, unless they have been
2942 * explicitly masked by command line switches. */
2944 if ((arg_settings_mask
& SETTING_START_MODE
) == 0 &&
2945 settings
->start_mode
>= 0) {
2946 arg_start_mode
= settings
->start_mode
;
2948 strv_free(arg_parameters
);
2949 arg_parameters
= settings
->parameters
;
2950 settings
->parameters
= NULL
;
2953 if ((arg_settings_mask
& SETTING_WORKING_DIRECTORY
) == 0 &&
2954 settings
->working_directory
) {
2956 arg_chdir
= settings
->working_directory
;
2957 settings
->working_directory
= NULL
;
2960 if ((arg_settings_mask
& SETTING_ENVIRONMENT
) == 0 &&
2961 settings
->environment
) {
2962 strv_free(arg_setenv
);
2963 arg_setenv
= settings
->environment
;
2964 settings
->environment
= NULL
;
2967 if ((arg_settings_mask
& SETTING_USER
) == 0 &&
2970 arg_user
= settings
->user
;
2971 settings
->user
= NULL
;
2974 if ((arg_settings_mask
& SETTING_CAPABILITY
) == 0) {
2977 plus
= settings
->capability
;
2978 if (settings_private_network(settings
))
2979 plus
|= (1ULL << CAP_NET_ADMIN
);
2981 if (!arg_settings_trusted
&& plus
!= 0) {
2982 if (settings
->capability
!= 0)
2983 log_warning("Ignoring Capability= setting, file %s is not trusted.", p
);
2987 arg_retain
&= ~settings
->drop_capability
;
2990 if ((arg_settings_mask
& SETTING_KILL_SIGNAL
) == 0 &&
2991 settings
->kill_signal
> 0)
2992 arg_kill_signal
= settings
->kill_signal
;
2994 if ((arg_settings_mask
& SETTING_PERSONALITY
) == 0 &&
2995 settings
->personality
!= PERSONALITY_INVALID
)
2996 arg_personality
= settings
->personality
;
2998 if ((arg_settings_mask
& SETTING_MACHINE_ID
) == 0 &&
2999 !sd_id128_is_null(settings
->machine_id
)) {
3001 if (!arg_settings_trusted
)
3002 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p
);
3004 arg_uuid
= settings
->machine_id
;
3007 if ((arg_settings_mask
& SETTING_READ_ONLY
) == 0 &&
3008 settings
->read_only
>= 0)
3009 arg_read_only
= settings
->read_only
;
3011 if ((arg_settings_mask
& SETTING_VOLATILE_MODE
) == 0 &&
3012 settings
->volatile_mode
!= _VOLATILE_MODE_INVALID
)
3013 arg_volatile_mode
= settings
->volatile_mode
;
3015 if ((arg_settings_mask
& SETTING_CUSTOM_MOUNTS
) == 0 &&
3016 settings
->n_custom_mounts
> 0) {
3018 if (!arg_settings_trusted
)
3019 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p
);
3021 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
3022 arg_custom_mounts
= settings
->custom_mounts
;
3023 arg_n_custom_mounts
= settings
->n_custom_mounts
;
3025 settings
->custom_mounts
= NULL
;
3026 settings
->n_custom_mounts
= 0;
3030 if ((arg_settings_mask
& SETTING_NETWORK
) == 0 &&
3031 (settings
->private_network
>= 0 ||
3032 settings
->network_veth
>= 0 ||
3033 settings
->network_bridge
||
3034 settings
->network_interfaces
||
3035 settings
->network_macvlan
||
3036 settings
->network_ipvlan
||
3037 settings
->network_veth_extra
)) {
3039 if (!arg_settings_trusted
)
3040 log_warning("Ignoring network settings, file %s is not trusted.", p
);
3042 arg_network_veth
= settings_network_veth(settings
);
3043 arg_private_network
= settings_private_network(settings
);
3045 strv_free(arg_network_interfaces
);
3046 arg_network_interfaces
= settings
->network_interfaces
;
3047 settings
->network_interfaces
= NULL
;
3049 strv_free(arg_network_macvlan
);
3050 arg_network_macvlan
= settings
->network_macvlan
;
3051 settings
->network_macvlan
= NULL
;
3053 strv_free(arg_network_ipvlan
);
3054 arg_network_ipvlan
= settings
->network_ipvlan
;
3055 settings
->network_ipvlan
= NULL
;
3057 strv_free(arg_network_veth_extra
);
3058 arg_network_veth_extra
= settings
->network_veth_extra
;
3059 settings
->network_veth_extra
= NULL
;
3061 free(arg_network_bridge
);
3062 arg_network_bridge
= settings
->network_bridge
;
3063 settings
->network_bridge
= NULL
;
3067 if ((arg_settings_mask
& SETTING_EXPOSE_PORTS
) == 0 &&
3068 settings
->expose_ports
) {
3070 if (!arg_settings_trusted
)
3071 log_warning("Ignoring Port= setting, file %s is not trusted.", p
);
3073 expose_port_free_all(arg_expose_ports
);
3074 arg_expose_ports
= settings
->expose_ports
;
3075 settings
->expose_ports
= NULL
;
3082 int main(int argc
, char *argv
[]) {
3084 _cleanup_free_
char *device_path
= NULL
, *root_device
= NULL
, *home_device
= NULL
, *srv_device
= NULL
, *console
= NULL
;
3085 bool root_device_rw
= true, home_device_rw
= true, srv_device_rw
= true;
3086 _cleanup_close_
int master
= -1, image_fd
= -1;
3087 _cleanup_fdset_free_ FDSet
*fds
= NULL
;
3088 int r
, n_fd_passed
, loop_nr
= -1;
3089 char veth_name
[IFNAMSIZ
];
3090 bool secondary
= false, remove_subvol
= false;
3093 int ret
= EXIT_SUCCESS
;
3094 union in_addr_union exposed
= {};
3095 _cleanup_release_lock_file_ LockFile tree_global_lock
= LOCK_FILE_INIT
, tree_local_lock
= LOCK_FILE_INIT
;
3098 log_parse_environment();
3101 /* Make sure rename_process() in the stub init process can work */
3105 r
= parse_argv(argc
, argv
);
3109 if (geteuid() != 0) {
3110 log_error("Need to be root.");
3114 r
= determine_names();
3118 r
= load_settings();
3122 r
= verify_arguments();
3126 n_fd_passed
= sd_listen_fds(false);
3127 if (n_fd_passed
> 0) {
3128 r
= fdset_new_listen_fds(&fds
, false);
3130 log_error_errno(r
, "Failed to collect file descriptors: %m");
3135 if (arg_directory
) {
3138 if (path_equal(arg_directory
, "/") && !arg_ephemeral
) {
3139 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3144 if (arg_ephemeral
) {
3145 _cleanup_free_
char *np
= NULL
;
3147 /* If the specified path is a mount point we
3148 * generate the new snapshot immediately
3149 * inside it under a random name. However if
3150 * the specified is not a mount point we
3151 * create the new snapshot in the parent
3152 * directory, just next to it. */
3153 r
= path_is_mount_point(arg_directory
, 0);
3155 log_error_errno(r
, "Failed to determine whether directory %s is mount point: %m", arg_directory
);
3159 r
= tempfn_random_child(arg_directory
, "machine.", &np
);
3161 r
= tempfn_random(arg_directory
, "machine.", &np
);
3163 log_error_errno(r
, "Failed to generate name for snapshot: %m");
3167 r
= image_path_lock(np
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3169 log_error_errno(r
, "Failed to lock %s: %m", np
);
3173 r
= btrfs_subvol_snapshot(arg_directory
, np
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
3175 log_error_errno(r
, "Failed to create snapshot %s from %s: %m", np
, arg_directory
);
3179 free(arg_directory
);
3183 remove_subvol
= true;
3186 r
= image_path_lock(arg_directory
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3188 log_error_errno(r
, "Directory tree %s is currently busy.", arg_directory
);
3192 log_error_errno(r
, "Failed to lock %s: %m", arg_directory
);
3197 r
= btrfs_subvol_snapshot(arg_template
, arg_directory
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
3200 log_info("Directory %s already exists, not populating from template %s.", arg_directory
, arg_template
);
3202 log_error_errno(r
, "Couldn't create snapshot %s from %s: %m", arg_directory
, arg_template
);
3206 log_info("Populated %s from template %s.", arg_directory
, arg_template
);
3211 if (arg_start_mode
== START_BOOT
) {
3212 if (path_is_os_tree(arg_directory
) <= 0) {
3213 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory
);
3220 p
= strjoina(arg_directory
, "/usr/");
3221 if (laccess(p
, F_OK
) < 0) {
3222 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory
);
3229 char template[] = "/tmp/nspawn-root-XXXXXX";
3232 assert(!arg_template
);
3234 r
= image_path_lock(arg_image
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3236 r
= log_error_errno(r
, "Disk image %s is currently busy.", arg_image
);
3240 r
= log_error_errno(r
, "Failed to create image lock: %m");
3244 if (!mkdtemp(template)) {
3245 log_error_errno(errno
, "Failed to create temporary directory: %m");
3250 arg_directory
= strdup(template);
3251 if (!arg_directory
) {
3256 image_fd
= setup_image(&device_path
, &loop_nr
);
3262 r
= dissect_image(image_fd
,
3263 &root_device
, &root_device_rw
,
3264 &home_device
, &home_device_rw
,
3265 &srv_device
, &srv_device_rw
,
3271 r
= custom_mounts_prepare();
3276 isatty(STDIN_FILENO
) > 0 &&
3277 isatty(STDOUT_FILENO
) > 0;
3279 master
= posix_openpt(O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NDELAY
);
3281 r
= log_error_errno(errno
, "Failed to acquire pseudo tty: %m");
3285 r
= ptsname_malloc(master
, &console
);
3287 r
= log_error_errno(r
, "Failed to determine tty name: %m");
3291 if (unlockpt(master
) < 0) {
3292 r
= log_error_errno(errno
, "Failed to unlock tty: %m");
3297 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3298 arg_machine
, arg_image
?: arg_directory
);
3300 assert_se(sigprocmask_many(SIG_BLOCK
, NULL
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, -1) >= 0);
3302 assert_se(sigemptyset(&mask_chld
) == 0);
3303 assert_se(sigaddset(&mask_chld
, SIGCHLD
) == 0);
3305 if (prctl(PR_SET_CHILD_SUBREAPER
, 1) < 0) {
3306 r
= log_error_errno(errno
, "Failed to become subreaper: %m");
3311 _cleanup_close_pair_
int kmsg_socket_pair
[2] = { -1, -1 }, rtnl_socket_pair
[2] = { -1, -1 }, pid_socket_pair
[2] = { -1, -1 }, uid_shift_socket_pair
[2] = { -1, -1 };
3312 ContainerStatus container_status
;
3313 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
3314 static const struct sigaction sa
= {
3315 .sa_handler
= nop_signal_handler
,
3316 .sa_flags
= SA_NOCLDSTOP
,
3320 _cleanup_(sd_event_unrefp
) sd_event
*event
= NULL
;
3321 _cleanup_(pty_forward_freep
) PTYForward
*forward
= NULL
;
3322 _cleanup_(sd_netlink_unrefp
) sd_netlink
*rtnl
= NULL
;
3325 r
= barrier_create(&barrier
);
3327 log_error_errno(r
, "Cannot initialize IPC barrier: %m");
3331 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, kmsg_socket_pair
) < 0) {
3332 r
= log_error_errno(errno
, "Failed to create kmsg socket pair: %m");
3336 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, rtnl_socket_pair
) < 0) {
3337 r
= log_error_errno(errno
, "Failed to create rtnl socket pair: %m");
3341 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, pid_socket_pair
) < 0) {
3342 r
= log_error_errno(errno
, "Failed to create pid socket pair: %m");
3347 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, uid_shift_socket_pair
) < 0) {
3348 r
= log_error_errno(errno
, "Failed to create uid shift socket pair: %m");
3352 /* Child can be killed before execv(), so handle SIGCHLD
3353 * in order to interrupt parent's blocking calls and
3354 * give it a chance to call wait() and terminate. */
3355 r
= sigprocmask(SIG_UNBLOCK
, &mask_chld
, NULL
);
3357 r
= log_error_errno(errno
, "Failed to change the signal mask: %m");
3361 r
= sigaction(SIGCHLD
, &sa
, NULL
);
3363 r
= log_error_errno(errno
, "Failed to install SIGCHLD handler: %m");
3367 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
, NULL
);
3369 if (errno
== EINVAL
)
3370 r
= log_error_errno(errno
, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3372 r
= log_error_errno(errno
, "clone() failed: %m");
3378 /* The outer child only has a file system namespace. */
3379 barrier_set_role(&barrier
, BARRIER_CHILD
);
3381 master
= safe_close(master
);
3383 kmsg_socket_pair
[0] = safe_close(kmsg_socket_pair
[0]);
3384 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3385 pid_socket_pair
[0] = safe_close(pid_socket_pair
[0]);
3386 uid_shift_socket_pair
[0] = safe_close(uid_shift_socket_pair
[0]);
3388 (void) reset_all_signal_handlers();
3389 (void) reset_signal_mask();
3391 r
= outer_child(&barrier
,
3394 root_device
, root_device_rw
,
3395 home_device
, home_device_rw
,
3396 srv_device
, srv_device_rw
,
3400 kmsg_socket_pair
[1],
3401 rtnl_socket_pair
[1],
3402 uid_shift_socket_pair
[1],
3405 _exit(EXIT_FAILURE
);
3407 _exit(EXIT_SUCCESS
);
3410 barrier_set_role(&barrier
, BARRIER_PARENT
);
3412 fds
= fdset_free(fds
);
3414 kmsg_socket_pair
[1] = safe_close(kmsg_socket_pair
[1]);
3415 rtnl_socket_pair
[1] = safe_close(rtnl_socket_pair
[1]);
3416 pid_socket_pair
[1] = safe_close(pid_socket_pair
[1]);
3417 uid_shift_socket_pair
[1] = safe_close(uid_shift_socket_pair
[1]);
3419 /* Wait for the outer child. */
3420 r
= wait_for_terminate_and_warn("namespace helper", pid
, NULL
);
3429 /* And now retrieve the PID of the inner child. */
3430 l
= recv(pid_socket_pair
[0], &pid
, sizeof(pid
), 0);
3432 r
= log_error_errno(errno
, "Failed to read inner child PID: %m");
3435 if (l
!= sizeof(pid
)) {
3436 log_error("Short read while reading inner child PID.");
3441 log_debug("Init process invoked as PID " PID_FMT
, pid
);
3444 if (!barrier_place_and_sync(&barrier
)) { /* #1 */
3445 log_error("Child died too early.");
3450 l
= recv(uid_shift_socket_pair
[0], &arg_uid_shift
, sizeof(arg_uid_shift
), 0);
3452 r
= log_error_errno(errno
, "Failed to read UID shift: %m");
3455 if (l
!= sizeof(arg_uid_shift
)) {
3456 log_error("Short read while reading UID shift.");
3461 r
= setup_uid_map(pid
);
3465 (void) barrier_place(&barrier
); /* #2 */
3468 if (arg_private_network
) {
3470 r
= move_network_interfaces(pid
, arg_network_interfaces
);
3474 if (arg_network_veth
) {
3475 r
= setup_veth(arg_machine
, pid
, veth_name
, !!arg_network_bridge
);
3481 if (arg_network_bridge
) {
3482 r
= setup_bridge(veth_name
, arg_network_bridge
);
3490 r
= setup_veth_extra(arg_machine
, pid
, arg_network_veth_extra
);
3494 r
= setup_macvlan(arg_machine
, pid
, arg_network_macvlan
);
3498 r
= setup_ipvlan(arg_machine
, pid
, arg_network_ipvlan
);
3504 r
= register_machine(
3511 arg_custom_mounts
, arg_n_custom_mounts
,
3515 arg_container_service_name
);
3520 r
= sync_cgroup(pid
, arg_unified_cgroup_hierarchy
);
3524 if (arg_keep_unit
) {
3525 r
= create_subcgroup(pid
, arg_unified_cgroup_hierarchy
);
3530 r
= chown_cgroup(pid
, arg_uid_shift
);
3534 /* Notify the child that the parent is ready with all
3535 * its setup (including cgroup-ification), and that
3536 * the child can now hand over control to the code to
3537 * run inside the container. */
3538 (void) barrier_place(&barrier
); /* #3 */
3540 /* Block SIGCHLD here, before notifying child.
3541 * process_pty() will handle it with the other signals. */
3542 assert_se(sigprocmask(SIG_BLOCK
, &mask_chld
, NULL
) >= 0);
3544 /* Reset signal to default */
3545 r
= default_signals(SIGCHLD
, -1);
3547 log_error_errno(r
, "Failed to reset SIGCHLD: %m");
3551 /* Let the child know that we are ready and wait that the child is completely ready now. */
3552 if (!barrier_place_and_sync(&barrier
)) { /* #4 */
3553 log_error("Child died too early.");
3560 "STATUS=Container running.\n"
3561 "X_NSPAWN_LEADER_PID=" PID_FMT
, pid
);
3563 r
= sd_event_new(&event
);
3565 log_error_errno(r
, "Failed to get default event source: %m");
3569 if (arg_kill_signal
> 0) {
3570 /* Try to kill the init system on SIGINT or SIGTERM */
3571 sd_event_add_signal(event
, NULL
, SIGINT
, on_orderly_shutdown
, PID_TO_PTR(pid
));
3572 sd_event_add_signal(event
, NULL
, SIGTERM
, on_orderly_shutdown
, PID_TO_PTR(pid
));
3574 /* Immediately exit */
3575 sd_event_add_signal(event
, NULL
, SIGINT
, NULL
, NULL
);
3576 sd_event_add_signal(event
, NULL
, SIGTERM
, NULL
, NULL
);
3579 /* simply exit on sigchld */
3580 sd_event_add_signal(event
, NULL
, SIGCHLD
, NULL
, NULL
);
3582 if (arg_expose_ports
) {
3583 r
= expose_port_watch_rtnl(event
, rtnl_socket_pair
[0], on_address_change
, &exposed
, &rtnl
);
3587 (void) expose_port_execute(rtnl
, arg_expose_ports
, &exposed
);
3590 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3592 r
= pty_forward_new(event
, master
, PTY_FORWARD_IGNORE_VHANGUP
| (interactive
? 0 : PTY_FORWARD_READ_ONLY
), &forward
);
3594 log_error_errno(r
, "Failed to create PTY forwarder: %m");
3598 r
= sd_event_loop(event
);
3600 log_error_errno(r
, "Failed to run event loop: %m");
3604 pty_forward_get_last_char(forward
, &last_char
);
3606 forward
= pty_forward_free(forward
);
3608 if (!arg_quiet
&& last_char
!= '\n')
3611 /* Kill if it is not dead yet anyway */
3612 if (arg_register
&& !arg_keep_unit
)
3613 terminate_machine(pid
);
3615 /* Normally redundant, but better safe than sorry */
3618 r
= wait_for_container(pid
, &container_status
);
3622 /* We failed to wait for the container, or the
3623 * container exited abnormally */
3625 else if (r
> 0 || container_status
== CONTAINER_TERMINATED
){
3626 /* The container exited with a non-zero
3627 * status, or with zero status and no reboot
3633 /* CONTAINER_REBOOTED, loop again */
3635 if (arg_keep_unit
) {
3636 /* Special handling if we are running as a
3637 * service: instead of simply restarting the
3638 * machine we want to restart the entire
3639 * service, so let's inform systemd about this
3640 * with the special exit code 133. The service
3641 * file uses RestartForceExitStatus=133 so
3642 * that this results in a full nspawn
3643 * restart. This is necessary since we might
3644 * have cgroup parameters set we want to have
3651 expose_port_flush(arg_expose_ports
, &exposed
);
3657 "STATUS=Terminating...");
3662 /* Try to flush whatever is still queued in the pty */
3664 (void) copy_bytes(master
, STDOUT_FILENO
, (uint64_t) -1, false);
3666 loop_remove(loop_nr
, &image_fd
);
3668 if (remove_subvol
&& arg_directory
) {
3671 k
= btrfs_subvol_remove(arg_directory
, BTRFS_REMOVE_RECURSIVE
|BTRFS_REMOVE_QUOTA
);
3673 log_warning_errno(k
, "Cannot remove subvolume '%s', ignoring: %m", arg_directory
);
3679 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
3680 (void) rm_rf(p
, REMOVE_ROOT
);
3683 expose_port_flush(arg_expose_ports
, &exposed
);
3685 free(arg_directory
);
3691 strv_free(arg_setenv
);
3692 free(arg_network_bridge
);
3693 strv_free(arg_network_interfaces
);
3694 strv_free(arg_network_macvlan
);
3695 strv_free(arg_network_ipvlan
);
3696 strv_free(arg_network_veth_extra
);
3697 strv_free(arg_parameters
);
3698 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
3699 expose_port_free_all(arg_expose_ports
);
3701 return r
< 0 ? EXIT_FAILURE
: ret
;