2 This file is part of systemd.
4 Copyright 2010 Lennart Poettering
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
21 #include <blkid/blkid.h>
25 #include <linux/loop.h>
31 #include <selinux/selinux.h>
38 #include <sys/mount.h>
39 #include <sys/personality.h>
40 #include <sys/prctl.h>
41 #include <sys/types.h>
44 #include "sd-daemon.h"
47 #include "alloc-util.h"
49 #include "base-filesystem.h"
50 #include "blkid-util.h"
51 #include "btrfs-util.h"
53 #include "capability-util.h"
54 #include "cgroup-util.h"
56 #include "dev-setup.h"
61 #include "formats-util.h"
64 #include "hostname-util.h"
66 #include "loopback-setup.h"
67 #include "machine-image.h"
71 #include "mount-util.h"
72 #include "netlink-util.h"
73 #include "nspawn-cgroup.h"
74 #include "nspawn-expose-ports.h"
75 #include "nspawn-mount.h"
76 #include "nspawn-network.h"
77 #include "nspawn-register.h"
78 #include "nspawn-settings.h"
79 #include "nspawn-setuid.h"
80 #include "nspawn-stub-pid1.h"
81 #include "parse-util.h"
82 #include "path-util.h"
83 #include "process-util.h"
85 #include "random-util.h"
88 #include "seccomp-util.h"
90 #include "selinux-util.h"
91 #include "signal-util.h"
92 #include "socket-util.h"
93 #include "stat-util.h"
94 #include "stdio-util.h"
95 #include "string-util.h"
97 #include "terminal-util.h"
98 #include "udev-util.h"
99 #include "umask-util.h"
100 #include "user-util.h"
103 typedef enum ContainerStatus
{
104 CONTAINER_TERMINATED
,
108 typedef enum LinkJournal
{
115 static char *arg_directory
= NULL
;
116 static char *arg_template
= NULL
;
117 static char *arg_chdir
= NULL
;
118 static char *arg_user
= NULL
;
119 static sd_id128_t arg_uuid
= {};
120 static char *arg_machine
= NULL
;
121 static const char *arg_selinux_context
= NULL
;
122 static const char *arg_selinux_apifs_context
= NULL
;
123 static const char *arg_slice
= NULL
;
124 static bool arg_private_network
= false;
125 static bool arg_read_only
= false;
126 static StartMode arg_start_mode
= START_PID1
;
127 static bool arg_ephemeral
= false;
128 static LinkJournal arg_link_journal
= LINK_AUTO
;
129 static bool arg_link_journal_try
= false;
130 static uint64_t arg_retain
=
131 (1ULL << CAP_CHOWN
) |
132 (1ULL << CAP_DAC_OVERRIDE
) |
133 (1ULL << CAP_DAC_READ_SEARCH
) |
134 (1ULL << CAP_FOWNER
) |
135 (1ULL << CAP_FSETID
) |
136 (1ULL << CAP_IPC_OWNER
) |
138 (1ULL << CAP_LEASE
) |
139 (1ULL << CAP_LINUX_IMMUTABLE
) |
140 (1ULL << CAP_NET_BIND_SERVICE
) |
141 (1ULL << CAP_NET_BROADCAST
) |
142 (1ULL << CAP_NET_RAW
) |
143 (1ULL << CAP_SETGID
) |
144 (1ULL << CAP_SETFCAP
) |
145 (1ULL << CAP_SETPCAP
) |
146 (1ULL << CAP_SETUID
) |
147 (1ULL << CAP_SYS_ADMIN
) |
148 (1ULL << CAP_SYS_CHROOT
) |
149 (1ULL << CAP_SYS_NICE
) |
150 (1ULL << CAP_SYS_PTRACE
) |
151 (1ULL << CAP_SYS_TTY_CONFIG
) |
152 (1ULL << CAP_SYS_RESOURCE
) |
153 (1ULL << CAP_SYS_BOOT
) |
154 (1ULL << CAP_AUDIT_WRITE
) |
155 (1ULL << CAP_AUDIT_CONTROL
) |
157 static CustomMount
*arg_custom_mounts
= NULL
;
158 static unsigned arg_n_custom_mounts
= 0;
159 static char **arg_setenv
= NULL
;
160 static bool arg_quiet
= false;
161 static bool arg_share_system
= false;
162 static bool arg_register
= true;
163 static bool arg_keep_unit
= false;
164 static char **arg_network_interfaces
= NULL
;
165 static char **arg_network_macvlan
= NULL
;
166 static char **arg_network_ipvlan
= NULL
;
167 static bool arg_network_veth
= false;
168 static char **arg_network_veth_extra
= NULL
;
169 static char *arg_network_bridge
= NULL
;
170 static unsigned long arg_personality
= PERSONALITY_INVALID
;
171 static char *arg_image
= NULL
;
172 static VolatileMode arg_volatile_mode
= VOLATILE_NO
;
173 static ExposePort
*arg_expose_ports
= NULL
;
174 static char **arg_property
= NULL
;
175 static uid_t arg_uid_shift
= UID_INVALID
, arg_uid_range
= 0x10000U
;
176 static bool arg_userns
= false;
177 static int arg_kill_signal
= 0;
178 static bool arg_unified_cgroup_hierarchy
= false;
179 static SettingsMask arg_settings_mask
= 0;
180 static int arg_settings_trusted
= -1;
181 static char **arg_parameters
= NULL
;
182 static const char *arg_container_service_name
= "systemd-nspawn";
184 static void help(void) {
185 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
186 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
187 " -h --help Show this help\n"
188 " --version Print version string\n"
189 " -q --quiet Do not show status information\n"
190 " -D --directory=PATH Root directory for the container\n"
191 " --template=PATH Initialize root directory from template directory,\n"
193 " -x --ephemeral Run container with snapshot of root directory, and\n"
194 " remove it after exit\n"
195 " -i --image=PATH File system device or disk image for the container\n"
196 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
197 " -b --boot Boot up full system (i.e. invoke init)\n"
198 " --chdir=PATH Set working directory in the container\n"
199 " -u --user=USER Run the command under specified user or uid\n"
200 " -M --machine=NAME Set the machine name for the container\n"
201 " --uuid=UUID Set a specific machine UUID for the container\n"
202 " -S --slice=SLICE Place the container in the specified slice\n"
203 " --property=NAME=VALUE Set scope unit property\n"
204 " --private-users[=UIDBASE[:NUIDS]]\n"
205 " Run within user namespace\n"
206 " --private-network Disable network in container\n"
207 " --network-interface=INTERFACE\n"
208 " Assign an existing network interface to the\n"
210 " --network-macvlan=INTERFACE\n"
211 " Create a macvlan network interface based on an\n"
212 " existing network interface to the container\n"
213 " --network-ipvlan=INTERFACE\n"
214 " Create a ipvlan network interface based on an\n"
215 " existing network interface to the container\n"
216 " -n --network-veth Add a virtual Ethernet connection between host\n"
218 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
219 " Add an additional virtual Ethernet link between\n"
220 " host and container\n"
221 " --network-bridge=INTERFACE\n"
222 " Add a virtual Ethernet connection between host\n"
223 " and container and add it to an existing bridge on\n"
225 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
226 " Expose a container IP port on the host\n"
227 " -Z --selinux-context=SECLABEL\n"
228 " Set the SELinux security context to be used by\n"
229 " processes in the container\n"
230 " -L --selinux-apifs-context=SECLABEL\n"
231 " Set the SELinux security context to be used by\n"
232 " API/tmpfs file systems in the container\n"
233 " --capability=CAP In addition to the default, retain specified\n"
235 " --drop-capability=CAP Drop the specified capability from the default set\n"
236 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
237 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
238 " host, try-guest, try-host\n"
239 " -j Equivalent to --link-journal=try-guest\n"
240 " --read-only Mount the root directory read-only\n"
241 " --bind=PATH[:PATH[:OPTIONS]]\n"
242 " Bind mount a file or directory from the host into\n"
244 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
245 " Similar, but creates a read-only bind mount\n"
246 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
247 " --overlay=PATH[:PATH...]:PATH\n"
248 " Create an overlay mount from the host to \n"
250 " --overlay-ro=PATH[:PATH...]:PATH\n"
251 " Similar, but creates a read-only overlay mount\n"
252 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
253 " --share-system Share system namespaces with host\n"
254 " --register=BOOLEAN Register container as machine\n"
255 " --keep-unit Do not register a scope for the machine, reuse\n"
256 " the service unit nspawn is running in\n"
257 " --volatile[=MODE] Run the system in volatile mode\n"
258 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
259 , program_invocation_short_name
);
263 static int custom_mounts_prepare(void) {
267 /* Ensure the mounts are applied prefix first. */
268 qsort_safe(arg_custom_mounts
, arg_n_custom_mounts
, sizeof(CustomMount
), custom_mount_compare
);
270 /* Allocate working directories for the overlay file systems that need it */
271 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
272 CustomMount
*m
= &arg_custom_mounts
[i
];
274 if (arg_userns
&& arg_uid_shift
== UID_INVALID
&& path_equal(m
->destination
, "/")) {
275 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
279 if (m
->type
!= CUSTOM_MOUNT_OVERLAY
)
288 r
= tempfn_random(m
->source
, NULL
, &m
->work_dir
);
290 return log_error_errno(r
, "Failed to generate work directory from %s: %m", m
->source
);
296 static int detect_unified_cgroup_hierarchy(void) {
300 /* Allow the user to control whether the unified hierarchy is used */
301 e
= getenv("UNIFIED_CGROUP_HIERARCHY");
303 r
= parse_boolean(e
);
305 return log_error_errno(r
, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
307 arg_unified_cgroup_hierarchy
= r
;
311 /* Otherwise inherit the default from the host system */
314 return log_error_errno(r
, "Failed to determine whether the unified cgroups hierarchy is used: %m");
316 arg_unified_cgroup_hierarchy
= r
;
320 static int parse_argv(int argc
, char *argv
[]) {
339 ARG_NETWORK_INTERFACE
,
343 ARG_NETWORK_VETH_EXTRA
,
354 static const struct option options
[] = {
355 { "help", no_argument
, NULL
, 'h' },
356 { "version", no_argument
, NULL
, ARG_VERSION
},
357 { "directory", required_argument
, NULL
, 'D' },
358 { "template", required_argument
, NULL
, ARG_TEMPLATE
},
359 { "ephemeral", no_argument
, NULL
, 'x' },
360 { "user", required_argument
, NULL
, 'u' },
361 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
362 { "as-pid2", no_argument
, NULL
, 'a' },
363 { "boot", no_argument
, NULL
, 'b' },
364 { "uuid", required_argument
, NULL
, ARG_UUID
},
365 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
366 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
367 { "drop-capability", required_argument
, NULL
, ARG_DROP_CAPABILITY
},
368 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
369 { "bind", required_argument
, NULL
, ARG_BIND
},
370 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
371 { "tmpfs", required_argument
, NULL
, ARG_TMPFS
},
372 { "overlay", required_argument
, NULL
, ARG_OVERLAY
},
373 { "overlay-ro", required_argument
, NULL
, ARG_OVERLAY_RO
},
374 { "machine", required_argument
, NULL
, 'M' },
375 { "slice", required_argument
, NULL
, 'S' },
376 { "setenv", required_argument
, NULL
, ARG_SETENV
},
377 { "selinux-context", required_argument
, NULL
, 'Z' },
378 { "selinux-apifs-context", required_argument
, NULL
, 'L' },
379 { "quiet", no_argument
, NULL
, 'q' },
380 { "share-system", no_argument
, NULL
, ARG_SHARE_SYSTEM
},
381 { "register", required_argument
, NULL
, ARG_REGISTER
},
382 { "keep-unit", no_argument
, NULL
, ARG_KEEP_UNIT
},
383 { "network-interface", required_argument
, NULL
, ARG_NETWORK_INTERFACE
},
384 { "network-macvlan", required_argument
, NULL
, ARG_NETWORK_MACVLAN
},
385 { "network-ipvlan", required_argument
, NULL
, ARG_NETWORK_IPVLAN
},
386 { "network-veth", no_argument
, NULL
, 'n' },
387 { "network-veth-extra", required_argument
, NULL
, ARG_NETWORK_VETH_EXTRA
},
388 { "network-bridge", required_argument
, NULL
, ARG_NETWORK_BRIDGE
},
389 { "personality", required_argument
, NULL
, ARG_PERSONALITY
},
390 { "image", required_argument
, NULL
, 'i' },
391 { "volatile", optional_argument
, NULL
, ARG_VOLATILE
},
392 { "port", required_argument
, NULL
, 'p' },
393 { "property", required_argument
, NULL
, ARG_PROPERTY
},
394 { "private-users", optional_argument
, NULL
, ARG_PRIVATE_USERS
},
395 { "kill-signal", required_argument
, NULL
, ARG_KILL_SIGNAL
},
396 { "settings", required_argument
, NULL
, ARG_SETTINGS
},
397 { "chdir", required_argument
, NULL
, ARG_CHDIR
},
403 uint64_t plus
= 0, minus
= 0;
404 bool mask_all_settings
= false, mask_no_settings
= false;
409 while ((c
= getopt_long(argc
, argv
, "+hD:u:abL:M:jS:Z:qi:xp:n", options
, NULL
)) >= 0)
421 r
= parse_path_argument_and_warn(optarg
, false, &arg_directory
);
427 r
= parse_path_argument_and_warn(optarg
, false, &arg_template
);
433 r
= parse_path_argument_and_warn(optarg
, false, &arg_image
);
439 arg_ephemeral
= true;
443 r
= free_and_strdup(&arg_user
, optarg
);
447 arg_settings_mask
|= SETTING_USER
;
450 case ARG_NETWORK_BRIDGE
:
451 r
= free_and_strdup(&arg_network_bridge
, optarg
);
458 arg_network_veth
= true;
459 arg_private_network
= true;
460 arg_settings_mask
|= SETTING_NETWORK
;
463 case ARG_NETWORK_VETH_EXTRA
:
464 r
= veth_extra_parse(&arg_network_veth_extra
, optarg
);
466 return log_error_errno(r
, "Failed to parse --network-veth-extra= parameter: %s", optarg
);
468 arg_private_network
= true;
469 arg_settings_mask
|= SETTING_NETWORK
;
472 case ARG_NETWORK_INTERFACE
:
473 if (strv_extend(&arg_network_interfaces
, optarg
) < 0)
476 arg_private_network
= true;
477 arg_settings_mask
|= SETTING_NETWORK
;
480 case ARG_NETWORK_MACVLAN
:
481 if (strv_extend(&arg_network_macvlan
, optarg
) < 0)
484 arg_private_network
= true;
485 arg_settings_mask
|= SETTING_NETWORK
;
488 case ARG_NETWORK_IPVLAN
:
489 if (strv_extend(&arg_network_ipvlan
, optarg
) < 0)
494 case ARG_PRIVATE_NETWORK
:
495 arg_private_network
= true;
496 arg_settings_mask
|= SETTING_NETWORK
;
500 if (arg_start_mode
== START_PID2
) {
501 log_error("--boot and --as-pid2 may not be combined.");
505 arg_start_mode
= START_BOOT
;
506 arg_settings_mask
|= SETTING_START_MODE
;
510 if (arg_start_mode
== START_BOOT
) {
511 log_error("--boot and --as-pid2 may not be combined.");
515 arg_start_mode
= START_PID2
;
516 arg_settings_mask
|= SETTING_START_MODE
;
520 r
= sd_id128_from_string(optarg
, &arg_uuid
);
522 log_error("Invalid UUID: %s", optarg
);
526 arg_settings_mask
|= SETTING_MACHINE_ID
;
535 arg_machine
= mfree(arg_machine
);
537 if (!machine_name_is_valid(optarg
)) {
538 log_error("Invalid machine name: %s", optarg
);
542 r
= free_and_strdup(&arg_machine
, optarg
);
550 arg_selinux_context
= optarg
;
554 arg_selinux_apifs_context
= optarg
;
558 arg_read_only
= true;
559 arg_settings_mask
|= SETTING_READ_ONLY
;
563 case ARG_DROP_CAPABILITY
: {
566 _cleanup_free_
char *t
= NULL
;
568 r
= extract_first_word(&p
, &t
, ",", 0);
570 return log_error_errno(r
, "Failed to parse capability %s.", t
);
575 if (streq(t
, "all")) {
576 if (c
== ARG_CAPABILITY
)
577 plus
= (uint64_t) -1;
579 minus
= (uint64_t) -1;
583 cap
= capability_from_name(t
);
585 log_error("Failed to parse capability %s.", t
);
589 if (c
== ARG_CAPABILITY
)
590 plus
|= 1ULL << (uint64_t) cap
;
592 minus
|= 1ULL << (uint64_t) cap
;
596 arg_settings_mask
|= SETTING_CAPABILITY
;
601 arg_link_journal
= LINK_GUEST
;
602 arg_link_journal_try
= true;
605 case ARG_LINK_JOURNAL
:
606 if (streq(optarg
, "auto")) {
607 arg_link_journal
= LINK_AUTO
;
608 arg_link_journal_try
= false;
609 } else if (streq(optarg
, "no")) {
610 arg_link_journal
= LINK_NO
;
611 arg_link_journal_try
= false;
612 } else if (streq(optarg
, "guest")) {
613 arg_link_journal
= LINK_GUEST
;
614 arg_link_journal_try
= false;
615 } else if (streq(optarg
, "host")) {
616 arg_link_journal
= LINK_HOST
;
617 arg_link_journal_try
= false;
618 } else if (streq(optarg
, "try-guest")) {
619 arg_link_journal
= LINK_GUEST
;
620 arg_link_journal_try
= true;
621 } else if (streq(optarg
, "try-host")) {
622 arg_link_journal
= LINK_HOST
;
623 arg_link_journal_try
= true;
625 log_error("Failed to parse link journal mode %s", optarg
);
633 r
= bind_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
, c
== ARG_BIND_RO
);
635 return log_error_errno(r
, "Failed to parse --bind(-ro)= argument %s: %m", optarg
);
637 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
641 r
= tmpfs_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
);
643 return log_error_errno(r
, "Failed to parse --tmpfs= argument %s: %m", optarg
);
645 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
649 case ARG_OVERLAY_RO
: {
650 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
651 _cleanup_strv_free_
char **lower
= NULL
;
656 r
= strv_split_extract(&lower
, optarg
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
660 log_error("Invalid overlay specification: %s", optarg
);
664 STRV_FOREACH(i
, lower
) {
665 if (!path_is_absolute(*i
)) {
666 log_error("Overlay path %s is not absolute.", *i
);
674 log_error("--overlay= needs at least two colon-separated directories specified.");
679 /* If two parameters are specified,
680 * the first one is the lower, the
681 * second one the upper directory. And
682 * we'll also define the destination
683 * mount point the same as the upper. */
687 destination
= strdup(upper
);
692 upper
= lower
[n
- 2];
693 destination
= lower
[n
- 1];
697 m
= custom_mount_add(&arg_custom_mounts
, &arg_n_custom_mounts
, CUSTOM_MOUNT_OVERLAY
);
701 m
->destination
= destination
;
704 m
->read_only
= c
== ARG_OVERLAY_RO
;
706 upper
= destination
= NULL
;
709 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
716 if (!env_assignment_is_valid(optarg
)) {
717 log_error("Environment variable assignment '%s' is not valid.", optarg
);
721 n
= strv_env_set(arg_setenv
, optarg
);
725 strv_free(arg_setenv
);
728 arg_settings_mask
|= SETTING_ENVIRONMENT
;
736 case ARG_SHARE_SYSTEM
:
737 arg_share_system
= true;
741 r
= parse_boolean(optarg
);
743 log_error("Failed to parse --register= argument: %s", optarg
);
751 arg_keep_unit
= true;
754 case ARG_PERSONALITY
:
756 arg_personality
= personality_from_string(optarg
);
757 if (arg_personality
== PERSONALITY_INVALID
) {
758 log_error("Unknown or unsupported personality '%s'.", optarg
);
762 arg_settings_mask
|= SETTING_PERSONALITY
;
768 arg_volatile_mode
= VOLATILE_YES
;
772 m
= volatile_mode_from_string(optarg
);
774 log_error("Failed to parse --volatile= argument: %s", optarg
);
777 arg_volatile_mode
= m
;
780 arg_settings_mask
|= SETTING_VOLATILE_MODE
;
784 r
= expose_port_parse(&arg_expose_ports
, optarg
);
786 return log_error_errno(r
, "Duplicate port specification: %s", optarg
);
788 return log_error_errno(r
, "Failed to parse host port %s: %m", optarg
);
790 arg_settings_mask
|= SETTING_EXPOSE_PORTS
;
794 if (strv_extend(&arg_property
, optarg
) < 0)
799 case ARG_PRIVATE_USERS
:
801 _cleanup_free_
char *buffer
= NULL
;
802 const char *range
, *shift
;
804 range
= strchr(optarg
, ':');
806 buffer
= strndup(optarg
, range
- optarg
);
812 if (safe_atou32(range
, &arg_uid_range
) < 0 || arg_uid_range
<= 0) {
813 log_error("Failed to parse UID range: %s", range
);
819 if (parse_uid(shift
, &arg_uid_shift
) < 0) {
820 log_error("Failed to parse UID: %s", optarg
);
828 case ARG_KILL_SIGNAL
:
829 arg_kill_signal
= signal_from_string_try_harder(optarg
);
830 if (arg_kill_signal
< 0) {
831 log_error("Cannot parse signal: %s", optarg
);
835 arg_settings_mask
|= SETTING_KILL_SIGNAL
;
840 /* no → do not read files
841 * yes → read files, do not override cmdline, trust only subset
842 * override → read files, override cmdline, trust only subset
843 * trusted → read files, do not override cmdline, trust all
846 r
= parse_boolean(optarg
);
848 if (streq(optarg
, "trusted")) {
849 mask_all_settings
= false;
850 mask_no_settings
= false;
851 arg_settings_trusted
= true;
853 } else if (streq(optarg
, "override")) {
854 mask_all_settings
= false;
855 mask_no_settings
= true;
856 arg_settings_trusted
= -1;
858 return log_error_errno(r
, "Failed to parse --settings= argument: %s", optarg
);
861 mask_all_settings
= false;
862 mask_no_settings
= false;
863 arg_settings_trusted
= -1;
866 mask_all_settings
= true;
867 mask_no_settings
= false;
868 arg_settings_trusted
= false;
874 if (!path_is_absolute(optarg
)) {
875 log_error("Working directory %s is not an absolute path.", optarg
);
879 r
= free_and_strdup(&arg_chdir
, optarg
);
883 arg_settings_mask
|= SETTING_WORKING_DIRECTORY
;
890 assert_not_reached("Unhandled option");
893 if (arg_share_system
)
894 arg_register
= false;
896 if (arg_start_mode
!= START_PID1
&& arg_share_system
) {
897 log_error("--boot and --share-system may not be combined.");
901 if (arg_keep_unit
&& cg_pid_get_owner_uid(0, NULL
) >= 0) {
902 log_error("--keep-unit may not be used when invoked from a user session.");
906 if (arg_directory
&& arg_image
) {
907 log_error("--directory= and --image= may not be combined.");
911 if (arg_template
&& arg_image
) {
912 log_error("--template= and --image= may not be combined.");
916 if (arg_template
&& !(arg_directory
|| arg_machine
)) {
917 log_error("--template= needs --directory= or --machine=.");
921 if (arg_ephemeral
&& arg_template
) {
922 log_error("--ephemeral and --template= may not be combined.");
926 if (arg_ephemeral
&& arg_image
) {
927 log_error("--ephemeral and --image= may not be combined.");
931 if (arg_ephemeral
&& !IN_SET(arg_link_journal
, LINK_NO
, LINK_AUTO
)) {
932 log_error("--ephemeral and --link-journal= may not be combined.");
936 if (arg_userns
&& access("/proc/self/uid_map", F_OK
) < 0)
937 return log_error_errno(EOPNOTSUPP
, "--private-users= is not supported, kernel compiled without user namespace support.");
940 arg_parameters
= strv_copy(argv
+ optind
);
944 arg_settings_mask
|= SETTING_START_MODE
;
947 /* Load all settings from .nspawn files */
948 if (mask_no_settings
)
949 arg_settings_mask
= 0;
951 /* Don't load any settings from .nspawn files */
952 if (mask_all_settings
)
953 arg_settings_mask
= _SETTINGS_MASK_ALL
;
955 arg_retain
= (arg_retain
| plus
| (arg_private_network
? 1ULL << CAP_NET_ADMIN
: 0)) & ~minus
;
957 r
= detect_unified_cgroup_hierarchy();
961 e
= getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
963 arg_container_service_name
= e
;
968 static int verify_arguments(void) {
970 if (arg_volatile_mode
!= VOLATILE_NO
&& arg_read_only
) {
971 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
975 if (arg_expose_ports
&& !arg_private_network
) {
976 log_error("Cannot use --port= without private networking.");
981 if (arg_expose_ports
) {
982 log_error("--port= is not supported, compiled without libiptc support.");
987 if (arg_start_mode
== START_BOOT
&& arg_kill_signal
<= 0)
988 arg_kill_signal
= SIGRTMIN
+3;
993 static int userns_lchown(const char *p
, uid_t uid
, gid_t gid
) {
999 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
1002 if (uid
!= UID_INVALID
) {
1003 uid
+= arg_uid_shift
;
1005 if (uid
< arg_uid_shift
|| uid
>= arg_uid_shift
+ arg_uid_range
)
1009 if (gid
!= GID_INVALID
) {
1010 gid
+= (gid_t
) arg_uid_shift
;
1012 if (gid
< (gid_t
) arg_uid_shift
|| gid
>= (gid_t
) (arg_uid_shift
+ arg_uid_range
))
1016 if (lchown(p
, uid
, gid
) < 0)
1022 static int userns_mkdir(const char *root
, const char *path
, mode_t mode
, uid_t uid
, gid_t gid
) {
1025 q
= prefix_roota(root
, path
);
1026 if (mkdir(q
, mode
) < 0) {
1027 if (errno
== EEXIST
)
1032 return userns_lchown(q
, uid
, gid
);
1035 static int setup_timezone(const char *dest
) {
1036 _cleanup_free_
char *p
= NULL
, *q
= NULL
;
1037 const char *where
, *check
, *what
;
1043 /* Fix the timezone, if possible */
1044 r
= readlink_malloc("/etc/localtime", &p
);
1046 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1050 z
= path_startswith(p
, "../usr/share/zoneinfo/");
1052 z
= path_startswith(p
, "/usr/share/zoneinfo/");
1054 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1058 where
= prefix_roota(dest
, "/etc/localtime");
1059 r
= readlink_malloc(where
, &q
);
1061 y
= path_startswith(q
, "../usr/share/zoneinfo/");
1063 y
= path_startswith(q
, "/usr/share/zoneinfo/");
1065 /* Already pointing to the right place? Then do nothing .. */
1066 if (y
&& streq(y
, z
))
1070 check
= strjoina("/usr/share/zoneinfo/", z
);
1071 check
= prefix_roota(dest
, check
);
1072 if (laccess(check
, F_OK
) < 0) {
1073 log_warning("Timezone %s does not exist in container, not updating container timezone.", z
);
1078 if (r
< 0 && errno
!= ENOENT
) {
1079 log_error_errno(errno
, "Failed to remove existing timezone info %s in container: %m", where
);
1083 what
= strjoina("../usr/share/zoneinfo/", z
);
1084 if (symlink(what
, where
) < 0) {
1085 log_error_errno(errno
, "Failed to correct timezone of container: %m");
1089 r
= userns_lchown(where
, 0, 0);
1091 return log_warning_errno(r
, "Failed to chown /etc/localtime: %m");
1096 static int setup_resolv_conf(const char *dest
) {
1097 const char *where
= NULL
;
1102 if (arg_private_network
)
1105 /* Fix resolv.conf, if possible */
1106 where
= prefix_roota(dest
, "/etc/resolv.conf");
1108 r
= copy_file("/etc/resolv.conf", where
, O_TRUNC
|O_NOFOLLOW
, 0644, 0);
1110 /* If the file already exists as symlink, let's
1111 * suppress the warning, under the assumption that
1112 * resolved or something similar runs inside and the
1113 * symlink points there.
1115 * If the disk image is read-only, there's also no
1116 * point in complaining.
1118 log_full_errno(IN_SET(r
, -ELOOP
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
1119 "Failed to copy /etc/resolv.conf to %s: %m", where
);
1123 r
= userns_lchown(where
, 0, 0);
1125 log_warning_errno(r
, "Failed to chown /etc/resolv.conf: %m");
1130 static char* id128_format_as_uuid(sd_id128_t id
, char s
[37]) {
1134 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1135 SD_ID128_FORMAT_VAL(id
));
1140 static int setup_boot_id(const char *dest
) {
1141 const char *from
, *to
;
1142 sd_id128_t rnd
= {};
1146 if (arg_share_system
)
1149 /* Generate a new randomized boot ID, so that each boot-up of
1150 * the container gets a new one */
1152 from
= prefix_roota(dest
, "/run/proc-sys-kernel-random-boot-id");
1153 to
= prefix_roota(dest
, "/proc/sys/kernel/random/boot_id");
1155 r
= sd_id128_randomize(&rnd
);
1157 return log_error_errno(r
, "Failed to generate random boot id: %m");
1159 id128_format_as_uuid(rnd
, as_uuid
);
1161 r
= write_string_file(from
, as_uuid
, WRITE_STRING_FILE_CREATE
);
1163 return log_error_errno(r
, "Failed to write boot id: %m");
1165 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1166 r
= log_error_errno(errno
, "Failed to bind mount boot id: %m");
1167 else if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
) < 0)
1168 log_warning_errno(errno
, "Failed to make boot id read-only: %m");
1174 static int copy_devnodes(const char *dest
) {
1176 static const char devnodes
[] =
1187 _cleanup_umask_ mode_t u
;
1193 /* Create /dev/net, so that we can create /dev/net/tun in it */
1194 if (userns_mkdir(dest
, "/dev/net", 0755, 0, 0) < 0)
1195 return log_error_errno(r
, "Failed to create /dev/net directory: %m");
1197 NULSTR_FOREACH(d
, devnodes
) {
1198 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
1201 from
= strappend("/dev/", d
);
1202 to
= prefix_root(dest
, from
);
1204 if (stat(from
, &st
) < 0) {
1206 if (errno
!= ENOENT
)
1207 return log_error_errno(errno
, "Failed to stat %s: %m", from
);
1209 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
1211 log_error("%s is not a char or block device, cannot copy.", from
);
1215 if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
1217 return log_error_errno(errno
, "mknod(%s) failed: %m", to
);
1219 /* Some systems abusively restrict mknod but
1220 * allow bind mounts. */
1223 return log_error_errno(r
, "touch (%s) failed: %m", to
);
1224 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1225 return log_error_errno(errno
, "Both mknod and bind mount (%s) failed: %m", to
);
1228 r
= userns_lchown(to
, 0, 0);
1230 return log_error_errno(r
, "chown() of device node %s failed: %m", to
);
1237 static int setup_pts(const char *dest
) {
1238 _cleanup_free_
char *options
= NULL
;
1243 if (arg_selinux_apifs_context
)
1244 (void) asprintf(&options
,
1245 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
",context=\"%s\"",
1246 arg_uid_shift
+ TTY_GID
,
1247 arg_selinux_apifs_context
);
1250 (void) asprintf(&options
,
1251 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
,
1252 arg_uid_shift
+ TTY_GID
);
1257 /* Mount /dev/pts itself */
1258 p
= prefix_roota(dest
, "/dev/pts");
1259 if (mkdir(p
, 0755) < 0)
1260 return log_error_errno(errno
, "Failed to create /dev/pts: %m");
1261 if (mount("devpts", p
, "devpts", MS_NOSUID
|MS_NOEXEC
, options
) < 0)
1262 return log_error_errno(errno
, "Failed to mount /dev/pts: %m");
1263 r
= userns_lchown(p
, 0, 0);
1265 return log_error_errno(r
, "Failed to chown /dev/pts: %m");
1267 /* Create /dev/ptmx symlink */
1268 p
= prefix_roota(dest
, "/dev/ptmx");
1269 if (symlink("pts/ptmx", p
) < 0)
1270 return log_error_errno(errno
, "Failed to create /dev/ptmx symlink: %m");
1271 r
= userns_lchown(p
, 0, 0);
1273 return log_error_errno(r
, "Failed to chown /dev/ptmx: %m");
1275 /* And fix /dev/pts/ptmx ownership */
1276 p
= prefix_roota(dest
, "/dev/pts/ptmx");
1277 r
= userns_lchown(p
, 0, 0);
1279 return log_error_errno(r
, "Failed to chown /dev/pts/ptmx: %m");
1284 static int setup_dev_console(const char *dest
, const char *console
) {
1285 _cleanup_umask_ mode_t u
;
1294 r
= chmod_and_chown(console
, 0600, arg_uid_shift
, arg_uid_shift
);
1296 return log_error_errno(r
, "Failed to correct access mode for TTY: %m");
1298 /* We need to bind mount the right tty to /dev/console since
1299 * ptys can only exist on pts file systems. To have something
1300 * to bind mount things on we create a empty regular file. */
1302 to
= prefix_roota(dest
, "/dev/console");
1305 return log_error_errno(r
, "touch() for /dev/console failed: %m");
1307 if (mount(console
, to
, NULL
, MS_BIND
, NULL
) < 0)
1308 return log_error_errno(errno
, "Bind mount for /dev/console failed: %m");
1313 static int setup_kmsg(const char *dest
, int kmsg_socket
) {
1314 const char *from
, *to
;
1315 _cleanup_umask_ mode_t u
;
1318 assert(kmsg_socket
>= 0);
1322 /* We create the kmsg FIFO as /run/kmsg, but immediately
1323 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1324 * on the reading side behave very similar to /proc/kmsg,
1325 * their writing side behaves differently from /dev/kmsg in
1326 * that writing blocks when nothing is reading. In order to
1327 * avoid any problems with containers deadlocking due to this
1328 * we simply make /dev/kmsg unavailable to the container. */
1329 from
= prefix_roota(dest
, "/run/kmsg");
1330 to
= prefix_roota(dest
, "/proc/kmsg");
1332 if (mkfifo(from
, 0600) < 0)
1333 return log_error_errno(errno
, "mkfifo() for /run/kmsg failed: %m");
1334 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1335 return log_error_errno(errno
, "Bind mount for /proc/kmsg failed: %m");
1337 fd
= open(from
, O_RDWR
|O_NDELAY
|O_CLOEXEC
);
1339 return log_error_errno(errno
, "Failed to open fifo: %m");
1341 /* Store away the fd in the socket, so that it stays open as
1342 * long as we run the child */
1343 r
= send_one_fd(kmsg_socket
, fd
, 0);
1347 return log_error_errno(r
, "Failed to send FIFO fd: %m");
1349 /* And now make the FIFO unavailable as /run/kmsg... */
1350 (void) unlink(from
);
1355 static int on_address_change(sd_netlink
*rtnl
, sd_netlink_message
*m
, void *userdata
) {
1356 union in_addr_union
*exposed
= userdata
;
1362 expose_port_execute(rtnl
, arg_expose_ports
, exposed
);
1366 static int setup_hostname(void) {
1368 if (arg_share_system
)
1371 if (sethostname_idempotent(arg_machine
) < 0)
1377 static int setup_journal(const char *directory
) {
1378 sd_id128_t machine_id
, this_id
;
1379 _cleanup_free_
char *b
= NULL
, *d
= NULL
;
1380 const char *etc_machine_id
, *p
, *q
;
1385 /* Don't link journals in ephemeral mode */
1389 if (arg_link_journal
== LINK_NO
)
1392 try = arg_link_journal_try
|| arg_link_journal
== LINK_AUTO
;
1394 etc_machine_id
= prefix_roota(directory
, "/etc/machine-id");
1396 r
= read_one_line_file(etc_machine_id
, &b
);
1397 if (r
== -ENOENT
&& try)
1400 return log_error_errno(r
, "Failed to read machine ID from %s: %m", etc_machine_id
);
1403 if (isempty(id
) && try)
1406 /* Verify validity */
1407 r
= sd_id128_from_string(id
, &machine_id
);
1409 return log_error_errno(r
, "Failed to parse machine ID from %s: %m", etc_machine_id
);
1411 r
= sd_id128_get_machine(&this_id
);
1413 return log_error_errno(r
, "Failed to retrieve machine ID: %m");
1415 if (sd_id128_equal(machine_id
, this_id
)) {
1416 log_full(try ? LOG_WARNING
: LOG_ERR
,
1417 "Host and machine ids are equal (%s): refusing to link journals", id
);
1423 r
= userns_mkdir(directory
, "/var", 0755, 0, 0);
1425 return log_error_errno(r
, "Failed to create /var: %m");
1427 r
= userns_mkdir(directory
, "/var/log", 0755, 0, 0);
1429 return log_error_errno(r
, "Failed to create /var/log: %m");
1431 r
= userns_mkdir(directory
, "/var/log/journal", 0755, 0, 0);
1433 return log_error_errno(r
, "Failed to create /var/log/journal: %m");
1435 p
= strjoina("/var/log/journal/", id
);
1436 q
= prefix_roota(directory
, p
);
1438 if (path_is_mount_point(p
, 0) > 0) {
1442 log_error("%s: already a mount point, refusing to use for journal", p
);
1446 if (path_is_mount_point(q
, 0) > 0) {
1450 log_error("%s: already a mount point, refusing to use for journal", q
);
1454 r
= readlink_and_make_absolute(p
, &d
);
1456 if ((arg_link_journal
== LINK_GUEST
||
1457 arg_link_journal
== LINK_AUTO
) &&
1460 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1462 log_warning_errno(r
, "Failed to create directory %s: %m", q
);
1467 return log_error_errno(errno
, "Failed to remove symlink %s: %m", p
);
1468 } else if (r
== -EINVAL
) {
1470 if (arg_link_journal
== LINK_GUEST
&&
1473 if (errno
== ENOTDIR
) {
1474 log_error("%s already exists and is neither a symlink nor a directory", p
);
1477 return log_error_errno(errno
, "Failed to remove %s: %m", p
);
1479 } else if (r
!= -ENOENT
)
1480 return log_error_errno(r
, "readlink(%s) failed: %m", p
);
1482 if (arg_link_journal
== LINK_GUEST
) {
1484 if (symlink(q
, p
) < 0) {
1486 log_debug_errno(errno
, "Failed to symlink %s to %s, skipping journal setup: %m", q
, p
);
1489 return log_error_errno(errno
, "Failed to symlink %s to %s: %m", q
, p
);
1492 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1494 log_warning_errno(r
, "Failed to create directory %s: %m", q
);
1498 if (arg_link_journal
== LINK_HOST
) {
1499 /* don't create parents here -- if the host doesn't have
1500 * permanent journal set up, don't force it here */
1502 if (mkdir(p
, 0755) < 0 && errno
!= EEXIST
) {
1504 log_debug_errno(errno
, "Failed to create %s, skipping journal setup: %m", p
);
1507 return log_error_errno(errno
, "Failed to create %s: %m", p
);
1510 } else if (access(p
, F_OK
) < 0)
1513 if (dir_is_empty(q
) == 0)
1514 log_warning("%s is not empty, proceeding anyway.", q
);
1516 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1518 return log_error_errno(r
, "Failed to create %s: %m", q
);
1520 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1521 return log_error_errno(errno
, "Failed to bind mount journal from host into guest: %m");
1526 static int drop_capabilities(void) {
1527 return capability_bounding_set_drop(arg_retain
, false);
1530 static int reset_audit_loginuid(void) {
1531 _cleanup_free_
char *p
= NULL
;
1534 if (arg_share_system
)
1537 r
= read_one_line_file("/proc/self/loginuid", &p
);
1541 return log_error_errno(r
, "Failed to read /proc/self/loginuid: %m");
1543 /* Already reset? */
1544 if (streq(p
, "4294967295"))
1547 r
= write_string_file("/proc/self/loginuid", "4294967295", 0);
1550 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1551 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1552 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1553 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1554 "using systemd-nspawn. Sleeping for 5s... (%m)");
1562 static int setup_seccomp(void) {
1565 static const struct {
1566 uint64_t capability
;
1569 { CAP_SYS_RAWIO
, SCMP_SYS(iopl
) },
1570 { CAP_SYS_RAWIO
, SCMP_SYS(ioperm
) },
1571 { CAP_SYS_BOOT
, SCMP_SYS(kexec_load
) },
1572 { CAP_SYS_ADMIN
, SCMP_SYS(swapon
) },
1573 { CAP_SYS_ADMIN
, SCMP_SYS(swapoff
) },
1574 { CAP_SYS_ADMIN
, SCMP_SYS(open_by_handle_at
) },
1575 { CAP_SYS_MODULE
, SCMP_SYS(init_module
) },
1576 { CAP_SYS_MODULE
, SCMP_SYS(finit_module
) },
1577 { CAP_SYS_MODULE
, SCMP_SYS(delete_module
) },
1578 { CAP_SYSLOG
, SCMP_SYS(syslog
) },
1581 scmp_filter_ctx seccomp
;
1585 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
1589 r
= seccomp_add_secondary_archs(seccomp
);
1591 log_error_errno(r
, "Failed to add secondary archs to seccomp filter: %m");
1595 for (i
= 0; i
< ELEMENTSOF(blacklist
); i
++) {
1596 if (arg_retain
& (1ULL << blacklist
[i
].capability
))
1599 r
= seccomp_rule_add(seccomp
, SCMP_ACT_ERRNO(EPERM
), blacklist
[i
].syscall_num
, 0);
1601 continue; /* unknown syscall */
1603 log_error_errno(r
, "Failed to block syscall: %m");
1610 Audit is broken in containers, much of the userspace audit
1611 hookup will fail if running inside a container. We don't
1612 care and just turn off creation of audit sockets.
1614 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1615 with EAFNOSUPPORT which audit userspace uses as indication
1616 that audit is disabled in the kernel.
1619 r
= seccomp_rule_add(
1621 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1624 SCMP_A0(SCMP_CMP_EQ
, AF_NETLINK
),
1625 SCMP_A2(SCMP_CMP_EQ
, NETLINK_AUDIT
));
1627 log_error_errno(r
, "Failed to add audit seccomp rule: %m");
1631 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
1633 log_error_errno(r
, "Failed to unset NO_NEW_PRIVS: %m");
1637 r
= seccomp_load(seccomp
);
1639 log_debug_errno(r
, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1644 log_error_errno(r
, "Failed to install seccomp audit filter: %m");
1649 seccomp_release(seccomp
);
1657 static int setup_propagate(const char *root
) {
1661 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1662 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1663 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
1664 (void) mkdir_p(p
, 0600);
1666 r
= userns_mkdir(root
, "/run/systemd", 0755, 0, 0);
1668 return log_error_errno(r
, "Failed to create /run/systemd: %m");
1670 r
= userns_mkdir(root
, "/run/systemd/nspawn", 0755, 0, 0);
1672 return log_error_errno(r
, "Failed to create /run/systemd/nspawn: %m");
1674 r
= userns_mkdir(root
, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1676 return log_error_errno(r
, "Failed to create /run/systemd/nspawn/incoming: %m");
1678 q
= prefix_roota(root
, "/run/systemd/nspawn/incoming");
1679 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1680 return log_error_errno(errno
, "Failed to install propagation bind mount.");
1682 if (mount(NULL
, q
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
) < 0)
1683 return log_error_errno(errno
, "Failed to make propagation mount read-only");
1688 static int setup_image(char **device_path
, int *loop_nr
) {
1689 struct loop_info64 info
= {
1690 .lo_flags
= LO_FLAGS_AUTOCLEAR
|LO_FLAGS_PARTSCAN
1692 _cleanup_close_
int fd
= -1, control
= -1, loop
= -1;
1693 _cleanup_free_
char* loopdev
= NULL
;
1697 assert(device_path
);
1701 fd
= open(arg_image
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1703 return log_error_errno(errno
, "Failed to open %s: %m", arg_image
);
1705 if (fstat(fd
, &st
) < 0)
1706 return log_error_errno(errno
, "Failed to stat %s: %m", arg_image
);
1708 if (S_ISBLK(st
.st_mode
)) {
1711 p
= strdup(arg_image
);
1725 if (!S_ISREG(st
.st_mode
)) {
1726 log_error("%s is not a regular file or block device.", arg_image
);
1730 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
1732 return log_error_errno(errno
, "Failed to open /dev/loop-control: %m");
1734 nr
= ioctl(control
, LOOP_CTL_GET_FREE
);
1736 return log_error_errno(errno
, "Failed to allocate loop device: %m");
1738 if (asprintf(&loopdev
, "/dev/loop%i", nr
) < 0)
1741 loop
= open(loopdev
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1743 return log_error_errno(errno
, "Failed to open loop device %s: %m", loopdev
);
1745 if (ioctl(loop
, LOOP_SET_FD
, fd
) < 0)
1746 return log_error_errno(errno
, "Failed to set loopback file descriptor on %s: %m", loopdev
);
1749 info
.lo_flags
|= LO_FLAGS_READ_ONLY
;
1751 if (ioctl(loop
, LOOP_SET_STATUS64
, &info
) < 0)
1752 return log_error_errno(errno
, "Failed to set loopback settings on %s: %m", loopdev
);
1754 *device_path
= loopdev
;
1765 #define PARTITION_TABLE_BLURB \
1766 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1767 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1768 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1769 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1770 "to be bootable with systemd-nspawn."
1772 static int dissect_image(
1774 char **root_device
, bool *root_device_rw
,
1775 char **home_device
, bool *home_device_rw
,
1776 char **srv_device
, bool *srv_device_rw
,
1780 int home_nr
= -1, srv_nr
= -1;
1781 #ifdef GPT_ROOT_NATIVE
1784 #ifdef GPT_ROOT_SECONDARY
1785 int secondary_root_nr
= -1;
1787 _cleanup_free_
char *home
= NULL
, *root
= NULL
, *secondary_root
= NULL
, *srv
= NULL
, *generic
= NULL
;
1788 _cleanup_udev_enumerate_unref_
struct udev_enumerate
*e
= NULL
;
1789 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
1790 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
1791 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
1792 struct udev_list_entry
*first
, *item
;
1793 bool home_rw
= true, root_rw
= true, secondary_root_rw
= true, srv_rw
= true, generic_rw
= true;
1794 bool is_gpt
, is_mbr
, multiple_generic
= false;
1795 const char *pttype
= NULL
;
1802 assert(root_device
);
1803 assert(home_device
);
1808 b
= blkid_new_probe();
1813 r
= blkid_probe_set_device(b
, fd
, 0, 0);
1818 return log_error_errno(errno
, "Failed to set device on blkid probe: %m");
1821 blkid_probe_enable_partitions(b
, 1);
1822 blkid_probe_set_partitions_flags(b
, BLKID_PARTS_ENTRY_DETAILS
);
1825 r
= blkid_do_safeprobe(b
);
1826 if (r
== -2 || r
== 1) {
1827 log_error("Failed to identify any partition table on\n"
1829 PARTITION_TABLE_BLURB
, arg_image
);
1831 } else if (r
!= 0) {
1834 return log_error_errno(errno
, "Failed to probe: %m");
1837 (void) blkid_probe_lookup_value(b
, "PTTYPE", &pttype
, NULL
);
1839 is_gpt
= streq_ptr(pttype
, "gpt");
1840 is_mbr
= streq_ptr(pttype
, "dos");
1842 if (!is_gpt
&& !is_mbr
) {
1843 log_error("No GPT or MBR partition table discovered on\n"
1845 PARTITION_TABLE_BLURB
, arg_image
);
1850 pl
= blkid_probe_get_partitions(b
);
1855 log_error("Failed to list partitions of %s", arg_image
);
1863 if (fstat(fd
, &st
) < 0)
1864 return log_error_errno(errno
, "Failed to stat block device: %m");
1866 d
= udev_device_new_from_devnum(udev
, 'b', st
.st_rdev
);
1874 log_error("Kernel partitions never appeared.");
1878 e
= udev_enumerate_new(udev
);
1882 r
= udev_enumerate_add_match_parent(e
, d
);
1886 r
= udev_enumerate_scan_devices(e
);
1888 return log_error_errno(r
, "Failed to scan for partition devices of %s: %m", arg_image
);
1890 /* Count the partitions enumerated by the kernel */
1892 first
= udev_enumerate_get_list_entry(e
);
1893 udev_list_entry_foreach(item
, first
)
1896 /* Count the partitions enumerated by blkid */
1897 m
= blkid_partlist_numof_partitions(pl
);
1901 log_error("blkid and kernel partition list do not match.");
1907 /* The kernel has probed fewer partitions than
1908 * blkid? Maybe the kernel prober is still
1909 * running or it got EBUSY because udev
1910 * already opened the device. Let's reprobe
1911 * the device, which is a synchronous call
1912 * that waits until probing is complete. */
1914 for (j
= 0; j
< 20; j
++) {
1916 r
= ioctl(fd
, BLKRRPART
, 0);
1919 if (r
>= 0 || r
!= -EBUSY
)
1922 /* If something else has the device
1923 * open, such as an udev rule, the
1924 * ioctl will return EBUSY. Since
1925 * there's no way to wait until it
1926 * isn't busy anymore, let's just wait
1927 * a bit, and try again.
1929 * This is really something they
1930 * should fix in the kernel! */
1932 usleep(50 * USEC_PER_MSEC
);
1936 return log_error_errno(r
, "Failed to reread partition table: %m");
1939 e
= udev_enumerate_unref(e
);
1942 first
= udev_enumerate_get_list_entry(e
);
1943 udev_list_entry_foreach(item
, first
) {
1944 _cleanup_udev_device_unref_
struct udev_device
*q
;
1946 unsigned long long flags
;
1952 q
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(item
));
1957 return log_error_errno(errno
, "Failed to get partition device of %s: %m", arg_image
);
1960 qn
= udev_device_get_devnum(q
);
1964 if (st
.st_rdev
== qn
)
1967 node
= udev_device_get_devnode(q
);
1971 pp
= blkid_partlist_devno_to_partition(pl
, qn
);
1975 flags
= blkid_partition_get_flags(pp
);
1977 nr
= blkid_partition_get_partno(pp
);
1985 if (flags
& GPT_FLAG_NO_AUTO
)
1988 stype
= blkid_partition_get_type_string(pp
);
1992 if (sd_id128_from_string(stype
, &type_id
) < 0)
1995 if (sd_id128_equal(type_id
, GPT_HOME
)) {
1997 if (home
&& nr
>= home_nr
)
2001 home_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2003 r
= free_and_strdup(&home
, node
);
2007 } else if (sd_id128_equal(type_id
, GPT_SRV
)) {
2009 if (srv
&& nr
>= srv_nr
)
2013 srv_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2015 r
= free_and_strdup(&srv
, node
);
2019 #ifdef GPT_ROOT_NATIVE
2020 else if (sd_id128_equal(type_id
, GPT_ROOT_NATIVE
)) {
2022 if (root
&& nr
>= root_nr
)
2026 root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2028 r
= free_and_strdup(&root
, node
);
2033 #ifdef GPT_ROOT_SECONDARY
2034 else if (sd_id128_equal(type_id
, GPT_ROOT_SECONDARY
)) {
2036 if (secondary_root
&& nr
>= secondary_root_nr
)
2039 secondary_root_nr
= nr
;
2040 secondary_root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2042 r
= free_and_strdup(&secondary_root
, node
);
2047 else if (sd_id128_equal(type_id
, GPT_LINUX_GENERIC
)) {
2050 multiple_generic
= true;
2052 generic_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2054 r
= free_and_strdup(&generic
, node
);
2060 } else if (is_mbr
) {
2063 if (flags
!= 0x80) /* Bootable flag */
2066 type
= blkid_partition_get_type(pp
);
2067 if (type
!= 0x83) /* Linux partition */
2071 multiple_generic
= true;
2075 r
= free_and_strdup(&root
, node
);
2083 *root_device
= root
;
2086 *root_device_rw
= root_rw
;
2088 } else if (secondary_root
) {
2089 *root_device
= secondary_root
;
2090 secondary_root
= NULL
;
2092 *root_device_rw
= secondary_root_rw
;
2094 } else if (generic
) {
2096 /* There were no partitions with precise meanings
2097 * around, but we found generic partitions. In this
2098 * case, if there's only one, we can go ahead and boot
2099 * it, otherwise we bail out, because we really cannot
2100 * make any sense of it. */
2102 if (multiple_generic
) {
2103 log_error("Identified multiple bootable Linux partitions on\n"
2105 PARTITION_TABLE_BLURB
, arg_image
);
2109 *root_device
= generic
;
2112 *root_device_rw
= generic_rw
;
2115 log_error("Failed to identify root partition in disk image\n"
2117 PARTITION_TABLE_BLURB
, arg_image
);
2122 *home_device
= home
;
2125 *home_device_rw
= home_rw
;
2132 *srv_device_rw
= srv_rw
;
2137 log_error("--image= is not supported, compiled without blkid support.");
2142 static int mount_device(const char *what
, const char *where
, const char *directory
, bool rw
) {
2144 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
2145 const char *fstype
, *p
;
2155 p
= strjoina(where
, directory
);
2160 b
= blkid_new_probe_from_filename(what
);
2164 return log_error_errno(errno
, "Failed to allocate prober for %s: %m", what
);
2167 blkid_probe_enable_superblocks(b
, 1);
2168 blkid_probe_set_superblocks_flags(b
, BLKID_SUBLKS_TYPE
);
2171 r
= blkid_do_safeprobe(b
);
2172 if (r
== -1 || r
== 1) {
2173 log_error("Cannot determine file system type of %s", what
);
2175 } else if (r
!= 0) {
2178 return log_error_errno(errno
, "Failed to probe %s: %m", what
);
2182 if (blkid_probe_lookup_value(b
, "TYPE", &fstype
, NULL
) < 0) {
2185 log_error("Failed to determine file system type of %s", what
);
2189 if (streq(fstype
, "crypto_LUKS")) {
2190 log_error("nspawn currently does not support LUKS disk images.");
2194 if (mount(what
, p
, fstype
, MS_NODEV
|(rw
? 0 : MS_RDONLY
), NULL
) < 0)
2195 return log_error_errno(errno
, "Failed to mount %s: %m", what
);
2199 log_error("--image= is not supported, compiled without blkid support.");
2204 static int mount_devices(
2206 const char *root_device
, bool root_device_rw
,
2207 const char *home_device
, bool home_device_rw
,
2208 const char *srv_device
, bool srv_device_rw
) {
2214 r
= mount_device(root_device
, arg_directory
, NULL
, root_device_rw
);
2216 return log_error_errno(r
, "Failed to mount root directory: %m");
2220 r
= mount_device(home_device
, arg_directory
, "/home", home_device_rw
);
2222 return log_error_errno(r
, "Failed to mount home directory: %m");
2226 r
= mount_device(srv_device
, arg_directory
, "/srv", srv_device_rw
);
2228 return log_error_errno(r
, "Failed to mount server data directory: %m");
2234 static void loop_remove(int nr
, int *image_fd
) {
2235 _cleanup_close_
int control
= -1;
2241 if (image_fd
&& *image_fd
>= 0) {
2242 r
= ioctl(*image_fd
, LOOP_CLR_FD
);
2244 log_debug_errno(errno
, "Failed to close loop image: %m");
2245 *image_fd
= safe_close(*image_fd
);
2248 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
2250 log_warning_errno(errno
, "Failed to open /dev/loop-control: %m");
2254 r
= ioctl(control
, LOOP_CTL_REMOVE
, nr
);
2256 log_debug_errno(errno
, "Failed to remove loop %d: %m", nr
);
2261 * < 0 : wait_for_terminate() failed to get the state of the
2262 * container, the container was terminated by a signal, or
2263 * failed for an unknown reason. No change is made to the
2264 * container argument.
2265 * > 0 : The program executed in the container terminated with an
2266 * error. The exit code of the program executed in the
2267 * container is returned. The container argument has been set
2268 * to CONTAINER_TERMINATED.
2269 * 0 : The container is being rebooted, has been shut down or exited
2270 * successfully. The container argument has been set to either
2271 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2273 * That is, success is indicated by a return value of zero, and an
2274 * error is indicated by a non-zero value.
2276 static int wait_for_container(pid_t pid
, ContainerStatus
*container
) {
2280 r
= wait_for_terminate(pid
, &status
);
2282 return log_warning_errno(r
, "Failed to wait for container: %m");
2284 switch (status
.si_code
) {
2287 if (status
.si_status
== 0) {
2288 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s exited successfully.", arg_machine
);
2291 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s failed with error code %i.", arg_machine
, status
.si_status
);
2293 *container
= CONTAINER_TERMINATED
;
2294 return status
.si_status
;
2297 if (status
.si_status
== SIGINT
) {
2299 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s has been shut down.", arg_machine
);
2300 *container
= CONTAINER_TERMINATED
;
2303 } else if (status
.si_status
== SIGHUP
) {
2305 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s is being rebooted.", arg_machine
);
2306 *container
= CONTAINER_REBOOTED
;
2310 /* CLD_KILLED fallthrough */
2313 log_error("Container %s terminated by signal %s.", arg_machine
, signal_to_string(status
.si_status
));
2317 log_error("Container %s failed due to unknown reason.", arg_machine
);
2324 static int on_orderly_shutdown(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
2327 pid
= PTR_TO_PID(userdata
);
2329 if (kill(pid
, arg_kill_signal
) >= 0) {
2330 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2331 sd_event_source_set_userdata(s
, NULL
);
2336 sd_event_exit(sd_event_source_get_event(s
), 0);
2340 static int determine_names(void) {
2343 if (arg_template
&& !arg_directory
&& arg_machine
) {
2345 /* If --template= was specified then we should not
2346 * search for a machine, but instead create a new one
2347 * in /var/lib/machine. */
2349 arg_directory
= strjoin("/var/lib/machines/", arg_machine
, NULL
);
2354 if (!arg_image
&& !arg_directory
) {
2356 _cleanup_(image_unrefp
) Image
*i
= NULL
;
2358 r
= image_find(arg_machine
, &i
);
2360 return log_error_errno(r
, "Failed to find image for machine '%s': %m", arg_machine
);
2362 log_error("No image for machine '%s': %m", arg_machine
);
2366 if (i
->type
== IMAGE_RAW
)
2367 r
= free_and_strdup(&arg_image
, i
->path
);
2369 r
= free_and_strdup(&arg_directory
, i
->path
);
2371 return log_error_errno(r
, "Invalid image directory: %m");
2374 arg_read_only
= arg_read_only
|| i
->read_only
;
2376 arg_directory
= get_current_dir_name();
2378 if (!arg_directory
&& !arg_machine
) {
2379 log_error("Failed to determine path, please use -D or -i.");
2385 if (arg_directory
&& path_equal(arg_directory
, "/"))
2386 arg_machine
= gethostname_malloc();
2388 arg_machine
= strdup(basename(arg_image
?: arg_directory
));
2393 hostname_cleanup(arg_machine
);
2394 if (!machine_name_is_valid(arg_machine
)) {
2395 log_error("Failed to determine machine name automatically, please use -M.");
2399 if (arg_ephemeral
) {
2402 /* Add a random suffix when this is an
2403 * ephemeral machine, so that we can run many
2404 * instances at once without manually having
2405 * to specify -M each time. */
2407 if (asprintf(&b
, "%s-%016" PRIx64
, arg_machine
, random_u64()) < 0)
2418 static int determine_uid_shift(const char *directory
) {
2426 if (arg_uid_shift
== UID_INVALID
) {
2429 r
= stat(directory
, &st
);
2431 return log_error_errno(errno
, "Failed to determine UID base of %s: %m", directory
);
2433 arg_uid_shift
= st
.st_uid
& UINT32_C(0xffff0000);
2435 if (arg_uid_shift
!= (st
.st_gid
& UINT32_C(0xffff0000))) {
2436 log_error("UID and GID base of %s don't match.", directory
);
2440 arg_uid_range
= UINT32_C(0x10000);
2443 if (arg_uid_shift
> (uid_t
) -1 - arg_uid_range
) {
2444 log_error("UID base too high for UID range.");
2448 log_info("Using user namespaces with base " UID_FMT
" and range " UID_FMT
".", arg_uid_shift
, arg_uid_range
);
2452 static int inner_child(
2454 const char *directory
,
2460 _cleanup_free_
char *home
= NULL
;
2462 const char *envp
[] = {
2463 "PATH=" DEFAULT_PATH_SPLIT_USR
,
2464 NULL
, /* container */
2469 NULL
, /* container_uuid */
2470 NULL
, /* LISTEN_FDS */
2471 NULL
, /* LISTEN_PID */
2475 _cleanup_strv_free_
char **env_use
= NULL
;
2480 assert(kmsg_socket
>= 0);
2485 /* Tell the parent, that it now can write the UID map. */
2486 (void) barrier_place(barrier
); /* #1 */
2488 /* Wait until the parent wrote the UID map */
2489 if (!barrier_place_and_sync(barrier
)) { /* #2 */
2490 log_error("Parent died too early");
2495 r
= mount_all(NULL
, arg_userns
, true, arg_uid_shift
, arg_private_network
, arg_uid_range
, arg_selinux_apifs_context
);
2499 r
= mount_sysfs(NULL
);
2503 /* Wait until we are cgroup-ified, so that we
2504 * can mount the right cgroup path writable */
2505 if (!barrier_place_and_sync(barrier
)) { /* #3 */
2506 log_error("Parent died too early");
2510 r
= mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy
);
2514 r
= reset_uid_gid();
2516 return log_error_errno(r
, "Couldn't become new root: %m");
2518 r
= setup_boot_id(NULL
);
2522 r
= setup_kmsg(NULL
, kmsg_socket
);
2525 kmsg_socket
= safe_close(kmsg_socket
);
2530 return log_error_errno(errno
, "setsid() failed: %m");
2532 if (arg_private_network
)
2535 if (arg_expose_ports
) {
2536 r
= expose_port_send_rtnl(rtnl_socket
);
2539 rtnl_socket
= safe_close(rtnl_socket
);
2542 r
= drop_capabilities();
2544 return log_error_errno(r
, "drop_capabilities() failed: %m");
2548 if (arg_personality
!= PERSONALITY_INVALID
) {
2549 if (personality(arg_personality
) < 0)
2550 return log_error_errno(errno
, "personality() failed: %m");
2551 } else if (secondary
) {
2552 if (personality(PER_LINUX32
) < 0)
2553 return log_error_errno(errno
, "personality() failed: %m");
2557 if (arg_selinux_context
)
2558 if (setexeccon((security_context_t
) arg_selinux_context
) < 0)
2559 return log_error_errno(errno
, "setexeccon(\"%s\") failed: %m", arg_selinux_context
);
2562 r
= change_uid_gid(arg_user
, &home
);
2566 /* LXC sets container=lxc, so follow the scheme here */
2567 envp
[n_env
++] = strjoina("container=", arg_container_service_name
);
2569 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
2573 if ((asprintf((char**)(envp
+ n_env
++), "HOME=%s", home
? home
: "/root") < 0) ||
2574 (asprintf((char**)(envp
+ n_env
++), "USER=%s", arg_user
? arg_user
: "root") < 0) ||
2575 (asprintf((char**)(envp
+ n_env
++), "LOGNAME=%s", arg_user
? arg_user
: "root") < 0))
2578 if (!sd_id128_equal(arg_uuid
, SD_ID128_NULL
)) {
2581 if (asprintf((char**)(envp
+ n_env
++), "container_uuid=%s", id128_format_as_uuid(arg_uuid
, as_uuid
)) < 0)
2585 if (fdset_size(fds
) > 0) {
2586 r
= fdset_cloexec(fds
, false);
2588 return log_error_errno(r
, "Failed to unset O_CLOEXEC for file descriptors.");
2590 if ((asprintf((char **)(envp
+ n_env
++), "LISTEN_FDS=%u", fdset_size(fds
)) < 0) ||
2591 (asprintf((char **)(envp
+ n_env
++), "LISTEN_PID=1") < 0))
2595 env_use
= strv_env_merge(2, envp
, arg_setenv
);
2599 /* Let the parent know that we are ready and
2600 * wait until the parent is ready with the
2602 if (!barrier_place_and_sync(barrier
)) { /* #4 */
2603 log_error("Parent died too early");
2608 if (chdir(arg_chdir
) < 0)
2609 return log_error_errno(errno
, "Failed to change to specified working directory %s: %m", arg_chdir
);
2611 if (arg_start_mode
== START_PID2
) {
2617 /* Now, explicitly close the log, so that we
2618 * then can close all remaining fds. Closing
2619 * the log explicitly first has the benefit
2620 * that the logging subsystem knows about it,
2621 * and is thus ready to be reopened should we
2622 * need it again. Note that the other fds
2623 * closed here are at least the locking and
2626 (void) fdset_close_others(fds
);
2628 if (arg_start_mode
== START_BOOT
) {
2632 /* Automatically search for the init system */
2634 m
= strv_length(arg_parameters
);
2635 a
= newa(char*, m
+ 2);
2636 memcpy_safe(a
+ 1, arg_parameters
, m
* sizeof(char*));
2639 a
[0] = (char*) "/usr/lib/systemd/systemd";
2640 execve(a
[0], a
, env_use
);
2642 a
[0] = (char*) "/lib/systemd/systemd";
2643 execve(a
[0], a
, env_use
);
2645 a
[0] = (char*) "/sbin/init";
2646 execve(a
[0], a
, env_use
);
2647 } else if (!strv_isempty(arg_parameters
))
2648 execvpe(arg_parameters
[0], arg_parameters
, env_use
);
2651 chdir(home
?: "/root");
2653 execle("/bin/bash", "-bash", NULL
, env_use
);
2654 execle("/bin/sh", "-sh", NULL
, env_use
);
2659 return log_error_errno(r
, "execv() failed: %m");
2662 static int outer_child(
2664 const char *directory
,
2665 const char *console
,
2666 const char *root_device
, bool root_device_rw
,
2667 const char *home_device
, bool home_device_rw
,
2668 const char *srv_device
, bool srv_device_rw
,
2674 int uid_shift_socket
,
2684 assert(pid_socket
>= 0);
2685 assert(kmsg_socket
>= 0);
2689 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0)
2690 return log_error_errno(errno
, "PR_SET_PDEATHSIG failed: %m");
2693 close_nointr(STDIN_FILENO
);
2694 close_nointr(STDOUT_FILENO
);
2695 close_nointr(STDERR_FILENO
);
2697 r
= open_terminal(console
, O_RDWR
);
2698 if (r
!= STDIN_FILENO
) {
2704 return log_error_errno(r
, "Failed to open console: %m");
2707 if (dup2(STDIN_FILENO
, STDOUT_FILENO
) != STDOUT_FILENO
||
2708 dup2(STDIN_FILENO
, STDERR_FILENO
) != STDERR_FILENO
)
2709 return log_error_errno(errno
, "Failed to duplicate console: %m");
2712 r
= reset_audit_loginuid();
2716 /* Mark everything as slave, so that we still
2717 * receive mounts from the real root, but don't
2718 * propagate mounts to the real root. */
2719 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
2720 return log_error_errno(errno
, "MS_SLAVE|MS_REC failed: %m");
2722 r
= mount_devices(directory
,
2723 root_device
, root_device_rw
,
2724 home_device
, home_device_rw
,
2725 srv_device
, srv_device_rw
);
2729 r
= determine_uid_shift(directory
);
2734 l
= send(uid_shift_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), MSG_NOSIGNAL
);
2736 return log_error_errno(errno
, "Failed to send UID shift: %m");
2737 if (l
!= sizeof(arg_uid_shift
)) {
2738 log_error("Short write while sending UID shift.");
2743 /* Turn directory into bind mount */
2744 if (mount(directory
, directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
2745 return log_error_errno(errno
, "Failed to make bind mount: %m");
2747 r
= setup_volatile(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2751 r
= setup_volatile_state(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2755 r
= base_filesystem_create(directory
, arg_uid_shift
, (gid_t
) arg_uid_shift
);
2759 if (arg_read_only
) {
2760 r
= bind_remount_recursive(directory
, true);
2762 return log_error_errno(r
, "Failed to make tree read-only: %m");
2765 r
= mount_all(directory
, arg_userns
, false, arg_private_network
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2769 r
= copy_devnodes(directory
);
2773 dev_setup(directory
, arg_uid_shift
, arg_uid_shift
);
2775 r
= setup_pts(directory
);
2779 r
= setup_propagate(directory
);
2783 r
= setup_dev_console(directory
, console
);
2787 r
= setup_seccomp();
2791 r
= setup_timezone(directory
);
2795 r
= setup_resolv_conf(directory
);
2799 r
= setup_journal(directory
);
2803 r
= mount_custom(directory
, arg_custom_mounts
, arg_n_custom_mounts
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2807 r
= mount_cgroups(directory
, arg_unified_cgroup_hierarchy
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2811 r
= mount_move_root(directory
);
2813 return log_error_errno(r
, "Failed to move root directory: %m");
2815 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
|
2816 (arg_share_system
? 0 : CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
) |
2817 (arg_private_network
? CLONE_NEWNET
: 0) |
2818 (arg_userns
? CLONE_NEWUSER
: 0),
2821 return log_error_errno(errno
, "Failed to fork inner child: %m");
2823 pid_socket
= safe_close(pid_socket
);
2824 uid_shift_socket
= safe_close(uid_shift_socket
);
2826 /* The inner child has all namespaces that are
2827 * requested, so that we all are owned by the user if
2828 * user namespaces are turned on. */
2830 r
= inner_child(barrier
, directory
, secondary
, kmsg_socket
, rtnl_socket
, fds
);
2832 _exit(EXIT_FAILURE
);
2834 _exit(EXIT_SUCCESS
);
2837 l
= send(pid_socket
, &pid
, sizeof(pid
), MSG_NOSIGNAL
);
2839 return log_error_errno(errno
, "Failed to send PID: %m");
2840 if (l
!= sizeof(pid
)) {
2841 log_error("Short write while sending PID.");
2845 pid_socket
= safe_close(pid_socket
);
2846 kmsg_socket
= safe_close(kmsg_socket
);
2847 rtnl_socket
= safe_close(rtnl_socket
);
2852 static int setup_uid_map(pid_t pid
) {
2853 char uid_map
[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t
) + 1], line
[DECIMAL_STR_MAX(uid_t
)*3+3+1];
2858 xsprintf(uid_map
, "/proc/" PID_FMT
"/uid_map", pid
);
2859 xsprintf(line
, UID_FMT
" " UID_FMT
" " UID_FMT
"\n", 0, arg_uid_shift
, arg_uid_range
);
2860 r
= write_string_file(uid_map
, line
, 0);
2862 return log_error_errno(r
, "Failed to write UID map: %m");
2864 /* We always assign the same UID and GID ranges */
2865 xsprintf(uid_map
, "/proc/" PID_FMT
"/gid_map", pid
);
2866 r
= write_string_file(uid_map
, line
, 0);
2868 return log_error_errno(r
, "Failed to write GID map: %m");
2873 static int load_settings(void) {
2874 _cleanup_(settings_freep
) Settings
*settings
= NULL
;
2875 _cleanup_fclose_
FILE *f
= NULL
;
2876 _cleanup_free_
char *p
= NULL
;
2880 /* If all settings are masked, there's no point in looking for
2881 * the settings file */
2882 if ((arg_settings_mask
& _SETTINGS_MASK_ALL
) == _SETTINGS_MASK_ALL
)
2885 fn
= strjoina(arg_machine
, ".nspawn");
2887 /* We first look in the admin's directories in /etc and /run */
2888 FOREACH_STRING(i
, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2889 _cleanup_free_
char *j
= NULL
;
2891 j
= strjoin(i
, "/", fn
, NULL
);
2900 /* By default, we trust configuration from /etc and /run */
2901 if (arg_settings_trusted
< 0)
2902 arg_settings_trusted
= true;
2907 if (errno
!= ENOENT
)
2908 return log_error_errno(errno
, "Failed to open %s: %m", j
);
2912 /* After that, let's look for a file next to the
2913 * actual image we shall boot. */
2916 p
= file_in_same_dir(arg_image
, fn
);
2919 } else if (arg_directory
) {
2920 p
= file_in_same_dir(arg_directory
, fn
);
2927 if (!f
&& errno
!= ENOENT
)
2928 return log_error_errno(errno
, "Failed to open %s: %m", p
);
2930 /* By default, we do not trust configuration from /var/lib/machines */
2931 if (arg_settings_trusted
< 0)
2932 arg_settings_trusted
= false;
2939 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted
));
2941 r
= settings_load(f
, p
, &settings
);
2945 /* Copy over bits from the settings, unless they have been
2946 * explicitly masked by command line switches. */
2948 if ((arg_settings_mask
& SETTING_START_MODE
) == 0 &&
2949 settings
->start_mode
>= 0) {
2950 arg_start_mode
= settings
->start_mode
;
2952 strv_free(arg_parameters
);
2953 arg_parameters
= settings
->parameters
;
2954 settings
->parameters
= NULL
;
2957 if ((arg_settings_mask
& SETTING_WORKING_DIRECTORY
) == 0 &&
2958 settings
->working_directory
) {
2960 arg_chdir
= settings
->working_directory
;
2961 settings
->working_directory
= NULL
;
2964 if ((arg_settings_mask
& SETTING_ENVIRONMENT
) == 0 &&
2965 settings
->environment
) {
2966 strv_free(arg_setenv
);
2967 arg_setenv
= settings
->environment
;
2968 settings
->environment
= NULL
;
2971 if ((arg_settings_mask
& SETTING_USER
) == 0 &&
2974 arg_user
= settings
->user
;
2975 settings
->user
= NULL
;
2978 if ((arg_settings_mask
& SETTING_CAPABILITY
) == 0) {
2981 plus
= settings
->capability
;
2982 if (settings_private_network(settings
))
2983 plus
|= (1ULL << CAP_NET_ADMIN
);
2985 if (!arg_settings_trusted
&& plus
!= 0) {
2986 if (settings
->capability
!= 0)
2987 log_warning("Ignoring Capability= setting, file %s is not trusted.", p
);
2991 arg_retain
&= ~settings
->drop_capability
;
2994 if ((arg_settings_mask
& SETTING_KILL_SIGNAL
) == 0 &&
2995 settings
->kill_signal
> 0)
2996 arg_kill_signal
= settings
->kill_signal
;
2998 if ((arg_settings_mask
& SETTING_PERSONALITY
) == 0 &&
2999 settings
->personality
!= PERSONALITY_INVALID
)
3000 arg_personality
= settings
->personality
;
3002 if ((arg_settings_mask
& SETTING_MACHINE_ID
) == 0 &&
3003 !sd_id128_is_null(settings
->machine_id
)) {
3005 if (!arg_settings_trusted
)
3006 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p
);
3008 arg_uuid
= settings
->machine_id
;
3011 if ((arg_settings_mask
& SETTING_READ_ONLY
) == 0 &&
3012 settings
->read_only
>= 0)
3013 arg_read_only
= settings
->read_only
;
3015 if ((arg_settings_mask
& SETTING_VOLATILE_MODE
) == 0 &&
3016 settings
->volatile_mode
!= _VOLATILE_MODE_INVALID
)
3017 arg_volatile_mode
= settings
->volatile_mode
;
3019 if ((arg_settings_mask
& SETTING_CUSTOM_MOUNTS
) == 0 &&
3020 settings
->n_custom_mounts
> 0) {
3022 if (!arg_settings_trusted
)
3023 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p
);
3025 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
3026 arg_custom_mounts
= settings
->custom_mounts
;
3027 arg_n_custom_mounts
= settings
->n_custom_mounts
;
3029 settings
->custom_mounts
= NULL
;
3030 settings
->n_custom_mounts
= 0;
3034 if ((arg_settings_mask
& SETTING_NETWORK
) == 0 &&
3035 (settings
->private_network
>= 0 ||
3036 settings
->network_veth
>= 0 ||
3037 settings
->network_bridge
||
3038 settings
->network_interfaces
||
3039 settings
->network_macvlan
||
3040 settings
->network_ipvlan
||
3041 settings
->network_veth_extra
)) {
3043 if (!arg_settings_trusted
)
3044 log_warning("Ignoring network settings, file %s is not trusted.", p
);
3046 arg_network_veth
= settings_network_veth(settings
);
3047 arg_private_network
= settings_private_network(settings
);
3049 strv_free(arg_network_interfaces
);
3050 arg_network_interfaces
= settings
->network_interfaces
;
3051 settings
->network_interfaces
= NULL
;
3053 strv_free(arg_network_macvlan
);
3054 arg_network_macvlan
= settings
->network_macvlan
;
3055 settings
->network_macvlan
= NULL
;
3057 strv_free(arg_network_ipvlan
);
3058 arg_network_ipvlan
= settings
->network_ipvlan
;
3059 settings
->network_ipvlan
= NULL
;
3061 strv_free(arg_network_veth_extra
);
3062 arg_network_veth_extra
= settings
->network_veth_extra
;
3063 settings
->network_veth_extra
= NULL
;
3065 free(arg_network_bridge
);
3066 arg_network_bridge
= settings
->network_bridge
;
3067 settings
->network_bridge
= NULL
;
3071 if ((arg_settings_mask
& SETTING_EXPOSE_PORTS
) == 0 &&
3072 settings
->expose_ports
) {
3074 if (!arg_settings_trusted
)
3075 log_warning("Ignoring Port= setting, file %s is not trusted.", p
);
3077 expose_port_free_all(arg_expose_ports
);
3078 arg_expose_ports
= settings
->expose_ports
;
3079 settings
->expose_ports
= NULL
;
3086 int main(int argc
, char *argv
[]) {
3088 _cleanup_free_
char *device_path
= NULL
, *root_device
= NULL
, *home_device
= NULL
, *srv_device
= NULL
, *console
= NULL
;
3089 bool root_device_rw
= true, home_device_rw
= true, srv_device_rw
= true;
3090 _cleanup_close_
int master
= -1, image_fd
= -1;
3091 _cleanup_fdset_free_ FDSet
*fds
= NULL
;
3092 int r
, n_fd_passed
, loop_nr
= -1;
3093 char veth_name
[IFNAMSIZ
];
3094 bool secondary
= false, remove_subvol
= false;
3097 int ret
= EXIT_SUCCESS
;
3098 union in_addr_union exposed
= {};
3099 _cleanup_release_lock_file_ LockFile tree_global_lock
= LOCK_FILE_INIT
, tree_local_lock
= LOCK_FILE_INIT
;
3102 log_parse_environment();
3105 /* Make sure rename_process() in the stub init process can work */
3109 r
= parse_argv(argc
, argv
);
3113 if (geteuid() != 0) {
3114 log_error("Need to be root.");
3118 r
= determine_names();
3122 r
= load_settings();
3126 r
= verify_arguments();
3130 n_fd_passed
= sd_listen_fds(false);
3131 if (n_fd_passed
> 0) {
3132 r
= fdset_new_listen_fds(&fds
, false);
3134 log_error_errno(r
, "Failed to collect file descriptors: %m");
3139 if (arg_directory
) {
3142 if (path_equal(arg_directory
, "/") && !arg_ephemeral
) {
3143 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3148 if (arg_ephemeral
) {
3149 _cleanup_free_
char *np
= NULL
;
3151 /* If the specified path is a mount point we
3152 * generate the new snapshot immediately
3153 * inside it under a random name. However if
3154 * the specified is not a mount point we
3155 * create the new snapshot in the parent
3156 * directory, just next to it. */
3157 r
= path_is_mount_point(arg_directory
, 0);
3159 log_error_errno(r
, "Failed to determine whether directory %s is mount point: %m", arg_directory
);
3163 r
= tempfn_random_child(arg_directory
, "machine.", &np
);
3165 r
= tempfn_random(arg_directory
, "machine.", &np
);
3167 log_error_errno(r
, "Failed to generate name for snapshot: %m");
3171 r
= image_path_lock(np
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3173 log_error_errno(r
, "Failed to lock %s: %m", np
);
3177 r
= btrfs_subvol_snapshot(arg_directory
, np
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
3179 log_error_errno(r
, "Failed to create snapshot %s from %s: %m", np
, arg_directory
);
3183 free(arg_directory
);
3187 remove_subvol
= true;
3190 r
= image_path_lock(arg_directory
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3192 log_error_errno(r
, "Directory tree %s is currently busy.", arg_directory
);
3196 log_error_errno(r
, "Failed to lock %s: %m", arg_directory
);
3201 r
= btrfs_subvol_snapshot(arg_template
, arg_directory
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
3204 log_info("Directory %s already exists, not populating from template %s.", arg_directory
, arg_template
);
3206 log_error_errno(r
, "Couldn't create snapshot %s from %s: %m", arg_directory
, arg_template
);
3210 log_info("Populated %s from template %s.", arg_directory
, arg_template
);
3215 if (arg_start_mode
== START_BOOT
) {
3216 if (path_is_os_tree(arg_directory
) <= 0) {
3217 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory
);
3224 p
= strjoina(arg_directory
, "/usr/");
3225 if (laccess(p
, F_OK
) < 0) {
3226 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory
);
3233 char template[] = "/tmp/nspawn-root-XXXXXX";
3236 assert(!arg_template
);
3238 r
= image_path_lock(arg_image
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3240 r
= log_error_errno(r
, "Disk image %s is currently busy.", arg_image
);
3244 r
= log_error_errno(r
, "Failed to create image lock: %m");
3248 if (!mkdtemp(template)) {
3249 log_error_errno(errno
, "Failed to create temporary directory: %m");
3254 arg_directory
= strdup(template);
3255 if (!arg_directory
) {
3260 image_fd
= setup_image(&device_path
, &loop_nr
);
3266 r
= dissect_image(image_fd
,
3267 &root_device
, &root_device_rw
,
3268 &home_device
, &home_device_rw
,
3269 &srv_device
, &srv_device_rw
,
3275 r
= custom_mounts_prepare();
3280 isatty(STDIN_FILENO
) > 0 &&
3281 isatty(STDOUT_FILENO
) > 0;
3283 master
= posix_openpt(O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NDELAY
);
3285 r
= log_error_errno(errno
, "Failed to acquire pseudo tty: %m");
3289 r
= ptsname_malloc(master
, &console
);
3291 r
= log_error_errno(r
, "Failed to determine tty name: %m");
3295 if (arg_selinux_apifs_context
) {
3296 r
= mac_selinux_apply(console
, arg_selinux_apifs_context
);
3301 if (unlockpt(master
) < 0) {
3302 r
= log_error_errno(errno
, "Failed to unlock tty: %m");
3307 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3308 arg_machine
, arg_image
?: arg_directory
);
3310 assert_se(sigprocmask_many(SIG_BLOCK
, NULL
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, -1) >= 0);
3312 assert_se(sigemptyset(&mask_chld
) == 0);
3313 assert_se(sigaddset(&mask_chld
, SIGCHLD
) == 0);
3315 if (prctl(PR_SET_CHILD_SUBREAPER
, 1) < 0) {
3316 r
= log_error_errno(errno
, "Failed to become subreaper: %m");
3321 _cleanup_close_pair_
int kmsg_socket_pair
[2] = { -1, -1 }, rtnl_socket_pair
[2] = { -1, -1 }, pid_socket_pair
[2] = { -1, -1 }, uid_shift_socket_pair
[2] = { -1, -1 };
3322 ContainerStatus container_status
;
3323 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
3324 static const struct sigaction sa
= {
3325 .sa_handler
= nop_signal_handler
,
3326 .sa_flags
= SA_NOCLDSTOP
,
3330 _cleanup_(sd_event_unrefp
) sd_event
*event
= NULL
;
3331 _cleanup_(pty_forward_freep
) PTYForward
*forward
= NULL
;
3332 _cleanup_(sd_netlink_unrefp
) sd_netlink
*rtnl
= NULL
;
3335 r
= barrier_create(&barrier
);
3337 log_error_errno(r
, "Cannot initialize IPC barrier: %m");
3341 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, kmsg_socket_pair
) < 0) {
3342 r
= log_error_errno(errno
, "Failed to create kmsg socket pair: %m");
3346 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, rtnl_socket_pair
) < 0) {
3347 r
= log_error_errno(errno
, "Failed to create rtnl socket pair: %m");
3351 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, pid_socket_pair
) < 0) {
3352 r
= log_error_errno(errno
, "Failed to create pid socket pair: %m");
3357 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, uid_shift_socket_pair
) < 0) {
3358 r
= log_error_errno(errno
, "Failed to create uid shift socket pair: %m");
3362 /* Child can be killed before execv(), so handle SIGCHLD
3363 * in order to interrupt parent's blocking calls and
3364 * give it a chance to call wait() and terminate. */
3365 r
= sigprocmask(SIG_UNBLOCK
, &mask_chld
, NULL
);
3367 r
= log_error_errno(errno
, "Failed to change the signal mask: %m");
3371 r
= sigaction(SIGCHLD
, &sa
, NULL
);
3373 r
= log_error_errno(errno
, "Failed to install SIGCHLD handler: %m");
3377 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
, NULL
);
3379 if (errno
== EINVAL
)
3380 r
= log_error_errno(errno
, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3382 r
= log_error_errno(errno
, "clone() failed: %m");
3388 /* The outer child only has a file system namespace. */
3389 barrier_set_role(&barrier
, BARRIER_CHILD
);
3391 master
= safe_close(master
);
3393 kmsg_socket_pair
[0] = safe_close(kmsg_socket_pair
[0]);
3394 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3395 pid_socket_pair
[0] = safe_close(pid_socket_pair
[0]);
3396 uid_shift_socket_pair
[0] = safe_close(uid_shift_socket_pair
[0]);
3398 (void) reset_all_signal_handlers();
3399 (void) reset_signal_mask();
3401 r
= outer_child(&barrier
,
3404 root_device
, root_device_rw
,
3405 home_device
, home_device_rw
,
3406 srv_device
, srv_device_rw
,
3410 kmsg_socket_pair
[1],
3411 rtnl_socket_pair
[1],
3412 uid_shift_socket_pair
[1],
3415 _exit(EXIT_FAILURE
);
3417 _exit(EXIT_SUCCESS
);
3420 barrier_set_role(&barrier
, BARRIER_PARENT
);
3422 fds
= fdset_free(fds
);
3424 kmsg_socket_pair
[1] = safe_close(kmsg_socket_pair
[1]);
3425 rtnl_socket_pair
[1] = safe_close(rtnl_socket_pair
[1]);
3426 pid_socket_pair
[1] = safe_close(pid_socket_pair
[1]);
3427 uid_shift_socket_pair
[1] = safe_close(uid_shift_socket_pair
[1]);
3429 /* Wait for the outer child. */
3430 r
= wait_for_terminate_and_warn("namespace helper", pid
, NULL
);
3439 /* And now retrieve the PID of the inner child. */
3440 l
= recv(pid_socket_pair
[0], &pid
, sizeof(pid
), 0);
3442 r
= log_error_errno(errno
, "Failed to read inner child PID: %m");
3445 if (l
!= sizeof(pid
)) {
3446 log_error("Short read while reading inner child PID.");
3451 log_debug("Init process invoked as PID " PID_FMT
, pid
);
3454 if (!barrier_place_and_sync(&barrier
)) { /* #1 */
3455 log_error("Child died too early.");
3460 l
= recv(uid_shift_socket_pair
[0], &arg_uid_shift
, sizeof(arg_uid_shift
), 0);
3462 r
= log_error_errno(errno
, "Failed to read UID shift: %m");
3465 if (l
!= sizeof(arg_uid_shift
)) {
3466 log_error("Short read while reading UID shift.");
3471 r
= setup_uid_map(pid
);
3475 (void) barrier_place(&barrier
); /* #2 */
3478 if (arg_private_network
) {
3480 r
= move_network_interfaces(pid
, arg_network_interfaces
);
3484 if (arg_network_veth
) {
3485 r
= setup_veth(arg_machine
, pid
, veth_name
, !!arg_network_bridge
);
3491 if (arg_network_bridge
) {
3492 r
= setup_bridge(veth_name
, arg_network_bridge
);
3500 r
= setup_veth_extra(arg_machine
, pid
, arg_network_veth_extra
);
3504 r
= setup_macvlan(arg_machine
, pid
, arg_network_macvlan
);
3508 r
= setup_ipvlan(arg_machine
, pid
, arg_network_ipvlan
);
3514 r
= register_machine(
3521 arg_custom_mounts
, arg_n_custom_mounts
,
3525 arg_container_service_name
);
3530 r
= sync_cgroup(pid
, arg_unified_cgroup_hierarchy
);
3534 if (arg_keep_unit
) {
3535 r
= create_subcgroup(pid
, arg_unified_cgroup_hierarchy
);
3540 r
= chown_cgroup(pid
, arg_uid_shift
);
3544 /* Notify the child that the parent is ready with all
3545 * its setup (including cgroup-ification), and that
3546 * the child can now hand over control to the code to
3547 * run inside the container. */
3548 (void) barrier_place(&barrier
); /* #3 */
3550 /* Block SIGCHLD here, before notifying child.
3551 * process_pty() will handle it with the other signals. */
3552 assert_se(sigprocmask(SIG_BLOCK
, &mask_chld
, NULL
) >= 0);
3554 /* Reset signal to default */
3555 r
= default_signals(SIGCHLD
, -1);
3557 log_error_errno(r
, "Failed to reset SIGCHLD: %m");
3561 /* Let the child know that we are ready and wait that the child is completely ready now. */
3562 if (!barrier_place_and_sync(&barrier
)) { /* #4 */
3563 log_error("Child died too early.");
3570 "STATUS=Container running.\n"
3571 "X_NSPAWN_LEADER_PID=" PID_FMT
, pid
);
3573 r
= sd_event_new(&event
);
3575 log_error_errno(r
, "Failed to get default event source: %m");
3579 if (arg_kill_signal
> 0) {
3580 /* Try to kill the init system on SIGINT or SIGTERM */
3581 sd_event_add_signal(event
, NULL
, SIGINT
, on_orderly_shutdown
, PID_TO_PTR(pid
));
3582 sd_event_add_signal(event
, NULL
, SIGTERM
, on_orderly_shutdown
, PID_TO_PTR(pid
));
3584 /* Immediately exit */
3585 sd_event_add_signal(event
, NULL
, SIGINT
, NULL
, NULL
);
3586 sd_event_add_signal(event
, NULL
, SIGTERM
, NULL
, NULL
);
3589 /* simply exit on sigchld */
3590 sd_event_add_signal(event
, NULL
, SIGCHLD
, NULL
, NULL
);
3592 if (arg_expose_ports
) {
3593 r
= expose_port_watch_rtnl(event
, rtnl_socket_pair
[0], on_address_change
, &exposed
, &rtnl
);
3597 (void) expose_port_execute(rtnl
, arg_expose_ports
, &exposed
);
3600 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3602 r
= pty_forward_new(event
, master
, PTY_FORWARD_IGNORE_VHANGUP
| (interactive
? 0 : PTY_FORWARD_READ_ONLY
), &forward
);
3604 log_error_errno(r
, "Failed to create PTY forwarder: %m");
3608 r
= sd_event_loop(event
);
3610 log_error_errno(r
, "Failed to run event loop: %m");
3614 pty_forward_get_last_char(forward
, &last_char
);
3616 forward
= pty_forward_free(forward
);
3618 if (!arg_quiet
&& last_char
!= '\n')
3621 /* Kill if it is not dead yet anyway */
3622 if (arg_register
&& !arg_keep_unit
)
3623 terminate_machine(pid
);
3625 /* Normally redundant, but better safe than sorry */
3628 r
= wait_for_container(pid
, &container_status
);
3632 /* We failed to wait for the container, or the
3633 * container exited abnormally */
3635 else if (r
> 0 || container_status
== CONTAINER_TERMINATED
) {
3636 /* The container exited with a non-zero
3637 * status, or with zero status and no reboot
3643 /* CONTAINER_REBOOTED, loop again */
3645 if (arg_keep_unit
) {
3646 /* Special handling if we are running as a
3647 * service: instead of simply restarting the
3648 * machine we want to restart the entire
3649 * service, so let's inform systemd about this
3650 * with the special exit code 133. The service
3651 * file uses RestartForceExitStatus=133 so
3652 * that this results in a full nspawn
3653 * restart. This is necessary since we might
3654 * have cgroup parameters set we want to have
3661 expose_port_flush(arg_expose_ports
, &exposed
);
3667 "STATUS=Terminating...");
3672 /* Try to flush whatever is still queued in the pty */
3674 (void) copy_bytes(master
, STDOUT_FILENO
, (uint64_t) -1, false);
3676 loop_remove(loop_nr
, &image_fd
);
3678 if (remove_subvol
&& arg_directory
) {
3681 k
= btrfs_subvol_remove(arg_directory
, BTRFS_REMOVE_RECURSIVE
|BTRFS_REMOVE_QUOTA
);
3683 log_warning_errno(k
, "Cannot remove subvolume '%s', ignoring: %m", arg_directory
);
3689 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
3690 (void) rm_rf(p
, REMOVE_ROOT
);
3693 expose_port_flush(arg_expose_ports
, &exposed
);
3695 free(arg_directory
);
3701 strv_free(arg_setenv
);
3702 free(arg_network_bridge
);
3703 strv_free(arg_network_interfaces
);
3704 strv_free(arg_network_macvlan
);
3705 strv_free(arg_network_ipvlan
);
3706 strv_free(arg_network_veth_extra
);
3707 strv_free(arg_parameters
);
3708 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
3709 expose_port_free_all(arg_expose_ports
);
3711 return r
< 0 ? EXIT_FAILURE
: ret
;