2 This file is part of systemd.
4 Copyright 2010 Lennart Poettering
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
21 #include <blkid/blkid.h>
25 #include <linux/loop.h>
31 #include <selinux/selinux.h>
38 #include <sys/mount.h>
39 #include <sys/personality.h>
40 #include <sys/prctl.h>
41 #include <sys/types.h>
44 #include "sd-daemon.h"
47 #include "alloc-util.h"
49 #include "base-filesystem.h"
50 #include "blkid-util.h"
51 #include "btrfs-util.h"
53 #include "capability-util.h"
54 #include "cgroup-util.h"
56 #include "dev-setup.h"
61 #include "formats-util.h"
64 #include "hostname-util.h"
66 #include "loopback-setup.h"
67 #include "machine-image.h"
71 #include "mount-util.h"
72 #include "netlink-util.h"
73 #include "nspawn-cgroup.h"
74 #include "nspawn-expose-ports.h"
75 #include "nspawn-mount.h"
76 #include "nspawn-network.h"
77 #include "nspawn-register.h"
78 #include "nspawn-settings.h"
79 #include "nspawn-setuid.h"
80 #include "nspawn-stub-pid1.h"
81 #include "parse-util.h"
82 #include "path-util.h"
83 #include "process-util.h"
85 #include "random-util.h"
88 #include "seccomp-util.h"
90 #include "signal-util.h"
91 #include "socket-util.h"
92 #include "stat-util.h"
93 #include "stdio-util.h"
94 #include "string-util.h"
96 #include "terminal-util.h"
97 #include "udev-util.h"
98 #include "umask-util.h"
99 #include "user-util.h"
102 typedef enum ContainerStatus
{
103 CONTAINER_TERMINATED
,
107 typedef enum LinkJournal
{
114 static char *arg_directory
= NULL
;
115 static char *arg_template
= NULL
;
116 static char *arg_chdir
= NULL
;
117 static char *arg_user
= NULL
;
118 static sd_id128_t arg_uuid
= {};
119 static char *arg_machine
= NULL
;
120 static const char *arg_selinux_context
= NULL
;
121 static const char *arg_selinux_apifs_context
= NULL
;
122 static const char *arg_slice
= NULL
;
123 static bool arg_private_network
= false;
124 static bool arg_read_only
= false;
125 static StartMode arg_start_mode
= START_PID1
;
126 static bool arg_ephemeral
= false;
127 static LinkJournal arg_link_journal
= LINK_AUTO
;
128 static bool arg_link_journal_try
= false;
129 static uint64_t arg_retain
=
130 (1ULL << CAP_CHOWN
) |
131 (1ULL << CAP_DAC_OVERRIDE
) |
132 (1ULL << CAP_DAC_READ_SEARCH
) |
133 (1ULL << CAP_FOWNER
) |
134 (1ULL << CAP_FSETID
) |
135 (1ULL << CAP_IPC_OWNER
) |
137 (1ULL << CAP_LEASE
) |
138 (1ULL << CAP_LINUX_IMMUTABLE
) |
139 (1ULL << CAP_NET_BIND_SERVICE
) |
140 (1ULL << CAP_NET_BROADCAST
) |
141 (1ULL << CAP_NET_RAW
) |
142 (1ULL << CAP_SETGID
) |
143 (1ULL << CAP_SETFCAP
) |
144 (1ULL << CAP_SETPCAP
) |
145 (1ULL << CAP_SETUID
) |
146 (1ULL << CAP_SYS_ADMIN
) |
147 (1ULL << CAP_SYS_CHROOT
) |
148 (1ULL << CAP_SYS_NICE
) |
149 (1ULL << CAP_SYS_PTRACE
) |
150 (1ULL << CAP_SYS_TTY_CONFIG
) |
151 (1ULL << CAP_SYS_RESOURCE
) |
152 (1ULL << CAP_SYS_BOOT
) |
153 (1ULL << CAP_AUDIT_WRITE
) |
154 (1ULL << CAP_AUDIT_CONTROL
) |
156 static CustomMount
*arg_custom_mounts
= NULL
;
157 static unsigned arg_n_custom_mounts
= 0;
158 static char **arg_setenv
= NULL
;
159 static bool arg_quiet
= false;
160 static bool arg_share_system
= false;
161 static bool arg_register
= true;
162 static bool arg_keep_unit
= false;
163 static char **arg_network_interfaces
= NULL
;
164 static char **arg_network_macvlan
= NULL
;
165 static char **arg_network_ipvlan
= NULL
;
166 static bool arg_network_veth
= false;
167 static char **arg_network_veth_extra
= NULL
;
168 static char *arg_network_bridge
= NULL
;
169 static unsigned long arg_personality
= PERSONALITY_INVALID
;
170 static char *arg_image
= NULL
;
171 static VolatileMode arg_volatile_mode
= VOLATILE_NO
;
172 static ExposePort
*arg_expose_ports
= NULL
;
173 static char **arg_property
= NULL
;
174 static uid_t arg_uid_shift
= UID_INVALID
, arg_uid_range
= 0x10000U
;
175 static bool arg_userns
= false;
176 static int arg_kill_signal
= 0;
177 static bool arg_unified_cgroup_hierarchy
= false;
178 static SettingsMask arg_settings_mask
= 0;
179 static int arg_settings_trusted
= -1;
180 static char **arg_parameters
= NULL
;
181 static const char *arg_container_service_name
= "systemd-nspawn";
183 static void help(void) {
184 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
185 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
186 " -h --help Show this help\n"
187 " --version Print version string\n"
188 " -q --quiet Do not show status information\n"
189 " -D --directory=PATH Root directory for the container\n"
190 " --template=PATH Initialize root directory from template directory,\n"
192 " -x --ephemeral Run container with snapshot of root directory, and\n"
193 " remove it after exit\n"
194 " -i --image=PATH File system device or disk image for the container\n"
195 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
196 " -b --boot Boot up full system (i.e. invoke init)\n"
197 " --chdir=PATH Set working directory in the container\n"
198 " -u --user=USER Run the command under specified user or uid\n"
199 " -M --machine=NAME Set the machine name for the container\n"
200 " --uuid=UUID Set a specific machine UUID for the container\n"
201 " -S --slice=SLICE Place the container in the specified slice\n"
202 " --property=NAME=VALUE Set scope unit property\n"
203 " --private-users[=UIDBASE[:NUIDS]]\n"
204 " Run within user namespace\n"
205 " --private-network Disable network in container\n"
206 " --network-interface=INTERFACE\n"
207 " Assign an existing network interface to the\n"
209 " --network-macvlan=INTERFACE\n"
210 " Create a macvlan network interface based on an\n"
211 " existing network interface to the container\n"
212 " --network-ipvlan=INTERFACE\n"
213 " Create a ipvlan network interface based on an\n"
214 " existing network interface to the container\n"
215 " -n --network-veth Add a virtual Ethernet connection between host\n"
217 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
218 " Add an additional virtual Ethernet link between\n"
219 " host and container\n"
220 " --network-bridge=INTERFACE\n"
221 " Add a virtual Ethernet connection between host\n"
222 " and container and add it to an existing bridge on\n"
224 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
225 " Expose a container IP port on the host\n"
226 " -Z --selinux-context=SECLABEL\n"
227 " Set the SELinux security context to be used by\n"
228 " processes in the container\n"
229 " -L --selinux-apifs-context=SECLABEL\n"
230 " Set the SELinux security context to be used by\n"
231 " API/tmpfs file systems in the container\n"
232 " --capability=CAP In addition to the default, retain specified\n"
234 " --drop-capability=CAP Drop the specified capability from the default set\n"
235 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
236 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
237 " host, try-guest, try-host\n"
238 " -j Equivalent to --link-journal=try-guest\n"
239 " --read-only Mount the root directory read-only\n"
240 " --bind=PATH[:PATH[:OPTIONS]]\n"
241 " Bind mount a file or directory from the host into\n"
243 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
244 " Similar, but creates a read-only bind mount\n"
245 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
246 " --overlay=PATH[:PATH...]:PATH\n"
247 " Create an overlay mount from the host to \n"
249 " --overlay-ro=PATH[:PATH...]:PATH\n"
250 " Similar, but creates a read-only overlay mount\n"
251 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
252 " --share-system Share system namespaces with host\n"
253 " --register=BOOLEAN Register container as machine\n"
254 " --keep-unit Do not register a scope for the machine, reuse\n"
255 " the service unit nspawn is running in\n"
256 " --volatile[=MODE] Run the system in volatile mode\n"
257 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
258 , program_invocation_short_name
);
262 static int custom_mounts_prepare(void) {
266 /* Ensure the mounts are applied prefix first. */
267 qsort_safe(arg_custom_mounts
, arg_n_custom_mounts
, sizeof(CustomMount
), custom_mount_compare
);
269 /* Allocate working directories for the overlay file systems that need it */
270 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
271 CustomMount
*m
= &arg_custom_mounts
[i
];
273 if (arg_userns
&& arg_uid_shift
== UID_INVALID
&& path_equal(m
->destination
, "/")) {
274 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
278 if (m
->type
!= CUSTOM_MOUNT_OVERLAY
)
287 r
= tempfn_random(m
->source
, NULL
, &m
->work_dir
);
289 return log_error_errno(r
, "Failed to generate work directory from %s: %m", m
->source
);
295 static int detect_unified_cgroup_hierarchy(void) {
299 /* Allow the user to control whether the unified hierarchy is used */
300 e
= getenv("UNIFIED_CGROUP_HIERARCHY");
302 r
= parse_boolean(e
);
304 return log_error_errno(r
, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
306 arg_unified_cgroup_hierarchy
= r
;
310 /* Otherwise inherit the default from the host system */
313 return log_error_errno(r
, "Failed to determine whether the unified cgroups hierarchy is used: %m");
315 arg_unified_cgroup_hierarchy
= r
;
319 static int parse_argv(int argc
, char *argv
[]) {
338 ARG_NETWORK_INTERFACE
,
342 ARG_NETWORK_VETH_EXTRA
,
353 static const struct option options
[] = {
354 { "help", no_argument
, NULL
, 'h' },
355 { "version", no_argument
, NULL
, ARG_VERSION
},
356 { "directory", required_argument
, NULL
, 'D' },
357 { "template", required_argument
, NULL
, ARG_TEMPLATE
},
358 { "ephemeral", no_argument
, NULL
, 'x' },
359 { "user", required_argument
, NULL
, 'u' },
360 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
361 { "as-pid2", no_argument
, NULL
, 'a' },
362 { "boot", no_argument
, NULL
, 'b' },
363 { "uuid", required_argument
, NULL
, ARG_UUID
},
364 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
365 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
366 { "drop-capability", required_argument
, NULL
, ARG_DROP_CAPABILITY
},
367 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
368 { "bind", required_argument
, NULL
, ARG_BIND
},
369 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
370 { "tmpfs", required_argument
, NULL
, ARG_TMPFS
},
371 { "overlay", required_argument
, NULL
, ARG_OVERLAY
},
372 { "overlay-ro", required_argument
, NULL
, ARG_OVERLAY_RO
},
373 { "machine", required_argument
, NULL
, 'M' },
374 { "slice", required_argument
, NULL
, 'S' },
375 { "setenv", required_argument
, NULL
, ARG_SETENV
},
376 { "selinux-context", required_argument
, NULL
, 'Z' },
377 { "selinux-apifs-context", required_argument
, NULL
, 'L' },
378 { "quiet", no_argument
, NULL
, 'q' },
379 { "share-system", no_argument
, NULL
, ARG_SHARE_SYSTEM
},
380 { "register", required_argument
, NULL
, ARG_REGISTER
},
381 { "keep-unit", no_argument
, NULL
, ARG_KEEP_UNIT
},
382 { "network-interface", required_argument
, NULL
, ARG_NETWORK_INTERFACE
},
383 { "network-macvlan", required_argument
, NULL
, ARG_NETWORK_MACVLAN
},
384 { "network-ipvlan", required_argument
, NULL
, ARG_NETWORK_IPVLAN
},
385 { "network-veth", no_argument
, NULL
, 'n' },
386 { "network-veth-extra", required_argument
, NULL
, ARG_NETWORK_VETH_EXTRA
},
387 { "network-bridge", required_argument
, NULL
, ARG_NETWORK_BRIDGE
},
388 { "personality", required_argument
, NULL
, ARG_PERSONALITY
},
389 { "image", required_argument
, NULL
, 'i' },
390 { "volatile", optional_argument
, NULL
, ARG_VOLATILE
},
391 { "port", required_argument
, NULL
, 'p' },
392 { "property", required_argument
, NULL
, ARG_PROPERTY
},
393 { "private-users", optional_argument
, NULL
, ARG_PRIVATE_USERS
},
394 { "kill-signal", required_argument
, NULL
, ARG_KILL_SIGNAL
},
395 { "settings", required_argument
, NULL
, ARG_SETTINGS
},
396 { "chdir", required_argument
, NULL
, ARG_CHDIR
},
402 uint64_t plus
= 0, minus
= 0;
403 bool mask_all_settings
= false, mask_no_settings
= false;
408 while ((c
= getopt_long(argc
, argv
, "+hD:u:abL:M:jS:Z:qi:xp:n", options
, NULL
)) >= 0)
420 r
= parse_path_argument_and_warn(optarg
, false, &arg_directory
);
426 r
= parse_path_argument_and_warn(optarg
, false, &arg_template
);
432 r
= parse_path_argument_and_warn(optarg
, false, &arg_image
);
438 arg_ephemeral
= true;
442 r
= free_and_strdup(&arg_user
, optarg
);
446 arg_settings_mask
|= SETTING_USER
;
449 case ARG_NETWORK_BRIDGE
:
450 r
= free_and_strdup(&arg_network_bridge
, optarg
);
457 arg_network_veth
= true;
458 arg_private_network
= true;
459 arg_settings_mask
|= SETTING_NETWORK
;
462 case ARG_NETWORK_VETH_EXTRA
:
463 r
= veth_extra_parse(&arg_network_veth_extra
, optarg
);
465 return log_error_errno(r
, "Failed to parse --network-veth-extra= parameter: %s", optarg
);
467 arg_private_network
= true;
468 arg_settings_mask
|= SETTING_NETWORK
;
471 case ARG_NETWORK_INTERFACE
:
472 if (strv_extend(&arg_network_interfaces
, optarg
) < 0)
475 arg_private_network
= true;
476 arg_settings_mask
|= SETTING_NETWORK
;
479 case ARG_NETWORK_MACVLAN
:
480 if (strv_extend(&arg_network_macvlan
, optarg
) < 0)
483 arg_private_network
= true;
484 arg_settings_mask
|= SETTING_NETWORK
;
487 case ARG_NETWORK_IPVLAN
:
488 if (strv_extend(&arg_network_ipvlan
, optarg
) < 0)
493 case ARG_PRIVATE_NETWORK
:
494 arg_private_network
= true;
495 arg_settings_mask
|= SETTING_NETWORK
;
499 if (arg_start_mode
== START_PID2
) {
500 log_error("--boot and --as-pid2 may not be combined.");
504 arg_start_mode
= START_BOOT
;
505 arg_settings_mask
|= SETTING_START_MODE
;
509 if (arg_start_mode
== START_BOOT
) {
510 log_error("--boot and --as-pid2 may not be combined.");
514 arg_start_mode
= START_PID2
;
515 arg_settings_mask
|= SETTING_START_MODE
;
519 r
= sd_id128_from_string(optarg
, &arg_uuid
);
521 log_error("Invalid UUID: %s", optarg
);
525 arg_settings_mask
|= SETTING_MACHINE_ID
;
534 arg_machine
= mfree(arg_machine
);
536 if (!machine_name_is_valid(optarg
)) {
537 log_error("Invalid machine name: %s", optarg
);
541 r
= free_and_strdup(&arg_machine
, optarg
);
549 arg_selinux_context
= optarg
;
553 arg_selinux_apifs_context
= optarg
;
557 arg_read_only
= true;
558 arg_settings_mask
|= SETTING_READ_ONLY
;
562 case ARG_DROP_CAPABILITY
: {
565 _cleanup_free_
char *t
= NULL
;
567 r
= extract_first_word(&p
, &t
, ",", 0);
569 return log_error_errno(r
, "Failed to parse capability %s.", t
);
574 if (streq(t
, "all")) {
575 if (c
== ARG_CAPABILITY
)
576 plus
= (uint64_t) -1;
578 minus
= (uint64_t) -1;
582 cap
= capability_from_name(t
);
584 log_error("Failed to parse capability %s.", t
);
588 if (c
== ARG_CAPABILITY
)
589 plus
|= 1ULL << (uint64_t) cap
;
591 minus
|= 1ULL << (uint64_t) cap
;
595 arg_settings_mask
|= SETTING_CAPABILITY
;
600 arg_link_journal
= LINK_GUEST
;
601 arg_link_journal_try
= true;
604 case ARG_LINK_JOURNAL
:
605 if (streq(optarg
, "auto")) {
606 arg_link_journal
= LINK_AUTO
;
607 arg_link_journal_try
= false;
608 } else if (streq(optarg
, "no")) {
609 arg_link_journal
= LINK_NO
;
610 arg_link_journal_try
= false;
611 } else if (streq(optarg
, "guest")) {
612 arg_link_journal
= LINK_GUEST
;
613 arg_link_journal_try
= false;
614 } else if (streq(optarg
, "host")) {
615 arg_link_journal
= LINK_HOST
;
616 arg_link_journal_try
= false;
617 } else if (streq(optarg
, "try-guest")) {
618 arg_link_journal
= LINK_GUEST
;
619 arg_link_journal_try
= true;
620 } else if (streq(optarg
, "try-host")) {
621 arg_link_journal
= LINK_HOST
;
622 arg_link_journal_try
= true;
624 log_error("Failed to parse link journal mode %s", optarg
);
632 r
= bind_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
, c
== ARG_BIND_RO
);
634 return log_error_errno(r
, "Failed to parse --bind(-ro)= argument %s: %m", optarg
);
636 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
640 r
= tmpfs_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
);
642 return log_error_errno(r
, "Failed to parse --tmpfs= argument %s: %m", optarg
);
644 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
648 case ARG_OVERLAY_RO
: {
649 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
650 _cleanup_strv_free_
char **lower
= NULL
;
655 r
= strv_split_extract(&lower
, optarg
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
659 log_error("Invalid overlay specification: %s", optarg
);
663 STRV_FOREACH(i
, lower
) {
664 if (!path_is_absolute(*i
)) {
665 log_error("Overlay path %s is not absolute.", *i
);
673 log_error("--overlay= needs at least two colon-separated directories specified.");
678 /* If two parameters are specified,
679 * the first one is the lower, the
680 * second one the upper directory. And
681 * we'll also define the destination
682 * mount point the same as the upper. */
686 destination
= strdup(upper
);
691 upper
= lower
[n
- 2];
692 destination
= lower
[n
- 1];
696 m
= custom_mount_add(&arg_custom_mounts
, &arg_n_custom_mounts
, CUSTOM_MOUNT_OVERLAY
);
700 m
->destination
= destination
;
703 m
->read_only
= c
== ARG_OVERLAY_RO
;
705 upper
= destination
= NULL
;
708 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
715 if (!env_assignment_is_valid(optarg
)) {
716 log_error("Environment variable assignment '%s' is not valid.", optarg
);
720 n
= strv_env_set(arg_setenv
, optarg
);
724 strv_free(arg_setenv
);
727 arg_settings_mask
|= SETTING_ENVIRONMENT
;
735 case ARG_SHARE_SYSTEM
:
736 arg_share_system
= true;
740 r
= parse_boolean(optarg
);
742 log_error("Failed to parse --register= argument: %s", optarg
);
750 arg_keep_unit
= true;
753 case ARG_PERSONALITY
:
755 arg_personality
= personality_from_string(optarg
);
756 if (arg_personality
== PERSONALITY_INVALID
) {
757 log_error("Unknown or unsupported personality '%s'.", optarg
);
761 arg_settings_mask
|= SETTING_PERSONALITY
;
767 arg_volatile_mode
= VOLATILE_YES
;
771 m
= volatile_mode_from_string(optarg
);
773 log_error("Failed to parse --volatile= argument: %s", optarg
);
776 arg_volatile_mode
= m
;
779 arg_settings_mask
|= SETTING_VOLATILE_MODE
;
783 r
= expose_port_parse(&arg_expose_ports
, optarg
);
785 return log_error_errno(r
, "Duplicate port specification: %s", optarg
);
787 return log_error_errno(r
, "Failed to parse host port %s: %m", optarg
);
789 arg_settings_mask
|= SETTING_EXPOSE_PORTS
;
793 if (strv_extend(&arg_property
, optarg
) < 0)
798 case ARG_PRIVATE_USERS
:
800 _cleanup_free_
char *buffer
= NULL
;
801 const char *range
, *shift
;
803 range
= strchr(optarg
, ':');
805 buffer
= strndup(optarg
, range
- optarg
);
811 if (safe_atou32(range
, &arg_uid_range
) < 0 || arg_uid_range
<= 0) {
812 log_error("Failed to parse UID range: %s", range
);
818 if (parse_uid(shift
, &arg_uid_shift
) < 0) {
819 log_error("Failed to parse UID: %s", optarg
);
827 case ARG_KILL_SIGNAL
:
828 arg_kill_signal
= signal_from_string_try_harder(optarg
);
829 if (arg_kill_signal
< 0) {
830 log_error("Cannot parse signal: %s", optarg
);
834 arg_settings_mask
|= SETTING_KILL_SIGNAL
;
839 /* no → do not read files
840 * yes → read files, do not override cmdline, trust only subset
841 * override → read files, override cmdline, trust only subset
842 * trusted → read files, do not override cmdline, trust all
845 r
= parse_boolean(optarg
);
847 if (streq(optarg
, "trusted")) {
848 mask_all_settings
= false;
849 mask_no_settings
= false;
850 arg_settings_trusted
= true;
852 } else if (streq(optarg
, "override")) {
853 mask_all_settings
= false;
854 mask_no_settings
= true;
855 arg_settings_trusted
= -1;
857 return log_error_errno(r
, "Failed to parse --settings= argument: %s", optarg
);
860 mask_all_settings
= false;
861 mask_no_settings
= false;
862 arg_settings_trusted
= -1;
865 mask_all_settings
= true;
866 mask_no_settings
= false;
867 arg_settings_trusted
= false;
873 if (!path_is_absolute(optarg
)) {
874 log_error("Working directory %s is not an absolute path.", optarg
);
878 r
= free_and_strdup(&arg_chdir
, optarg
);
882 arg_settings_mask
|= SETTING_WORKING_DIRECTORY
;
889 assert_not_reached("Unhandled option");
892 if (arg_share_system
)
893 arg_register
= false;
895 if (arg_start_mode
!= START_PID1
&& arg_share_system
) {
896 log_error("--boot and --share-system may not be combined.");
900 if (arg_keep_unit
&& cg_pid_get_owner_uid(0, NULL
) >= 0) {
901 log_error("--keep-unit may not be used when invoked from a user session.");
905 if (arg_directory
&& arg_image
) {
906 log_error("--directory= and --image= may not be combined.");
910 if (arg_template
&& arg_image
) {
911 log_error("--template= and --image= may not be combined.");
915 if (arg_template
&& !(arg_directory
|| arg_machine
)) {
916 log_error("--template= needs --directory= or --machine=.");
920 if (arg_ephemeral
&& arg_template
) {
921 log_error("--ephemeral and --template= may not be combined.");
925 if (arg_ephemeral
&& arg_image
) {
926 log_error("--ephemeral and --image= may not be combined.");
930 if (arg_ephemeral
&& !IN_SET(arg_link_journal
, LINK_NO
, LINK_AUTO
)) {
931 log_error("--ephemeral and --link-journal= may not be combined.");
935 if (arg_userns
&& access("/proc/self/uid_map", F_OK
) < 0)
936 return log_error_errno(EOPNOTSUPP
, "--private-users= is not supported, kernel compiled without user namespace support.");
939 arg_parameters
= strv_copy(argv
+ optind
);
943 arg_settings_mask
|= SETTING_START_MODE
;
946 /* Load all settings from .nspawn files */
947 if (mask_no_settings
)
948 arg_settings_mask
= 0;
950 /* Don't load any settings from .nspawn files */
951 if (mask_all_settings
)
952 arg_settings_mask
= _SETTINGS_MASK_ALL
;
954 arg_retain
= (arg_retain
| plus
| (arg_private_network
? 1ULL << CAP_NET_ADMIN
: 0)) & ~minus
;
956 r
= detect_unified_cgroup_hierarchy();
960 e
= getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
962 arg_container_service_name
= e
;
967 static int verify_arguments(void) {
969 if (arg_volatile_mode
!= VOLATILE_NO
&& arg_read_only
) {
970 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
974 if (arg_expose_ports
&& !arg_private_network
) {
975 log_error("Cannot use --port= without private networking.");
979 if (arg_start_mode
== START_BOOT
&& arg_kill_signal
<= 0)
980 arg_kill_signal
= SIGRTMIN
+3;
985 static int userns_lchown(const char *p
, uid_t uid
, gid_t gid
) {
991 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
994 if (uid
!= UID_INVALID
) {
995 uid
+= arg_uid_shift
;
997 if (uid
< arg_uid_shift
|| uid
>= arg_uid_shift
+ arg_uid_range
)
1001 if (gid
!= GID_INVALID
) {
1002 gid
+= (gid_t
) arg_uid_shift
;
1004 if (gid
< (gid_t
) arg_uid_shift
|| gid
>= (gid_t
) (arg_uid_shift
+ arg_uid_range
))
1008 if (lchown(p
, uid
, gid
) < 0)
1014 static int userns_mkdir(const char *root
, const char *path
, mode_t mode
, uid_t uid
, gid_t gid
) {
1017 q
= prefix_roota(root
, path
);
1018 if (mkdir(q
, mode
) < 0) {
1019 if (errno
== EEXIST
)
1024 return userns_lchown(q
, uid
, gid
);
1027 static int setup_timezone(const char *dest
) {
1028 _cleanup_free_
char *p
= NULL
, *q
= NULL
;
1029 const char *where
, *check
, *what
;
1035 /* Fix the timezone, if possible */
1036 r
= readlink_malloc("/etc/localtime", &p
);
1038 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1042 z
= path_startswith(p
, "../usr/share/zoneinfo/");
1044 z
= path_startswith(p
, "/usr/share/zoneinfo/");
1046 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1050 where
= prefix_roota(dest
, "/etc/localtime");
1051 r
= readlink_malloc(where
, &q
);
1053 y
= path_startswith(q
, "../usr/share/zoneinfo/");
1055 y
= path_startswith(q
, "/usr/share/zoneinfo/");
1057 /* Already pointing to the right place? Then do nothing .. */
1058 if (y
&& streq(y
, z
))
1062 check
= strjoina("/usr/share/zoneinfo/", z
);
1063 check
= prefix_roota(dest
, check
);
1064 if (laccess(check
, F_OK
) < 0) {
1065 log_warning("Timezone %s does not exist in container, not updating container timezone.", z
);
1070 if (r
< 0 && errno
!= ENOENT
) {
1071 log_error_errno(errno
, "Failed to remove existing timezone info %s in container: %m", where
);
1075 what
= strjoina("../usr/share/zoneinfo/", z
);
1076 if (symlink(what
, where
) < 0) {
1077 log_error_errno(errno
, "Failed to correct timezone of container: %m");
1081 r
= userns_lchown(where
, 0, 0);
1083 return log_warning_errno(r
, "Failed to chown /etc/localtime: %m");
1088 static int setup_resolv_conf(const char *dest
) {
1089 const char *where
= NULL
;
1094 if (arg_private_network
)
1097 /* Fix resolv.conf, if possible */
1098 where
= prefix_roota(dest
, "/etc/resolv.conf");
1100 r
= copy_file("/etc/resolv.conf", where
, O_TRUNC
|O_NOFOLLOW
, 0644, 0);
1102 /* If the file already exists as symlink, let's
1103 * suppress the warning, under the assumption that
1104 * resolved or something similar runs inside and the
1105 * symlink points there.
1107 * If the disk image is read-only, there's also no
1108 * point in complaining.
1110 log_full_errno(IN_SET(r
, -ELOOP
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
1111 "Failed to copy /etc/resolv.conf to %s: %m", where
);
1115 r
= userns_lchown(where
, 0, 0);
1117 log_warning_errno(r
, "Failed to chown /etc/resolv.conf: %m");
1122 static char* id128_format_as_uuid(sd_id128_t id
, char s
[37]) {
1126 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1127 SD_ID128_FORMAT_VAL(id
));
1132 static int setup_boot_id(const char *dest
) {
1133 const char *from
, *to
;
1134 sd_id128_t rnd
= {};
1138 if (arg_share_system
)
1141 /* Generate a new randomized boot ID, so that each boot-up of
1142 * the container gets a new one */
1144 from
= prefix_roota(dest
, "/run/proc-sys-kernel-random-boot-id");
1145 to
= prefix_roota(dest
, "/proc/sys/kernel/random/boot_id");
1147 r
= sd_id128_randomize(&rnd
);
1149 return log_error_errno(r
, "Failed to generate random boot id: %m");
1151 id128_format_as_uuid(rnd
, as_uuid
);
1153 r
= write_string_file(from
, as_uuid
, WRITE_STRING_FILE_CREATE
);
1155 return log_error_errno(r
, "Failed to write boot id: %m");
1157 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1158 r
= log_error_errno(errno
, "Failed to bind mount boot id: %m");
1159 else if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
) < 0)
1160 log_warning_errno(errno
, "Failed to make boot id read-only: %m");
1166 static int copy_devnodes(const char *dest
) {
1168 static const char devnodes
[] =
1179 _cleanup_umask_ mode_t u
;
1185 /* Create /dev/net, so that we can create /dev/net/tun in it */
1186 if (userns_mkdir(dest
, "/dev/net", 0755, 0, 0) < 0)
1187 return log_error_errno(r
, "Failed to create /dev/net directory: %m");
1189 NULSTR_FOREACH(d
, devnodes
) {
1190 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
1193 from
= strappend("/dev/", d
);
1194 to
= prefix_root(dest
, from
);
1196 if (stat(from
, &st
) < 0) {
1198 if (errno
!= ENOENT
)
1199 return log_error_errno(errno
, "Failed to stat %s: %m", from
);
1201 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
1203 log_error("%s is not a char or block device, cannot copy.", from
);
1207 if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
1209 return log_error_errno(errno
, "mknod(%s) failed: %m", to
);
1211 /* Some systems abusively restrict mknod but
1212 * allow bind mounts. */
1215 return log_error_errno(r
, "touch (%s) failed: %m", to
);
1216 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1217 return log_error_errno(errno
, "Both mknod and bind mount (%s) failed: %m", to
);
1220 r
= userns_lchown(to
, 0, 0);
1222 return log_error_errno(r
, "chown() of device node %s failed: %m", to
);
1229 static int setup_pts(const char *dest
) {
1230 _cleanup_free_
char *options
= NULL
;
1235 if (arg_selinux_apifs_context
)
1236 (void) asprintf(&options
,
1237 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
",context=\"%s\"",
1238 arg_uid_shift
+ TTY_GID
,
1239 arg_selinux_apifs_context
);
1242 (void) asprintf(&options
,
1243 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
,
1244 arg_uid_shift
+ TTY_GID
);
1249 /* Mount /dev/pts itself */
1250 p
= prefix_roota(dest
, "/dev/pts");
1251 if (mkdir(p
, 0755) < 0)
1252 return log_error_errno(errno
, "Failed to create /dev/pts: %m");
1253 if (mount("devpts", p
, "devpts", MS_NOSUID
|MS_NOEXEC
, options
) < 0)
1254 return log_error_errno(errno
, "Failed to mount /dev/pts: %m");
1255 r
= userns_lchown(p
, 0, 0);
1257 return log_error_errno(r
, "Failed to chown /dev/pts: %m");
1259 /* Create /dev/ptmx symlink */
1260 p
= prefix_roota(dest
, "/dev/ptmx");
1261 if (symlink("pts/ptmx", p
) < 0)
1262 return log_error_errno(errno
, "Failed to create /dev/ptmx symlink: %m");
1263 r
= userns_lchown(p
, 0, 0);
1265 return log_error_errno(r
, "Failed to chown /dev/ptmx: %m");
1267 /* And fix /dev/pts/ptmx ownership */
1268 p
= prefix_roota(dest
, "/dev/pts/ptmx");
1269 r
= userns_lchown(p
, 0, 0);
1271 return log_error_errno(r
, "Failed to chown /dev/pts/ptmx: %m");
1276 static int setup_dev_console(const char *dest
, const char *console
) {
1277 _cleanup_umask_ mode_t u
;
1286 r
= chmod_and_chown(console
, 0600, arg_uid_shift
, arg_uid_shift
);
1288 return log_error_errno(r
, "Failed to correct access mode for TTY: %m");
1290 /* We need to bind mount the right tty to /dev/console since
1291 * ptys can only exist on pts file systems. To have something
1292 * to bind mount things on we create a empty regular file. */
1294 to
= prefix_roota(dest
, "/dev/console");
1297 return log_error_errno(r
, "touch() for /dev/console failed: %m");
1299 if (mount(console
, to
, NULL
, MS_BIND
, NULL
) < 0)
1300 return log_error_errno(errno
, "Bind mount for /dev/console failed: %m");
1305 static int setup_kmsg(const char *dest
, int kmsg_socket
) {
1306 const char *from
, *to
;
1307 _cleanup_umask_ mode_t u
;
1310 assert(kmsg_socket
>= 0);
1314 /* We create the kmsg FIFO as /run/kmsg, but immediately
1315 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1316 * on the reading side behave very similar to /proc/kmsg,
1317 * their writing side behaves differently from /dev/kmsg in
1318 * that writing blocks when nothing is reading. In order to
1319 * avoid any problems with containers deadlocking due to this
1320 * we simply make /dev/kmsg unavailable to the container. */
1321 from
= prefix_roota(dest
, "/run/kmsg");
1322 to
= prefix_roota(dest
, "/proc/kmsg");
1324 if (mkfifo(from
, 0600) < 0)
1325 return log_error_errno(errno
, "mkfifo() for /run/kmsg failed: %m");
1326 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1327 return log_error_errno(errno
, "Bind mount for /proc/kmsg failed: %m");
1329 fd
= open(from
, O_RDWR
|O_NDELAY
|O_CLOEXEC
);
1331 return log_error_errno(errno
, "Failed to open fifo: %m");
1333 /* Store away the fd in the socket, so that it stays open as
1334 * long as we run the child */
1335 r
= send_one_fd(kmsg_socket
, fd
, 0);
1339 return log_error_errno(r
, "Failed to send FIFO fd: %m");
1341 /* And now make the FIFO unavailable as /run/kmsg... */
1342 (void) unlink(from
);
1347 static int on_address_change(sd_netlink
*rtnl
, sd_netlink_message
*m
, void *userdata
) {
1348 union in_addr_union
*exposed
= userdata
;
1354 expose_port_execute(rtnl
, arg_expose_ports
, exposed
);
1358 static int setup_hostname(void) {
1360 if (arg_share_system
)
1363 if (sethostname_idempotent(arg_machine
) < 0)
1369 static int setup_journal(const char *directory
) {
1370 sd_id128_t machine_id
, this_id
;
1371 _cleanup_free_
char *b
= NULL
, *d
= NULL
;
1372 const char *etc_machine_id
, *p
, *q
;
1377 /* Don't link journals in ephemeral mode */
1381 if (arg_link_journal
== LINK_NO
)
1384 try = arg_link_journal_try
|| arg_link_journal
== LINK_AUTO
;
1386 etc_machine_id
= prefix_roota(directory
, "/etc/machine-id");
1388 r
= read_one_line_file(etc_machine_id
, &b
);
1389 if (r
== -ENOENT
&& try)
1392 return log_error_errno(r
, "Failed to read machine ID from %s: %m", etc_machine_id
);
1395 if (isempty(id
) && try)
1398 /* Verify validity */
1399 r
= sd_id128_from_string(id
, &machine_id
);
1401 return log_error_errno(r
, "Failed to parse machine ID from %s: %m", etc_machine_id
);
1403 r
= sd_id128_get_machine(&this_id
);
1405 return log_error_errno(r
, "Failed to retrieve machine ID: %m");
1407 if (sd_id128_equal(machine_id
, this_id
)) {
1408 log_full(try ? LOG_WARNING
: LOG_ERR
,
1409 "Host and machine ids are equal (%s): refusing to link journals", id
);
1415 r
= userns_mkdir(directory
, "/var", 0755, 0, 0);
1417 return log_error_errno(r
, "Failed to create /var: %m");
1419 r
= userns_mkdir(directory
, "/var/log", 0755, 0, 0);
1421 return log_error_errno(r
, "Failed to create /var/log: %m");
1423 r
= userns_mkdir(directory
, "/var/log/journal", 0755, 0, 0);
1425 return log_error_errno(r
, "Failed to create /var/log/journal: %m");
1427 p
= strjoina("/var/log/journal/", id
);
1428 q
= prefix_roota(directory
, p
);
1430 if (path_is_mount_point(p
, 0) > 0) {
1434 log_error("%s: already a mount point, refusing to use for journal", p
);
1438 if (path_is_mount_point(q
, 0) > 0) {
1442 log_error("%s: already a mount point, refusing to use for journal", q
);
1446 r
= readlink_and_make_absolute(p
, &d
);
1448 if ((arg_link_journal
== LINK_GUEST
||
1449 arg_link_journal
== LINK_AUTO
) &&
1452 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1454 log_warning_errno(r
, "Failed to create directory %s: %m", q
);
1459 return log_error_errno(errno
, "Failed to remove symlink %s: %m", p
);
1460 } else if (r
== -EINVAL
) {
1462 if (arg_link_journal
== LINK_GUEST
&&
1465 if (errno
== ENOTDIR
) {
1466 log_error("%s already exists and is neither a symlink nor a directory", p
);
1469 return log_error_errno(errno
, "Failed to remove %s: %m", p
);
1471 } else if (r
!= -ENOENT
)
1472 return log_error_errno(r
, "readlink(%s) failed: %m", p
);
1474 if (arg_link_journal
== LINK_GUEST
) {
1476 if (symlink(q
, p
) < 0) {
1478 log_debug_errno(errno
, "Failed to symlink %s to %s, skipping journal setup: %m", q
, p
);
1481 return log_error_errno(errno
, "Failed to symlink %s to %s: %m", q
, p
);
1484 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1486 log_warning_errno(r
, "Failed to create directory %s: %m", q
);
1490 if (arg_link_journal
== LINK_HOST
) {
1491 /* don't create parents here -- if the host doesn't have
1492 * permanent journal set up, don't force it here */
1494 if (mkdir(p
, 0755) < 0 && errno
!= EEXIST
) {
1496 log_debug_errno(errno
, "Failed to create %s, skipping journal setup: %m", p
);
1499 return log_error_errno(errno
, "Failed to create %s: %m", p
);
1502 } else if (access(p
, F_OK
) < 0)
1505 if (dir_is_empty(q
) == 0)
1506 log_warning("%s is not empty, proceeding anyway.", q
);
1508 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1510 return log_error_errno(r
, "Failed to create %s: %m", q
);
1512 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1513 return log_error_errno(errno
, "Failed to bind mount journal from host into guest: %m");
1518 static int drop_capabilities(void) {
1519 return capability_bounding_set_drop(arg_retain
, false);
1522 static int reset_audit_loginuid(void) {
1523 _cleanup_free_
char *p
= NULL
;
1526 if (arg_share_system
)
1529 r
= read_one_line_file("/proc/self/loginuid", &p
);
1533 return log_error_errno(r
, "Failed to read /proc/self/loginuid: %m");
1535 /* Already reset? */
1536 if (streq(p
, "4294967295"))
1539 r
= write_string_file("/proc/self/loginuid", "4294967295", 0);
1542 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1543 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1544 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1545 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1546 "using systemd-nspawn. Sleeping for 5s... (%m)");
1554 static int setup_seccomp(void) {
1557 static const struct {
1558 uint64_t capability
;
1561 { CAP_SYS_RAWIO
, SCMP_SYS(iopl
) },
1562 { CAP_SYS_RAWIO
, SCMP_SYS(ioperm
) },
1563 { CAP_SYS_BOOT
, SCMP_SYS(kexec_load
) },
1564 { CAP_SYS_ADMIN
, SCMP_SYS(swapon
) },
1565 { CAP_SYS_ADMIN
, SCMP_SYS(swapoff
) },
1566 { CAP_SYS_ADMIN
, SCMP_SYS(open_by_handle_at
) },
1567 { CAP_SYS_MODULE
, SCMP_SYS(init_module
) },
1568 { CAP_SYS_MODULE
, SCMP_SYS(finit_module
) },
1569 { CAP_SYS_MODULE
, SCMP_SYS(delete_module
) },
1570 { CAP_SYSLOG
, SCMP_SYS(syslog
) },
1573 scmp_filter_ctx seccomp
;
1577 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
1581 r
= seccomp_add_secondary_archs(seccomp
);
1583 log_error_errno(r
, "Failed to add secondary archs to seccomp filter: %m");
1587 for (i
= 0; i
< ELEMENTSOF(blacklist
); i
++) {
1588 if (arg_retain
& (1ULL << blacklist
[i
].capability
))
1591 r
= seccomp_rule_add(seccomp
, SCMP_ACT_ERRNO(EPERM
), blacklist
[i
].syscall_num
, 0);
1593 continue; /* unknown syscall */
1595 log_error_errno(r
, "Failed to block syscall: %m");
1602 Audit is broken in containers, much of the userspace audit
1603 hookup will fail if running inside a container. We don't
1604 care and just turn off creation of audit sockets.
1606 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1607 with EAFNOSUPPORT which audit userspace uses as indication
1608 that audit is disabled in the kernel.
1611 r
= seccomp_rule_add(
1613 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1616 SCMP_A0(SCMP_CMP_EQ
, AF_NETLINK
),
1617 SCMP_A2(SCMP_CMP_EQ
, NETLINK_AUDIT
));
1619 log_error_errno(r
, "Failed to add audit seccomp rule: %m");
1623 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
1625 log_error_errno(r
, "Failed to unset NO_NEW_PRIVS: %m");
1629 r
= seccomp_load(seccomp
);
1631 log_debug_errno(r
, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1636 log_error_errno(r
, "Failed to install seccomp audit filter: %m");
1641 seccomp_release(seccomp
);
1649 static int setup_propagate(const char *root
) {
1653 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1654 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1655 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
1656 (void) mkdir_p(p
, 0600);
1658 r
= userns_mkdir(root
, "/run/systemd", 0755, 0, 0);
1660 return log_error_errno(r
, "Failed to create /run/systemd: %m");
1662 r
= userns_mkdir(root
, "/run/systemd/nspawn", 0755, 0, 0);
1664 return log_error_errno(r
, "Failed to create /run/systemd/nspawn: %m");
1666 r
= userns_mkdir(root
, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1668 return log_error_errno(r
, "Failed to create /run/systemd/nspawn/incoming: %m");
1670 q
= prefix_roota(root
, "/run/systemd/nspawn/incoming");
1671 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1672 return log_error_errno(errno
, "Failed to install propagation bind mount.");
1674 if (mount(NULL
, q
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
) < 0)
1675 return log_error_errno(errno
, "Failed to make propagation mount read-only");
1680 static int setup_image(char **device_path
, int *loop_nr
) {
1681 struct loop_info64 info
= {
1682 .lo_flags
= LO_FLAGS_AUTOCLEAR
|LO_FLAGS_PARTSCAN
1684 _cleanup_close_
int fd
= -1, control
= -1, loop
= -1;
1685 _cleanup_free_
char* loopdev
= NULL
;
1689 assert(device_path
);
1693 fd
= open(arg_image
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1695 return log_error_errno(errno
, "Failed to open %s: %m", arg_image
);
1697 if (fstat(fd
, &st
) < 0)
1698 return log_error_errno(errno
, "Failed to stat %s: %m", arg_image
);
1700 if (S_ISBLK(st
.st_mode
)) {
1703 p
= strdup(arg_image
);
1717 if (!S_ISREG(st
.st_mode
)) {
1718 log_error("%s is not a regular file or block device.", arg_image
);
1722 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
1724 return log_error_errno(errno
, "Failed to open /dev/loop-control: %m");
1726 nr
= ioctl(control
, LOOP_CTL_GET_FREE
);
1728 return log_error_errno(errno
, "Failed to allocate loop device: %m");
1730 if (asprintf(&loopdev
, "/dev/loop%i", nr
) < 0)
1733 loop
= open(loopdev
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1735 return log_error_errno(errno
, "Failed to open loop device %s: %m", loopdev
);
1737 if (ioctl(loop
, LOOP_SET_FD
, fd
) < 0)
1738 return log_error_errno(errno
, "Failed to set loopback file descriptor on %s: %m", loopdev
);
1741 info
.lo_flags
|= LO_FLAGS_READ_ONLY
;
1743 if (ioctl(loop
, LOOP_SET_STATUS64
, &info
) < 0)
1744 return log_error_errno(errno
, "Failed to set loopback settings on %s: %m", loopdev
);
1746 *device_path
= loopdev
;
1757 #define PARTITION_TABLE_BLURB \
1758 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1759 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1760 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1761 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1762 "to be bootable with systemd-nspawn."
1764 static int dissect_image(
1766 char **root_device
, bool *root_device_rw
,
1767 char **home_device
, bool *home_device_rw
,
1768 char **srv_device
, bool *srv_device_rw
,
1772 int home_nr
= -1, srv_nr
= -1;
1773 #ifdef GPT_ROOT_NATIVE
1776 #ifdef GPT_ROOT_SECONDARY
1777 int secondary_root_nr
= -1;
1779 _cleanup_free_
char *home
= NULL
, *root
= NULL
, *secondary_root
= NULL
, *srv
= NULL
, *generic
= NULL
;
1780 _cleanup_udev_enumerate_unref_
struct udev_enumerate
*e
= NULL
;
1781 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
1782 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
1783 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
1784 struct udev_list_entry
*first
, *item
;
1785 bool home_rw
= true, root_rw
= true, secondary_root_rw
= true, srv_rw
= true, generic_rw
= true;
1786 bool is_gpt
, is_mbr
, multiple_generic
= false;
1787 const char *pttype
= NULL
;
1794 assert(root_device
);
1795 assert(home_device
);
1800 b
= blkid_new_probe();
1805 r
= blkid_probe_set_device(b
, fd
, 0, 0);
1810 return log_error_errno(errno
, "Failed to set device on blkid probe: %m");
1813 blkid_probe_enable_partitions(b
, 1);
1814 blkid_probe_set_partitions_flags(b
, BLKID_PARTS_ENTRY_DETAILS
);
1817 r
= blkid_do_safeprobe(b
);
1818 if (r
== -2 || r
== 1) {
1819 log_error("Failed to identify any partition table on\n"
1821 PARTITION_TABLE_BLURB
, arg_image
);
1823 } else if (r
!= 0) {
1826 return log_error_errno(errno
, "Failed to probe: %m");
1829 (void) blkid_probe_lookup_value(b
, "PTTYPE", &pttype
, NULL
);
1831 is_gpt
= streq_ptr(pttype
, "gpt");
1832 is_mbr
= streq_ptr(pttype
, "dos");
1834 if (!is_gpt
&& !is_mbr
) {
1835 log_error("No GPT or MBR partition table discovered on\n"
1837 PARTITION_TABLE_BLURB
, arg_image
);
1842 pl
= blkid_probe_get_partitions(b
);
1847 log_error("Failed to list partitions of %s", arg_image
);
1855 if (fstat(fd
, &st
) < 0)
1856 return log_error_errno(errno
, "Failed to stat block device: %m");
1858 d
= udev_device_new_from_devnum(udev
, 'b', st
.st_rdev
);
1866 log_error("Kernel partitions never appeared.");
1870 e
= udev_enumerate_new(udev
);
1874 r
= udev_enumerate_add_match_parent(e
, d
);
1878 r
= udev_enumerate_scan_devices(e
);
1880 return log_error_errno(r
, "Failed to scan for partition devices of %s: %m", arg_image
);
1882 /* Count the partitions enumerated by the kernel */
1884 first
= udev_enumerate_get_list_entry(e
);
1885 udev_list_entry_foreach(item
, first
)
1888 /* Count the partitions enumerated by blkid */
1889 m
= blkid_partlist_numof_partitions(pl
);
1893 log_error("blkid and kernel partition list do not match.");
1899 /* The kernel has probed fewer partitions than
1900 * blkid? Maybe the kernel prober is still
1901 * running or it got EBUSY because udev
1902 * already opened the device. Let's reprobe
1903 * the device, which is a synchronous call
1904 * that waits until probing is complete. */
1906 for (j
= 0; j
< 20; j
++) {
1908 r
= ioctl(fd
, BLKRRPART
, 0);
1911 if (r
>= 0 || r
!= -EBUSY
)
1914 /* If something else has the device
1915 * open, such as an udev rule, the
1916 * ioctl will return EBUSY. Since
1917 * there's no way to wait until it
1918 * isn't busy anymore, let's just wait
1919 * a bit, and try again.
1921 * This is really something they
1922 * should fix in the kernel! */
1924 usleep(50 * USEC_PER_MSEC
);
1928 return log_error_errno(r
, "Failed to reread partition table: %m");
1931 e
= udev_enumerate_unref(e
);
1934 first
= udev_enumerate_get_list_entry(e
);
1935 udev_list_entry_foreach(item
, first
) {
1936 _cleanup_udev_device_unref_
struct udev_device
*q
;
1938 unsigned long long flags
;
1944 q
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(item
));
1949 return log_error_errno(errno
, "Failed to get partition device of %s: %m", arg_image
);
1952 qn
= udev_device_get_devnum(q
);
1956 if (st
.st_rdev
== qn
)
1959 node
= udev_device_get_devnode(q
);
1963 pp
= blkid_partlist_devno_to_partition(pl
, qn
);
1967 flags
= blkid_partition_get_flags(pp
);
1969 nr
= blkid_partition_get_partno(pp
);
1977 if (flags
& GPT_FLAG_NO_AUTO
)
1980 stype
= blkid_partition_get_type_string(pp
);
1984 if (sd_id128_from_string(stype
, &type_id
) < 0)
1987 if (sd_id128_equal(type_id
, GPT_HOME
)) {
1989 if (home
&& nr
>= home_nr
)
1993 home_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
1995 r
= free_and_strdup(&home
, node
);
1999 } else if (sd_id128_equal(type_id
, GPT_SRV
)) {
2001 if (srv
&& nr
>= srv_nr
)
2005 srv_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2007 r
= free_and_strdup(&srv
, node
);
2011 #ifdef GPT_ROOT_NATIVE
2012 else if (sd_id128_equal(type_id
, GPT_ROOT_NATIVE
)) {
2014 if (root
&& nr
>= root_nr
)
2018 root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2020 r
= free_and_strdup(&root
, node
);
2025 #ifdef GPT_ROOT_SECONDARY
2026 else if (sd_id128_equal(type_id
, GPT_ROOT_SECONDARY
)) {
2028 if (secondary_root
&& nr
>= secondary_root_nr
)
2031 secondary_root_nr
= nr
;
2032 secondary_root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2034 r
= free_and_strdup(&secondary_root
, node
);
2039 else if (sd_id128_equal(type_id
, GPT_LINUX_GENERIC
)) {
2042 multiple_generic
= true;
2044 generic_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2046 r
= free_and_strdup(&generic
, node
);
2052 } else if (is_mbr
) {
2055 if (flags
!= 0x80) /* Bootable flag */
2058 type
= blkid_partition_get_type(pp
);
2059 if (type
!= 0x83) /* Linux partition */
2063 multiple_generic
= true;
2067 r
= free_and_strdup(&root
, node
);
2075 *root_device
= root
;
2078 *root_device_rw
= root_rw
;
2080 } else if (secondary_root
) {
2081 *root_device
= secondary_root
;
2082 secondary_root
= NULL
;
2084 *root_device_rw
= secondary_root_rw
;
2086 } else if (generic
) {
2088 /* There were no partitions with precise meanings
2089 * around, but we found generic partitions. In this
2090 * case, if there's only one, we can go ahead and boot
2091 * it, otherwise we bail out, because we really cannot
2092 * make any sense of it. */
2094 if (multiple_generic
) {
2095 log_error("Identified multiple bootable Linux partitions on\n"
2097 PARTITION_TABLE_BLURB
, arg_image
);
2101 *root_device
= generic
;
2104 *root_device_rw
= generic_rw
;
2107 log_error("Failed to identify root partition in disk image\n"
2109 PARTITION_TABLE_BLURB
, arg_image
);
2114 *home_device
= home
;
2117 *home_device_rw
= home_rw
;
2124 *srv_device_rw
= srv_rw
;
2129 log_error("--image= is not supported, compiled without blkid support.");
2134 static int mount_device(const char *what
, const char *where
, const char *directory
, bool rw
) {
2136 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
2137 const char *fstype
, *p
;
2147 p
= strjoina(where
, directory
);
2152 b
= blkid_new_probe_from_filename(what
);
2156 return log_error_errno(errno
, "Failed to allocate prober for %s: %m", what
);
2159 blkid_probe_enable_superblocks(b
, 1);
2160 blkid_probe_set_superblocks_flags(b
, BLKID_SUBLKS_TYPE
);
2163 r
= blkid_do_safeprobe(b
);
2164 if (r
== -1 || r
== 1) {
2165 log_error("Cannot determine file system type of %s", what
);
2167 } else if (r
!= 0) {
2170 return log_error_errno(errno
, "Failed to probe %s: %m", what
);
2174 if (blkid_probe_lookup_value(b
, "TYPE", &fstype
, NULL
) < 0) {
2177 log_error("Failed to determine file system type of %s", what
);
2181 if (streq(fstype
, "crypto_LUKS")) {
2182 log_error("nspawn currently does not support LUKS disk images.");
2186 if (mount(what
, p
, fstype
, MS_NODEV
|(rw
? 0 : MS_RDONLY
), NULL
) < 0)
2187 return log_error_errno(errno
, "Failed to mount %s: %m", what
);
2191 log_error("--image= is not supported, compiled without blkid support.");
2196 static int mount_devices(
2198 const char *root_device
, bool root_device_rw
,
2199 const char *home_device
, bool home_device_rw
,
2200 const char *srv_device
, bool srv_device_rw
) {
2206 r
= mount_device(root_device
, arg_directory
, NULL
, root_device_rw
);
2208 return log_error_errno(r
, "Failed to mount root directory: %m");
2212 r
= mount_device(home_device
, arg_directory
, "/home", home_device_rw
);
2214 return log_error_errno(r
, "Failed to mount home directory: %m");
2218 r
= mount_device(srv_device
, arg_directory
, "/srv", srv_device_rw
);
2220 return log_error_errno(r
, "Failed to mount server data directory: %m");
2226 static void loop_remove(int nr
, int *image_fd
) {
2227 _cleanup_close_
int control
= -1;
2233 if (image_fd
&& *image_fd
>= 0) {
2234 r
= ioctl(*image_fd
, LOOP_CLR_FD
);
2236 log_debug_errno(errno
, "Failed to close loop image: %m");
2237 *image_fd
= safe_close(*image_fd
);
2240 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
2242 log_warning_errno(errno
, "Failed to open /dev/loop-control: %m");
2246 r
= ioctl(control
, LOOP_CTL_REMOVE
, nr
);
2248 log_debug_errno(errno
, "Failed to remove loop %d: %m", nr
);
2253 * < 0 : wait_for_terminate() failed to get the state of the
2254 * container, the container was terminated by a signal, or
2255 * failed for an unknown reason. No change is made to the
2256 * container argument.
2257 * > 0 : The program executed in the container terminated with an
2258 * error. The exit code of the program executed in the
2259 * container is returned. The container argument has been set
2260 * to CONTAINER_TERMINATED.
2261 * 0 : The container is being rebooted, has been shut down or exited
2262 * successfully. The container argument has been set to either
2263 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2265 * That is, success is indicated by a return value of zero, and an
2266 * error is indicated by a non-zero value.
2268 static int wait_for_container(pid_t pid
, ContainerStatus
*container
) {
2272 r
= wait_for_terminate(pid
, &status
);
2274 return log_warning_errno(r
, "Failed to wait for container: %m");
2276 switch (status
.si_code
) {
2279 if (status
.si_status
== 0) {
2280 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s exited successfully.", arg_machine
);
2283 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s failed with error code %i.", arg_machine
, status
.si_status
);
2285 *container
= CONTAINER_TERMINATED
;
2286 return status
.si_status
;
2289 if (status
.si_status
== SIGINT
) {
2291 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s has been shut down.", arg_machine
);
2292 *container
= CONTAINER_TERMINATED
;
2295 } else if (status
.si_status
== SIGHUP
) {
2297 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s is being rebooted.", arg_machine
);
2298 *container
= CONTAINER_REBOOTED
;
2302 /* CLD_KILLED fallthrough */
2305 log_error("Container %s terminated by signal %s.", arg_machine
, signal_to_string(status
.si_status
));
2309 log_error("Container %s failed due to unknown reason.", arg_machine
);
2316 static int on_orderly_shutdown(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
2319 pid
= PTR_TO_PID(userdata
);
2321 if (kill(pid
, arg_kill_signal
) >= 0) {
2322 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2323 sd_event_source_set_userdata(s
, NULL
);
2328 sd_event_exit(sd_event_source_get_event(s
), 0);
2332 static int determine_names(void) {
2335 if (arg_template
&& !arg_directory
&& arg_machine
) {
2337 /* If --template= was specified then we should not
2338 * search for a machine, but instead create a new one
2339 * in /var/lib/machine. */
2341 arg_directory
= strjoin("/var/lib/machines/", arg_machine
, NULL
);
2346 if (!arg_image
&& !arg_directory
) {
2348 _cleanup_(image_unrefp
) Image
*i
= NULL
;
2350 r
= image_find(arg_machine
, &i
);
2352 return log_error_errno(r
, "Failed to find image for machine '%s': %m", arg_machine
);
2354 log_error("No image for machine '%s': %m", arg_machine
);
2358 if (i
->type
== IMAGE_RAW
)
2359 r
= free_and_strdup(&arg_image
, i
->path
);
2361 r
= free_and_strdup(&arg_directory
, i
->path
);
2363 return log_error_errno(r
, "Invalid image directory: %m");
2366 arg_read_only
= arg_read_only
|| i
->read_only
;
2368 arg_directory
= get_current_dir_name();
2370 if (!arg_directory
&& !arg_machine
) {
2371 log_error("Failed to determine path, please use -D or -i.");
2377 if (arg_directory
&& path_equal(arg_directory
, "/"))
2378 arg_machine
= gethostname_malloc();
2380 arg_machine
= strdup(basename(arg_image
?: arg_directory
));
2385 hostname_cleanup(arg_machine
);
2386 if (!machine_name_is_valid(arg_machine
)) {
2387 log_error("Failed to determine machine name automatically, please use -M.");
2391 if (arg_ephemeral
) {
2394 /* Add a random suffix when this is an
2395 * ephemeral machine, so that we can run many
2396 * instances at once without manually having
2397 * to specify -M each time. */
2399 if (asprintf(&b
, "%s-%016" PRIx64
, arg_machine
, random_u64()) < 0)
2410 static int determine_uid_shift(const char *directory
) {
2418 if (arg_uid_shift
== UID_INVALID
) {
2421 r
= stat(directory
, &st
);
2423 return log_error_errno(errno
, "Failed to determine UID base of %s: %m", directory
);
2425 arg_uid_shift
= st
.st_uid
& UINT32_C(0xffff0000);
2427 if (arg_uid_shift
!= (st
.st_gid
& UINT32_C(0xffff0000))) {
2428 log_error("UID and GID base of %s don't match.", directory
);
2432 arg_uid_range
= UINT32_C(0x10000);
2435 if (arg_uid_shift
> (uid_t
) -1 - arg_uid_range
) {
2436 log_error("UID base too high for UID range.");
2440 log_info("Using user namespaces with base " UID_FMT
" and range " UID_FMT
".", arg_uid_shift
, arg_uid_range
);
2444 static int inner_child(
2446 const char *directory
,
2452 _cleanup_free_
char *home
= NULL
;
2454 const char *envp
[] = {
2455 "PATH=" DEFAULT_PATH_SPLIT_USR
,
2456 NULL
, /* container */
2461 NULL
, /* container_uuid */
2462 NULL
, /* LISTEN_FDS */
2463 NULL
, /* LISTEN_PID */
2467 _cleanup_strv_free_
char **env_use
= NULL
;
2472 assert(kmsg_socket
>= 0);
2477 /* Tell the parent, that it now can write the UID map. */
2478 (void) barrier_place(barrier
); /* #1 */
2480 /* Wait until the parent wrote the UID map */
2481 if (!barrier_place_and_sync(barrier
)) { /* #2 */
2482 log_error("Parent died too early");
2487 r
= mount_all(NULL
, arg_userns
, true, arg_uid_shift
, arg_private_network
, arg_uid_range
, arg_selinux_apifs_context
);
2491 r
= mount_sysfs(NULL
);
2495 /* Wait until we are cgroup-ified, so that we
2496 * can mount the right cgroup path writable */
2497 if (!barrier_place_and_sync(barrier
)) { /* #3 */
2498 log_error("Parent died too early");
2502 r
= mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy
);
2506 r
= reset_uid_gid();
2508 return log_error_errno(r
, "Couldn't become new root: %m");
2510 r
= setup_boot_id(NULL
);
2514 r
= setup_kmsg(NULL
, kmsg_socket
);
2517 kmsg_socket
= safe_close(kmsg_socket
);
2522 return log_error_errno(errno
, "setsid() failed: %m");
2524 if (arg_private_network
)
2527 if (arg_expose_ports
) {
2528 r
= expose_port_send_rtnl(rtnl_socket
);
2531 rtnl_socket
= safe_close(rtnl_socket
);
2534 r
= drop_capabilities();
2536 return log_error_errno(r
, "drop_capabilities() failed: %m");
2540 if (arg_personality
!= PERSONALITY_INVALID
) {
2541 if (personality(arg_personality
) < 0)
2542 return log_error_errno(errno
, "personality() failed: %m");
2543 } else if (secondary
) {
2544 if (personality(PER_LINUX32
) < 0)
2545 return log_error_errno(errno
, "personality() failed: %m");
2549 if (arg_selinux_context
)
2550 if (setexeccon((security_context_t
) arg_selinux_context
) < 0)
2551 return log_error_errno(errno
, "setexeccon(\"%s\") failed: %m", arg_selinux_context
);
2554 r
= change_uid_gid(arg_user
, &home
);
2558 /* LXC sets container=lxc, so follow the scheme here */
2559 envp
[n_env
++] = strjoina("container=", arg_container_service_name
);
2561 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
2565 if ((asprintf((char**)(envp
+ n_env
++), "HOME=%s", home
? home
: "/root") < 0) ||
2566 (asprintf((char**)(envp
+ n_env
++), "USER=%s", arg_user
? arg_user
: "root") < 0) ||
2567 (asprintf((char**)(envp
+ n_env
++), "LOGNAME=%s", arg_user
? arg_user
: "root") < 0))
2570 if (!sd_id128_equal(arg_uuid
, SD_ID128_NULL
)) {
2573 if (asprintf((char**)(envp
+ n_env
++), "container_uuid=%s", id128_format_as_uuid(arg_uuid
, as_uuid
)) < 0)
2577 if (fdset_size(fds
) > 0) {
2578 r
= fdset_cloexec(fds
, false);
2580 return log_error_errno(r
, "Failed to unset O_CLOEXEC for file descriptors.");
2582 if ((asprintf((char **)(envp
+ n_env
++), "LISTEN_FDS=%u", fdset_size(fds
)) < 0) ||
2583 (asprintf((char **)(envp
+ n_env
++), "LISTEN_PID=1") < 0))
2587 env_use
= strv_env_merge(2, envp
, arg_setenv
);
2591 /* Let the parent know that we are ready and
2592 * wait until the parent is ready with the
2594 if (!barrier_place_and_sync(barrier
)) { /* #4 */
2595 log_error("Parent died too early");
2600 if (chdir(arg_chdir
) < 0)
2601 return log_error_errno(errno
, "Failed to change to specified working directory %s: %m", arg_chdir
);
2603 if (arg_start_mode
== START_PID2
) {
2609 /* Now, explicitly close the log, so that we
2610 * then can close all remaining fds. Closing
2611 * the log explicitly first has the benefit
2612 * that the logging subsystem knows about it,
2613 * and is thus ready to be reopened should we
2614 * need it again. Note that the other fds
2615 * closed here are at least the locking and
2618 (void) fdset_close_others(fds
);
2620 if (arg_start_mode
== START_BOOT
) {
2624 /* Automatically search for the init system */
2626 m
= strv_length(arg_parameters
);
2627 a
= newa(char*, m
+ 2);
2628 memcpy_safe(a
+ 1, arg_parameters
, m
* sizeof(char*));
2631 a
[0] = (char*) "/usr/lib/systemd/systemd";
2632 execve(a
[0], a
, env_use
);
2634 a
[0] = (char*) "/lib/systemd/systemd";
2635 execve(a
[0], a
, env_use
);
2637 a
[0] = (char*) "/sbin/init";
2638 execve(a
[0], a
, env_use
);
2639 } else if (!strv_isempty(arg_parameters
))
2640 execvpe(arg_parameters
[0], arg_parameters
, env_use
);
2643 chdir(home
?: "/root");
2645 execle("/bin/bash", "-bash", NULL
, env_use
);
2646 execle("/bin/sh", "-sh", NULL
, env_use
);
2651 return log_error_errno(r
, "execv() failed: %m");
2654 static int outer_child(
2656 const char *directory
,
2657 const char *console
,
2658 const char *root_device
, bool root_device_rw
,
2659 const char *home_device
, bool home_device_rw
,
2660 const char *srv_device
, bool srv_device_rw
,
2666 int uid_shift_socket
,
2676 assert(pid_socket
>= 0);
2677 assert(kmsg_socket
>= 0);
2681 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0)
2682 return log_error_errno(errno
, "PR_SET_PDEATHSIG failed: %m");
2685 close_nointr(STDIN_FILENO
);
2686 close_nointr(STDOUT_FILENO
);
2687 close_nointr(STDERR_FILENO
);
2689 r
= open_terminal(console
, O_RDWR
);
2690 if (r
!= STDIN_FILENO
) {
2696 return log_error_errno(r
, "Failed to open console: %m");
2699 if (dup2(STDIN_FILENO
, STDOUT_FILENO
) != STDOUT_FILENO
||
2700 dup2(STDIN_FILENO
, STDERR_FILENO
) != STDERR_FILENO
)
2701 return log_error_errno(errno
, "Failed to duplicate console: %m");
2704 r
= reset_audit_loginuid();
2708 /* Mark everything as slave, so that we still
2709 * receive mounts from the real root, but don't
2710 * propagate mounts to the real root. */
2711 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
2712 return log_error_errno(errno
, "MS_SLAVE|MS_REC failed: %m");
2714 r
= mount_devices(directory
,
2715 root_device
, root_device_rw
,
2716 home_device
, home_device_rw
,
2717 srv_device
, srv_device_rw
);
2721 r
= determine_uid_shift(directory
);
2726 l
= send(uid_shift_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), MSG_NOSIGNAL
);
2728 return log_error_errno(errno
, "Failed to send UID shift: %m");
2729 if (l
!= sizeof(arg_uid_shift
)) {
2730 log_error("Short write while sending UID shift.");
2735 /* Turn directory into bind mount */
2736 if (mount(directory
, directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
2737 return log_error_errno(errno
, "Failed to make bind mount: %m");
2739 r
= setup_volatile(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2743 r
= setup_volatile_state(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
2747 r
= base_filesystem_create(directory
, arg_uid_shift
, (gid_t
) arg_uid_shift
);
2751 if (arg_read_only
) {
2752 r
= bind_remount_recursive(directory
, true);
2754 return log_error_errno(r
, "Failed to make tree read-only: %m");
2757 r
= mount_all(directory
, arg_userns
, false, arg_private_network
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2761 r
= copy_devnodes(directory
);
2765 dev_setup(directory
, arg_uid_shift
, arg_uid_shift
);
2767 r
= setup_pts(directory
);
2771 r
= setup_propagate(directory
);
2775 r
= setup_dev_console(directory
, console
);
2779 r
= setup_seccomp();
2783 r
= setup_timezone(directory
);
2787 r
= setup_resolv_conf(directory
);
2791 r
= setup_journal(directory
);
2795 r
= mount_custom(directory
, arg_custom_mounts
, arg_n_custom_mounts
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2799 r
= mount_cgroups(directory
, arg_unified_cgroup_hierarchy
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2803 r
= mount_move_root(directory
);
2805 return log_error_errno(r
, "Failed to move root directory: %m");
2807 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
|
2808 (arg_share_system
? 0 : CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
) |
2809 (arg_private_network
? CLONE_NEWNET
: 0) |
2810 (arg_userns
? CLONE_NEWUSER
: 0),
2813 return log_error_errno(errno
, "Failed to fork inner child: %m");
2815 pid_socket
= safe_close(pid_socket
);
2816 uid_shift_socket
= safe_close(uid_shift_socket
);
2818 /* The inner child has all namespaces that are
2819 * requested, so that we all are owned by the user if
2820 * user namespaces are turned on. */
2822 r
= inner_child(barrier
, directory
, secondary
, kmsg_socket
, rtnl_socket
, fds
);
2824 _exit(EXIT_FAILURE
);
2826 _exit(EXIT_SUCCESS
);
2829 l
= send(pid_socket
, &pid
, sizeof(pid
), MSG_NOSIGNAL
);
2831 return log_error_errno(errno
, "Failed to send PID: %m");
2832 if (l
!= sizeof(pid
)) {
2833 log_error("Short write while sending PID.");
2837 pid_socket
= safe_close(pid_socket
);
2838 kmsg_socket
= safe_close(kmsg_socket
);
2839 rtnl_socket
= safe_close(rtnl_socket
);
2844 static int setup_uid_map(pid_t pid
) {
2845 char uid_map
[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t
) + 1], line
[DECIMAL_STR_MAX(uid_t
)*3+3+1];
2850 xsprintf(uid_map
, "/proc/" PID_FMT
"/uid_map", pid
);
2851 xsprintf(line
, UID_FMT
" " UID_FMT
" " UID_FMT
"\n", 0, arg_uid_shift
, arg_uid_range
);
2852 r
= write_string_file(uid_map
, line
, 0);
2854 return log_error_errno(r
, "Failed to write UID map: %m");
2856 /* We always assign the same UID and GID ranges */
2857 xsprintf(uid_map
, "/proc/" PID_FMT
"/gid_map", pid
);
2858 r
= write_string_file(uid_map
, line
, 0);
2860 return log_error_errno(r
, "Failed to write GID map: %m");
2865 static int load_settings(void) {
2866 _cleanup_(settings_freep
) Settings
*settings
= NULL
;
2867 _cleanup_fclose_
FILE *f
= NULL
;
2868 _cleanup_free_
char *p
= NULL
;
2872 /* If all settings are masked, there's no point in looking for
2873 * the settings file */
2874 if ((arg_settings_mask
& _SETTINGS_MASK_ALL
) == _SETTINGS_MASK_ALL
)
2877 fn
= strjoina(arg_machine
, ".nspawn");
2879 /* We first look in the admin's directories in /etc and /run */
2880 FOREACH_STRING(i
, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2881 _cleanup_free_
char *j
= NULL
;
2883 j
= strjoin(i
, "/", fn
, NULL
);
2892 /* By default, we trust configuration from /etc and /run */
2893 if (arg_settings_trusted
< 0)
2894 arg_settings_trusted
= true;
2899 if (errno
!= ENOENT
)
2900 return log_error_errno(errno
, "Failed to open %s: %m", j
);
2904 /* After that, let's look for a file next to the
2905 * actual image we shall boot. */
2908 p
= file_in_same_dir(arg_image
, fn
);
2911 } else if (arg_directory
) {
2912 p
= file_in_same_dir(arg_directory
, fn
);
2919 if (!f
&& errno
!= ENOENT
)
2920 return log_error_errno(errno
, "Failed to open %s: %m", p
);
2922 /* By default, we do not trust configuration from /var/lib/machines */
2923 if (arg_settings_trusted
< 0)
2924 arg_settings_trusted
= false;
2931 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted
));
2933 r
= settings_load(f
, p
, &settings
);
2937 /* Copy over bits from the settings, unless they have been
2938 * explicitly masked by command line switches. */
2940 if ((arg_settings_mask
& SETTING_START_MODE
) == 0 &&
2941 settings
->start_mode
>= 0) {
2942 arg_start_mode
= settings
->start_mode
;
2944 strv_free(arg_parameters
);
2945 arg_parameters
= settings
->parameters
;
2946 settings
->parameters
= NULL
;
2949 if ((arg_settings_mask
& SETTING_WORKING_DIRECTORY
) == 0 &&
2950 settings
->working_directory
) {
2952 arg_chdir
= settings
->working_directory
;
2953 settings
->working_directory
= NULL
;
2956 if ((arg_settings_mask
& SETTING_ENVIRONMENT
) == 0 &&
2957 settings
->environment
) {
2958 strv_free(arg_setenv
);
2959 arg_setenv
= settings
->environment
;
2960 settings
->environment
= NULL
;
2963 if ((arg_settings_mask
& SETTING_USER
) == 0 &&
2966 arg_user
= settings
->user
;
2967 settings
->user
= NULL
;
2970 if ((arg_settings_mask
& SETTING_CAPABILITY
) == 0) {
2973 plus
= settings
->capability
;
2974 if (settings_private_network(settings
))
2975 plus
|= (1ULL << CAP_NET_ADMIN
);
2977 if (!arg_settings_trusted
&& plus
!= 0) {
2978 if (settings
->capability
!= 0)
2979 log_warning("Ignoring Capability= setting, file %s is not trusted.", p
);
2983 arg_retain
&= ~settings
->drop_capability
;
2986 if ((arg_settings_mask
& SETTING_KILL_SIGNAL
) == 0 &&
2987 settings
->kill_signal
> 0)
2988 arg_kill_signal
= settings
->kill_signal
;
2990 if ((arg_settings_mask
& SETTING_PERSONALITY
) == 0 &&
2991 settings
->personality
!= PERSONALITY_INVALID
)
2992 arg_personality
= settings
->personality
;
2994 if ((arg_settings_mask
& SETTING_MACHINE_ID
) == 0 &&
2995 !sd_id128_is_null(settings
->machine_id
)) {
2997 if (!arg_settings_trusted
)
2998 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p
);
3000 arg_uuid
= settings
->machine_id
;
3003 if ((arg_settings_mask
& SETTING_READ_ONLY
) == 0 &&
3004 settings
->read_only
>= 0)
3005 arg_read_only
= settings
->read_only
;
3007 if ((arg_settings_mask
& SETTING_VOLATILE_MODE
) == 0 &&
3008 settings
->volatile_mode
!= _VOLATILE_MODE_INVALID
)
3009 arg_volatile_mode
= settings
->volatile_mode
;
3011 if ((arg_settings_mask
& SETTING_CUSTOM_MOUNTS
) == 0 &&
3012 settings
->n_custom_mounts
> 0) {
3014 if (!arg_settings_trusted
)
3015 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p
);
3017 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
3018 arg_custom_mounts
= settings
->custom_mounts
;
3019 arg_n_custom_mounts
= settings
->n_custom_mounts
;
3021 settings
->custom_mounts
= NULL
;
3022 settings
->n_custom_mounts
= 0;
3026 if ((arg_settings_mask
& SETTING_NETWORK
) == 0 &&
3027 (settings
->private_network
>= 0 ||
3028 settings
->network_veth
>= 0 ||
3029 settings
->network_bridge
||
3030 settings
->network_interfaces
||
3031 settings
->network_macvlan
||
3032 settings
->network_ipvlan
||
3033 settings
->network_veth_extra
)) {
3035 if (!arg_settings_trusted
)
3036 log_warning("Ignoring network settings, file %s is not trusted.", p
);
3038 arg_network_veth
= settings_network_veth(settings
);
3039 arg_private_network
= settings_private_network(settings
);
3041 strv_free(arg_network_interfaces
);
3042 arg_network_interfaces
= settings
->network_interfaces
;
3043 settings
->network_interfaces
= NULL
;
3045 strv_free(arg_network_macvlan
);
3046 arg_network_macvlan
= settings
->network_macvlan
;
3047 settings
->network_macvlan
= NULL
;
3049 strv_free(arg_network_ipvlan
);
3050 arg_network_ipvlan
= settings
->network_ipvlan
;
3051 settings
->network_ipvlan
= NULL
;
3053 strv_free(arg_network_veth_extra
);
3054 arg_network_veth_extra
= settings
->network_veth_extra
;
3055 settings
->network_veth_extra
= NULL
;
3057 free(arg_network_bridge
);
3058 arg_network_bridge
= settings
->network_bridge
;
3059 settings
->network_bridge
= NULL
;
3063 if ((arg_settings_mask
& SETTING_EXPOSE_PORTS
) == 0 &&
3064 settings
->expose_ports
) {
3066 if (!arg_settings_trusted
)
3067 log_warning("Ignoring Port= setting, file %s is not trusted.", p
);
3069 expose_port_free_all(arg_expose_ports
);
3070 arg_expose_ports
= settings
->expose_ports
;
3071 settings
->expose_ports
= NULL
;
3078 int main(int argc
, char *argv
[]) {
3080 _cleanup_free_
char *device_path
= NULL
, *root_device
= NULL
, *home_device
= NULL
, *srv_device
= NULL
, *console
= NULL
;
3081 bool root_device_rw
= true, home_device_rw
= true, srv_device_rw
= true;
3082 _cleanup_close_
int master
= -1, image_fd
= -1;
3083 _cleanup_fdset_free_ FDSet
*fds
= NULL
;
3084 int r
, n_fd_passed
, loop_nr
= -1;
3085 char veth_name
[IFNAMSIZ
];
3086 bool secondary
= false, remove_subvol
= false;
3089 int ret
= EXIT_SUCCESS
;
3090 union in_addr_union exposed
= {};
3091 _cleanup_release_lock_file_ LockFile tree_global_lock
= LOCK_FILE_INIT
, tree_local_lock
= LOCK_FILE_INIT
;
3094 log_parse_environment();
3097 /* Make sure rename_process() in the stub init process can work */
3101 r
= parse_argv(argc
, argv
);
3105 if (geteuid() != 0) {
3106 log_error("Need to be root.");
3110 r
= determine_names();
3114 r
= load_settings();
3118 r
= verify_arguments();
3122 n_fd_passed
= sd_listen_fds(false);
3123 if (n_fd_passed
> 0) {
3124 r
= fdset_new_listen_fds(&fds
, false);
3126 log_error_errno(r
, "Failed to collect file descriptors: %m");
3131 if (arg_directory
) {
3134 if (path_equal(arg_directory
, "/") && !arg_ephemeral
) {
3135 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3140 if (arg_ephemeral
) {
3141 _cleanup_free_
char *np
= NULL
;
3143 /* If the specified path is a mount point we
3144 * generate the new snapshot immediately
3145 * inside it under a random name. However if
3146 * the specified is not a mount point we
3147 * create the new snapshot in the parent
3148 * directory, just next to it. */
3149 r
= path_is_mount_point(arg_directory
, 0);
3151 log_error_errno(r
, "Failed to determine whether directory %s is mount point: %m", arg_directory
);
3155 r
= tempfn_random_child(arg_directory
, "machine.", &np
);
3157 r
= tempfn_random(arg_directory
, "machine.", &np
);
3159 log_error_errno(r
, "Failed to generate name for snapshot: %m");
3163 r
= image_path_lock(np
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3165 log_error_errno(r
, "Failed to lock %s: %m", np
);
3169 r
= btrfs_subvol_snapshot(arg_directory
, np
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
3171 log_error_errno(r
, "Failed to create snapshot %s from %s: %m", np
, arg_directory
);
3175 free(arg_directory
);
3179 remove_subvol
= true;
3182 r
= image_path_lock(arg_directory
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3184 log_error_errno(r
, "Directory tree %s is currently busy.", arg_directory
);
3188 log_error_errno(r
, "Failed to lock %s: %m", arg_directory
);
3193 r
= btrfs_subvol_snapshot(arg_template
, arg_directory
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
3196 log_info("Directory %s already exists, not populating from template %s.", arg_directory
, arg_template
);
3198 log_error_errno(r
, "Couldn't create snapshot %s from %s: %m", arg_directory
, arg_template
);
3202 log_info("Populated %s from template %s.", arg_directory
, arg_template
);
3207 if (arg_start_mode
== START_BOOT
) {
3208 if (path_is_os_tree(arg_directory
) <= 0) {
3209 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory
);
3216 p
= strjoina(arg_directory
, "/usr/");
3217 if (laccess(p
, F_OK
) < 0) {
3218 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory
);
3225 char template[] = "/tmp/nspawn-root-XXXXXX";
3228 assert(!arg_template
);
3230 r
= image_path_lock(arg_image
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3232 r
= log_error_errno(r
, "Disk image %s is currently busy.", arg_image
);
3236 r
= log_error_errno(r
, "Failed to create image lock: %m");
3240 if (!mkdtemp(template)) {
3241 log_error_errno(errno
, "Failed to create temporary directory: %m");
3246 arg_directory
= strdup(template);
3247 if (!arg_directory
) {
3252 image_fd
= setup_image(&device_path
, &loop_nr
);
3258 r
= dissect_image(image_fd
,
3259 &root_device
, &root_device_rw
,
3260 &home_device
, &home_device_rw
,
3261 &srv_device
, &srv_device_rw
,
3267 r
= custom_mounts_prepare();
3272 isatty(STDIN_FILENO
) > 0 &&
3273 isatty(STDOUT_FILENO
) > 0;
3275 master
= posix_openpt(O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NDELAY
);
3277 r
= log_error_errno(errno
, "Failed to acquire pseudo tty: %m");
3281 r
= ptsname_malloc(master
, &console
);
3283 r
= log_error_errno(r
, "Failed to determine tty name: %m");
3287 if (unlockpt(master
) < 0) {
3288 r
= log_error_errno(errno
, "Failed to unlock tty: %m");
3293 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3294 arg_machine
, arg_image
?: arg_directory
);
3296 assert_se(sigprocmask_many(SIG_BLOCK
, NULL
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, -1) >= 0);
3298 assert_se(sigemptyset(&mask_chld
) == 0);
3299 assert_se(sigaddset(&mask_chld
, SIGCHLD
) == 0);
3301 if (prctl(PR_SET_CHILD_SUBREAPER
, 1) < 0) {
3302 r
= log_error_errno(errno
, "Failed to become subreaper: %m");
3307 _cleanup_close_pair_
int kmsg_socket_pair
[2] = { -1, -1 }, rtnl_socket_pair
[2] = { -1, -1 }, pid_socket_pair
[2] = { -1, -1 }, uid_shift_socket_pair
[2] = { -1, -1 };
3308 ContainerStatus container_status
;
3309 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
3310 static const struct sigaction sa
= {
3311 .sa_handler
= nop_signal_handler
,
3312 .sa_flags
= SA_NOCLDSTOP
,
3316 _cleanup_(sd_event_unrefp
) sd_event
*event
= NULL
;
3317 _cleanup_(pty_forward_freep
) PTYForward
*forward
= NULL
;
3318 _cleanup_(sd_netlink_unrefp
) sd_netlink
*rtnl
= NULL
;
3321 r
= barrier_create(&barrier
);
3323 log_error_errno(r
, "Cannot initialize IPC barrier: %m");
3327 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, kmsg_socket_pair
) < 0) {
3328 r
= log_error_errno(errno
, "Failed to create kmsg socket pair: %m");
3332 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, rtnl_socket_pair
) < 0) {
3333 r
= log_error_errno(errno
, "Failed to create rtnl socket pair: %m");
3337 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, pid_socket_pair
) < 0) {
3338 r
= log_error_errno(errno
, "Failed to create pid socket pair: %m");
3343 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, uid_shift_socket_pair
) < 0) {
3344 r
= log_error_errno(errno
, "Failed to create uid shift socket pair: %m");
3348 /* Child can be killed before execv(), so handle SIGCHLD
3349 * in order to interrupt parent's blocking calls and
3350 * give it a chance to call wait() and terminate. */
3351 r
= sigprocmask(SIG_UNBLOCK
, &mask_chld
, NULL
);
3353 r
= log_error_errno(errno
, "Failed to change the signal mask: %m");
3357 r
= sigaction(SIGCHLD
, &sa
, NULL
);
3359 r
= log_error_errno(errno
, "Failed to install SIGCHLD handler: %m");
3363 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
, NULL
);
3365 if (errno
== EINVAL
)
3366 r
= log_error_errno(errno
, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3368 r
= log_error_errno(errno
, "clone() failed: %m");
3374 /* The outer child only has a file system namespace. */
3375 barrier_set_role(&barrier
, BARRIER_CHILD
);
3377 master
= safe_close(master
);
3379 kmsg_socket_pair
[0] = safe_close(kmsg_socket_pair
[0]);
3380 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3381 pid_socket_pair
[0] = safe_close(pid_socket_pair
[0]);
3382 uid_shift_socket_pair
[0] = safe_close(uid_shift_socket_pair
[0]);
3384 (void) reset_all_signal_handlers();
3385 (void) reset_signal_mask();
3387 r
= outer_child(&barrier
,
3390 root_device
, root_device_rw
,
3391 home_device
, home_device_rw
,
3392 srv_device
, srv_device_rw
,
3396 kmsg_socket_pair
[1],
3397 rtnl_socket_pair
[1],
3398 uid_shift_socket_pair
[1],
3401 _exit(EXIT_FAILURE
);
3403 _exit(EXIT_SUCCESS
);
3406 barrier_set_role(&barrier
, BARRIER_PARENT
);
3408 fds
= fdset_free(fds
);
3410 kmsg_socket_pair
[1] = safe_close(kmsg_socket_pair
[1]);
3411 rtnl_socket_pair
[1] = safe_close(rtnl_socket_pair
[1]);
3412 pid_socket_pair
[1] = safe_close(pid_socket_pair
[1]);
3413 uid_shift_socket_pair
[1] = safe_close(uid_shift_socket_pair
[1]);
3415 /* Wait for the outer child. */
3416 r
= wait_for_terminate_and_warn("namespace helper", pid
, NULL
);
3425 /* And now retrieve the PID of the inner child. */
3426 l
= recv(pid_socket_pair
[0], &pid
, sizeof(pid
), 0);
3428 r
= log_error_errno(errno
, "Failed to read inner child PID: %m");
3431 if (l
!= sizeof(pid
)) {
3432 log_error("Short read while reading inner child PID.");
3437 log_debug("Init process invoked as PID " PID_FMT
, pid
);
3440 if (!barrier_place_and_sync(&barrier
)) { /* #1 */
3441 log_error("Child died too early.");
3446 l
= recv(uid_shift_socket_pair
[0], &arg_uid_shift
, sizeof(arg_uid_shift
), 0);
3448 r
= log_error_errno(errno
, "Failed to read UID shift: %m");
3451 if (l
!= sizeof(arg_uid_shift
)) {
3452 log_error("Short read while reading UID shift.");
3457 r
= setup_uid_map(pid
);
3461 (void) barrier_place(&barrier
); /* #2 */
3464 if (arg_private_network
) {
3466 r
= move_network_interfaces(pid
, arg_network_interfaces
);
3470 if (arg_network_veth
) {
3471 r
= setup_veth(arg_machine
, pid
, veth_name
, !!arg_network_bridge
);
3477 if (arg_network_bridge
) {
3478 r
= setup_bridge(veth_name
, arg_network_bridge
);
3486 r
= setup_veth_extra(arg_machine
, pid
, arg_network_veth_extra
);
3490 r
= setup_macvlan(arg_machine
, pid
, arg_network_macvlan
);
3494 r
= setup_ipvlan(arg_machine
, pid
, arg_network_ipvlan
);
3500 r
= register_machine(
3507 arg_custom_mounts
, arg_n_custom_mounts
,
3511 arg_container_service_name
);
3516 r
= sync_cgroup(pid
, arg_unified_cgroup_hierarchy
);
3520 if (arg_keep_unit
) {
3521 r
= create_subcgroup(pid
, arg_unified_cgroup_hierarchy
);
3526 r
= chown_cgroup(pid
, arg_uid_shift
);
3530 /* Notify the child that the parent is ready with all
3531 * its setup (including cgroup-ification), and that
3532 * the child can now hand over control to the code to
3533 * run inside the container. */
3534 (void) barrier_place(&barrier
); /* #3 */
3536 /* Block SIGCHLD here, before notifying child.
3537 * process_pty() will handle it with the other signals. */
3538 assert_se(sigprocmask(SIG_BLOCK
, &mask_chld
, NULL
) >= 0);
3540 /* Reset signal to default */
3541 r
= default_signals(SIGCHLD
, -1);
3543 log_error_errno(r
, "Failed to reset SIGCHLD: %m");
3547 /* Let the child know that we are ready and wait that the child is completely ready now. */
3548 if (!barrier_place_and_sync(&barrier
)) { /* #4 */
3549 log_error("Child died too early.");
3556 "STATUS=Container running.\n"
3557 "X_NSPAWN_LEADER_PID=" PID_FMT
, pid
);
3559 r
= sd_event_new(&event
);
3561 log_error_errno(r
, "Failed to get default event source: %m");
3565 if (arg_kill_signal
> 0) {
3566 /* Try to kill the init system on SIGINT or SIGTERM */
3567 sd_event_add_signal(event
, NULL
, SIGINT
, on_orderly_shutdown
, PID_TO_PTR(pid
));
3568 sd_event_add_signal(event
, NULL
, SIGTERM
, on_orderly_shutdown
, PID_TO_PTR(pid
));
3570 /* Immediately exit */
3571 sd_event_add_signal(event
, NULL
, SIGINT
, NULL
, NULL
);
3572 sd_event_add_signal(event
, NULL
, SIGTERM
, NULL
, NULL
);
3575 /* simply exit on sigchld */
3576 sd_event_add_signal(event
, NULL
, SIGCHLD
, NULL
, NULL
);
3578 if (arg_expose_ports
) {
3579 r
= expose_port_watch_rtnl(event
, rtnl_socket_pair
[0], on_address_change
, &exposed
, &rtnl
);
3583 (void) expose_port_execute(rtnl
, arg_expose_ports
, &exposed
);
3586 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3588 r
= pty_forward_new(event
, master
, PTY_FORWARD_IGNORE_VHANGUP
| (interactive
? 0 : PTY_FORWARD_READ_ONLY
), &forward
);
3590 log_error_errno(r
, "Failed to create PTY forwarder: %m");
3594 r
= sd_event_loop(event
);
3596 log_error_errno(r
, "Failed to run event loop: %m");
3600 pty_forward_get_last_char(forward
, &last_char
);
3602 forward
= pty_forward_free(forward
);
3604 if (!arg_quiet
&& last_char
!= '\n')
3607 /* Kill if it is not dead yet anyway */
3608 if (arg_register
&& !arg_keep_unit
)
3609 terminate_machine(pid
);
3611 /* Normally redundant, but better safe than sorry */
3614 r
= wait_for_container(pid
, &container_status
);
3618 /* We failed to wait for the container, or the
3619 * container exited abnormally */
3621 else if (r
> 0 || container_status
== CONTAINER_TERMINATED
){
3622 /* The container exited with a non-zero
3623 * status, or with zero status and no reboot
3629 /* CONTAINER_REBOOTED, loop again */
3631 if (arg_keep_unit
) {
3632 /* Special handling if we are running as a
3633 * service: instead of simply restarting the
3634 * machine we want to restart the entire
3635 * service, so let's inform systemd about this
3636 * with the special exit code 133. The service
3637 * file uses RestartForceExitStatus=133 so
3638 * that this results in a full nspawn
3639 * restart. This is necessary since we might
3640 * have cgroup parameters set we want to have
3647 expose_port_flush(arg_expose_ports
, &exposed
);
3653 "STATUS=Terminating...");
3658 /* Try to flush whatever is still queued in the pty */
3660 (void) copy_bytes(master
, STDOUT_FILENO
, (uint64_t) -1, false);
3662 loop_remove(loop_nr
, &image_fd
);
3664 if (remove_subvol
&& arg_directory
) {
3667 k
= btrfs_subvol_remove(arg_directory
, BTRFS_REMOVE_RECURSIVE
|BTRFS_REMOVE_QUOTA
);
3669 log_warning_errno(k
, "Cannot remove subvolume '%s', ignoring: %m", arg_directory
);
3675 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
3676 (void) rm_rf(p
, REMOVE_ROOT
);
3679 expose_port_flush(arg_expose_ports
, &exposed
);
3681 free(arg_directory
);
3687 strv_free(arg_setenv
);
3688 free(arg_network_bridge
);
3689 strv_free(arg_network_interfaces
);
3690 strv_free(arg_network_macvlan
);
3691 strv_free(arg_network_ipvlan
);
3692 strv_free(arg_network_veth_extra
);
3693 strv_free(arg_parameters
);
3694 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
3695 expose_port_free_all(arg_expose_ports
);
3697 return r
< 0 ? EXIT_FAILURE
: ret
;