2 This file is part of systemd.
4 Copyright 2010 Lennart Poettering
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
21 #include <blkid/blkid.h>
26 #include <linux/loop.h>
30 #include <selinux/selinux.h>
37 #include <sys/mount.h>
38 #include <sys/personality.h>
39 #include <sys/prctl.h>
40 #include <sys/types.h>
43 #include "sd-daemon.h"
46 #include "alloc-util.h"
48 #include "base-filesystem.h"
49 #include "blkid-util.h"
50 #include "btrfs-util.h"
52 #include "capability-util.h"
53 #include "cgroup-util.h"
55 #include "dev-setup.h"
60 #include "formats-util.h"
63 #include "hostname-util.h"
65 #include "loopback-setup.h"
66 #include "machine-id-setup.h"
67 #include "machine-image.h"
71 #include "mount-util.h"
72 #include "netlink-util.h"
73 #include "nspawn-cgroup.h"
74 #include "nspawn-expose-ports.h"
75 #include "nspawn-mount.h"
76 #include "nspawn-network.h"
77 #include "nspawn-patch-uid.h"
78 #include "nspawn-register.h"
79 #include "nspawn-settings.h"
80 #include "nspawn-setuid.h"
81 #include "nspawn-stub-pid1.h"
82 #include "nspawn-seccomp.h"
83 #include "parse-util.h"
84 #include "path-util.h"
85 #include "process-util.h"
87 #include "random-util.h"
88 #include "raw-clone.h"
90 #include "selinux-util.h"
91 #include "signal-util.h"
92 #include "socket-util.h"
93 #include "stat-util.h"
94 #include "stdio-util.h"
95 #include "string-util.h"
97 #include "terminal-util.h"
98 #include "udev-util.h"
99 #include "umask-util.h"
100 #include "user-util.h"
103 /* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
104 * UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems
105 * may have their own allocation ranges too. */
106 #define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
107 #define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
109 /* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
110 * nspawn_notify_socket_path is relative to the container
111 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
112 #define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
114 typedef enum ContainerStatus
{
115 CONTAINER_TERMINATED
,
119 typedef enum LinkJournal
{
126 static char *arg_directory
= NULL
;
127 static char *arg_template
= NULL
;
128 static char *arg_chdir
= NULL
;
129 static char *arg_user
= NULL
;
130 static sd_id128_t arg_uuid
= {};
131 static char *arg_machine
= NULL
;
132 static const char *arg_selinux_context
= NULL
;
133 static const char *arg_selinux_apifs_context
= NULL
;
134 static const char *arg_slice
= NULL
;
135 static bool arg_private_network
= false;
136 static bool arg_read_only
= false;
137 static StartMode arg_start_mode
= START_PID1
;
138 static bool arg_ephemeral
= false;
139 static LinkJournal arg_link_journal
= LINK_AUTO
;
140 static bool arg_link_journal_try
= false;
141 static uint64_t arg_caps_retain
=
142 (1ULL << CAP_AUDIT_CONTROL
) |
143 (1ULL << CAP_AUDIT_WRITE
) |
144 (1ULL << CAP_CHOWN
) |
145 (1ULL << CAP_DAC_OVERRIDE
) |
146 (1ULL << CAP_DAC_READ_SEARCH
) |
147 (1ULL << CAP_FOWNER
) |
148 (1ULL << CAP_FSETID
) |
149 (1ULL << CAP_IPC_OWNER
) |
151 (1ULL << CAP_LEASE
) |
152 (1ULL << CAP_LINUX_IMMUTABLE
) |
153 (1ULL << CAP_MKNOD
) |
154 (1ULL << CAP_NET_BIND_SERVICE
) |
155 (1ULL << CAP_NET_BROADCAST
) |
156 (1ULL << CAP_NET_RAW
) |
157 (1ULL << CAP_SETFCAP
) |
158 (1ULL << CAP_SETGID
) |
159 (1ULL << CAP_SETPCAP
) |
160 (1ULL << CAP_SETUID
) |
161 (1ULL << CAP_SYS_ADMIN
) |
162 (1ULL << CAP_SYS_BOOT
) |
163 (1ULL << CAP_SYS_CHROOT
) |
164 (1ULL << CAP_SYS_NICE
) |
165 (1ULL << CAP_SYS_PTRACE
) |
166 (1ULL << CAP_SYS_RESOURCE
) |
167 (1ULL << CAP_SYS_TTY_CONFIG
);
168 static CustomMount
*arg_custom_mounts
= NULL
;
169 static unsigned arg_n_custom_mounts
= 0;
170 static char **arg_setenv
= NULL
;
171 static bool arg_quiet
= false;
172 static bool arg_share_system
= false;
173 static bool arg_register
= true;
174 static bool arg_keep_unit
= false;
175 static char **arg_network_interfaces
= NULL
;
176 static char **arg_network_macvlan
= NULL
;
177 static char **arg_network_ipvlan
= NULL
;
178 static bool arg_network_veth
= false;
179 static char **arg_network_veth_extra
= NULL
;
180 static char *arg_network_bridge
= NULL
;
181 static char *arg_network_zone
= NULL
;
182 static unsigned long arg_personality
= PERSONALITY_INVALID
;
183 static char *arg_image
= NULL
;
184 static VolatileMode arg_volatile_mode
= VOLATILE_NO
;
185 static ExposePort
*arg_expose_ports
= NULL
;
186 static char **arg_property
= NULL
;
187 static UserNamespaceMode arg_userns_mode
= USER_NAMESPACE_NO
;
188 static uid_t arg_uid_shift
= UID_INVALID
, arg_uid_range
= 0x10000U
;
189 static bool arg_userns_chown
= false;
190 static int arg_kill_signal
= 0;
191 static bool arg_unified_cgroup_hierarchy
= false;
192 static SettingsMask arg_settings_mask
= 0;
193 static int arg_settings_trusted
= -1;
194 static char **arg_parameters
= NULL
;
195 static const char *arg_container_service_name
= "systemd-nspawn";
196 static bool arg_notify_ready
= false;
198 static void help(void) {
199 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
200 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
201 " -h --help Show this help\n"
202 " --version Print version string\n"
203 " -q --quiet Do not show status information\n"
204 " -D --directory=PATH Root directory for the container\n"
205 " --template=PATH Initialize root directory from template directory,\n"
207 " -x --ephemeral Run container with snapshot of root directory, and\n"
208 " remove it after exit\n"
209 " -i --image=PATH File system device or disk image for the container\n"
210 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
211 " -b --boot Boot up full system (i.e. invoke init)\n"
212 " --chdir=PATH Set working directory in the container\n"
213 " -u --user=USER Run the command under specified user or uid\n"
214 " -M --machine=NAME Set the machine name for the container\n"
215 " --uuid=UUID Set a specific machine UUID for the container\n"
216 " -S --slice=SLICE Place the container in the specified slice\n"
217 " --property=NAME=VALUE Set scope unit property\n"
218 " -U --private-users=pick Run within user namespace, pick UID/GID range automatically\n"
219 " --private-users[=UIDBASE[:NUIDS]]\n"
220 " Run within user namespace, user configured UID/GID range\n"
221 " --private-user-chown Adjust OS tree file ownership for private UID/GID range\n"
222 " --private-network Disable network in container\n"
223 " --network-interface=INTERFACE\n"
224 " Assign an existing network interface to the\n"
226 " --network-macvlan=INTERFACE\n"
227 " Create a macvlan network interface based on an\n"
228 " existing network interface to the container\n"
229 " --network-ipvlan=INTERFACE\n"
230 " Create a ipvlan network interface based on an\n"
231 " existing network interface to the container\n"
232 " -n --network-veth Add a virtual Ethernet connection between host\n"
234 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
235 " Add an additional virtual Ethernet link between\n"
236 " host and container\n"
237 " --network-bridge=INTERFACE\n"
238 " Add a virtual Ethernet connection between host\n"
239 " and container and add it to an existing bridge on\n"
241 " --network-zone=NAME Add a virtual Ethernet connection to the container,\n"
242 " and add it to an automatically managed bridge interface\n"
243 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
244 " Expose a container IP port on the host\n"
245 " -Z --selinux-context=SECLABEL\n"
246 " Set the SELinux security context to be used by\n"
247 " processes in the container\n"
248 " -L --selinux-apifs-context=SECLABEL\n"
249 " Set the SELinux security context to be used by\n"
250 " API/tmpfs file systems in the container\n"
251 " --capability=CAP In addition to the default, retain specified\n"
253 " --drop-capability=CAP Drop the specified capability from the default set\n"
254 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
255 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
256 " host, try-guest, try-host\n"
257 " -j Equivalent to --link-journal=try-guest\n"
258 " --read-only Mount the root directory read-only\n"
259 " --bind=PATH[:PATH[:OPTIONS]]\n"
260 " Bind mount a file or directory from the host into\n"
262 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
263 " Similar, but creates a read-only bind mount\n"
264 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
265 " --overlay=PATH[:PATH...]:PATH\n"
266 " Create an overlay mount from the host to \n"
268 " --overlay-ro=PATH[:PATH...]:PATH\n"
269 " Similar, but creates a read-only overlay mount\n"
270 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
271 " --share-system Share system namespaces with host\n"
272 " --register=BOOLEAN Register container as machine\n"
273 " --keep-unit Do not register a scope for the machine, reuse\n"
274 " the service unit nspawn is running in\n"
275 " --volatile[=MODE] Run the system in volatile mode\n"
276 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
277 " --notify-ready=BOOLEAN Receive notifications from the container's init process,\n"
278 " accepted values: yes and no\n"
279 , program_invocation_short_name
);
282 static int custom_mounts_prepare(void) {
286 /* Ensure the mounts are applied prefix first. */
287 qsort_safe(arg_custom_mounts
, arg_n_custom_mounts
, sizeof(CustomMount
), custom_mount_compare
);
289 /* Allocate working directories for the overlay file systems that need it */
290 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
291 CustomMount
*m
= &arg_custom_mounts
[i
];
293 if (path_equal(m
->destination
, "/") && arg_userns_mode
!= USER_NAMESPACE_NO
) {
295 if (arg_userns_chown
) {
296 log_error("--private-users-chown may not be combined with custom root mounts.");
298 } else if (arg_uid_shift
== UID_INVALID
) {
299 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
304 if (m
->type
!= CUSTOM_MOUNT_OVERLAY
)
313 r
= tempfn_random(m
->source
, NULL
, &m
->work_dir
);
315 return log_error_errno(r
, "Failed to generate work directory from %s: %m", m
->source
);
321 static int detect_unified_cgroup_hierarchy(void) {
325 /* Allow the user to control whether the unified hierarchy is used */
326 e
= getenv("UNIFIED_CGROUP_HIERARCHY");
328 r
= parse_boolean(e
);
330 return log_error_errno(r
, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
332 arg_unified_cgroup_hierarchy
= r
;
336 /* Otherwise inherit the default from the host system */
339 return log_error_errno(r
, "Failed to determine whether the unified cgroups hierarchy is used: %m");
341 arg_unified_cgroup_hierarchy
= r
;
345 static int parse_argv(int argc
, char *argv
[]) {
363 ARG_NETWORK_INTERFACE
,
368 ARG_NETWORK_VETH_EXTRA
,
377 ARG_PRIVATE_USERS_CHOWN
,
381 static const struct option options
[] = {
382 { "help", no_argument
, NULL
, 'h' },
383 { "version", no_argument
, NULL
, ARG_VERSION
},
384 { "directory", required_argument
, NULL
, 'D' },
385 { "template", required_argument
, NULL
, ARG_TEMPLATE
},
386 { "ephemeral", no_argument
, NULL
, 'x' },
387 { "user", required_argument
, NULL
, 'u' },
388 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
389 { "as-pid2", no_argument
, NULL
, 'a' },
390 { "boot", no_argument
, NULL
, 'b' },
391 { "uuid", required_argument
, NULL
, ARG_UUID
},
392 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
393 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
394 { "drop-capability", required_argument
, NULL
, ARG_DROP_CAPABILITY
},
395 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
396 { "bind", required_argument
, NULL
, ARG_BIND
},
397 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
398 { "tmpfs", required_argument
, NULL
, ARG_TMPFS
},
399 { "overlay", required_argument
, NULL
, ARG_OVERLAY
},
400 { "overlay-ro", required_argument
, NULL
, ARG_OVERLAY_RO
},
401 { "machine", required_argument
, NULL
, 'M' },
402 { "slice", required_argument
, NULL
, 'S' },
403 { "setenv", required_argument
, NULL
, 'E' },
404 { "selinux-context", required_argument
, NULL
, 'Z' },
405 { "selinux-apifs-context", required_argument
, NULL
, 'L' },
406 { "quiet", no_argument
, NULL
, 'q' },
407 { "share-system", no_argument
, NULL
, ARG_SHARE_SYSTEM
},
408 { "register", required_argument
, NULL
, ARG_REGISTER
},
409 { "keep-unit", no_argument
, NULL
, ARG_KEEP_UNIT
},
410 { "network-interface", required_argument
, NULL
, ARG_NETWORK_INTERFACE
},
411 { "network-macvlan", required_argument
, NULL
, ARG_NETWORK_MACVLAN
},
412 { "network-ipvlan", required_argument
, NULL
, ARG_NETWORK_IPVLAN
},
413 { "network-veth", no_argument
, NULL
, 'n' },
414 { "network-veth-extra", required_argument
, NULL
, ARG_NETWORK_VETH_EXTRA
},
415 { "network-bridge", required_argument
, NULL
, ARG_NETWORK_BRIDGE
},
416 { "network-zone", required_argument
, NULL
, ARG_NETWORK_ZONE
},
417 { "personality", required_argument
, NULL
, ARG_PERSONALITY
},
418 { "image", required_argument
, NULL
, 'i' },
419 { "volatile", optional_argument
, NULL
, ARG_VOLATILE
},
420 { "port", required_argument
, NULL
, 'p' },
421 { "property", required_argument
, NULL
, ARG_PROPERTY
},
422 { "private-users", optional_argument
, NULL
, ARG_PRIVATE_USERS
},
423 { "private-users-chown", optional_argument
, NULL
, ARG_PRIVATE_USERS_CHOWN
},
424 { "kill-signal", required_argument
, NULL
, ARG_KILL_SIGNAL
},
425 { "settings", required_argument
, NULL
, ARG_SETTINGS
},
426 { "chdir", required_argument
, NULL
, ARG_CHDIR
},
427 { "notify-ready", required_argument
, NULL
, ARG_NOTIFY_READY
},
433 uint64_t plus
= 0, minus
= 0;
434 bool mask_all_settings
= false, mask_no_settings
= false;
439 while ((c
= getopt_long(argc
, argv
, "+hD:u:abL:M:jS:Z:qi:xp:nU", options
, NULL
)) >= 0)
451 r
= parse_path_argument_and_warn(optarg
, false, &arg_directory
);
457 r
= parse_path_argument_and_warn(optarg
, false, &arg_template
);
463 r
= parse_path_argument_and_warn(optarg
, false, &arg_image
);
469 arg_ephemeral
= true;
473 r
= free_and_strdup(&arg_user
, optarg
);
477 arg_settings_mask
|= SETTING_USER
;
480 case ARG_NETWORK_ZONE
: {
483 j
= strappend("vz-", optarg
);
487 if (!ifname_valid(j
)) {
488 log_error("Network zone name not valid: %s", j
);
493 free(arg_network_zone
);
494 arg_network_zone
= j
;
496 arg_network_veth
= true;
497 arg_private_network
= true;
498 arg_settings_mask
|= SETTING_NETWORK
;
502 case ARG_NETWORK_BRIDGE
:
504 if (!ifname_valid(optarg
)) {
505 log_error("Bridge interface name not valid: %s", optarg
);
509 r
= free_and_strdup(&arg_network_bridge
, optarg
);
516 arg_network_veth
= true;
517 arg_private_network
= true;
518 arg_settings_mask
|= SETTING_NETWORK
;
521 case ARG_NETWORK_VETH_EXTRA
:
522 r
= veth_extra_parse(&arg_network_veth_extra
, optarg
);
524 return log_error_errno(r
, "Failed to parse --network-veth-extra= parameter: %s", optarg
);
526 arg_private_network
= true;
527 arg_settings_mask
|= SETTING_NETWORK
;
530 case ARG_NETWORK_INTERFACE
:
532 if (!ifname_valid(optarg
)) {
533 log_error("Network interface name not valid: %s", optarg
);
537 if (strv_extend(&arg_network_interfaces
, optarg
) < 0)
540 arg_private_network
= true;
541 arg_settings_mask
|= SETTING_NETWORK
;
544 case ARG_NETWORK_MACVLAN
:
546 if (!ifname_valid(optarg
)) {
547 log_error("MACVLAN network interface name not valid: %s", optarg
);
551 if (strv_extend(&arg_network_macvlan
, optarg
) < 0)
554 arg_private_network
= true;
555 arg_settings_mask
|= SETTING_NETWORK
;
558 case ARG_NETWORK_IPVLAN
:
560 if (!ifname_valid(optarg
)) {
561 log_error("IPVLAN network interface name not valid: %s", optarg
);
565 if (strv_extend(&arg_network_ipvlan
, optarg
) < 0)
570 case ARG_PRIVATE_NETWORK
:
571 arg_private_network
= true;
572 arg_settings_mask
|= SETTING_NETWORK
;
576 if (arg_start_mode
== START_PID2
) {
577 log_error("--boot and --as-pid2 may not be combined.");
581 arg_start_mode
= START_BOOT
;
582 arg_settings_mask
|= SETTING_START_MODE
;
586 if (arg_start_mode
== START_BOOT
) {
587 log_error("--boot and --as-pid2 may not be combined.");
591 arg_start_mode
= START_PID2
;
592 arg_settings_mask
|= SETTING_START_MODE
;
596 r
= sd_id128_from_string(optarg
, &arg_uuid
);
598 log_error("Invalid UUID: %s", optarg
);
602 arg_settings_mask
|= SETTING_MACHINE_ID
;
611 arg_machine
= mfree(arg_machine
);
613 if (!machine_name_is_valid(optarg
)) {
614 log_error("Invalid machine name: %s", optarg
);
618 r
= free_and_strdup(&arg_machine
, optarg
);
626 arg_selinux_context
= optarg
;
630 arg_selinux_apifs_context
= optarg
;
634 arg_read_only
= true;
635 arg_settings_mask
|= SETTING_READ_ONLY
;
639 case ARG_DROP_CAPABILITY
: {
642 _cleanup_free_
char *t
= NULL
;
644 r
= extract_first_word(&p
, &t
, ",", 0);
646 return log_error_errno(r
, "Failed to parse capability %s.", t
);
651 if (streq(t
, "all")) {
652 if (c
== ARG_CAPABILITY
)
653 plus
= (uint64_t) -1;
655 minus
= (uint64_t) -1;
659 cap
= capability_from_name(t
);
661 log_error("Failed to parse capability %s.", t
);
665 if (c
== ARG_CAPABILITY
)
666 plus
|= 1ULL << (uint64_t) cap
;
668 minus
|= 1ULL << (uint64_t) cap
;
672 arg_settings_mask
|= SETTING_CAPABILITY
;
677 arg_link_journal
= LINK_GUEST
;
678 arg_link_journal_try
= true;
681 case ARG_LINK_JOURNAL
:
682 if (streq(optarg
, "auto")) {
683 arg_link_journal
= LINK_AUTO
;
684 arg_link_journal_try
= false;
685 } else if (streq(optarg
, "no")) {
686 arg_link_journal
= LINK_NO
;
687 arg_link_journal_try
= false;
688 } else if (streq(optarg
, "guest")) {
689 arg_link_journal
= LINK_GUEST
;
690 arg_link_journal_try
= false;
691 } else if (streq(optarg
, "host")) {
692 arg_link_journal
= LINK_HOST
;
693 arg_link_journal_try
= false;
694 } else if (streq(optarg
, "try-guest")) {
695 arg_link_journal
= LINK_GUEST
;
696 arg_link_journal_try
= true;
697 } else if (streq(optarg
, "try-host")) {
698 arg_link_journal
= LINK_HOST
;
699 arg_link_journal_try
= true;
701 log_error("Failed to parse link journal mode %s", optarg
);
709 r
= bind_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
, c
== ARG_BIND_RO
);
711 return log_error_errno(r
, "Failed to parse --bind(-ro)= argument %s: %m", optarg
);
713 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
717 r
= tmpfs_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
);
719 return log_error_errno(r
, "Failed to parse --tmpfs= argument %s: %m", optarg
);
721 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
725 case ARG_OVERLAY_RO
: {
726 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
727 _cleanup_strv_free_
char **lower
= NULL
;
732 r
= strv_split_extract(&lower
, optarg
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
736 log_error("Invalid overlay specification: %s", optarg
);
740 STRV_FOREACH(i
, lower
) {
741 if (!path_is_absolute(*i
)) {
742 log_error("Overlay path %s is not absolute.", *i
);
750 log_error("--overlay= needs at least two colon-separated directories specified.");
755 /* If two parameters are specified,
756 * the first one is the lower, the
757 * second one the upper directory. And
758 * we'll also define the destination
759 * mount point the same as the upper. */
763 destination
= strdup(upper
);
768 upper
= lower
[n
- 2];
769 destination
= lower
[n
- 1];
773 m
= custom_mount_add(&arg_custom_mounts
, &arg_n_custom_mounts
, CUSTOM_MOUNT_OVERLAY
);
777 m
->destination
= destination
;
780 m
->read_only
= c
== ARG_OVERLAY_RO
;
782 upper
= destination
= NULL
;
785 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
792 if (!env_assignment_is_valid(optarg
)) {
793 log_error("Environment variable assignment '%s' is not valid.", optarg
);
797 n
= strv_env_set(arg_setenv
, optarg
);
801 strv_free(arg_setenv
);
804 arg_settings_mask
|= SETTING_ENVIRONMENT
;
812 case ARG_SHARE_SYSTEM
:
813 arg_share_system
= true;
817 r
= parse_boolean(optarg
);
819 log_error("Failed to parse --register= argument: %s", optarg
);
827 arg_keep_unit
= true;
830 case ARG_PERSONALITY
:
832 arg_personality
= personality_from_string(optarg
);
833 if (arg_personality
== PERSONALITY_INVALID
) {
834 log_error("Unknown or unsupported personality '%s'.", optarg
);
838 arg_settings_mask
|= SETTING_PERSONALITY
;
844 arg_volatile_mode
= VOLATILE_YES
;
848 m
= volatile_mode_from_string(optarg
);
850 log_error("Failed to parse --volatile= argument: %s", optarg
);
853 arg_volatile_mode
= m
;
856 arg_settings_mask
|= SETTING_VOLATILE_MODE
;
860 r
= expose_port_parse(&arg_expose_ports
, optarg
);
862 return log_error_errno(r
, "Duplicate port specification: %s", optarg
);
864 return log_error_errno(r
, "Failed to parse host port %s: %m", optarg
);
866 arg_settings_mask
|= SETTING_EXPOSE_PORTS
;
870 if (strv_extend(&arg_property
, optarg
) < 0)
875 case ARG_PRIVATE_USERS
:
877 r
= optarg
? parse_boolean(optarg
) : 1;
879 /* no: User namespacing off */
880 arg_userns_mode
= USER_NAMESPACE_NO
;
881 arg_uid_shift
= UID_INVALID
;
882 arg_uid_range
= UINT32_C(0x10000);
884 /* yes: User namespacing on, UID range is read from root dir */
885 arg_userns_mode
= USER_NAMESPACE_FIXED
;
886 arg_uid_shift
= UID_INVALID
;
887 arg_uid_range
= UINT32_C(0x10000);
888 } else if (streq(optarg
, "pick")) {
889 /* pick: User namespacing on, UID range is picked randomly */
890 arg_userns_mode
= USER_NAMESPACE_PICK
;
891 arg_uid_shift
= UID_INVALID
;
892 arg_uid_range
= UINT32_C(0x10000);
894 _cleanup_free_
char *buffer
= NULL
;
895 const char *range
, *shift
;
897 /* anything else: User namespacing on, UID range is explicitly configured */
899 range
= strchr(optarg
, ':');
901 buffer
= strndup(optarg
, range
- optarg
);
907 if (safe_atou32(range
, &arg_uid_range
) < 0 || arg_uid_range
<= 0) {
908 log_error("Failed to parse UID range: %s", range
);
914 if (parse_uid(shift
, &arg_uid_shift
) < 0) {
915 log_error("Failed to parse UID: %s", optarg
);
919 arg_userns_mode
= USER_NAMESPACE_FIXED
;
922 arg_settings_mask
|= SETTING_USERNS
;
926 if (userns_supported()) {
927 arg_userns_mode
= USER_NAMESPACE_PICK
;
928 arg_uid_shift
= UID_INVALID
;
929 arg_uid_range
= UINT32_C(0x10000);
931 arg_settings_mask
|= SETTING_USERNS
;
936 case ARG_PRIVATE_USERS_CHOWN
:
937 arg_userns_chown
= true;
939 arg_settings_mask
|= SETTING_USERNS
;
942 case ARG_KILL_SIGNAL
:
943 arg_kill_signal
= signal_from_string_try_harder(optarg
);
944 if (arg_kill_signal
< 0) {
945 log_error("Cannot parse signal: %s", optarg
);
949 arg_settings_mask
|= SETTING_KILL_SIGNAL
;
954 /* no → do not read files
955 * yes → read files, do not override cmdline, trust only subset
956 * override → read files, override cmdline, trust only subset
957 * trusted → read files, do not override cmdline, trust all
960 r
= parse_boolean(optarg
);
962 if (streq(optarg
, "trusted")) {
963 mask_all_settings
= false;
964 mask_no_settings
= false;
965 arg_settings_trusted
= true;
967 } else if (streq(optarg
, "override")) {
968 mask_all_settings
= false;
969 mask_no_settings
= true;
970 arg_settings_trusted
= -1;
972 return log_error_errno(r
, "Failed to parse --settings= argument: %s", optarg
);
975 mask_all_settings
= false;
976 mask_no_settings
= false;
977 arg_settings_trusted
= -1;
980 mask_all_settings
= true;
981 mask_no_settings
= false;
982 arg_settings_trusted
= false;
988 if (!path_is_absolute(optarg
)) {
989 log_error("Working directory %s is not an absolute path.", optarg
);
993 r
= free_and_strdup(&arg_chdir
, optarg
);
997 arg_settings_mask
|= SETTING_WORKING_DIRECTORY
;
1000 case ARG_NOTIFY_READY
:
1001 r
= parse_boolean(optarg
);
1003 log_error("%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg
);
1006 arg_notify_ready
= r
;
1007 arg_settings_mask
|= SETTING_NOTIFY_READY
;
1014 assert_not_reached("Unhandled option");
1017 if (arg_share_system
)
1018 arg_register
= false;
1020 if (arg_userns_mode
== USER_NAMESPACE_PICK
)
1021 arg_userns_chown
= true;
1023 if (arg_start_mode
!= START_PID1
&& arg_share_system
) {
1024 log_error("--boot and --share-system may not be combined.");
1028 if (arg_keep_unit
&& cg_pid_get_owner_uid(0, NULL
) >= 0) {
1029 log_error("--keep-unit may not be used when invoked from a user session.");
1033 if (arg_directory
&& arg_image
) {
1034 log_error("--directory= and --image= may not be combined.");
1038 if (arg_template
&& arg_image
) {
1039 log_error("--template= and --image= may not be combined.");
1043 if (arg_template
&& !(arg_directory
|| arg_machine
)) {
1044 log_error("--template= needs --directory= or --machine=.");
1048 if (arg_ephemeral
&& arg_template
) {
1049 log_error("--ephemeral and --template= may not be combined.");
1053 if (arg_ephemeral
&& arg_image
) {
1054 log_error("--ephemeral and --image= may not be combined.");
1058 if (arg_ephemeral
&& !IN_SET(arg_link_journal
, LINK_NO
, LINK_AUTO
)) {
1059 log_error("--ephemeral and --link-journal= may not be combined.");
1063 if (arg_userns_mode
!= USER_NAMESPACE_NO
&& !userns_supported()) {
1064 log_error("--private-users= is not supported, kernel compiled without user namespace support.");
1068 if (arg_userns_chown
&& arg_read_only
) {
1069 log_error("--read-only and --private-users-chown may not be combined.");
1073 if (arg_network_bridge
&& arg_network_zone
) {
1074 log_error("--network-bridge= and --network-zone= may not be combined.");
1078 if (argc
> optind
) {
1079 arg_parameters
= strv_copy(argv
+ optind
);
1080 if (!arg_parameters
)
1083 arg_settings_mask
|= SETTING_START_MODE
;
1086 /* Load all settings from .nspawn files */
1087 if (mask_no_settings
)
1088 arg_settings_mask
= 0;
1090 /* Don't load any settings from .nspawn files */
1091 if (mask_all_settings
)
1092 arg_settings_mask
= _SETTINGS_MASK_ALL
;
1094 arg_caps_retain
= (arg_caps_retain
| plus
| (arg_private_network
? 1ULL << CAP_NET_ADMIN
: 0)) & ~minus
;
1096 r
= detect_unified_cgroup_hierarchy();
1100 e
= getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
1102 arg_container_service_name
= e
;
1107 static int verify_arguments(void) {
1109 if (arg_volatile_mode
!= VOLATILE_NO
&& arg_read_only
) {
1110 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1114 if (arg_expose_ports
&& !arg_private_network
) {
1115 log_error("Cannot use --port= without private networking.");
1119 #ifndef HAVE_LIBIPTC
1120 if (arg_expose_ports
) {
1121 log_error("--port= is not supported, compiled without libiptc support.");
1126 if (arg_start_mode
== START_BOOT
&& arg_kill_signal
<= 0)
1127 arg_kill_signal
= SIGRTMIN
+3;
1132 static int userns_lchown(const char *p
, uid_t uid
, gid_t gid
) {
1135 if (arg_userns_mode
== USER_NAMESPACE_NO
)
1138 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
1141 if (uid
!= UID_INVALID
) {
1142 uid
+= arg_uid_shift
;
1144 if (uid
< arg_uid_shift
|| uid
>= arg_uid_shift
+ arg_uid_range
)
1148 if (gid
!= GID_INVALID
) {
1149 gid
+= (gid_t
) arg_uid_shift
;
1151 if (gid
< (gid_t
) arg_uid_shift
|| gid
>= (gid_t
) (arg_uid_shift
+ arg_uid_range
))
1155 if (lchown(p
, uid
, gid
) < 0)
1161 static int userns_mkdir(const char *root
, const char *path
, mode_t mode
, uid_t uid
, gid_t gid
) {
1164 q
= prefix_roota(root
, path
);
1165 if (mkdir(q
, mode
) < 0) {
1166 if (errno
== EEXIST
)
1171 return userns_lchown(q
, uid
, gid
);
1174 static int setup_timezone(const char *dest
) {
1175 _cleanup_free_
char *p
= NULL
, *q
= NULL
;
1176 const char *where
, *check
, *what
;
1182 /* Fix the timezone, if possible */
1183 r
= readlink_malloc("/etc/localtime", &p
);
1185 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1189 z
= path_startswith(p
, "../usr/share/zoneinfo/");
1191 z
= path_startswith(p
, "/usr/share/zoneinfo/");
1193 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1197 where
= prefix_roota(dest
, "/etc/localtime");
1198 r
= readlink_malloc(where
, &q
);
1200 y
= path_startswith(q
, "../usr/share/zoneinfo/");
1202 y
= path_startswith(q
, "/usr/share/zoneinfo/");
1204 /* Already pointing to the right place? Then do nothing .. */
1205 if (y
&& streq(y
, z
))
1209 check
= strjoina("/usr/share/zoneinfo/", z
);
1210 check
= prefix_roota(dest
, check
);
1211 if (laccess(check
, F_OK
) < 0) {
1212 log_warning("Timezone %s does not exist in container, not updating container timezone.", z
);
1217 if (r
< 0 && errno
!= ENOENT
) {
1218 log_error_errno(errno
, "Failed to remove existing timezone info %s in container: %m", where
);
1222 what
= strjoina("../usr/share/zoneinfo/", z
);
1223 if (symlink(what
, where
) < 0) {
1224 log_error_errno(errno
, "Failed to correct timezone of container: %m");
1228 r
= userns_lchown(where
, 0, 0);
1230 return log_warning_errno(r
, "Failed to chown /etc/localtime: %m");
1235 static int setup_resolv_conf(const char *dest
) {
1236 const char *where
= NULL
;
1241 if (arg_private_network
)
1244 /* Fix resolv.conf, if possible */
1245 where
= prefix_roota(dest
, "/etc/resolv.conf");
1247 r
= copy_file("/etc/resolv.conf", where
, O_TRUNC
|O_NOFOLLOW
, 0644, 0);
1249 /* If the file already exists as symlink, let's
1250 * suppress the warning, under the assumption that
1251 * resolved or something similar runs inside and the
1252 * symlink points there.
1254 * If the disk image is read-only, there's also no
1255 * point in complaining.
1257 log_full_errno(IN_SET(r
, -ELOOP
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
1258 "Failed to copy /etc/resolv.conf to %s: %m", where
);
1262 r
= userns_lchown(where
, 0, 0);
1264 log_warning_errno(r
, "Failed to chown /etc/resolv.conf: %m");
1269 static char* id128_format_as_uuid(sd_id128_t id
, char s
[37]) {
1273 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1274 SD_ID128_FORMAT_VAL(id
));
1279 static int setup_boot_id(const char *dest
) {
1280 sd_id128_t rnd
= SD_ID128_NULL
;
1281 const char *from
, *to
;
1285 if (arg_share_system
)
1288 /* Generate a new randomized boot ID, so that each boot-up of
1289 * the container gets a new one */
1291 from
= prefix_roota(dest
, "/run/proc-sys-kernel-random-boot-id");
1292 to
= prefix_roota(dest
, "/proc/sys/kernel/random/boot_id");
1294 r
= sd_id128_randomize(&rnd
);
1296 return log_error_errno(r
, "Failed to generate random boot id: %m");
1298 id128_format_as_uuid(rnd
, as_uuid
);
1300 r
= write_string_file(from
, as_uuid
, WRITE_STRING_FILE_CREATE
);
1302 return log_error_errno(r
, "Failed to write boot id: %m");
1304 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1305 r
= log_error_errno(errno
, "Failed to bind mount boot id: %m");
1306 else if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
) < 0)
1307 log_warning_errno(errno
, "Failed to make boot id read-only, ignoring: %m");
1309 (void) unlink(from
);
1313 static int copy_devnodes(const char *dest
) {
1315 static const char devnodes
[] =
1326 _cleanup_umask_ mode_t u
;
1332 /* Create /dev/net, so that we can create /dev/net/tun in it */
1333 if (userns_mkdir(dest
, "/dev/net", 0755, 0, 0) < 0)
1334 return log_error_errno(r
, "Failed to create /dev/net directory: %m");
1336 NULSTR_FOREACH(d
, devnodes
) {
1337 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
1340 from
= strappend("/dev/", d
);
1341 to
= prefix_root(dest
, from
);
1343 if (stat(from
, &st
) < 0) {
1345 if (errno
!= ENOENT
)
1346 return log_error_errno(errno
, "Failed to stat %s: %m", from
);
1348 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
1350 log_error("%s is not a char or block device, cannot copy.", from
);
1354 if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
1356 return log_error_errno(errno
, "mknod(%s) failed: %m", to
);
1358 /* Some systems abusively restrict mknod but
1359 * allow bind mounts. */
1362 return log_error_errno(r
, "touch (%s) failed: %m", to
);
1363 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1364 return log_error_errno(errno
, "Both mknod and bind mount (%s) failed: %m", to
);
1367 r
= userns_lchown(to
, 0, 0);
1369 return log_error_errno(r
, "chown() of device node %s failed: %m", to
);
1376 static int setup_pts(const char *dest
) {
1377 _cleanup_free_
char *options
= NULL
;
1382 if (arg_selinux_apifs_context
)
1383 (void) asprintf(&options
,
1384 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
",context=\"%s\"",
1385 arg_uid_shift
+ TTY_GID
,
1386 arg_selinux_apifs_context
);
1389 (void) asprintf(&options
,
1390 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
,
1391 arg_uid_shift
+ TTY_GID
);
1396 /* Mount /dev/pts itself */
1397 p
= prefix_roota(dest
, "/dev/pts");
1398 if (mkdir(p
, 0755) < 0)
1399 return log_error_errno(errno
, "Failed to create /dev/pts: %m");
1400 if (mount("devpts", p
, "devpts", MS_NOSUID
|MS_NOEXEC
, options
) < 0)
1401 return log_error_errno(errno
, "Failed to mount /dev/pts: %m");
1402 r
= userns_lchown(p
, 0, 0);
1404 return log_error_errno(r
, "Failed to chown /dev/pts: %m");
1406 /* Create /dev/ptmx symlink */
1407 p
= prefix_roota(dest
, "/dev/ptmx");
1408 if (symlink("pts/ptmx", p
) < 0)
1409 return log_error_errno(errno
, "Failed to create /dev/ptmx symlink: %m");
1410 r
= userns_lchown(p
, 0, 0);
1412 return log_error_errno(r
, "Failed to chown /dev/ptmx: %m");
1414 /* And fix /dev/pts/ptmx ownership */
1415 p
= prefix_roota(dest
, "/dev/pts/ptmx");
1416 r
= userns_lchown(p
, 0, 0);
1418 return log_error_errno(r
, "Failed to chown /dev/pts/ptmx: %m");
1423 static int setup_dev_console(const char *dest
, const char *console
) {
1424 _cleanup_umask_ mode_t u
;
1433 r
= chmod_and_chown(console
, 0600, arg_uid_shift
, arg_uid_shift
);
1435 return log_error_errno(r
, "Failed to correct access mode for TTY: %m");
1437 /* We need to bind mount the right tty to /dev/console since
1438 * ptys can only exist on pts file systems. To have something
1439 * to bind mount things on we create a empty regular file. */
1441 to
= prefix_roota(dest
, "/dev/console");
1444 return log_error_errno(r
, "touch() for /dev/console failed: %m");
1446 if (mount(console
, to
, NULL
, MS_BIND
, NULL
) < 0)
1447 return log_error_errno(errno
, "Bind mount for /dev/console failed: %m");
1452 static int setup_kmsg(const char *dest
, int kmsg_socket
) {
1453 const char *from
, *to
;
1454 _cleanup_umask_ mode_t u
;
1457 assert(kmsg_socket
>= 0);
1461 /* We create the kmsg FIFO as /run/kmsg, but immediately
1462 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1463 * on the reading side behave very similar to /proc/kmsg,
1464 * their writing side behaves differently from /dev/kmsg in
1465 * that writing blocks when nothing is reading. In order to
1466 * avoid any problems with containers deadlocking due to this
1467 * we simply make /dev/kmsg unavailable to the container. */
1468 from
= prefix_roota(dest
, "/run/kmsg");
1469 to
= prefix_roota(dest
, "/proc/kmsg");
1471 if (mkfifo(from
, 0600) < 0)
1472 return log_error_errno(errno
, "mkfifo() for /run/kmsg failed: %m");
1473 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1474 return log_error_errno(errno
, "Bind mount for /proc/kmsg failed: %m");
1476 fd
= open(from
, O_RDWR
|O_NDELAY
|O_CLOEXEC
);
1478 return log_error_errno(errno
, "Failed to open fifo: %m");
1480 /* Store away the fd in the socket, so that it stays open as
1481 * long as we run the child */
1482 r
= send_one_fd(kmsg_socket
, fd
, 0);
1486 return log_error_errno(r
, "Failed to send FIFO fd: %m");
1488 /* And now make the FIFO unavailable as /run/kmsg... */
1489 (void) unlink(from
);
1494 static int on_address_change(sd_netlink
*rtnl
, sd_netlink_message
*m
, void *userdata
) {
1495 union in_addr_union
*exposed
= userdata
;
1501 expose_port_execute(rtnl
, arg_expose_ports
, exposed
);
1505 static int setup_hostname(void) {
1507 if (arg_share_system
)
1510 if (sethostname_idempotent(arg_machine
) < 0)
1516 static int setup_journal(const char *directory
) {
1518 _cleanup_free_
char *d
= NULL
;
1524 /* Don't link journals in ephemeral mode */
1528 if (arg_link_journal
== LINK_NO
)
1531 try = arg_link_journal_try
|| arg_link_journal
== LINK_AUTO
;
1533 r
= sd_id128_get_machine(&this_id
);
1535 return log_error_errno(r
, "Failed to retrieve machine ID: %m");
1537 if (sd_id128_equal(arg_uuid
, this_id
)) {
1538 log_full(try ? LOG_WARNING
: LOG_ERR
,
1539 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid
, id
));
1545 r
= userns_mkdir(directory
, "/var", 0755, 0, 0);
1547 return log_error_errno(r
, "Failed to create /var: %m");
1549 r
= userns_mkdir(directory
, "/var/log", 0755, 0, 0);
1551 return log_error_errno(r
, "Failed to create /var/log: %m");
1553 r
= userns_mkdir(directory
, "/var/log/journal", 0755, 0, 0);
1555 return log_error_errno(r
, "Failed to create /var/log/journal: %m");
1557 (void) sd_id128_to_string(arg_uuid
, id
);
1559 p
= strjoina("/var/log/journal/", id
);
1560 q
= prefix_roota(directory
, p
);
1562 if (path_is_mount_point(p
, 0) > 0) {
1566 log_error("%s: already a mount point, refusing to use for journal", p
);
1570 if (path_is_mount_point(q
, 0) > 0) {
1574 log_error("%s: already a mount point, refusing to use for journal", q
);
1578 r
= readlink_and_make_absolute(p
, &d
);
1580 if ((arg_link_journal
== LINK_GUEST
||
1581 arg_link_journal
== LINK_AUTO
) &&
1584 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1586 log_warning_errno(r
, "Failed to create directory %s: %m", q
);
1591 return log_error_errno(errno
, "Failed to remove symlink %s: %m", p
);
1592 } else if (r
== -EINVAL
) {
1594 if (arg_link_journal
== LINK_GUEST
&&
1597 if (errno
== ENOTDIR
) {
1598 log_error("%s already exists and is neither a symlink nor a directory", p
);
1601 return log_error_errno(errno
, "Failed to remove %s: %m", p
);
1603 } else if (r
!= -ENOENT
)
1604 return log_error_errno(r
, "readlink(%s) failed: %m", p
);
1606 if (arg_link_journal
== LINK_GUEST
) {
1608 if (symlink(q
, p
) < 0) {
1610 log_debug_errno(errno
, "Failed to symlink %s to %s, skipping journal setup: %m", q
, p
);
1613 return log_error_errno(errno
, "Failed to symlink %s to %s: %m", q
, p
);
1616 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1618 log_warning_errno(r
, "Failed to create directory %s: %m", q
);
1622 if (arg_link_journal
== LINK_HOST
) {
1623 /* don't create parents here — if the host doesn't have
1624 * permanent journal set up, don't force it here */
1626 if (mkdir(p
, 0755) < 0 && errno
!= EEXIST
) {
1628 log_debug_errno(errno
, "Failed to create %s, skipping journal setup: %m", p
);
1631 return log_error_errno(errno
, "Failed to create %s: %m", p
);
1634 } else if (access(p
, F_OK
) < 0)
1637 if (dir_is_empty(q
) == 0)
1638 log_warning("%s is not empty, proceeding anyway.", q
);
1640 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1642 return log_error_errno(r
, "Failed to create %s: %m", q
);
1644 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1645 return log_error_errno(errno
, "Failed to bind mount journal from host into guest: %m");
1650 static int drop_capabilities(void) {
1651 return capability_bounding_set_drop(arg_caps_retain
, false);
1654 static int reset_audit_loginuid(void) {
1655 _cleanup_free_
char *p
= NULL
;
1658 if (arg_share_system
)
1661 r
= read_one_line_file("/proc/self/loginuid", &p
);
1665 return log_error_errno(r
, "Failed to read /proc/self/loginuid: %m");
1667 /* Already reset? */
1668 if (streq(p
, "4294967295"))
1671 r
= write_string_file("/proc/self/loginuid", "4294967295", 0);
1674 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1675 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1676 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1677 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1678 "using systemd-nspawn. Sleeping for 5s... (%m)");
1687 static int setup_propagate(const char *root
) {
1691 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1692 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1693 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
1694 (void) mkdir_p(p
, 0600);
1696 r
= userns_mkdir(root
, "/run/systemd", 0755, 0, 0);
1698 return log_error_errno(r
, "Failed to create /run/systemd: %m");
1700 r
= userns_mkdir(root
, "/run/systemd/nspawn", 0755, 0, 0);
1702 return log_error_errno(r
, "Failed to create /run/systemd/nspawn: %m");
1704 r
= userns_mkdir(root
, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1706 return log_error_errno(r
, "Failed to create /run/systemd/nspawn/incoming: %m");
1708 q
= prefix_roota(root
, "/run/systemd/nspawn/incoming");
1709 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1710 return log_error_errno(errno
, "Failed to install propagation bind mount.");
1712 if (mount(NULL
, q
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
) < 0)
1713 return log_error_errno(errno
, "Failed to make propagation mount read-only");
1718 static int setup_image(char **device_path
, int *loop_nr
) {
1719 struct loop_info64 info
= {
1720 .lo_flags
= LO_FLAGS_AUTOCLEAR
|LO_FLAGS_PARTSCAN
1722 _cleanup_close_
int fd
= -1, control
= -1, loop
= -1;
1723 _cleanup_free_
char* loopdev
= NULL
;
1727 assert(device_path
);
1731 fd
= open(arg_image
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1733 return log_error_errno(errno
, "Failed to open %s: %m", arg_image
);
1735 if (fstat(fd
, &st
) < 0)
1736 return log_error_errno(errno
, "Failed to stat %s: %m", arg_image
);
1738 if (S_ISBLK(st
.st_mode
)) {
1741 p
= strdup(arg_image
);
1755 if (!S_ISREG(st
.st_mode
)) {
1756 log_error("%s is not a regular file or block device.", arg_image
);
1760 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
1762 return log_error_errno(errno
, "Failed to open /dev/loop-control: %m");
1764 nr
= ioctl(control
, LOOP_CTL_GET_FREE
);
1766 return log_error_errno(errno
, "Failed to allocate loop device: %m");
1768 if (asprintf(&loopdev
, "/dev/loop%i", nr
) < 0)
1771 loop
= open(loopdev
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1773 return log_error_errno(errno
, "Failed to open loop device %s: %m", loopdev
);
1775 if (ioctl(loop
, LOOP_SET_FD
, fd
) < 0)
1776 return log_error_errno(errno
, "Failed to set loopback file descriptor on %s: %m", loopdev
);
1779 info
.lo_flags
|= LO_FLAGS_READ_ONLY
;
1781 if (ioctl(loop
, LOOP_SET_STATUS64
, &info
) < 0)
1782 return log_error_errno(errno
, "Failed to set loopback settings on %s: %m", loopdev
);
1784 *device_path
= loopdev
;
1795 #define PARTITION_TABLE_BLURB \
1796 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1797 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1798 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1799 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1800 "to be bootable with systemd-nspawn."
1802 static int dissect_image(
1804 char **root_device
, bool *root_device_rw
,
1805 char **home_device
, bool *home_device_rw
,
1806 char **srv_device
, bool *srv_device_rw
,
1810 int home_nr
= -1, srv_nr
= -1;
1811 #ifdef GPT_ROOT_NATIVE
1814 #ifdef GPT_ROOT_SECONDARY
1815 int secondary_root_nr
= -1;
1817 _cleanup_free_
char *home
= NULL
, *root
= NULL
, *secondary_root
= NULL
, *srv
= NULL
, *generic
= NULL
;
1818 _cleanup_udev_enumerate_unref_
struct udev_enumerate
*e
= NULL
;
1819 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
1820 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
1821 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
1822 struct udev_list_entry
*first
, *item
;
1823 bool home_rw
= true, root_rw
= true, secondary_root_rw
= true, srv_rw
= true, generic_rw
= true;
1824 bool is_gpt
, is_mbr
, multiple_generic
= false;
1825 const char *pttype
= NULL
;
1832 assert(root_device
);
1833 assert(home_device
);
1838 b
= blkid_new_probe();
1843 r
= blkid_probe_set_device(b
, fd
, 0, 0);
1848 return log_error_errno(errno
, "Failed to set device on blkid probe: %m");
1851 blkid_probe_enable_partitions(b
, 1);
1852 blkid_probe_set_partitions_flags(b
, BLKID_PARTS_ENTRY_DETAILS
);
1855 r
= blkid_do_safeprobe(b
);
1856 if (r
== -2 || r
== 1) {
1857 log_error("Failed to identify any partition table on\n"
1859 PARTITION_TABLE_BLURB
, arg_image
);
1861 } else if (r
!= 0) {
1864 return log_error_errno(errno
, "Failed to probe: %m");
1867 (void) blkid_probe_lookup_value(b
, "PTTYPE", &pttype
, NULL
);
1869 is_gpt
= streq_ptr(pttype
, "gpt");
1870 is_mbr
= streq_ptr(pttype
, "dos");
1872 if (!is_gpt
&& !is_mbr
) {
1873 log_error("No GPT or MBR partition table discovered on\n"
1875 PARTITION_TABLE_BLURB
, arg_image
);
1880 pl
= blkid_probe_get_partitions(b
);
1885 log_error("Failed to list partitions of %s", arg_image
);
1893 if (fstat(fd
, &st
) < 0)
1894 return log_error_errno(errno
, "Failed to stat block device: %m");
1896 d
= udev_device_new_from_devnum(udev
, 'b', st
.st_rdev
);
1904 log_error("Kernel partitions never appeared.");
1908 e
= udev_enumerate_new(udev
);
1912 r
= udev_enumerate_add_match_parent(e
, d
);
1916 r
= udev_enumerate_scan_devices(e
);
1918 return log_error_errno(r
, "Failed to scan for partition devices of %s: %m", arg_image
);
1920 /* Count the partitions enumerated by the kernel */
1922 first
= udev_enumerate_get_list_entry(e
);
1923 udev_list_entry_foreach(item
, first
)
1926 /* Count the partitions enumerated by blkid */
1927 m
= blkid_partlist_numof_partitions(pl
);
1931 log_error("blkid and kernel partition list do not match.");
1937 /* The kernel has probed fewer partitions than
1938 * blkid? Maybe the kernel prober is still
1939 * running or it got EBUSY because udev
1940 * already opened the device. Let's reprobe
1941 * the device, which is a synchronous call
1942 * that waits until probing is complete. */
1944 for (j
= 0; j
< 20; j
++) {
1946 r
= ioctl(fd
, BLKRRPART
, 0);
1949 if (r
>= 0 || r
!= -EBUSY
)
1952 /* If something else has the device
1953 * open, such as an udev rule, the
1954 * ioctl will return EBUSY. Since
1955 * there's no way to wait until it
1956 * isn't busy anymore, let's just wait
1957 * a bit, and try again.
1959 * This is really something they
1960 * should fix in the kernel! */
1962 usleep(50 * USEC_PER_MSEC
);
1966 return log_error_errno(r
, "Failed to reread partition table: %m");
1969 e
= udev_enumerate_unref(e
);
1972 first
= udev_enumerate_get_list_entry(e
);
1973 udev_list_entry_foreach(item
, first
) {
1974 _cleanup_udev_device_unref_
struct udev_device
*q
;
1976 unsigned long long flags
;
1982 q
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(item
));
1987 return log_error_errno(errno
, "Failed to get partition device of %s: %m", arg_image
);
1990 qn
= udev_device_get_devnum(q
);
1994 if (st
.st_rdev
== qn
)
1997 node
= udev_device_get_devnode(q
);
2001 pp
= blkid_partlist_devno_to_partition(pl
, qn
);
2005 flags
= blkid_partition_get_flags(pp
);
2007 nr
= blkid_partition_get_partno(pp
);
2015 if (flags
& GPT_FLAG_NO_AUTO
)
2018 stype
= blkid_partition_get_type_string(pp
);
2022 if (sd_id128_from_string(stype
, &type_id
) < 0)
2025 if (sd_id128_equal(type_id
, GPT_HOME
)) {
2027 if (home
&& nr
>= home_nr
)
2031 home_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2033 r
= free_and_strdup(&home
, node
);
2037 } else if (sd_id128_equal(type_id
, GPT_SRV
)) {
2039 if (srv
&& nr
>= srv_nr
)
2043 srv_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2045 r
= free_and_strdup(&srv
, node
);
2049 #ifdef GPT_ROOT_NATIVE
2050 else if (sd_id128_equal(type_id
, GPT_ROOT_NATIVE
)) {
2052 if (root
&& nr
>= root_nr
)
2056 root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2058 r
= free_and_strdup(&root
, node
);
2063 #ifdef GPT_ROOT_SECONDARY
2064 else if (sd_id128_equal(type_id
, GPT_ROOT_SECONDARY
)) {
2066 if (secondary_root
&& nr
>= secondary_root_nr
)
2069 secondary_root_nr
= nr
;
2070 secondary_root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2072 r
= free_and_strdup(&secondary_root
, node
);
2077 else if (sd_id128_equal(type_id
, GPT_LINUX_GENERIC
)) {
2080 multiple_generic
= true;
2082 generic_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2084 r
= free_and_strdup(&generic
, node
);
2090 } else if (is_mbr
) {
2093 if (flags
!= 0x80) /* Bootable flag */
2096 type
= blkid_partition_get_type(pp
);
2097 if (type
!= 0x83) /* Linux partition */
2101 multiple_generic
= true;
2105 r
= free_and_strdup(&root
, node
);
2113 *root_device
= root
;
2116 *root_device_rw
= root_rw
;
2118 } else if (secondary_root
) {
2119 *root_device
= secondary_root
;
2120 secondary_root
= NULL
;
2122 *root_device_rw
= secondary_root_rw
;
2124 } else if (generic
) {
2126 /* There were no partitions with precise meanings
2127 * around, but we found generic partitions. In this
2128 * case, if there's only one, we can go ahead and boot
2129 * it, otherwise we bail out, because we really cannot
2130 * make any sense of it. */
2132 if (multiple_generic
) {
2133 log_error("Identified multiple bootable Linux partitions on\n"
2135 PARTITION_TABLE_BLURB
, arg_image
);
2139 *root_device
= generic
;
2142 *root_device_rw
= generic_rw
;
2145 log_error("Failed to identify root partition in disk image\n"
2147 PARTITION_TABLE_BLURB
, arg_image
);
2152 *home_device
= home
;
2155 *home_device_rw
= home_rw
;
2162 *srv_device_rw
= srv_rw
;
2167 log_error("--image= is not supported, compiled without blkid support.");
2172 static int mount_device(const char *what
, const char *where
, const char *directory
, bool rw
) {
2174 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
2175 const char *fstype
, *p
;
2185 p
= strjoina(where
, directory
);
2190 b
= blkid_new_probe_from_filename(what
);
2194 return log_error_errno(errno
, "Failed to allocate prober for %s: %m", what
);
2197 blkid_probe_enable_superblocks(b
, 1);
2198 blkid_probe_set_superblocks_flags(b
, BLKID_SUBLKS_TYPE
);
2201 r
= blkid_do_safeprobe(b
);
2202 if (r
== -1 || r
== 1) {
2203 log_error("Cannot determine file system type of %s", what
);
2205 } else if (r
!= 0) {
2208 return log_error_errno(errno
, "Failed to probe %s: %m", what
);
2212 if (blkid_probe_lookup_value(b
, "TYPE", &fstype
, NULL
) < 0) {
2215 log_error("Failed to determine file system type of %s", what
);
2219 if (streq(fstype
, "crypto_LUKS")) {
2220 log_error("nspawn currently does not support LUKS disk images.");
2224 if (mount(what
, p
, fstype
, MS_NODEV
|(rw
? 0 : MS_RDONLY
), NULL
) < 0)
2225 return log_error_errno(errno
, "Failed to mount %s: %m", what
);
2229 log_error("--image= is not supported, compiled without blkid support.");
2234 static int setup_machine_id(const char *directory
) {
2235 const char *etc_machine_id
, *t
;
2236 _cleanup_free_
char *s
= NULL
;
2239 etc_machine_id
= prefix_roota(directory
, "/etc/machine-id");
2241 r
= read_one_line_file(etc_machine_id
, &s
);
2243 return log_error_errno(r
, "Failed to read machine ID from %s: %m", etc_machine_id
);
2248 r
= sd_id128_from_string(t
, &arg_uuid
);
2250 return log_error_errno(r
, "Failed to parse machine ID from %s: %m", etc_machine_id
);
2252 if (sd_id128_is_null(arg_uuid
)) {
2253 r
= sd_id128_randomize(&arg_uuid
);
2255 return log_error_errno(r
, "Failed to generate random machine ID: %m");
2259 r
= machine_id_setup(directory
, arg_uuid
);
2261 return log_error_errno(r
, "Failed to setup machine ID: %m");
2266 static int recursive_chown(const char *directory
, uid_t shift
, uid_t range
) {
2271 if (arg_userns_mode
== USER_NAMESPACE_NO
|| !arg_userns_chown
)
2274 r
= path_patch_uid(directory
, arg_uid_shift
, arg_uid_range
);
2275 if (r
== -EOPNOTSUPP
)
2276 return log_error_errno(r
, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2278 return log_error_errno(r
, "Upper 16 bits of root directory UID and GID do not match.");
2280 return log_error_errno(r
, "Failed to adjust UID/GID shift of OS tree: %m");
2282 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2284 log_debug("Patched directory tree to match UID/GID range.");
2289 static int mount_devices(
2291 const char *root_device
, bool root_device_rw
,
2292 const char *home_device
, bool home_device_rw
,
2293 const char *srv_device
, bool srv_device_rw
) {
2299 r
= mount_device(root_device
, arg_directory
, NULL
, root_device_rw
);
2301 return log_error_errno(r
, "Failed to mount root directory: %m");
2305 r
= mount_device(home_device
, arg_directory
, "/home", home_device_rw
);
2307 return log_error_errno(r
, "Failed to mount home directory: %m");
2311 r
= mount_device(srv_device
, arg_directory
, "/srv", srv_device_rw
);
2313 return log_error_errno(r
, "Failed to mount server data directory: %m");
2319 static void loop_remove(int nr
, int *image_fd
) {
2320 _cleanup_close_
int control
= -1;
2326 if (image_fd
&& *image_fd
>= 0) {
2327 r
= ioctl(*image_fd
, LOOP_CLR_FD
);
2329 log_debug_errno(errno
, "Failed to close loop image: %m");
2330 *image_fd
= safe_close(*image_fd
);
2333 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
2335 log_warning_errno(errno
, "Failed to open /dev/loop-control: %m");
2339 r
= ioctl(control
, LOOP_CTL_REMOVE
, nr
);
2341 log_debug_errno(errno
, "Failed to remove loop %d: %m", nr
);
2346 * < 0 : wait_for_terminate() failed to get the state of the
2347 * container, the container was terminated by a signal, or
2348 * failed for an unknown reason. No change is made to the
2349 * container argument.
2350 * > 0 : The program executed in the container terminated with an
2351 * error. The exit code of the program executed in the
2352 * container is returned. The container argument has been set
2353 * to CONTAINER_TERMINATED.
2354 * 0 : The container is being rebooted, has been shut down or exited
2355 * successfully. The container argument has been set to either
2356 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2358 * That is, success is indicated by a return value of zero, and an
2359 * error is indicated by a non-zero value.
2361 static int wait_for_container(pid_t pid
, ContainerStatus
*container
) {
2365 r
= wait_for_terminate(pid
, &status
);
2367 return log_warning_errno(r
, "Failed to wait for container: %m");
2369 switch (status
.si_code
) {
2372 if (status
.si_status
== 0)
2373 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s exited successfully.", arg_machine
);
2375 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s failed with error code %i.", arg_machine
, status
.si_status
);
2377 *container
= CONTAINER_TERMINATED
;
2378 return status
.si_status
;
2381 if (status
.si_status
== SIGINT
) {
2382 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s has been shut down.", arg_machine
);
2383 *container
= CONTAINER_TERMINATED
;
2386 } else if (status
.si_status
== SIGHUP
) {
2387 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s is being rebooted.", arg_machine
);
2388 *container
= CONTAINER_REBOOTED
;
2392 /* CLD_KILLED fallthrough */
2395 log_error("Container %s terminated by signal %s.", arg_machine
, signal_to_string(status
.si_status
));
2399 log_error("Container %s failed due to unknown reason.", arg_machine
);
2404 static int on_orderly_shutdown(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
2407 pid
= PTR_TO_PID(userdata
);
2409 if (kill(pid
, arg_kill_signal
) >= 0) {
2410 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2411 sd_event_source_set_userdata(s
, NULL
);
2416 sd_event_exit(sd_event_source_get_event(s
), 0);
2420 static int determine_names(void) {
2423 if (arg_template
&& !arg_directory
&& arg_machine
) {
2425 /* If --template= was specified then we should not
2426 * search for a machine, but instead create a new one
2427 * in /var/lib/machine. */
2429 arg_directory
= strjoin("/var/lib/machines/", arg_machine
, NULL
);
2434 if (!arg_image
&& !arg_directory
) {
2436 _cleanup_(image_unrefp
) Image
*i
= NULL
;
2438 r
= image_find(arg_machine
, &i
);
2440 return log_error_errno(r
, "Failed to find image for machine '%s': %m", arg_machine
);
2442 log_error("No image for machine '%s': %m", arg_machine
);
2446 if (i
->type
== IMAGE_RAW
)
2447 r
= free_and_strdup(&arg_image
, i
->path
);
2449 r
= free_and_strdup(&arg_directory
, i
->path
);
2451 return log_error_errno(r
, "Invalid image directory: %m");
2454 arg_read_only
= arg_read_only
|| i
->read_only
;
2456 arg_directory
= get_current_dir_name();
2458 if (!arg_directory
&& !arg_machine
) {
2459 log_error("Failed to determine path, please use -D or -i.");
2465 if (arg_directory
&& path_equal(arg_directory
, "/"))
2466 arg_machine
= gethostname_malloc();
2468 arg_machine
= strdup(basename(arg_image
?: arg_directory
));
2473 hostname_cleanup(arg_machine
);
2474 if (!machine_name_is_valid(arg_machine
)) {
2475 log_error("Failed to determine machine name automatically, please use -M.");
2479 if (arg_ephemeral
) {
2482 /* Add a random suffix when this is an
2483 * ephemeral machine, so that we can run many
2484 * instances at once without manually having
2485 * to specify -M each time. */
2487 if (asprintf(&b
, "%s-%016" PRIx64
, arg_machine
, random_u64()) < 0)
2498 static int determine_uid_shift(const char *directory
) {
2501 if (arg_userns_mode
== USER_NAMESPACE_NO
) {
2506 if (arg_uid_shift
== UID_INVALID
) {
2509 r
= stat(directory
, &st
);
2511 return log_error_errno(errno
, "Failed to determine UID base of %s: %m", directory
);
2513 arg_uid_shift
= st
.st_uid
& UINT32_C(0xffff0000);
2515 if (arg_uid_shift
!= (st
.st_gid
& UINT32_C(0xffff0000))) {
2516 log_error("UID and GID base of %s don't match.", directory
);
2520 arg_uid_range
= UINT32_C(0x10000);
2523 if (arg_uid_shift
> (uid_t
) -1 - arg_uid_range
) {
2524 log_error("UID base too high for UID range.");
2531 static int inner_child(
2533 const char *directory
,
2539 _cleanup_free_
char *home
= NULL
;
2542 const char *envp
[] = {
2543 "PATH=" DEFAULT_PATH_SPLIT_USR
,
2544 NULL
, /* container */
2549 NULL
, /* container_uuid */
2550 NULL
, /* LISTEN_FDS */
2551 NULL
, /* LISTEN_PID */
2552 NULL
, /* NOTIFY_SOCKET */
2556 _cleanup_strv_free_
char **env_use
= NULL
;
2561 assert(kmsg_socket
>= 0);
2565 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
2566 /* Tell the parent, that it now can write the UID map. */
2567 (void) barrier_place(barrier
); /* #1 */
2569 /* Wait until the parent wrote the UID map */
2570 if (!barrier_place_and_sync(barrier
)) { /* #2 */
2571 log_error("Parent died too early");
2577 arg_userns_mode
!= USER_NAMESPACE_NO
,
2579 arg_private_network
,
2582 arg_selinux_apifs_context
);
2587 r
= mount_sysfs(NULL
);
2591 /* Wait until we are cgroup-ified, so that we
2592 * can mount the right cgroup path writable */
2593 if (!barrier_place_and_sync(barrier
)) { /* #3 */
2594 log_error("Parent died too early");
2598 r
= mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy
);
2602 r
= reset_uid_gid();
2604 return log_error_errno(r
, "Couldn't become new root: %m");
2606 r
= setup_boot_id(NULL
);
2610 r
= setup_kmsg(NULL
, kmsg_socket
);
2613 kmsg_socket
= safe_close(kmsg_socket
);
2618 return log_error_errno(errno
, "setsid() failed: %m");
2620 if (arg_private_network
)
2623 if (arg_expose_ports
) {
2624 r
= expose_port_send_rtnl(rtnl_socket
);
2627 rtnl_socket
= safe_close(rtnl_socket
);
2630 r
= drop_capabilities();
2632 return log_error_errno(r
, "drop_capabilities() failed: %m");
2636 if (arg_personality
!= PERSONALITY_INVALID
) {
2637 if (personality(arg_personality
) < 0)
2638 return log_error_errno(errno
, "personality() failed: %m");
2639 } else if (secondary
) {
2640 if (personality(PER_LINUX32
) < 0)
2641 return log_error_errno(errno
, "personality() failed: %m");
2645 if (arg_selinux_context
)
2646 if (setexeccon(arg_selinux_context
) < 0)
2647 return log_error_errno(errno
, "setexeccon(\"%s\") failed: %m", arg_selinux_context
);
2650 r
= change_uid_gid(arg_user
, &home
);
2654 /* LXC sets container=lxc, so follow the scheme here */
2655 envp
[n_env
++] = strjoina("container=", arg_container_service_name
);
2657 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
2661 if ((asprintf((char**)(envp
+ n_env
++), "HOME=%s", home
? home
: "/root") < 0) ||
2662 (asprintf((char**)(envp
+ n_env
++), "USER=%s", arg_user
? arg_user
: "root") < 0) ||
2663 (asprintf((char**)(envp
+ n_env
++), "LOGNAME=%s", arg_user
? arg_user
: "root") < 0))
2666 assert(!sd_id128_is_null(arg_uuid
));
2668 if (asprintf((char**)(envp
+ n_env
++), "container_uuid=%s", id128_format_as_uuid(arg_uuid
, as_uuid
)) < 0)
2671 if (fdset_size(fds
) > 0) {
2672 r
= fdset_cloexec(fds
, false);
2674 return log_error_errno(r
, "Failed to unset O_CLOEXEC for file descriptors.");
2676 if ((asprintf((char **)(envp
+ n_env
++), "LISTEN_FDS=%u", fdset_size(fds
)) < 0) ||
2677 (asprintf((char **)(envp
+ n_env
++), "LISTEN_PID=1") < 0))
2680 if (asprintf((char **)(envp
+ n_env
++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH
) < 0)
2683 env_use
= strv_env_merge(2, envp
, arg_setenv
);
2687 /* Let the parent know that we are ready and
2688 * wait until the parent is ready with the
2690 if (!barrier_place_and_sync(barrier
)) { /* #4 */
2691 log_error("Parent died too early");
2696 if (chdir(arg_chdir
) < 0)
2697 return log_error_errno(errno
, "Failed to change to specified working directory %s: %m", arg_chdir
);
2699 if (arg_start_mode
== START_PID2
) {
2705 /* Now, explicitly close the log, so that we
2706 * then can close all remaining fds. Closing
2707 * the log explicitly first has the benefit
2708 * that the logging subsystem knows about it,
2709 * and is thus ready to be reopened should we
2710 * need it again. Note that the other fds
2711 * closed here are at least the locking and
2714 (void) fdset_close_others(fds
);
2716 if (arg_start_mode
== START_BOOT
) {
2720 /* Automatically search for the init system */
2722 m
= strv_length(arg_parameters
);
2723 a
= newa(char*, m
+ 2);
2724 memcpy_safe(a
+ 1, arg_parameters
, m
* sizeof(char*));
2727 a
[0] = (char*) "/usr/lib/systemd/systemd";
2728 execve(a
[0], a
, env_use
);
2730 a
[0] = (char*) "/lib/systemd/systemd";
2731 execve(a
[0], a
, env_use
);
2733 a
[0] = (char*) "/sbin/init";
2734 execve(a
[0], a
, env_use
);
2735 } else if (!strv_isempty(arg_parameters
))
2736 execvpe(arg_parameters
[0], arg_parameters
, env_use
);
2739 /* If we cannot change the directory, we'll end up in /, that is expected. */
2740 (void) chdir(home
?: "/root");
2742 execle("/bin/bash", "-bash", NULL
, env_use
);
2743 execle("/bin/sh", "-sh", NULL
, env_use
);
2748 return log_error_errno(r
, "execv() failed: %m");
2751 static int setup_sd_notify_child(void) {
2752 static const int one
= 1;
2754 union sockaddr_union sa
= {
2755 .sa
.sa_family
= AF_UNIX
,
2759 fd
= socket(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
|SOCK_NONBLOCK
, 0);
2761 return log_error_errno(errno
, "Failed to allocate notification socket: %m");
2763 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH
, 0755);
2764 (void) unlink(NSPAWN_NOTIFY_SOCKET_PATH
);
2766 strncpy(sa
.un
.sun_path
, NSPAWN_NOTIFY_SOCKET_PATH
, sizeof(sa
.un
.sun_path
)-1);
2767 r
= bind(fd
, &sa
.sa
, SOCKADDR_UN_LEN(sa
.un
));
2770 return log_error_errno(errno
, "bind(%s) failed: %m", sa
.un
.sun_path
);
2773 r
= setsockopt(fd
, SOL_SOCKET
, SO_PASSCRED
, &one
, sizeof(one
));
2776 return log_error_errno(errno
, "SO_PASSCRED failed: %m");
2782 static int outer_child(
2784 const char *directory
,
2785 const char *console
,
2786 const char *root_device
, bool root_device_rw
,
2787 const char *home_device
, bool home_device_rw
,
2788 const char *srv_device
, bool srv_device_rw
,
2796 int uid_shift_socket
,
2802 _cleanup_close_
int fd
= -1;
2807 assert(pid_socket
>= 0);
2808 assert(uuid_socket
>= 0);
2809 assert(notify_socket
>= 0);
2810 assert(kmsg_socket
>= 0);
2814 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0)
2815 return log_error_errno(errno
, "PR_SET_PDEATHSIG failed: %m");
2818 close_nointr(STDIN_FILENO
);
2819 close_nointr(STDOUT_FILENO
);
2820 close_nointr(STDERR_FILENO
);
2822 r
= open_terminal(console
, O_RDWR
);
2823 if (r
!= STDIN_FILENO
) {
2829 return log_error_errno(r
, "Failed to open console: %m");
2832 if (dup2(STDIN_FILENO
, STDOUT_FILENO
) != STDOUT_FILENO
||
2833 dup2(STDIN_FILENO
, STDERR_FILENO
) != STDERR_FILENO
)
2834 return log_error_errno(errno
, "Failed to duplicate console: %m");
2837 r
= reset_audit_loginuid();
2841 /* Mark everything as slave, so that we still
2842 * receive mounts from the real root, but don't
2843 * propagate mounts to the real root. */
2844 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
2845 return log_error_errno(errno
, "MS_SLAVE|MS_REC failed: %m");
2847 r
= mount_devices(directory
,
2848 root_device
, root_device_rw
,
2849 home_device
, home_device_rw
,
2850 srv_device
, srv_device_rw
);
2854 r
= determine_uid_shift(directory
);
2858 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
2859 /* Let the parent know which UID shift we read from the image */
2860 l
= send(uid_shift_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), MSG_NOSIGNAL
);
2862 return log_error_errno(errno
, "Failed to send UID shift: %m");
2863 if (l
!= sizeof(arg_uid_shift
)) {
2864 log_error("Short write while sending UID shift.");
2868 if (arg_userns_mode
== USER_NAMESPACE_PICK
) {
2869 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
2870 * we just read from the image is available. If yes, it will send the UID shift back to us, if
2871 * not it will pick a different one, and send it back to us. */
2873 l
= recv(uid_shift_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), 0);
2875 return log_error_errno(errno
, "Failed to recv UID shift: %m");
2876 if (l
!= sizeof(arg_uid_shift
)) {
2877 log_error("Short read while receiving UID shift.");
2882 log_info("Selected user namespace base " UID_FMT
" and range " UID_FMT
".", arg_uid_shift
, arg_uid_range
);
2885 /* Turn directory into bind mount */
2886 if (mount(directory
, directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
2887 return log_error_errno(errno
, "Failed to make bind mount: %m");
2889 r
= recursive_chown(directory
, arg_uid_shift
, arg_uid_range
);
2896 arg_userns_mode
!= USER_NAMESPACE_NO
,
2899 arg_selinux_context
);
2903 r
= setup_volatile_state(
2906 arg_userns_mode
!= USER_NAMESPACE_NO
,
2909 arg_selinux_context
);
2913 r
= base_filesystem_create(directory
, arg_uid_shift
, (gid_t
) arg_uid_shift
);
2917 if (arg_read_only
) {
2918 r
= bind_remount_recursive(directory
, true);
2920 return log_error_errno(r
, "Failed to make tree read-only: %m");
2923 r
= mount_all(directory
,
2924 arg_userns_mode
!= USER_NAMESPACE_NO
,
2926 arg_private_network
,
2929 arg_selinux_apifs_context
);
2933 r
= copy_devnodes(directory
);
2937 dev_setup(directory
, arg_uid_shift
, arg_uid_shift
);
2939 r
= setup_pts(directory
);
2943 r
= setup_propagate(directory
);
2947 r
= setup_dev_console(directory
, console
);
2951 r
= setup_seccomp(arg_caps_retain
);
2955 r
= setup_timezone(directory
);
2959 r
= setup_resolv_conf(directory
);
2963 r
= setup_machine_id(directory
);
2967 r
= setup_journal(directory
);
2974 arg_n_custom_mounts
,
2975 arg_userns_mode
!= USER_NAMESPACE_NO
,
2978 arg_selinux_apifs_context
);
2984 arg_unified_cgroup_hierarchy
,
2985 arg_userns_mode
!= USER_NAMESPACE_NO
,
2988 arg_selinux_apifs_context
);
2992 r
= mount_move_root(directory
);
2994 return log_error_errno(r
, "Failed to move root directory: %m");
2996 fd
= setup_sd_notify_child();
3000 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
|
3001 (arg_share_system
? 0 : CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
) |
3002 (arg_private_network
? CLONE_NEWNET
: 0) |
3003 (arg_userns_mode
!= USER_NAMESPACE_NO
? CLONE_NEWUSER
: 0));
3005 return log_error_errno(errno
, "Failed to fork inner child: %m");
3007 pid_socket
= safe_close(pid_socket
);
3008 uuid_socket
= safe_close(uuid_socket
);
3009 notify_socket
= safe_close(notify_socket
);
3010 uid_shift_socket
= safe_close(uid_shift_socket
);
3012 /* The inner child has all namespaces that are
3013 * requested, so that we all are owned by the user if
3014 * user namespaces are turned on. */
3016 r
= inner_child(barrier
, directory
, secondary
, kmsg_socket
, rtnl_socket
, fds
);
3018 _exit(EXIT_FAILURE
);
3020 _exit(EXIT_SUCCESS
);
3023 l
= send(pid_socket
, &pid
, sizeof(pid
), MSG_NOSIGNAL
);
3025 return log_error_errno(errno
, "Failed to send PID: %m");
3026 if (l
!= sizeof(pid
)) {
3027 log_error("Short write while sending PID.");
3031 l
= send(uuid_socket
, &arg_uuid
, sizeof(arg_uuid
), MSG_NOSIGNAL
);
3033 return log_error_errno(errno
, "Failed to send machine ID: %m");
3034 if (l
!= sizeof(arg_uuid
)) {
3035 log_error("Short write while sending machine ID.");
3039 l
= send_one_fd(notify_socket
, fd
, 0);
3041 return log_error_errno(errno
, "Failed to send notify fd: %m");
3043 pid_socket
= safe_close(pid_socket
);
3044 uuid_socket
= safe_close(uuid_socket
);
3045 notify_socket
= safe_close(notify_socket
);
3046 kmsg_socket
= safe_close(kmsg_socket
);
3047 rtnl_socket
= safe_close(rtnl_socket
);
3052 static int uid_shift_pick(uid_t
*shift
, LockFile
*ret_lock_file
) {
3053 unsigned n_tries
= 100;
3058 assert(ret_lock_file
);
3059 assert(arg_userns_mode
== USER_NAMESPACE_PICK
);
3060 assert(arg_uid_range
== 0x10000U
);
3064 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3067 char lock_path
[strlen("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t
) + 1];
3068 _cleanup_release_lock_file_ LockFile lf
= LOCK_FILE_INIT
;
3073 if (candidate
< UID_SHIFT_PICK_MIN
|| candidate
> UID_SHIFT_PICK_MAX
)
3075 if ((candidate
& UINT32_C(0xFFFF)) != 0)
3078 xsprintf(lock_path
, "/run/systemd/nspawn-uid/" UID_FMT
, candidate
);
3079 r
= make_lock_file(lock_path
, LOCK_EX
|LOCK_NB
, &lf
);
3080 if (r
== -EBUSY
) /* Range already taken by another nspawn instance */
3085 /* Make some superficial checks whether the range is currently known in the user database */
3086 if (getpwuid(candidate
))
3088 if (getpwuid(candidate
+ UINT32_C(0xFFFE)))
3090 if (getgrgid(candidate
))
3092 if (getgrgid(candidate
+ UINT32_C(0xFFFE)))
3095 *ret_lock_file
= lf
;
3096 lf
= (struct LockFile
) LOCK_FILE_INIT
;
3101 random_bytes(&candidate
, sizeof(candidate
));
3102 candidate
= (candidate
% (UID_SHIFT_PICK_MAX
- UID_SHIFT_PICK_MIN
)) + UID_SHIFT_PICK_MIN
;
3103 candidate
&= (uid_t
) UINT32_C(0xFFFF0000);
3107 static int setup_uid_map(pid_t pid
) {
3108 char uid_map
[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t
) + 1], line
[DECIMAL_STR_MAX(uid_t
)*3+3+1];
3113 xsprintf(uid_map
, "/proc/" PID_FMT
"/uid_map", pid
);
3114 xsprintf(line
, UID_FMT
" " UID_FMT
" " UID_FMT
"\n", 0, arg_uid_shift
, arg_uid_range
);
3115 r
= write_string_file(uid_map
, line
, 0);
3117 return log_error_errno(r
, "Failed to write UID map: %m");
3119 /* We always assign the same UID and GID ranges */
3120 xsprintf(uid_map
, "/proc/" PID_FMT
"/gid_map", pid
);
3121 r
= write_string_file(uid_map
, line
, 0);
3123 return log_error_errno(r
, "Failed to write GID map: %m");
3128 static int nspawn_dispatch_notify_fd(sd_event_source
*source
, int fd
, uint32_t revents
, void *userdata
) {
3129 char buf
[NOTIFY_BUFFER_MAX
+1];
3131 struct iovec iovec
= {
3133 .iov_len
= sizeof(buf
)-1,
3136 struct cmsghdr cmsghdr
;
3137 uint8_t buf
[CMSG_SPACE(sizeof(struct ucred
)) +
3138 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX
)];
3140 struct msghdr msghdr
= {
3143 .msg_control
= &control
,
3144 .msg_controllen
= sizeof(control
),
3146 struct cmsghdr
*cmsg
;
3147 struct ucred
*ucred
= NULL
;
3149 pid_t inner_child_pid
;
3150 _cleanup_strv_free_
char **tags
= NULL
;
3154 inner_child_pid
= PTR_TO_PID(userdata
);
3156 if (revents
!= EPOLLIN
) {
3157 log_warning("Got unexpected poll event for notify fd.");
3161 n
= recvmsg(fd
, &msghdr
, MSG_DONTWAIT
|MSG_CMSG_CLOEXEC
);
3163 if (errno
== EAGAIN
|| errno
== EINTR
)
3166 return log_warning_errno(errno
, "Couldn't read notification socket: %m");
3168 cmsg_close_all(&msghdr
);
3170 CMSG_FOREACH(cmsg
, &msghdr
) {
3171 if (cmsg
->cmsg_level
== SOL_SOCKET
&&
3172 cmsg
->cmsg_type
== SCM_CREDENTIALS
&&
3173 cmsg
->cmsg_len
== CMSG_LEN(sizeof(struct ucred
))) {
3175 ucred
= (struct ucred
*) CMSG_DATA(cmsg
);
3179 if (!ucred
|| ucred
->pid
!= inner_child_pid
) {
3180 log_warning("Received notify message without valid credentials. Ignoring.");
3184 if ((size_t) n
>= sizeof(buf
)) {
3185 log_warning("Received notify message exceeded maximum size. Ignoring.");
3190 tags
= strv_split(buf
, "\n\r");
3194 if (strv_find(tags
, "READY=1"))
3195 sd_notifyf(false, "READY=1\n");
3197 p
= strv_find_startswith(tags
, "STATUS=");
3199 sd_notifyf(false, "STATUS=Container running: %s", p
);
3204 static int setup_sd_notify_parent(sd_event
*event
, int fd
, pid_t
*inner_child_pid
) {
3206 sd_event_source
*notify_event_source
;
3208 r
= sd_event_add_io(event
, ¬ify_event_source
, fd
, EPOLLIN
, nspawn_dispatch_notify_fd
, inner_child_pid
);
3210 return log_error_errno(r
, "Failed to allocate notify event source: %m");
3212 (void) sd_event_source_set_description(notify_event_source
, "nspawn-notify");
3217 static int load_settings(void) {
3218 _cleanup_(settings_freep
) Settings
*settings
= NULL
;
3219 _cleanup_fclose_
FILE *f
= NULL
;
3220 _cleanup_free_
char *p
= NULL
;
3224 /* If all settings are masked, there's no point in looking for
3225 * the settings file */
3226 if ((arg_settings_mask
& _SETTINGS_MASK_ALL
) == _SETTINGS_MASK_ALL
)
3229 fn
= strjoina(arg_machine
, ".nspawn");
3231 /* We first look in the admin's directories in /etc and /run */
3232 FOREACH_STRING(i
, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
3233 _cleanup_free_
char *j
= NULL
;
3235 j
= strjoin(i
, "/", fn
, NULL
);
3244 /* By default, we trust configuration from /etc and /run */
3245 if (arg_settings_trusted
< 0)
3246 arg_settings_trusted
= true;
3251 if (errno
!= ENOENT
)
3252 return log_error_errno(errno
, "Failed to open %s: %m", j
);
3256 /* After that, let's look for a file next to the
3257 * actual image we shall boot. */
3260 p
= file_in_same_dir(arg_image
, fn
);
3263 } else if (arg_directory
) {
3264 p
= file_in_same_dir(arg_directory
, fn
);
3271 if (!f
&& errno
!= ENOENT
)
3272 return log_error_errno(errno
, "Failed to open %s: %m", p
);
3274 /* By default, we do not trust configuration from /var/lib/machines */
3275 if (arg_settings_trusted
< 0)
3276 arg_settings_trusted
= false;
3283 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted
));
3285 r
= settings_load(f
, p
, &settings
);
3289 /* Copy over bits from the settings, unless they have been
3290 * explicitly masked by command line switches. */
3292 if ((arg_settings_mask
& SETTING_START_MODE
) == 0 &&
3293 settings
->start_mode
>= 0) {
3294 arg_start_mode
= settings
->start_mode
;
3296 strv_free(arg_parameters
);
3297 arg_parameters
= settings
->parameters
;
3298 settings
->parameters
= NULL
;
3301 if ((arg_settings_mask
& SETTING_WORKING_DIRECTORY
) == 0 &&
3302 settings
->working_directory
) {
3304 arg_chdir
= settings
->working_directory
;
3305 settings
->working_directory
= NULL
;
3308 if ((arg_settings_mask
& SETTING_ENVIRONMENT
) == 0 &&
3309 settings
->environment
) {
3310 strv_free(arg_setenv
);
3311 arg_setenv
= settings
->environment
;
3312 settings
->environment
= NULL
;
3315 if ((arg_settings_mask
& SETTING_USER
) == 0 &&
3318 arg_user
= settings
->user
;
3319 settings
->user
= NULL
;
3322 if ((arg_settings_mask
& SETTING_CAPABILITY
) == 0) {
3325 plus
= settings
->capability
;
3326 if (settings_private_network(settings
))
3327 plus
|= (1ULL << CAP_NET_ADMIN
);
3329 if (!arg_settings_trusted
&& plus
!= 0) {
3330 if (settings
->capability
!= 0)
3331 log_warning("Ignoring Capability= setting, file %s is not trusted.", p
);
3333 arg_caps_retain
|= plus
;
3335 arg_caps_retain
&= ~settings
->drop_capability
;
3338 if ((arg_settings_mask
& SETTING_KILL_SIGNAL
) == 0 &&
3339 settings
->kill_signal
> 0)
3340 arg_kill_signal
= settings
->kill_signal
;
3342 if ((arg_settings_mask
& SETTING_PERSONALITY
) == 0 &&
3343 settings
->personality
!= PERSONALITY_INVALID
)
3344 arg_personality
= settings
->personality
;
3346 if ((arg_settings_mask
& SETTING_MACHINE_ID
) == 0 &&
3347 !sd_id128_is_null(settings
->machine_id
)) {
3349 if (!arg_settings_trusted
)
3350 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p
);
3352 arg_uuid
= settings
->machine_id
;
3355 if ((arg_settings_mask
& SETTING_READ_ONLY
) == 0 &&
3356 settings
->read_only
>= 0)
3357 arg_read_only
= settings
->read_only
;
3359 if ((arg_settings_mask
& SETTING_VOLATILE_MODE
) == 0 &&
3360 settings
->volatile_mode
!= _VOLATILE_MODE_INVALID
)
3361 arg_volatile_mode
= settings
->volatile_mode
;
3363 if ((arg_settings_mask
& SETTING_CUSTOM_MOUNTS
) == 0 &&
3364 settings
->n_custom_mounts
> 0) {
3366 if (!arg_settings_trusted
)
3367 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p
);
3369 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
3370 arg_custom_mounts
= settings
->custom_mounts
;
3371 arg_n_custom_mounts
= settings
->n_custom_mounts
;
3373 settings
->custom_mounts
= NULL
;
3374 settings
->n_custom_mounts
= 0;
3378 if ((arg_settings_mask
& SETTING_NETWORK
) == 0 &&
3379 (settings
->private_network
>= 0 ||
3380 settings
->network_veth
>= 0 ||
3381 settings
->network_bridge
||
3382 settings
->network_zone
||
3383 settings
->network_interfaces
||
3384 settings
->network_macvlan
||
3385 settings
->network_ipvlan
||
3386 settings
->network_veth_extra
)) {
3388 if (!arg_settings_trusted
)
3389 log_warning("Ignoring network settings, file %s is not trusted.", p
);
3391 arg_network_veth
= settings_network_veth(settings
);
3392 arg_private_network
= settings_private_network(settings
);
3394 strv_free(arg_network_interfaces
);
3395 arg_network_interfaces
= settings
->network_interfaces
;
3396 settings
->network_interfaces
= NULL
;
3398 strv_free(arg_network_macvlan
);
3399 arg_network_macvlan
= settings
->network_macvlan
;
3400 settings
->network_macvlan
= NULL
;
3402 strv_free(arg_network_ipvlan
);
3403 arg_network_ipvlan
= settings
->network_ipvlan
;
3404 settings
->network_ipvlan
= NULL
;
3406 strv_free(arg_network_veth_extra
);
3407 arg_network_veth_extra
= settings
->network_veth_extra
;
3408 settings
->network_veth_extra
= NULL
;
3410 free(arg_network_bridge
);
3411 arg_network_bridge
= settings
->network_bridge
;
3412 settings
->network_bridge
= NULL
;
3414 free(arg_network_zone
);
3415 arg_network_zone
= settings
->network_zone
;
3416 settings
->network_zone
= NULL
;
3420 if ((arg_settings_mask
& SETTING_EXPOSE_PORTS
) == 0 &&
3421 settings
->expose_ports
) {
3423 if (!arg_settings_trusted
)
3424 log_warning("Ignoring Port= setting, file %s is not trusted.", p
);
3426 expose_port_free_all(arg_expose_ports
);
3427 arg_expose_ports
= settings
->expose_ports
;
3428 settings
->expose_ports
= NULL
;
3432 if ((arg_settings_mask
& SETTING_USERNS
) == 0 &&
3433 settings
->userns_mode
!= _USER_NAMESPACE_MODE_INVALID
) {
3435 if (!arg_settings_trusted
)
3436 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p
);
3438 arg_userns_mode
= settings
->userns_mode
;
3439 arg_uid_shift
= settings
->uid_shift
;
3440 arg_uid_range
= settings
->uid_range
;
3441 arg_userns_chown
= settings
->userns_chown
;
3445 if ((arg_settings_mask
& SETTING_NOTIFY_READY
) == 0)
3446 arg_notify_ready
= settings
->notify_ready
;
3451 int main(int argc
, char *argv
[]) {
3453 _cleanup_free_
char *device_path
= NULL
, *root_device
= NULL
, *home_device
= NULL
, *srv_device
= NULL
, *console
= NULL
;
3454 bool root_device_rw
= true, home_device_rw
= true, srv_device_rw
= true;
3455 _cleanup_close_
int master
= -1, image_fd
= -1;
3456 _cleanup_fdset_free_ FDSet
*fds
= NULL
;
3457 int r
, n_fd_passed
, loop_nr
= -1;
3458 char veth_name
[IFNAMSIZ
] = "";
3459 bool secondary
= false, remove_subvol
= false;
3462 int ret
= EXIT_SUCCESS
;
3463 union in_addr_union exposed
= {};
3464 _cleanup_release_lock_file_ LockFile tree_global_lock
= LOCK_FILE_INIT
, tree_local_lock
= LOCK_FILE_INIT
;
3465 bool interactive
, veth_created
= false;
3467 log_parse_environment();
3470 /* Make sure rename_process() in the stub init process can work */
3474 r
= parse_argv(argc
, argv
);
3478 if (geteuid() != 0) {
3479 log_error("Need to be root.");
3483 r
= determine_names();
3487 r
= load_settings();
3491 r
= verify_arguments();
3495 n_fd_passed
= sd_listen_fds(false);
3496 if (n_fd_passed
> 0) {
3497 r
= fdset_new_listen_fds(&fds
, false);
3499 log_error_errno(r
, "Failed to collect file descriptors: %m");
3504 if (arg_directory
) {
3507 if (path_equal(arg_directory
, "/") && !arg_ephemeral
) {
3508 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3513 if (arg_ephemeral
) {
3514 _cleanup_free_
char *np
= NULL
;
3516 /* If the specified path is a mount point we
3517 * generate the new snapshot immediately
3518 * inside it under a random name. However if
3519 * the specified is not a mount point we
3520 * create the new snapshot in the parent
3521 * directory, just next to it. */
3522 r
= path_is_mount_point(arg_directory
, 0);
3524 log_error_errno(r
, "Failed to determine whether directory %s is mount point: %m", arg_directory
);
3528 r
= tempfn_random_child(arg_directory
, "machine.", &np
);
3530 r
= tempfn_random(arg_directory
, "machine.", &np
);
3532 log_error_errno(r
, "Failed to generate name for snapshot: %m");
3536 r
= image_path_lock(np
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3538 log_error_errno(r
, "Failed to lock %s: %m", np
);
3542 r
= btrfs_subvol_snapshot(arg_directory
, np
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
3544 log_error_errno(r
, "Failed to create snapshot %s from %s: %m", np
, arg_directory
);
3548 free(arg_directory
);
3552 remove_subvol
= true;
3555 r
= image_path_lock(arg_directory
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3557 log_error_errno(r
, "Directory tree %s is currently busy.", arg_directory
);
3561 log_error_errno(r
, "Failed to lock %s: %m", arg_directory
);
3566 r
= btrfs_subvol_snapshot(arg_template
, arg_directory
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
3569 log_info("Directory %s already exists, not populating from template %s.", arg_directory
, arg_template
);
3571 log_error_errno(r
, "Couldn't create snapshot %s from %s: %m", arg_directory
, arg_template
);
3575 log_info("Populated %s from template %s.", arg_directory
, arg_template
);
3580 if (arg_start_mode
== START_BOOT
) {
3581 if (path_is_os_tree(arg_directory
) <= 0) {
3582 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory
);
3589 p
= strjoina(arg_directory
, "/usr/");
3590 if (laccess(p
, F_OK
) < 0) {
3591 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory
);
3598 char template[] = "/tmp/nspawn-root-XXXXXX";
3601 assert(!arg_template
);
3603 r
= image_path_lock(arg_image
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3605 r
= log_error_errno(r
, "Disk image %s is currently busy.", arg_image
);
3609 r
= log_error_errno(r
, "Failed to create image lock: %m");
3613 if (!mkdtemp(template)) {
3614 log_error_errno(errno
, "Failed to create temporary directory: %m");
3619 arg_directory
= strdup(template);
3620 if (!arg_directory
) {
3625 image_fd
= setup_image(&device_path
, &loop_nr
);
3631 r
= dissect_image(image_fd
,
3632 &root_device
, &root_device_rw
,
3633 &home_device
, &home_device_rw
,
3634 &srv_device
, &srv_device_rw
,
3640 r
= custom_mounts_prepare();
3645 isatty(STDIN_FILENO
) > 0 &&
3646 isatty(STDOUT_FILENO
) > 0;
3648 master
= posix_openpt(O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NDELAY
);
3650 r
= log_error_errno(errno
, "Failed to acquire pseudo tty: %m");
3654 r
= ptsname_malloc(master
, &console
);
3656 r
= log_error_errno(r
, "Failed to determine tty name: %m");
3660 if (arg_selinux_apifs_context
) {
3661 r
= mac_selinux_apply(console
, arg_selinux_apifs_context
);
3666 if (unlockpt(master
) < 0) {
3667 r
= log_error_errno(errno
, "Failed to unlock tty: %m");
3672 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3673 arg_machine
, arg_image
?: arg_directory
);
3675 assert_se(sigprocmask_many(SIG_BLOCK
, NULL
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, -1) >= 0);
3677 assert_se(sigemptyset(&mask_chld
) == 0);
3678 assert_se(sigaddset(&mask_chld
, SIGCHLD
) == 0);
3680 if (prctl(PR_SET_CHILD_SUBREAPER
, 1) < 0) {
3681 r
= log_error_errno(errno
, "Failed to become subreaper: %m");
3686 static const struct sigaction sa
= {
3687 .sa_handler
= nop_signal_handler
,
3688 .sa_flags
= SA_NOCLDSTOP
,
3691 _cleanup_release_lock_file_ LockFile uid_shift_lock
= LOCK_FILE_INIT
;
3692 _cleanup_close_
int etc_passwd_lock
= -1;
3693 _cleanup_close_pair_
int
3694 kmsg_socket_pair
[2] = { -1, -1 },
3695 rtnl_socket_pair
[2] = { -1, -1 },
3696 pid_socket_pair
[2] = { -1, -1 },
3697 uuid_socket_pair
[2] = { -1, -1 },
3698 notify_socket_pair
[2] = { -1, -1 },
3699 uid_shift_socket_pair
[2] = { -1, -1 };
3700 _cleanup_close_
int notify_socket
= -1;
3701 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
3702 _cleanup_(sd_event_unrefp
) sd_event
*event
= NULL
;
3703 _cleanup_(pty_forward_freep
) PTYForward
*forward
= NULL
;
3704 _cleanup_(sd_netlink_unrefp
) sd_netlink
*rtnl
= NULL
;
3705 ContainerStatus container_status
;
3710 if (arg_userns_mode
== USER_NAMESPACE_PICK
) {
3711 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3712 * check with getpwuid() if the specific user already exists. Note that /etc might be
3713 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3714 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3715 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3718 etc_passwd_lock
= take_etc_passwd_lock(NULL
);
3719 if (etc_passwd_lock
< 0 && etc_passwd_lock
!= -EROFS
) {
3720 log_error_errno(r
, "Failed to take /etc/passwd lock: %m");
3725 r
= barrier_create(&barrier
);
3727 log_error_errno(r
, "Cannot initialize IPC barrier: %m");
3731 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, kmsg_socket_pair
) < 0) {
3732 r
= log_error_errno(errno
, "Failed to create kmsg socket pair: %m");
3736 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, rtnl_socket_pair
) < 0) {
3737 r
= log_error_errno(errno
, "Failed to create rtnl socket pair: %m");
3741 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, pid_socket_pair
) < 0) {
3742 r
= log_error_errno(errno
, "Failed to create pid socket pair: %m");
3746 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, uuid_socket_pair
) < 0) {
3747 r
= log_error_errno(errno
, "Failed to create id socket pair: %m");
3751 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, notify_socket_pair
) < 0) {
3752 r
= log_error_errno(errno
, "Failed to create notify socket pair: %m");
3756 if (arg_userns_mode
!= USER_NAMESPACE_NO
)
3757 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, uid_shift_socket_pair
) < 0) {
3758 r
= log_error_errno(errno
, "Failed to create uid shift socket pair: %m");
3762 /* Child can be killed before execv(), so handle SIGCHLD
3763 * in order to interrupt parent's blocking calls and
3764 * give it a chance to call wait() and terminate. */
3765 r
= sigprocmask(SIG_UNBLOCK
, &mask_chld
, NULL
);
3767 r
= log_error_errno(errno
, "Failed to change the signal mask: %m");
3771 r
= sigaction(SIGCHLD
, &sa
, NULL
);
3773 r
= log_error_errno(errno
, "Failed to install SIGCHLD handler: %m");
3777 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
);
3779 if (errno
== EINVAL
)
3780 r
= log_error_errno(errno
, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3782 r
= log_error_errno(errno
, "clone() failed: %m");
3788 /* The outer child only has a file system namespace. */
3789 barrier_set_role(&barrier
, BARRIER_CHILD
);
3791 master
= safe_close(master
);
3793 kmsg_socket_pair
[0] = safe_close(kmsg_socket_pair
[0]);
3794 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3795 pid_socket_pair
[0] = safe_close(pid_socket_pair
[0]);
3796 uuid_socket_pair
[0] = safe_close(uuid_socket_pair
[0]);
3797 notify_socket_pair
[0] = safe_close(notify_socket_pair
[0]);
3798 uid_shift_socket_pair
[0] = safe_close(uid_shift_socket_pair
[0]);
3800 (void) reset_all_signal_handlers();
3801 (void) reset_signal_mask();
3803 r
= outer_child(&barrier
,
3806 root_device
, root_device_rw
,
3807 home_device
, home_device_rw
,
3808 srv_device
, srv_device_rw
,
3812 uuid_socket_pair
[1],
3813 notify_socket_pair
[1],
3814 kmsg_socket_pair
[1],
3815 rtnl_socket_pair
[1],
3816 uid_shift_socket_pair
[1],
3819 _exit(EXIT_FAILURE
);
3821 _exit(EXIT_SUCCESS
);
3824 barrier_set_role(&barrier
, BARRIER_PARENT
);
3826 fds
= fdset_free(fds
);
3828 kmsg_socket_pair
[1] = safe_close(kmsg_socket_pair
[1]);
3829 rtnl_socket_pair
[1] = safe_close(rtnl_socket_pair
[1]);
3830 pid_socket_pair
[1] = safe_close(pid_socket_pair
[1]);
3831 uuid_socket_pair
[1] = safe_close(uuid_socket_pair
[1]);
3832 notify_socket_pair
[1] = safe_close(notify_socket_pair
[1]);
3833 uid_shift_socket_pair
[1] = safe_close(uid_shift_socket_pair
[1]);
3835 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
3836 /* The child just let us know the UID shift it might have read from the image. */
3837 l
= recv(uid_shift_socket_pair
[0], &arg_uid_shift
, sizeof(arg_uid_shift
), 0);
3839 r
= log_error_errno(errno
, "Failed to read UID shift: %m");
3842 if (l
!= sizeof(arg_uid_shift
)) {
3843 log_error("Short read while reading UID shift.");
3848 if (arg_userns_mode
== USER_NAMESPACE_PICK
) {
3849 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3850 * image, but if that's already in use, pick a new one, and report back to the child,
3851 * which one we now picked. */
3853 r
= uid_shift_pick(&arg_uid_shift
, &uid_shift_lock
);
3855 log_error_errno(r
, "Failed to pick suitable UID/GID range: %m");
3859 l
= send(uid_shift_socket_pair
[0], &arg_uid_shift
, sizeof(arg_uid_shift
), MSG_NOSIGNAL
);
3861 r
= log_error_errno(errno
, "Failed to send UID shift: %m");
3864 if (l
!= sizeof(arg_uid_shift
)) {
3865 log_error("Short write while writing UID shift.");
3872 /* Wait for the outer child. */
3873 r
= wait_for_terminate_and_warn("namespace helper", pid
, NULL
);
3882 /* And now retrieve the PID of the inner child. */
3883 l
= recv(pid_socket_pair
[0], &pid
, sizeof(pid
), 0);
3885 r
= log_error_errno(errno
, "Failed to read inner child PID: %m");
3888 if (l
!= sizeof(pid
)) {
3889 log_error("Short read while reading inner child PID.");
3894 /* We also retrieve container UUID in case it was generated by outer child */
3895 l
= recv(uuid_socket_pair
[0], &arg_uuid
, sizeof(arg_uuid
), 0);
3897 r
= log_error_errno(errno
, "Failed to read container machine ID: %m");
3900 if (l
!= sizeof(arg_uuid
)) {
3901 log_error("Short read while reading container machined ID.");
3906 /* We also retrieve the socket used for notifications generated by outer child */
3907 notify_socket
= receive_one_fd(notify_socket_pair
[0], 0);
3908 if (notify_socket
< 0) {
3909 r
= log_error_errno(errno
, "Failed to receive notification socket from the outer child: %m");
3913 log_debug("Init process invoked as PID " PID_FMT
, pid
);
3915 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
3916 if (!barrier_place_and_sync(&barrier
)) { /* #1 */
3917 log_error("Child died too early.");
3922 r
= setup_uid_map(pid
);
3926 (void) barrier_place(&barrier
); /* #2 */
3929 if (arg_private_network
) {
3931 r
= move_network_interfaces(pid
, arg_network_interfaces
);
3935 if (arg_network_veth
) {
3936 r
= setup_veth(arg_machine
, pid
, veth_name
,
3937 arg_network_bridge
|| arg_network_zone
);
3943 if (arg_network_bridge
) {
3944 /* Add the interface to a bridge */
3945 r
= setup_bridge(veth_name
, arg_network_bridge
, false);
3950 } else if (arg_network_zone
) {
3951 /* Add the interface to a bridge, possibly creating it */
3952 r
= setup_bridge(veth_name
, arg_network_zone
, true);
3960 r
= setup_veth_extra(arg_machine
, pid
, arg_network_veth_extra
);
3964 /* We created the primary and extra veth links now; let's remember this, so that we know to
3965 remove them later on. Note that we don't bother with removing veth links that were created
3966 here when their setup failed half-way, because in that case the kernel should be able to
3967 remove them on its own, since they cannot be referenced by anything yet. */
3968 veth_created
= true;
3970 r
= setup_macvlan(arg_machine
, pid
, arg_network_macvlan
);
3974 r
= setup_ipvlan(arg_machine
, pid
, arg_network_ipvlan
);
3980 r
= register_machine(
3987 arg_custom_mounts
, arg_n_custom_mounts
,
3991 arg_container_service_name
);
3996 r
= sync_cgroup(pid
, arg_unified_cgroup_hierarchy
);
4000 if (arg_keep_unit
) {
4001 r
= create_subcgroup(pid
, arg_unified_cgroup_hierarchy
);
4006 r
= chown_cgroup(pid
, arg_uid_shift
);
4010 /* Notify the child that the parent is ready with all
4011 * its setup (including cgroup-ification), and that
4012 * the child can now hand over control to the code to
4013 * run inside the container. */
4014 (void) barrier_place(&barrier
); /* #3 */
4016 /* Block SIGCHLD here, before notifying child.
4017 * process_pty() will handle it with the other signals. */
4018 assert_se(sigprocmask(SIG_BLOCK
, &mask_chld
, NULL
) >= 0);
4020 /* Reset signal to default */
4021 r
= default_signals(SIGCHLD
, -1);
4023 log_error_errno(r
, "Failed to reset SIGCHLD: %m");
4027 r
= sd_event_new(&event
);
4029 log_error_errno(r
, "Failed to get default event source: %m");
4033 r
= setup_sd_notify_parent(event
, notify_socket
, PID_TO_PTR(pid
));
4037 /* Let the child know that we are ready and wait that the child is completely ready now. */
4038 if (!barrier_place_and_sync(&barrier
)) { /* #4 */
4039 log_error("Child died too early.");
4044 /* At this point we have made use of the UID we picked, and thus nss-mymachines will make them appear
4045 * in getpwuid(), thus we can release the /etc/passwd lock. */
4046 etc_passwd_lock
= safe_close(etc_passwd_lock
);
4049 "STATUS=Container running.\n"
4050 "X_NSPAWN_LEADER_PID=" PID_FMT
, pid
);
4051 if (!arg_notify_ready
)
4052 sd_notify(false, "READY=1\n");
4054 if (arg_kill_signal
> 0) {
4055 /* Try to kill the init system on SIGINT or SIGTERM */
4056 sd_event_add_signal(event
, NULL
, SIGINT
, on_orderly_shutdown
, PID_TO_PTR(pid
));
4057 sd_event_add_signal(event
, NULL
, SIGTERM
, on_orderly_shutdown
, PID_TO_PTR(pid
));
4059 /* Immediately exit */
4060 sd_event_add_signal(event
, NULL
, SIGINT
, NULL
, NULL
);
4061 sd_event_add_signal(event
, NULL
, SIGTERM
, NULL
, NULL
);
4064 /* simply exit on sigchld */
4065 sd_event_add_signal(event
, NULL
, SIGCHLD
, NULL
, NULL
);
4067 if (arg_expose_ports
) {
4068 r
= expose_port_watch_rtnl(event
, rtnl_socket_pair
[0], on_address_change
, &exposed
, &rtnl
);
4072 (void) expose_port_execute(rtnl
, arg_expose_ports
, &exposed
);
4075 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
4077 r
= pty_forward_new(event
, master
, PTY_FORWARD_IGNORE_VHANGUP
| (interactive
? 0 : PTY_FORWARD_READ_ONLY
), &forward
);
4079 log_error_errno(r
, "Failed to create PTY forwarder: %m");
4083 r
= sd_event_loop(event
);
4085 log_error_errno(r
, "Failed to run event loop: %m");
4089 pty_forward_get_last_char(forward
, &last_char
);
4091 forward
= pty_forward_free(forward
);
4093 if (!arg_quiet
&& last_char
!= '\n')
4096 /* Kill if it is not dead yet anyway */
4097 if (arg_register
&& !arg_keep_unit
)
4098 terminate_machine(pid
);
4100 /* Normally redundant, but better safe than sorry */
4103 r
= wait_for_container(pid
, &container_status
);
4107 /* We failed to wait for the container, or the
4108 * container exited abnormally */
4110 else if (r
> 0 || container_status
== CONTAINER_TERMINATED
) {
4111 /* The container exited with a non-zero
4112 * status, or with zero status and no reboot
4118 /* CONTAINER_REBOOTED, loop again */
4120 if (arg_keep_unit
) {
4121 /* Special handling if we are running as a
4122 * service: instead of simply restarting the
4123 * machine we want to restart the entire
4124 * service, so let's inform systemd about this
4125 * with the special exit code 133. The service
4126 * file uses RestartForceExitStatus=133 so
4127 * that this results in a full nspawn
4128 * restart. This is necessary since we might
4129 * have cgroup parameters set we want to have
4136 expose_port_flush(arg_expose_ports
, &exposed
);
4138 (void) remove_veth_links(veth_name
, arg_network_veth_extra
);
4139 veth_created
= false;
4145 "STATUS=Terminating...");
4150 /* Try to flush whatever is still queued in the pty */
4152 (void) copy_bytes(master
, STDOUT_FILENO
, (uint64_t) -1, false);
4154 loop_remove(loop_nr
, &image_fd
);
4156 if (remove_subvol
&& arg_directory
) {
4159 k
= btrfs_subvol_remove(arg_directory
, BTRFS_REMOVE_RECURSIVE
|BTRFS_REMOVE_QUOTA
);
4161 log_warning_errno(k
, "Cannot remove subvolume '%s', ignoring: %m", arg_directory
);
4167 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
4168 (void) rm_rf(p
, REMOVE_ROOT
);
4171 expose_port_flush(arg_expose_ports
, &exposed
);
4174 (void) remove_veth_links(veth_name
, arg_network_veth_extra
);
4175 (void) remove_bridge(arg_network_zone
);
4177 free(arg_directory
);
4183 strv_free(arg_setenv
);
4184 free(arg_network_bridge
);
4185 strv_free(arg_network_interfaces
);
4186 strv_free(arg_network_macvlan
);
4187 strv_free(arg_network_ipvlan
);
4188 strv_free(arg_network_veth_extra
);
4189 strv_free(arg_parameters
);
4190 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
4191 expose_port_free_all(arg_expose_ports
);
4193 return r
< 0 ? EXIT_FAILURE
: ret
;