2 This file is part of systemd.
4 Copyright 2010 Lennart Poettering
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
21 #include <blkid/blkid.h>
26 #include <linux/loop.h>
33 #include <selinux/selinux.h>
40 #include <sys/mount.h>
41 #include <sys/personality.h>
42 #include <sys/prctl.h>
43 #include <sys/types.h>
46 #include "sd-daemon.h"
49 #include "alloc-util.h"
51 #include "base-filesystem.h"
52 #include "blkid-util.h"
53 #include "btrfs-util.h"
55 #include "capability-util.h"
56 #include "cgroup-util.h"
58 #include "dev-setup.h"
63 #include "formats-util.h"
66 #include "hostname-util.h"
68 #include "loopback-setup.h"
69 #include "machine-id-setup.h"
70 #include "machine-image.h"
74 #include "mount-util.h"
75 #include "netlink-util.h"
76 #include "nspawn-cgroup.h"
77 #include "nspawn-expose-ports.h"
78 #include "nspawn-mount.h"
79 #include "nspawn-network.h"
80 #include "nspawn-patch-uid.h"
81 #include "nspawn-register.h"
82 #include "nspawn-settings.h"
83 #include "nspawn-setuid.h"
84 #include "nspawn-stub-pid1.h"
85 #include "parse-util.h"
86 #include "path-util.h"
87 #include "process-util.h"
89 #include "random-util.h"
92 #include "seccomp-util.h"
94 #include "selinux-util.h"
95 #include "signal-util.h"
96 #include "socket-util.h"
97 #include "stat-util.h"
98 #include "stdio-util.h"
99 #include "string-util.h"
101 #include "terminal-util.h"
102 #include "udev-util.h"
103 #include "umask-util.h"
104 #include "user-util.h"
107 /* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
109 #define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
110 #define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
112 typedef enum ContainerStatus
{
113 CONTAINER_TERMINATED
,
117 typedef enum LinkJournal
{
124 static char *arg_directory
= NULL
;
125 static char *arg_template
= NULL
;
126 static char *arg_chdir
= NULL
;
127 static char *arg_user
= NULL
;
128 static sd_id128_t arg_uuid
= {};
129 static char *arg_machine
= NULL
;
130 static const char *arg_selinux_context
= NULL
;
131 static const char *arg_selinux_apifs_context
= NULL
;
132 static const char *arg_slice
= NULL
;
133 static bool arg_private_network
= false;
134 static bool arg_read_only
= false;
135 static StartMode arg_start_mode
= START_PID1
;
136 static bool arg_ephemeral
= false;
137 static LinkJournal arg_link_journal
= LINK_AUTO
;
138 static bool arg_link_journal_try
= false;
139 static uint64_t arg_retain
=
140 (1ULL << CAP_CHOWN
) |
141 (1ULL << CAP_DAC_OVERRIDE
) |
142 (1ULL << CAP_DAC_READ_SEARCH
) |
143 (1ULL << CAP_FOWNER
) |
144 (1ULL << CAP_FSETID
) |
145 (1ULL << CAP_IPC_OWNER
) |
147 (1ULL << CAP_LEASE
) |
148 (1ULL << CAP_LINUX_IMMUTABLE
) |
149 (1ULL << CAP_NET_BIND_SERVICE
) |
150 (1ULL << CAP_NET_BROADCAST
) |
151 (1ULL << CAP_NET_RAW
) |
152 (1ULL << CAP_SETGID
) |
153 (1ULL << CAP_SETFCAP
) |
154 (1ULL << CAP_SETPCAP
) |
155 (1ULL << CAP_SETUID
) |
156 (1ULL << CAP_SYS_ADMIN
) |
157 (1ULL << CAP_SYS_CHROOT
) |
158 (1ULL << CAP_SYS_NICE
) |
159 (1ULL << CAP_SYS_PTRACE
) |
160 (1ULL << CAP_SYS_TTY_CONFIG
) |
161 (1ULL << CAP_SYS_RESOURCE
) |
162 (1ULL << CAP_SYS_BOOT
) |
163 (1ULL << CAP_AUDIT_WRITE
) |
164 (1ULL << CAP_AUDIT_CONTROL
) |
166 static CustomMount
*arg_custom_mounts
= NULL
;
167 static unsigned arg_n_custom_mounts
= 0;
168 static char **arg_setenv
= NULL
;
169 static bool arg_quiet
= false;
170 static bool arg_share_system
= false;
171 static bool arg_register
= true;
172 static bool arg_keep_unit
= false;
173 static char **arg_network_interfaces
= NULL
;
174 static char **arg_network_macvlan
= NULL
;
175 static char **arg_network_ipvlan
= NULL
;
176 static bool arg_network_veth
= false;
177 static char **arg_network_veth_extra
= NULL
;
178 static char *arg_network_bridge
= NULL
;
179 static unsigned long arg_personality
= PERSONALITY_INVALID
;
180 static char *arg_image
= NULL
;
181 static VolatileMode arg_volatile_mode
= VOLATILE_NO
;
182 static ExposePort
*arg_expose_ports
= NULL
;
183 static char **arg_property
= NULL
;
184 static UserNamespaceMode arg_userns_mode
= USER_NAMESPACE_NO
;
185 static uid_t arg_uid_shift
= UID_INVALID
, arg_uid_range
= 0x10000U
;
186 static bool arg_userns_chown
= false;
187 static int arg_kill_signal
= 0;
188 static bool arg_unified_cgroup_hierarchy
= false;
189 static SettingsMask arg_settings_mask
= 0;
190 static int arg_settings_trusted
= -1;
191 static char **arg_parameters
= NULL
;
192 static const char *arg_container_service_name
= "systemd-nspawn";
194 static void help(void) {
195 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
196 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
197 " -h --help Show this help\n"
198 " --version Print version string\n"
199 " -q --quiet Do not show status information\n"
200 " -D --directory=PATH Root directory for the container\n"
201 " --template=PATH Initialize root directory from template directory,\n"
203 " -x --ephemeral Run container with snapshot of root directory, and\n"
204 " remove it after exit\n"
205 " -i --image=PATH File system device or disk image for the container\n"
206 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
207 " -b --boot Boot up full system (i.e. invoke init)\n"
208 " --chdir=PATH Set working directory in the container\n"
209 " -u --user=USER Run the command under specified user or uid\n"
210 " -M --machine=NAME Set the machine name for the container\n"
211 " --uuid=UUID Set a specific machine UUID for the container\n"
212 " -S --slice=SLICE Place the container in the specified slice\n"
213 " --property=NAME=VALUE Set scope unit property\n"
214 " -U --private-users=pick Run within user namespace, pick UID/GID range automatically\n"
215 " --private-users[=UIDBASE[:NUIDS]]\n"
216 " Run within user namespace, user configured UID/GID range\n"
217 " --private-user-chown Adjust OS tree file ownership for private UID/GID range\n"
218 " --private-network Disable network in container\n"
219 " --network-interface=INTERFACE\n"
220 " Assign an existing network interface to the\n"
222 " --network-macvlan=INTERFACE\n"
223 " Create a macvlan network interface based on an\n"
224 " existing network interface to the container\n"
225 " --network-ipvlan=INTERFACE\n"
226 " Create a ipvlan network interface based on an\n"
227 " existing network interface to the container\n"
228 " -n --network-veth Add a virtual Ethernet connection between host\n"
230 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
231 " Add an additional virtual Ethernet link between\n"
232 " host and container\n"
233 " --network-bridge=INTERFACE\n"
234 " Add a virtual Ethernet connection between host\n"
235 " and container and add it to an existing bridge on\n"
237 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
238 " Expose a container IP port on the host\n"
239 " -Z --selinux-context=SECLABEL\n"
240 " Set the SELinux security context to be used by\n"
241 " processes in the container\n"
242 " -L --selinux-apifs-context=SECLABEL\n"
243 " Set the SELinux security context to be used by\n"
244 " API/tmpfs file systems in the container\n"
245 " --capability=CAP In addition to the default, retain specified\n"
247 " --drop-capability=CAP Drop the specified capability from the default set\n"
248 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
249 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
250 " host, try-guest, try-host\n"
251 " -j Equivalent to --link-journal=try-guest\n"
252 " --read-only Mount the root directory read-only\n"
253 " --bind=PATH[:PATH[:OPTIONS]]\n"
254 " Bind mount a file or directory from the host into\n"
256 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
257 " Similar, but creates a read-only bind mount\n"
258 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
259 " --overlay=PATH[:PATH...]:PATH\n"
260 " Create an overlay mount from the host to \n"
262 " --overlay-ro=PATH[:PATH...]:PATH\n"
263 " Similar, but creates a read-only overlay mount\n"
264 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
265 " --share-system Share system namespaces with host\n"
266 " --register=BOOLEAN Register container as machine\n"
267 " --keep-unit Do not register a scope for the machine, reuse\n"
268 " the service unit nspawn is running in\n"
269 " --volatile[=MODE] Run the system in volatile mode\n"
270 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
271 , program_invocation_short_name
);
275 static int custom_mounts_prepare(void) {
279 /* Ensure the mounts are applied prefix first. */
280 qsort_safe(arg_custom_mounts
, arg_n_custom_mounts
, sizeof(CustomMount
), custom_mount_compare
);
282 /* Allocate working directories for the overlay file systems that need it */
283 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
284 CustomMount
*m
= &arg_custom_mounts
[i
];
286 if (path_equal(m
->destination
, "/") && arg_userns_mode
!= USER_NAMESPACE_NO
) {
288 if (arg_userns_chown
) {
289 log_error("--private-users-chown may not be combined with custom root mounts.");
291 } else if (arg_uid_shift
== UID_INVALID
) {
292 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
297 if (m
->type
!= CUSTOM_MOUNT_OVERLAY
)
306 r
= tempfn_random(m
->source
, NULL
, &m
->work_dir
);
308 return log_error_errno(r
, "Failed to generate work directory from %s: %m", m
->source
);
314 static int detect_unified_cgroup_hierarchy(void) {
318 /* Allow the user to control whether the unified hierarchy is used */
319 e
= getenv("UNIFIED_CGROUP_HIERARCHY");
321 r
= parse_boolean(e
);
323 return log_error_errno(r
, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
325 arg_unified_cgroup_hierarchy
= r
;
329 /* Otherwise inherit the default from the host system */
332 return log_error_errno(r
, "Failed to determine whether the unified cgroups hierarchy is used: %m");
334 arg_unified_cgroup_hierarchy
= r
;
338 static int parse_argv(int argc
, char *argv
[]) {
356 ARG_NETWORK_INTERFACE
,
360 ARG_NETWORK_VETH_EXTRA
,
369 ARG_PRIVATE_USERS_CHOWN
,
372 static const struct option options
[] = {
373 { "help", no_argument
, NULL
, 'h' },
374 { "version", no_argument
, NULL
, ARG_VERSION
},
375 { "directory", required_argument
, NULL
, 'D' },
376 { "template", required_argument
, NULL
, ARG_TEMPLATE
},
377 { "ephemeral", no_argument
, NULL
, 'x' },
378 { "user", required_argument
, NULL
, 'u' },
379 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
380 { "as-pid2", no_argument
, NULL
, 'a' },
381 { "boot", no_argument
, NULL
, 'b' },
382 { "uuid", required_argument
, NULL
, ARG_UUID
},
383 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
384 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
385 { "drop-capability", required_argument
, NULL
, ARG_DROP_CAPABILITY
},
386 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
387 { "bind", required_argument
, NULL
, ARG_BIND
},
388 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
389 { "tmpfs", required_argument
, NULL
, ARG_TMPFS
},
390 { "overlay", required_argument
, NULL
, ARG_OVERLAY
},
391 { "overlay-ro", required_argument
, NULL
, ARG_OVERLAY_RO
},
392 { "machine", required_argument
, NULL
, 'M' },
393 { "slice", required_argument
, NULL
, 'S' },
394 { "setenv", required_argument
, NULL
, 'E' },
395 { "selinux-context", required_argument
, NULL
, 'Z' },
396 { "selinux-apifs-context", required_argument
, NULL
, 'L' },
397 { "quiet", no_argument
, NULL
, 'q' },
398 { "share-system", no_argument
, NULL
, ARG_SHARE_SYSTEM
},
399 { "register", required_argument
, NULL
, ARG_REGISTER
},
400 { "keep-unit", no_argument
, NULL
, ARG_KEEP_UNIT
},
401 { "network-interface", required_argument
, NULL
, ARG_NETWORK_INTERFACE
},
402 { "network-macvlan", required_argument
, NULL
, ARG_NETWORK_MACVLAN
},
403 { "network-ipvlan", required_argument
, NULL
, ARG_NETWORK_IPVLAN
},
404 { "network-veth", no_argument
, NULL
, 'n' },
405 { "network-veth-extra", required_argument
, NULL
, ARG_NETWORK_VETH_EXTRA
},
406 { "network-bridge", required_argument
, NULL
, ARG_NETWORK_BRIDGE
},
407 { "personality", required_argument
, NULL
, ARG_PERSONALITY
},
408 { "image", required_argument
, NULL
, 'i' },
409 { "volatile", optional_argument
, NULL
, ARG_VOLATILE
},
410 { "port", required_argument
, NULL
, 'p' },
411 { "property", required_argument
, NULL
, ARG_PROPERTY
},
412 { "private-users", optional_argument
, NULL
, ARG_PRIVATE_USERS
},
413 { "private-users-chown", optional_argument
, NULL
, ARG_PRIVATE_USERS_CHOWN
},
414 { "kill-signal", required_argument
, NULL
, ARG_KILL_SIGNAL
},
415 { "settings", required_argument
, NULL
, ARG_SETTINGS
},
416 { "chdir", required_argument
, NULL
, ARG_CHDIR
},
422 uint64_t plus
= 0, minus
= 0;
423 bool mask_all_settings
= false, mask_no_settings
= false;
428 while ((c
= getopt_long(argc
, argv
, "+hD:u:abL:M:jS:Z:qi:xp:nU", options
, NULL
)) >= 0)
440 r
= parse_path_argument_and_warn(optarg
, false, &arg_directory
);
446 r
= parse_path_argument_and_warn(optarg
, false, &arg_template
);
452 r
= parse_path_argument_and_warn(optarg
, false, &arg_image
);
458 arg_ephemeral
= true;
462 r
= free_and_strdup(&arg_user
, optarg
);
466 arg_settings_mask
|= SETTING_USER
;
469 case ARG_NETWORK_BRIDGE
:
470 r
= free_and_strdup(&arg_network_bridge
, optarg
);
477 arg_network_veth
= true;
478 arg_private_network
= true;
479 arg_settings_mask
|= SETTING_NETWORK
;
482 case ARG_NETWORK_VETH_EXTRA
:
483 r
= veth_extra_parse(&arg_network_veth_extra
, optarg
);
485 return log_error_errno(r
, "Failed to parse --network-veth-extra= parameter: %s", optarg
);
487 arg_private_network
= true;
488 arg_settings_mask
|= SETTING_NETWORK
;
491 case ARG_NETWORK_INTERFACE
:
492 if (strv_extend(&arg_network_interfaces
, optarg
) < 0)
495 arg_private_network
= true;
496 arg_settings_mask
|= SETTING_NETWORK
;
499 case ARG_NETWORK_MACVLAN
:
500 if (strv_extend(&arg_network_macvlan
, optarg
) < 0)
503 arg_private_network
= true;
504 arg_settings_mask
|= SETTING_NETWORK
;
507 case ARG_NETWORK_IPVLAN
:
508 if (strv_extend(&arg_network_ipvlan
, optarg
) < 0)
513 case ARG_PRIVATE_NETWORK
:
514 arg_private_network
= true;
515 arg_settings_mask
|= SETTING_NETWORK
;
519 if (arg_start_mode
== START_PID2
) {
520 log_error("--boot and --as-pid2 may not be combined.");
524 arg_start_mode
= START_BOOT
;
525 arg_settings_mask
|= SETTING_START_MODE
;
529 if (arg_start_mode
== START_BOOT
) {
530 log_error("--boot and --as-pid2 may not be combined.");
534 arg_start_mode
= START_PID2
;
535 arg_settings_mask
|= SETTING_START_MODE
;
539 r
= sd_id128_from_string(optarg
, &arg_uuid
);
541 log_error("Invalid UUID: %s", optarg
);
545 arg_settings_mask
|= SETTING_MACHINE_ID
;
554 arg_machine
= mfree(arg_machine
);
556 if (!machine_name_is_valid(optarg
)) {
557 log_error("Invalid machine name: %s", optarg
);
561 r
= free_and_strdup(&arg_machine
, optarg
);
569 arg_selinux_context
= optarg
;
573 arg_selinux_apifs_context
= optarg
;
577 arg_read_only
= true;
578 arg_settings_mask
|= SETTING_READ_ONLY
;
582 case ARG_DROP_CAPABILITY
: {
585 _cleanup_free_
char *t
= NULL
;
587 r
= extract_first_word(&p
, &t
, ",", 0);
589 return log_error_errno(r
, "Failed to parse capability %s.", t
);
594 if (streq(t
, "all")) {
595 if (c
== ARG_CAPABILITY
)
596 plus
= (uint64_t) -1;
598 minus
= (uint64_t) -1;
602 cap
= capability_from_name(t
);
604 log_error("Failed to parse capability %s.", t
);
608 if (c
== ARG_CAPABILITY
)
609 plus
|= 1ULL << (uint64_t) cap
;
611 minus
|= 1ULL << (uint64_t) cap
;
615 arg_settings_mask
|= SETTING_CAPABILITY
;
620 arg_link_journal
= LINK_GUEST
;
621 arg_link_journal_try
= true;
624 case ARG_LINK_JOURNAL
:
625 if (streq(optarg
, "auto")) {
626 arg_link_journal
= LINK_AUTO
;
627 arg_link_journal_try
= false;
628 } else if (streq(optarg
, "no")) {
629 arg_link_journal
= LINK_NO
;
630 arg_link_journal_try
= false;
631 } else if (streq(optarg
, "guest")) {
632 arg_link_journal
= LINK_GUEST
;
633 arg_link_journal_try
= false;
634 } else if (streq(optarg
, "host")) {
635 arg_link_journal
= LINK_HOST
;
636 arg_link_journal_try
= false;
637 } else if (streq(optarg
, "try-guest")) {
638 arg_link_journal
= LINK_GUEST
;
639 arg_link_journal_try
= true;
640 } else if (streq(optarg
, "try-host")) {
641 arg_link_journal
= LINK_HOST
;
642 arg_link_journal_try
= true;
644 log_error("Failed to parse link journal mode %s", optarg
);
652 r
= bind_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
, c
== ARG_BIND_RO
);
654 return log_error_errno(r
, "Failed to parse --bind(-ro)= argument %s: %m", optarg
);
656 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
660 r
= tmpfs_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
);
662 return log_error_errno(r
, "Failed to parse --tmpfs= argument %s: %m", optarg
);
664 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
668 case ARG_OVERLAY_RO
: {
669 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
670 _cleanup_strv_free_
char **lower
= NULL
;
675 r
= strv_split_extract(&lower
, optarg
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
679 log_error("Invalid overlay specification: %s", optarg
);
683 STRV_FOREACH(i
, lower
) {
684 if (!path_is_absolute(*i
)) {
685 log_error("Overlay path %s is not absolute.", *i
);
693 log_error("--overlay= needs at least two colon-separated directories specified.");
698 /* If two parameters are specified,
699 * the first one is the lower, the
700 * second one the upper directory. And
701 * we'll also define the destination
702 * mount point the same as the upper. */
706 destination
= strdup(upper
);
711 upper
= lower
[n
- 2];
712 destination
= lower
[n
- 1];
716 m
= custom_mount_add(&arg_custom_mounts
, &arg_n_custom_mounts
, CUSTOM_MOUNT_OVERLAY
);
720 m
->destination
= destination
;
723 m
->read_only
= c
== ARG_OVERLAY_RO
;
725 upper
= destination
= NULL
;
728 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
735 if (!env_assignment_is_valid(optarg
)) {
736 log_error("Environment variable assignment '%s' is not valid.", optarg
);
740 n
= strv_env_set(arg_setenv
, optarg
);
744 strv_free(arg_setenv
);
747 arg_settings_mask
|= SETTING_ENVIRONMENT
;
755 case ARG_SHARE_SYSTEM
:
756 arg_share_system
= true;
760 r
= parse_boolean(optarg
);
762 log_error("Failed to parse --register= argument: %s", optarg
);
770 arg_keep_unit
= true;
773 case ARG_PERSONALITY
:
775 arg_personality
= personality_from_string(optarg
);
776 if (arg_personality
== PERSONALITY_INVALID
) {
777 log_error("Unknown or unsupported personality '%s'.", optarg
);
781 arg_settings_mask
|= SETTING_PERSONALITY
;
787 arg_volatile_mode
= VOLATILE_YES
;
791 m
= volatile_mode_from_string(optarg
);
793 log_error("Failed to parse --volatile= argument: %s", optarg
);
796 arg_volatile_mode
= m
;
799 arg_settings_mask
|= SETTING_VOLATILE_MODE
;
803 r
= expose_port_parse(&arg_expose_ports
, optarg
);
805 return log_error_errno(r
, "Duplicate port specification: %s", optarg
);
807 return log_error_errno(r
, "Failed to parse host port %s: %m", optarg
);
809 arg_settings_mask
|= SETTING_EXPOSE_PORTS
;
813 if (strv_extend(&arg_property
, optarg
) < 0)
818 case ARG_PRIVATE_USERS
:
820 r
= optarg
? parse_boolean(optarg
) : 1;
822 /* no: User namespacing off */
823 arg_userns_mode
= USER_NAMESPACE_NO
;
824 arg_uid_shift
= UID_INVALID
;
825 arg_uid_range
= UINT32_C(0x10000);
827 /* yes: User namespacing on, UID range is read from root dir */
828 arg_userns_mode
= USER_NAMESPACE_FIXED
;
829 arg_uid_shift
= UID_INVALID
;
830 arg_uid_range
= UINT32_C(0x10000);
831 } else if (streq(optarg
, "pick")) {
832 /* pick: User namespacing on, UID range is picked randomly */
833 arg_userns_mode
= USER_NAMESPACE_PICK
;
834 arg_uid_shift
= UID_INVALID
;
835 arg_uid_range
= UINT32_C(0x10000);
837 _cleanup_free_
char *buffer
= NULL
;
838 const char *range
, *shift
;
840 /* anything else: User namespacing on, UID range is explicitly configured */
842 range
= strchr(optarg
, ':');
844 buffer
= strndup(optarg
, range
- optarg
);
850 if (safe_atou32(range
, &arg_uid_range
) < 0 || arg_uid_range
<= 0) {
851 log_error("Failed to parse UID range: %s", range
);
857 if (parse_uid(shift
, &arg_uid_shift
) < 0) {
858 log_error("Failed to parse UID: %s", optarg
);
862 arg_userns_mode
= USER_NAMESPACE_FIXED
;
865 arg_settings_mask
|= SETTING_USERNS
;
869 if (userns_supported()) {
870 arg_userns_mode
= USER_NAMESPACE_PICK
;
871 arg_uid_shift
= UID_INVALID
;
872 arg_uid_range
= UINT32_C(0x10000);
874 arg_settings_mask
|= SETTING_USERNS
;
879 case ARG_PRIVATE_USERS_CHOWN
:
880 arg_userns_chown
= true;
882 arg_settings_mask
|= SETTING_USERNS
;
885 case ARG_KILL_SIGNAL
:
886 arg_kill_signal
= signal_from_string_try_harder(optarg
);
887 if (arg_kill_signal
< 0) {
888 log_error("Cannot parse signal: %s", optarg
);
892 arg_settings_mask
|= SETTING_KILL_SIGNAL
;
897 /* no → do not read files
898 * yes → read files, do not override cmdline, trust only subset
899 * override → read files, override cmdline, trust only subset
900 * trusted → read files, do not override cmdline, trust all
903 r
= parse_boolean(optarg
);
905 if (streq(optarg
, "trusted")) {
906 mask_all_settings
= false;
907 mask_no_settings
= false;
908 arg_settings_trusted
= true;
910 } else if (streq(optarg
, "override")) {
911 mask_all_settings
= false;
912 mask_no_settings
= true;
913 arg_settings_trusted
= -1;
915 return log_error_errno(r
, "Failed to parse --settings= argument: %s", optarg
);
918 mask_all_settings
= false;
919 mask_no_settings
= false;
920 arg_settings_trusted
= -1;
923 mask_all_settings
= true;
924 mask_no_settings
= false;
925 arg_settings_trusted
= false;
931 if (!path_is_absolute(optarg
)) {
932 log_error("Working directory %s is not an absolute path.", optarg
);
936 r
= free_and_strdup(&arg_chdir
, optarg
);
940 arg_settings_mask
|= SETTING_WORKING_DIRECTORY
;
947 assert_not_reached("Unhandled option");
950 if (arg_share_system
)
951 arg_register
= false;
953 if (arg_userns_mode
== USER_NAMESPACE_PICK
)
954 arg_userns_chown
= true;
956 if (arg_start_mode
!= START_PID1
&& arg_share_system
) {
957 log_error("--boot and --share-system may not be combined.");
961 if (arg_keep_unit
&& cg_pid_get_owner_uid(0, NULL
) >= 0) {
962 log_error("--keep-unit may not be used when invoked from a user session.");
966 if (arg_directory
&& arg_image
) {
967 log_error("--directory= and --image= may not be combined.");
971 if (arg_template
&& arg_image
) {
972 log_error("--template= and --image= may not be combined.");
976 if (arg_template
&& !(arg_directory
|| arg_machine
)) {
977 log_error("--template= needs --directory= or --machine=.");
981 if (arg_ephemeral
&& arg_template
) {
982 log_error("--ephemeral and --template= may not be combined.");
986 if (arg_ephemeral
&& arg_image
) {
987 log_error("--ephemeral and --image= may not be combined.");
991 if (arg_ephemeral
&& !IN_SET(arg_link_journal
, LINK_NO
, LINK_AUTO
)) {
992 log_error("--ephemeral and --link-journal= may not be combined.");
996 if (arg_userns_mode
!= USER_NAMESPACE_NO
&& !userns_supported()) {
997 log_error("--private-users= is not supported, kernel compiled without user namespace support.");
1001 if (arg_userns_chown
&& arg_read_only
) {
1002 log_error("--read-only and --private-users-chown may not be combined.");
1006 if (argc
> optind
) {
1007 arg_parameters
= strv_copy(argv
+ optind
);
1008 if (!arg_parameters
)
1011 arg_settings_mask
|= SETTING_START_MODE
;
1014 /* Load all settings from .nspawn files */
1015 if (mask_no_settings
)
1016 arg_settings_mask
= 0;
1018 /* Don't load any settings from .nspawn files */
1019 if (mask_all_settings
)
1020 arg_settings_mask
= _SETTINGS_MASK_ALL
;
1022 arg_retain
= (arg_retain
| plus
| (arg_private_network
? 1ULL << CAP_NET_ADMIN
: 0)) & ~minus
;
1024 r
= detect_unified_cgroup_hierarchy();
1028 e
= getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
1030 arg_container_service_name
= e
;
1035 static int verify_arguments(void) {
1037 if (arg_volatile_mode
!= VOLATILE_NO
&& arg_read_only
) {
1038 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1042 if (arg_expose_ports
&& !arg_private_network
) {
1043 log_error("Cannot use --port= without private networking.");
1047 #ifndef HAVE_LIBIPTC
1048 if (arg_expose_ports
) {
1049 log_error("--port= is not supported, compiled without libiptc support.");
1054 if (arg_start_mode
== START_BOOT
&& arg_kill_signal
<= 0)
1055 arg_kill_signal
= SIGRTMIN
+3;
1060 static int userns_lchown(const char *p
, uid_t uid
, gid_t gid
) {
1063 if (arg_userns_mode
== USER_NAMESPACE_NO
)
1066 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
1069 if (uid
!= UID_INVALID
) {
1070 uid
+= arg_uid_shift
;
1072 if (uid
< arg_uid_shift
|| uid
>= arg_uid_shift
+ arg_uid_range
)
1076 if (gid
!= GID_INVALID
) {
1077 gid
+= (gid_t
) arg_uid_shift
;
1079 if (gid
< (gid_t
) arg_uid_shift
|| gid
>= (gid_t
) (arg_uid_shift
+ arg_uid_range
))
1083 if (lchown(p
, uid
, gid
) < 0)
1089 static int userns_mkdir(const char *root
, const char *path
, mode_t mode
, uid_t uid
, gid_t gid
) {
1092 q
= prefix_roota(root
, path
);
1093 if (mkdir(q
, mode
) < 0) {
1094 if (errno
== EEXIST
)
1099 return userns_lchown(q
, uid
, gid
);
1102 static int setup_timezone(const char *dest
) {
1103 _cleanup_free_
char *p
= NULL
, *q
= NULL
;
1104 const char *where
, *check
, *what
;
1110 /* Fix the timezone, if possible */
1111 r
= readlink_malloc("/etc/localtime", &p
);
1113 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1117 z
= path_startswith(p
, "../usr/share/zoneinfo/");
1119 z
= path_startswith(p
, "/usr/share/zoneinfo/");
1121 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1125 where
= prefix_roota(dest
, "/etc/localtime");
1126 r
= readlink_malloc(where
, &q
);
1128 y
= path_startswith(q
, "../usr/share/zoneinfo/");
1130 y
= path_startswith(q
, "/usr/share/zoneinfo/");
1132 /* Already pointing to the right place? Then do nothing .. */
1133 if (y
&& streq(y
, z
))
1137 check
= strjoina("/usr/share/zoneinfo/", z
);
1138 check
= prefix_roota(dest
, check
);
1139 if (laccess(check
, F_OK
) < 0) {
1140 log_warning("Timezone %s does not exist in container, not updating container timezone.", z
);
1145 if (r
< 0 && errno
!= ENOENT
) {
1146 log_error_errno(errno
, "Failed to remove existing timezone info %s in container: %m", where
);
1150 what
= strjoina("../usr/share/zoneinfo/", z
);
1151 if (symlink(what
, where
) < 0) {
1152 log_error_errno(errno
, "Failed to correct timezone of container: %m");
1156 r
= userns_lchown(where
, 0, 0);
1158 return log_warning_errno(r
, "Failed to chown /etc/localtime: %m");
1163 static int setup_resolv_conf(const char *dest
) {
1164 const char *where
= NULL
;
1169 if (arg_private_network
)
1172 /* Fix resolv.conf, if possible */
1173 where
= prefix_roota(dest
, "/etc/resolv.conf");
1175 r
= copy_file("/etc/resolv.conf", where
, O_TRUNC
|O_NOFOLLOW
, 0644, 0);
1177 /* If the file already exists as symlink, let's
1178 * suppress the warning, under the assumption that
1179 * resolved or something similar runs inside and the
1180 * symlink points there.
1182 * If the disk image is read-only, there's also no
1183 * point in complaining.
1185 log_full_errno(IN_SET(r
, -ELOOP
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
1186 "Failed to copy /etc/resolv.conf to %s: %m", where
);
1190 r
= userns_lchown(where
, 0, 0);
1192 log_warning_errno(r
, "Failed to chown /etc/resolv.conf: %m");
1197 static char* id128_format_as_uuid(sd_id128_t id
, char s
[37]) {
1201 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1202 SD_ID128_FORMAT_VAL(id
));
1207 static int setup_boot_id(const char *dest
) {
1208 const char *from
, *to
;
1209 sd_id128_t rnd
= {};
1213 if (arg_share_system
)
1216 /* Generate a new randomized boot ID, so that each boot-up of
1217 * the container gets a new one */
1219 from
= prefix_roota(dest
, "/run/proc-sys-kernel-random-boot-id");
1220 to
= prefix_roota(dest
, "/proc/sys/kernel/random/boot_id");
1222 r
= sd_id128_randomize(&rnd
);
1224 return log_error_errno(r
, "Failed to generate random boot id: %m");
1226 id128_format_as_uuid(rnd
, as_uuid
);
1228 r
= write_string_file(from
, as_uuid
, WRITE_STRING_FILE_CREATE
);
1230 return log_error_errno(r
, "Failed to write boot id: %m");
1232 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1233 r
= log_error_errno(errno
, "Failed to bind mount boot id: %m");
1234 else if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
) < 0)
1235 log_warning_errno(errno
, "Failed to make boot id read-only: %m");
1241 static int copy_devnodes(const char *dest
) {
1243 static const char devnodes
[] =
1254 _cleanup_umask_ mode_t u
;
1260 /* Create /dev/net, so that we can create /dev/net/tun in it */
1261 if (userns_mkdir(dest
, "/dev/net", 0755, 0, 0) < 0)
1262 return log_error_errno(r
, "Failed to create /dev/net directory: %m");
1264 NULSTR_FOREACH(d
, devnodes
) {
1265 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
1268 from
= strappend("/dev/", d
);
1269 to
= prefix_root(dest
, from
);
1271 if (stat(from
, &st
) < 0) {
1273 if (errno
!= ENOENT
)
1274 return log_error_errno(errno
, "Failed to stat %s: %m", from
);
1276 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
1278 log_error("%s is not a char or block device, cannot copy.", from
);
1282 if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
1284 return log_error_errno(errno
, "mknod(%s) failed: %m", to
);
1286 /* Some systems abusively restrict mknod but
1287 * allow bind mounts. */
1290 return log_error_errno(r
, "touch (%s) failed: %m", to
);
1291 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1292 return log_error_errno(errno
, "Both mknod and bind mount (%s) failed: %m", to
);
1295 r
= userns_lchown(to
, 0, 0);
1297 return log_error_errno(r
, "chown() of device node %s failed: %m", to
);
1304 static int setup_pts(const char *dest
) {
1305 _cleanup_free_
char *options
= NULL
;
1310 if (arg_selinux_apifs_context
)
1311 (void) asprintf(&options
,
1312 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
",context=\"%s\"",
1313 arg_uid_shift
+ TTY_GID
,
1314 arg_selinux_apifs_context
);
1317 (void) asprintf(&options
,
1318 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
,
1319 arg_uid_shift
+ TTY_GID
);
1324 /* Mount /dev/pts itself */
1325 p
= prefix_roota(dest
, "/dev/pts");
1326 if (mkdir(p
, 0755) < 0)
1327 return log_error_errno(errno
, "Failed to create /dev/pts: %m");
1328 if (mount("devpts", p
, "devpts", MS_NOSUID
|MS_NOEXEC
, options
) < 0)
1329 return log_error_errno(errno
, "Failed to mount /dev/pts: %m");
1330 r
= userns_lchown(p
, 0, 0);
1332 return log_error_errno(r
, "Failed to chown /dev/pts: %m");
1334 /* Create /dev/ptmx symlink */
1335 p
= prefix_roota(dest
, "/dev/ptmx");
1336 if (symlink("pts/ptmx", p
) < 0)
1337 return log_error_errno(errno
, "Failed to create /dev/ptmx symlink: %m");
1338 r
= userns_lchown(p
, 0, 0);
1340 return log_error_errno(r
, "Failed to chown /dev/ptmx: %m");
1342 /* And fix /dev/pts/ptmx ownership */
1343 p
= prefix_roota(dest
, "/dev/pts/ptmx");
1344 r
= userns_lchown(p
, 0, 0);
1346 return log_error_errno(r
, "Failed to chown /dev/pts/ptmx: %m");
1351 static int setup_dev_console(const char *dest
, const char *console
) {
1352 _cleanup_umask_ mode_t u
;
1361 r
= chmod_and_chown(console
, 0600, arg_uid_shift
, arg_uid_shift
);
1363 return log_error_errno(r
, "Failed to correct access mode for TTY: %m");
1365 /* We need to bind mount the right tty to /dev/console since
1366 * ptys can only exist on pts file systems. To have something
1367 * to bind mount things on we create a empty regular file. */
1369 to
= prefix_roota(dest
, "/dev/console");
1372 return log_error_errno(r
, "touch() for /dev/console failed: %m");
1374 if (mount(console
, to
, NULL
, MS_BIND
, NULL
) < 0)
1375 return log_error_errno(errno
, "Bind mount for /dev/console failed: %m");
1380 static int setup_kmsg(const char *dest
, int kmsg_socket
) {
1381 const char *from
, *to
;
1382 _cleanup_umask_ mode_t u
;
1385 assert(kmsg_socket
>= 0);
1389 /* We create the kmsg FIFO as /run/kmsg, but immediately
1390 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1391 * on the reading side behave very similar to /proc/kmsg,
1392 * their writing side behaves differently from /dev/kmsg in
1393 * that writing blocks when nothing is reading. In order to
1394 * avoid any problems with containers deadlocking due to this
1395 * we simply make /dev/kmsg unavailable to the container. */
1396 from
= prefix_roota(dest
, "/run/kmsg");
1397 to
= prefix_roota(dest
, "/proc/kmsg");
1399 if (mkfifo(from
, 0600) < 0)
1400 return log_error_errno(errno
, "mkfifo() for /run/kmsg failed: %m");
1401 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1402 return log_error_errno(errno
, "Bind mount for /proc/kmsg failed: %m");
1404 fd
= open(from
, O_RDWR
|O_NDELAY
|O_CLOEXEC
);
1406 return log_error_errno(errno
, "Failed to open fifo: %m");
1408 /* Store away the fd in the socket, so that it stays open as
1409 * long as we run the child */
1410 r
= send_one_fd(kmsg_socket
, fd
, 0);
1414 return log_error_errno(r
, "Failed to send FIFO fd: %m");
1416 /* And now make the FIFO unavailable as /run/kmsg... */
1417 (void) unlink(from
);
1422 static int on_address_change(sd_netlink
*rtnl
, sd_netlink_message
*m
, void *userdata
) {
1423 union in_addr_union
*exposed
= userdata
;
1429 expose_port_execute(rtnl
, arg_expose_ports
, exposed
);
1433 static int setup_hostname(void) {
1435 if (arg_share_system
)
1438 if (sethostname_idempotent(arg_machine
) < 0)
1444 static int setup_journal(const char *directory
) {
1446 _cleanup_free_
char *d
= NULL
;
1452 /* Don't link journals in ephemeral mode */
1456 if (arg_link_journal
== LINK_NO
)
1459 try = arg_link_journal_try
|| arg_link_journal
== LINK_AUTO
;
1461 r
= sd_id128_get_machine(&this_id
);
1463 return log_error_errno(r
, "Failed to retrieve machine ID: %m");
1465 if (sd_id128_equal(arg_uuid
, this_id
)) {
1466 log_full(try ? LOG_WARNING
: LOG_ERR
,
1467 "Host and machine ids are equal (%s): refusing to link journals", id
);
1473 r
= userns_mkdir(directory
, "/var", 0755, 0, 0);
1475 return log_error_errno(r
, "Failed to create /var: %m");
1477 r
= userns_mkdir(directory
, "/var/log", 0755, 0, 0);
1479 return log_error_errno(r
, "Failed to create /var/log: %m");
1481 r
= userns_mkdir(directory
, "/var/log/journal", 0755, 0, 0);
1483 return log_error_errno(r
, "Failed to create /var/log/journal: %m");
1485 (void) sd_id128_to_string(arg_uuid
, id
);
1487 p
= strjoina("/var/log/journal/", id
);
1488 q
= prefix_roota(directory
, p
);
1490 if (path_is_mount_point(p
, 0) > 0) {
1494 log_error("%s: already a mount point, refusing to use for journal", p
);
1498 if (path_is_mount_point(q
, 0) > 0) {
1502 log_error("%s: already a mount point, refusing to use for journal", q
);
1506 r
= readlink_and_make_absolute(p
, &d
);
1508 if ((arg_link_journal
== LINK_GUEST
||
1509 arg_link_journal
== LINK_AUTO
) &&
1512 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1514 log_warning_errno(r
, "Failed to create directory %s: %m", q
);
1519 return log_error_errno(errno
, "Failed to remove symlink %s: %m", p
);
1520 } else if (r
== -EINVAL
) {
1522 if (arg_link_journal
== LINK_GUEST
&&
1525 if (errno
== ENOTDIR
) {
1526 log_error("%s already exists and is neither a symlink nor a directory", p
);
1529 return log_error_errno(errno
, "Failed to remove %s: %m", p
);
1531 } else if (r
!= -ENOENT
)
1532 return log_error_errno(r
, "readlink(%s) failed: %m", p
);
1534 if (arg_link_journal
== LINK_GUEST
) {
1536 if (symlink(q
, p
) < 0) {
1538 log_debug_errno(errno
, "Failed to symlink %s to %s, skipping journal setup: %m", q
, p
);
1541 return log_error_errno(errno
, "Failed to symlink %s to %s: %m", q
, p
);
1544 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1546 log_warning_errno(r
, "Failed to create directory %s: %m", q
);
1550 if (arg_link_journal
== LINK_HOST
) {
1551 /* don't create parents here — if the host doesn't have
1552 * permanent journal set up, don't force it here */
1554 if (mkdir(p
, 0755) < 0 && errno
!= EEXIST
) {
1556 log_debug_errno(errno
, "Failed to create %s, skipping journal setup: %m", p
);
1559 return log_error_errno(errno
, "Failed to create %s: %m", p
);
1562 } else if (access(p
, F_OK
) < 0)
1565 if (dir_is_empty(q
) == 0)
1566 log_warning("%s is not empty, proceeding anyway.", q
);
1568 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1570 return log_error_errno(r
, "Failed to create %s: %m", q
);
1572 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1573 return log_error_errno(errno
, "Failed to bind mount journal from host into guest: %m");
1578 static int drop_capabilities(void) {
1579 return capability_bounding_set_drop(arg_retain
, false);
1582 static int reset_audit_loginuid(void) {
1583 _cleanup_free_
char *p
= NULL
;
1586 if (arg_share_system
)
1589 r
= read_one_line_file("/proc/self/loginuid", &p
);
1593 return log_error_errno(r
, "Failed to read /proc/self/loginuid: %m");
1595 /* Already reset? */
1596 if (streq(p
, "4294967295"))
1599 r
= write_string_file("/proc/self/loginuid", "4294967295", 0);
1602 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1603 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1604 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1605 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1606 "using systemd-nspawn. Sleeping for 5s... (%m)");
1614 static int setup_seccomp(void) {
1617 static const struct {
1618 uint64_t capability
;
1621 { CAP_SYS_RAWIO
, SCMP_SYS(iopl
) },
1622 { CAP_SYS_RAWIO
, SCMP_SYS(ioperm
) },
1623 { CAP_SYS_BOOT
, SCMP_SYS(kexec_load
) },
1624 { CAP_SYS_ADMIN
, SCMP_SYS(swapon
) },
1625 { CAP_SYS_ADMIN
, SCMP_SYS(swapoff
) },
1626 { CAP_SYS_ADMIN
, SCMP_SYS(open_by_handle_at
) },
1627 { CAP_SYS_MODULE
, SCMP_SYS(init_module
) },
1628 { CAP_SYS_MODULE
, SCMP_SYS(finit_module
) },
1629 { CAP_SYS_MODULE
, SCMP_SYS(delete_module
) },
1630 { CAP_SYSLOG
, SCMP_SYS(syslog
) },
1633 scmp_filter_ctx seccomp
;
1637 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
1641 r
= seccomp_add_secondary_archs(seccomp
);
1643 log_error_errno(r
, "Failed to add secondary archs to seccomp filter: %m");
1647 for (i
= 0; i
< ELEMENTSOF(blacklist
); i
++) {
1648 if (arg_retain
& (1ULL << blacklist
[i
].capability
))
1651 r
= seccomp_rule_add(seccomp
, SCMP_ACT_ERRNO(EPERM
), blacklist
[i
].syscall_num
, 0);
1653 continue; /* unknown syscall */
1655 log_error_errno(r
, "Failed to block syscall: %m");
1662 Audit is broken in containers, much of the userspace audit
1663 hookup will fail if running inside a container. We don't
1664 care and just turn off creation of audit sockets.
1666 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1667 with EAFNOSUPPORT which audit userspace uses as indication
1668 that audit is disabled in the kernel.
1671 r
= seccomp_rule_add(
1673 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1676 SCMP_A0(SCMP_CMP_EQ
, AF_NETLINK
),
1677 SCMP_A2(SCMP_CMP_EQ
, NETLINK_AUDIT
));
1679 log_error_errno(r
, "Failed to add audit seccomp rule: %m");
1683 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
1685 log_error_errno(r
, "Failed to unset NO_NEW_PRIVS: %m");
1689 r
= seccomp_load(seccomp
);
1691 log_debug_errno(r
, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1696 log_error_errno(r
, "Failed to install seccomp audit filter: %m");
1701 seccomp_release(seccomp
);
1709 static int setup_propagate(const char *root
) {
1713 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1714 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1715 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
1716 (void) mkdir_p(p
, 0600);
1718 r
= userns_mkdir(root
, "/run/systemd", 0755, 0, 0);
1720 return log_error_errno(r
, "Failed to create /run/systemd: %m");
1722 r
= userns_mkdir(root
, "/run/systemd/nspawn", 0755, 0, 0);
1724 return log_error_errno(r
, "Failed to create /run/systemd/nspawn: %m");
1726 r
= userns_mkdir(root
, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1728 return log_error_errno(r
, "Failed to create /run/systemd/nspawn/incoming: %m");
1730 q
= prefix_roota(root
, "/run/systemd/nspawn/incoming");
1731 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1732 return log_error_errno(errno
, "Failed to install propagation bind mount.");
1734 if (mount(NULL
, q
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
) < 0)
1735 return log_error_errno(errno
, "Failed to make propagation mount read-only");
1740 static int setup_image(char **device_path
, int *loop_nr
) {
1741 struct loop_info64 info
= {
1742 .lo_flags
= LO_FLAGS_AUTOCLEAR
|LO_FLAGS_PARTSCAN
1744 _cleanup_close_
int fd
= -1, control
= -1, loop
= -1;
1745 _cleanup_free_
char* loopdev
= NULL
;
1749 assert(device_path
);
1753 fd
= open(arg_image
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1755 return log_error_errno(errno
, "Failed to open %s: %m", arg_image
);
1757 if (fstat(fd
, &st
) < 0)
1758 return log_error_errno(errno
, "Failed to stat %s: %m", arg_image
);
1760 if (S_ISBLK(st
.st_mode
)) {
1763 p
= strdup(arg_image
);
1777 if (!S_ISREG(st
.st_mode
)) {
1778 log_error("%s is not a regular file or block device.", arg_image
);
1782 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
1784 return log_error_errno(errno
, "Failed to open /dev/loop-control: %m");
1786 nr
= ioctl(control
, LOOP_CTL_GET_FREE
);
1788 return log_error_errno(errno
, "Failed to allocate loop device: %m");
1790 if (asprintf(&loopdev
, "/dev/loop%i", nr
) < 0)
1793 loop
= open(loopdev
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1795 return log_error_errno(errno
, "Failed to open loop device %s: %m", loopdev
);
1797 if (ioctl(loop
, LOOP_SET_FD
, fd
) < 0)
1798 return log_error_errno(errno
, "Failed to set loopback file descriptor on %s: %m", loopdev
);
1801 info
.lo_flags
|= LO_FLAGS_READ_ONLY
;
1803 if (ioctl(loop
, LOOP_SET_STATUS64
, &info
) < 0)
1804 return log_error_errno(errno
, "Failed to set loopback settings on %s: %m", loopdev
);
1806 *device_path
= loopdev
;
1817 #define PARTITION_TABLE_BLURB \
1818 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1819 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1820 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1821 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1822 "to be bootable with systemd-nspawn."
1824 static int dissect_image(
1826 char **root_device
, bool *root_device_rw
,
1827 char **home_device
, bool *home_device_rw
,
1828 char **srv_device
, bool *srv_device_rw
,
1832 int home_nr
= -1, srv_nr
= -1;
1833 #ifdef GPT_ROOT_NATIVE
1836 #ifdef GPT_ROOT_SECONDARY
1837 int secondary_root_nr
= -1;
1839 _cleanup_free_
char *home
= NULL
, *root
= NULL
, *secondary_root
= NULL
, *srv
= NULL
, *generic
= NULL
;
1840 _cleanup_udev_enumerate_unref_
struct udev_enumerate
*e
= NULL
;
1841 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
1842 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
1843 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
1844 struct udev_list_entry
*first
, *item
;
1845 bool home_rw
= true, root_rw
= true, secondary_root_rw
= true, srv_rw
= true, generic_rw
= true;
1846 bool is_gpt
, is_mbr
, multiple_generic
= false;
1847 const char *pttype
= NULL
;
1854 assert(root_device
);
1855 assert(home_device
);
1860 b
= blkid_new_probe();
1865 r
= blkid_probe_set_device(b
, fd
, 0, 0);
1870 return log_error_errno(errno
, "Failed to set device on blkid probe: %m");
1873 blkid_probe_enable_partitions(b
, 1);
1874 blkid_probe_set_partitions_flags(b
, BLKID_PARTS_ENTRY_DETAILS
);
1877 r
= blkid_do_safeprobe(b
);
1878 if (r
== -2 || r
== 1) {
1879 log_error("Failed to identify any partition table on\n"
1881 PARTITION_TABLE_BLURB
, arg_image
);
1883 } else if (r
!= 0) {
1886 return log_error_errno(errno
, "Failed to probe: %m");
1889 (void) blkid_probe_lookup_value(b
, "PTTYPE", &pttype
, NULL
);
1891 is_gpt
= streq_ptr(pttype
, "gpt");
1892 is_mbr
= streq_ptr(pttype
, "dos");
1894 if (!is_gpt
&& !is_mbr
) {
1895 log_error("No GPT or MBR partition table discovered on\n"
1897 PARTITION_TABLE_BLURB
, arg_image
);
1902 pl
= blkid_probe_get_partitions(b
);
1907 log_error("Failed to list partitions of %s", arg_image
);
1915 if (fstat(fd
, &st
) < 0)
1916 return log_error_errno(errno
, "Failed to stat block device: %m");
1918 d
= udev_device_new_from_devnum(udev
, 'b', st
.st_rdev
);
1926 log_error("Kernel partitions never appeared.");
1930 e
= udev_enumerate_new(udev
);
1934 r
= udev_enumerate_add_match_parent(e
, d
);
1938 r
= udev_enumerate_scan_devices(e
);
1940 return log_error_errno(r
, "Failed to scan for partition devices of %s: %m", arg_image
);
1942 /* Count the partitions enumerated by the kernel */
1944 first
= udev_enumerate_get_list_entry(e
);
1945 udev_list_entry_foreach(item
, first
)
1948 /* Count the partitions enumerated by blkid */
1949 m
= blkid_partlist_numof_partitions(pl
);
1953 log_error("blkid and kernel partition list do not match.");
1959 /* The kernel has probed fewer partitions than
1960 * blkid? Maybe the kernel prober is still
1961 * running or it got EBUSY because udev
1962 * already opened the device. Let's reprobe
1963 * the device, which is a synchronous call
1964 * that waits until probing is complete. */
1966 for (j
= 0; j
< 20; j
++) {
1968 r
= ioctl(fd
, BLKRRPART
, 0);
1971 if (r
>= 0 || r
!= -EBUSY
)
1974 /* If something else has the device
1975 * open, such as an udev rule, the
1976 * ioctl will return EBUSY. Since
1977 * there's no way to wait until it
1978 * isn't busy anymore, let's just wait
1979 * a bit, and try again.
1981 * This is really something they
1982 * should fix in the kernel! */
1984 usleep(50 * USEC_PER_MSEC
);
1988 return log_error_errno(r
, "Failed to reread partition table: %m");
1991 e
= udev_enumerate_unref(e
);
1994 first
= udev_enumerate_get_list_entry(e
);
1995 udev_list_entry_foreach(item
, first
) {
1996 _cleanup_udev_device_unref_
struct udev_device
*q
;
1998 unsigned long long flags
;
2004 q
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(item
));
2009 return log_error_errno(errno
, "Failed to get partition device of %s: %m", arg_image
);
2012 qn
= udev_device_get_devnum(q
);
2016 if (st
.st_rdev
== qn
)
2019 node
= udev_device_get_devnode(q
);
2023 pp
= blkid_partlist_devno_to_partition(pl
, qn
);
2027 flags
= blkid_partition_get_flags(pp
);
2029 nr
= blkid_partition_get_partno(pp
);
2037 if (flags
& GPT_FLAG_NO_AUTO
)
2040 stype
= blkid_partition_get_type_string(pp
);
2044 if (sd_id128_from_string(stype
, &type_id
) < 0)
2047 if (sd_id128_equal(type_id
, GPT_HOME
)) {
2049 if (home
&& nr
>= home_nr
)
2053 home_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2055 r
= free_and_strdup(&home
, node
);
2059 } else if (sd_id128_equal(type_id
, GPT_SRV
)) {
2061 if (srv
&& nr
>= srv_nr
)
2065 srv_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2067 r
= free_and_strdup(&srv
, node
);
2071 #ifdef GPT_ROOT_NATIVE
2072 else if (sd_id128_equal(type_id
, GPT_ROOT_NATIVE
)) {
2074 if (root
&& nr
>= root_nr
)
2078 root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2080 r
= free_and_strdup(&root
, node
);
2085 #ifdef GPT_ROOT_SECONDARY
2086 else if (sd_id128_equal(type_id
, GPT_ROOT_SECONDARY
)) {
2088 if (secondary_root
&& nr
>= secondary_root_nr
)
2091 secondary_root_nr
= nr
;
2092 secondary_root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2094 r
= free_and_strdup(&secondary_root
, node
);
2099 else if (sd_id128_equal(type_id
, GPT_LINUX_GENERIC
)) {
2102 multiple_generic
= true;
2104 generic_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2106 r
= free_and_strdup(&generic
, node
);
2112 } else if (is_mbr
) {
2115 if (flags
!= 0x80) /* Bootable flag */
2118 type
= blkid_partition_get_type(pp
);
2119 if (type
!= 0x83) /* Linux partition */
2123 multiple_generic
= true;
2127 r
= free_and_strdup(&root
, node
);
2135 *root_device
= root
;
2138 *root_device_rw
= root_rw
;
2140 } else if (secondary_root
) {
2141 *root_device
= secondary_root
;
2142 secondary_root
= NULL
;
2144 *root_device_rw
= secondary_root_rw
;
2146 } else if (generic
) {
2148 /* There were no partitions with precise meanings
2149 * around, but we found generic partitions. In this
2150 * case, if there's only one, we can go ahead and boot
2151 * it, otherwise we bail out, because we really cannot
2152 * make any sense of it. */
2154 if (multiple_generic
) {
2155 log_error("Identified multiple bootable Linux partitions on\n"
2157 PARTITION_TABLE_BLURB
, arg_image
);
2161 *root_device
= generic
;
2164 *root_device_rw
= generic_rw
;
2167 log_error("Failed to identify root partition in disk image\n"
2169 PARTITION_TABLE_BLURB
, arg_image
);
2174 *home_device
= home
;
2177 *home_device_rw
= home_rw
;
2184 *srv_device_rw
= srv_rw
;
2189 log_error("--image= is not supported, compiled without blkid support.");
2194 static int mount_device(const char *what
, const char *where
, const char *directory
, bool rw
) {
2196 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
2197 const char *fstype
, *p
;
2207 p
= strjoina(where
, directory
);
2212 b
= blkid_new_probe_from_filename(what
);
2216 return log_error_errno(errno
, "Failed to allocate prober for %s: %m", what
);
2219 blkid_probe_enable_superblocks(b
, 1);
2220 blkid_probe_set_superblocks_flags(b
, BLKID_SUBLKS_TYPE
);
2223 r
= blkid_do_safeprobe(b
);
2224 if (r
== -1 || r
== 1) {
2225 log_error("Cannot determine file system type of %s", what
);
2227 } else if (r
!= 0) {
2230 return log_error_errno(errno
, "Failed to probe %s: %m", what
);
2234 if (blkid_probe_lookup_value(b
, "TYPE", &fstype
, NULL
) < 0) {
2237 log_error("Failed to determine file system type of %s", what
);
2241 if (streq(fstype
, "crypto_LUKS")) {
2242 log_error("nspawn currently does not support LUKS disk images.");
2246 if (mount(what
, p
, fstype
, MS_NODEV
|(rw
? 0 : MS_RDONLY
), NULL
) < 0)
2247 return log_error_errno(errno
, "Failed to mount %s: %m", what
);
2251 log_error("--image= is not supported, compiled without blkid support.");
2256 static int setup_machine_id(const char *directory
) {
2258 const char *etc_machine_id
, *t
;
2259 _cleanup_free_
char *s
= NULL
;
2261 etc_machine_id
= prefix_roota(directory
, "/etc/machine-id");
2263 r
= read_one_line_file(etc_machine_id
, &s
);
2265 return log_error_errno(r
, "Failed to read machine ID from %s: %m", etc_machine_id
);
2270 r
= sd_id128_from_string(t
, &arg_uuid
);
2272 return log_error_errno(r
, "Failed to parse machine ID from %s: %m", etc_machine_id
);
2274 if (sd_id128_is_null(arg_uuid
)) {
2275 r
= sd_id128_randomize(&arg_uuid
);
2277 return log_error_errno(r
, "Failed to generate random machine ID: %m");
2281 r
= machine_id_setup(directory
, arg_uuid
);
2283 return log_error_errno(r
, "Failed to setup machine ID: %m");
2288 static int recursive_chown(const char *directory
, uid_t shift
, uid_t range
) {
2293 if (arg_userns_mode
== USER_NAMESPACE_NO
|| !arg_userns_chown
)
2296 r
= path_patch_uid(directory
, arg_uid_shift
, arg_uid_range
);
2297 if (r
== -EOPNOTSUPP
)
2298 return log_error_errno(r
, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2300 return log_error_errno(r
, "Upper 16 bits of root directory UID and GID do not match.");
2302 return log_error_errno(r
, "Failed to adjust UID/GID shift of OS tree: %m");
2304 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2306 log_debug("Patched directory tree to match UID/GID range.");
2311 static int mount_devices(
2313 const char *root_device
, bool root_device_rw
,
2314 const char *home_device
, bool home_device_rw
,
2315 const char *srv_device
, bool srv_device_rw
) {
2321 r
= mount_device(root_device
, arg_directory
, NULL
, root_device_rw
);
2323 return log_error_errno(r
, "Failed to mount root directory: %m");
2327 r
= mount_device(home_device
, arg_directory
, "/home", home_device_rw
);
2329 return log_error_errno(r
, "Failed to mount home directory: %m");
2333 r
= mount_device(srv_device
, arg_directory
, "/srv", srv_device_rw
);
2335 return log_error_errno(r
, "Failed to mount server data directory: %m");
2341 static void loop_remove(int nr
, int *image_fd
) {
2342 _cleanup_close_
int control
= -1;
2348 if (image_fd
&& *image_fd
>= 0) {
2349 r
= ioctl(*image_fd
, LOOP_CLR_FD
);
2351 log_debug_errno(errno
, "Failed to close loop image: %m");
2352 *image_fd
= safe_close(*image_fd
);
2355 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
2357 log_warning_errno(errno
, "Failed to open /dev/loop-control: %m");
2361 r
= ioctl(control
, LOOP_CTL_REMOVE
, nr
);
2363 log_debug_errno(errno
, "Failed to remove loop %d: %m", nr
);
2368 * < 0 : wait_for_terminate() failed to get the state of the
2369 * container, the container was terminated by a signal, or
2370 * failed for an unknown reason. No change is made to the
2371 * container argument.
2372 * > 0 : The program executed in the container terminated with an
2373 * error. The exit code of the program executed in the
2374 * container is returned. The container argument has been set
2375 * to CONTAINER_TERMINATED.
2376 * 0 : The container is being rebooted, has been shut down or exited
2377 * successfully. The container argument has been set to either
2378 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2380 * That is, success is indicated by a return value of zero, and an
2381 * error is indicated by a non-zero value.
2383 static int wait_for_container(pid_t pid
, ContainerStatus
*container
) {
2387 r
= wait_for_terminate(pid
, &status
);
2389 return log_warning_errno(r
, "Failed to wait for container: %m");
2391 switch (status
.si_code
) {
2394 if (status
.si_status
== 0) {
2395 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s exited successfully.", arg_machine
);
2398 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s failed with error code %i.", arg_machine
, status
.si_status
);
2400 *container
= CONTAINER_TERMINATED
;
2401 return status
.si_status
;
2404 if (status
.si_status
== SIGINT
) {
2406 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s has been shut down.", arg_machine
);
2407 *container
= CONTAINER_TERMINATED
;
2410 } else if (status
.si_status
== SIGHUP
) {
2412 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s is being rebooted.", arg_machine
);
2413 *container
= CONTAINER_REBOOTED
;
2417 /* CLD_KILLED fallthrough */
2420 log_error("Container %s terminated by signal %s.", arg_machine
, signal_to_string(status
.si_status
));
2424 log_error("Container %s failed due to unknown reason.", arg_machine
);
2431 static int on_orderly_shutdown(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
2434 pid
= PTR_TO_PID(userdata
);
2436 if (kill(pid
, arg_kill_signal
) >= 0) {
2437 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2438 sd_event_source_set_userdata(s
, NULL
);
2443 sd_event_exit(sd_event_source_get_event(s
), 0);
2447 static int determine_names(void) {
2450 if (arg_template
&& !arg_directory
&& arg_machine
) {
2452 /* If --template= was specified then we should not
2453 * search for a machine, but instead create a new one
2454 * in /var/lib/machine. */
2456 arg_directory
= strjoin("/var/lib/machines/", arg_machine
, NULL
);
2461 if (!arg_image
&& !arg_directory
) {
2463 _cleanup_(image_unrefp
) Image
*i
= NULL
;
2465 r
= image_find(arg_machine
, &i
);
2467 return log_error_errno(r
, "Failed to find image for machine '%s': %m", arg_machine
);
2469 log_error("No image for machine '%s': %m", arg_machine
);
2473 if (i
->type
== IMAGE_RAW
)
2474 r
= free_and_strdup(&arg_image
, i
->path
);
2476 r
= free_and_strdup(&arg_directory
, i
->path
);
2478 return log_error_errno(r
, "Invalid image directory: %m");
2481 arg_read_only
= arg_read_only
|| i
->read_only
;
2483 arg_directory
= get_current_dir_name();
2485 if (!arg_directory
&& !arg_machine
) {
2486 log_error("Failed to determine path, please use -D or -i.");
2492 if (arg_directory
&& path_equal(arg_directory
, "/"))
2493 arg_machine
= gethostname_malloc();
2495 arg_machine
= strdup(basename(arg_image
?: arg_directory
));
2500 hostname_cleanup(arg_machine
);
2501 if (!machine_name_is_valid(arg_machine
)) {
2502 log_error("Failed to determine machine name automatically, please use -M.");
2506 if (arg_ephemeral
) {
2509 /* Add a random suffix when this is an
2510 * ephemeral machine, so that we can run many
2511 * instances at once without manually having
2512 * to specify -M each time. */
2514 if (asprintf(&b
, "%s-%016" PRIx64
, arg_machine
, random_u64()) < 0)
2525 static int determine_uid_shift(const char *directory
) {
2528 if (arg_userns_mode
== USER_NAMESPACE_NO
) {
2533 if (arg_uid_shift
== UID_INVALID
) {
2536 r
= stat(directory
, &st
);
2538 return log_error_errno(errno
, "Failed to determine UID base of %s: %m", directory
);
2540 arg_uid_shift
= st
.st_uid
& UINT32_C(0xffff0000);
2542 if (arg_uid_shift
!= (st
.st_gid
& UINT32_C(0xffff0000))) {
2543 log_error("UID and GID base of %s don't match.", directory
);
2547 arg_uid_range
= UINT32_C(0x10000);
2550 if (arg_uid_shift
> (uid_t
) -1 - arg_uid_range
) {
2551 log_error("UID base too high for UID range.");
2558 static int inner_child(
2560 const char *directory
,
2566 _cleanup_free_
char *home
= NULL
;
2569 const char *envp
[] = {
2570 "PATH=" DEFAULT_PATH_SPLIT_USR
,
2571 NULL
, /* container */
2576 NULL
, /* container_uuid */
2577 NULL
, /* LISTEN_FDS */
2578 NULL
, /* LISTEN_PID */
2582 _cleanup_strv_free_
char **env_use
= NULL
;
2587 assert(kmsg_socket
>= 0);
2591 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
2592 /* Tell the parent, that it now can write the UID map. */
2593 (void) barrier_place(barrier
); /* #1 */
2595 /* Wait until the parent wrote the UID map */
2596 if (!barrier_place_and_sync(barrier
)) { /* #2 */
2597 log_error("Parent died too early");
2603 arg_userns_mode
!= USER_NAMESPACE_NO
,
2605 arg_private_network
,
2608 arg_selinux_apifs_context
);
2613 r
= mount_sysfs(NULL
);
2617 /* Wait until we are cgroup-ified, so that we
2618 * can mount the right cgroup path writable */
2619 if (!barrier_place_and_sync(barrier
)) { /* #3 */
2620 log_error("Parent died too early");
2624 r
= mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy
);
2628 r
= reset_uid_gid();
2630 return log_error_errno(r
, "Couldn't become new root: %m");
2632 r
= setup_boot_id(NULL
);
2636 r
= setup_kmsg(NULL
, kmsg_socket
);
2639 kmsg_socket
= safe_close(kmsg_socket
);
2644 return log_error_errno(errno
, "setsid() failed: %m");
2646 if (arg_private_network
)
2649 if (arg_expose_ports
) {
2650 r
= expose_port_send_rtnl(rtnl_socket
);
2653 rtnl_socket
= safe_close(rtnl_socket
);
2656 r
= drop_capabilities();
2658 return log_error_errno(r
, "drop_capabilities() failed: %m");
2662 if (arg_personality
!= PERSONALITY_INVALID
) {
2663 if (personality(arg_personality
) < 0)
2664 return log_error_errno(errno
, "personality() failed: %m");
2665 } else if (secondary
) {
2666 if (personality(PER_LINUX32
) < 0)
2667 return log_error_errno(errno
, "personality() failed: %m");
2671 if (arg_selinux_context
)
2672 if (setexeccon((security_context_t
) arg_selinux_context
) < 0)
2673 return log_error_errno(errno
, "setexeccon(\"%s\") failed: %m", arg_selinux_context
);
2676 r
= change_uid_gid(arg_user
, &home
);
2680 /* LXC sets container=lxc, so follow the scheme here */
2681 envp
[n_env
++] = strjoina("container=", arg_container_service_name
);
2683 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
2687 if ((asprintf((char**)(envp
+ n_env
++), "HOME=%s", home
? home
: "/root") < 0) ||
2688 (asprintf((char**)(envp
+ n_env
++), "USER=%s", arg_user
? arg_user
: "root") < 0) ||
2689 (asprintf((char**)(envp
+ n_env
++), "LOGNAME=%s", arg_user
? arg_user
: "root") < 0))
2692 assert(!sd_id128_equal(arg_uuid
, SD_ID128_NULL
));
2694 if (asprintf((char**)(envp
+ n_env
++), "container_uuid=%s", id128_format_as_uuid(arg_uuid
, as_uuid
)) < 0)
2697 if (fdset_size(fds
) > 0) {
2698 r
= fdset_cloexec(fds
, false);
2700 return log_error_errno(r
, "Failed to unset O_CLOEXEC for file descriptors.");
2702 if ((asprintf((char **)(envp
+ n_env
++), "LISTEN_FDS=%u", fdset_size(fds
)) < 0) ||
2703 (asprintf((char **)(envp
+ n_env
++), "LISTEN_PID=1") < 0))
2707 env_use
= strv_env_merge(2, envp
, arg_setenv
);
2711 /* Let the parent know that we are ready and
2712 * wait until the parent is ready with the
2714 if (!barrier_place_and_sync(barrier
)) { /* #4 */
2715 log_error("Parent died too early");
2720 if (chdir(arg_chdir
) < 0)
2721 return log_error_errno(errno
, "Failed to change to specified working directory %s: %m", arg_chdir
);
2723 if (arg_start_mode
== START_PID2
) {
2729 /* Now, explicitly close the log, so that we
2730 * then can close all remaining fds. Closing
2731 * the log explicitly first has the benefit
2732 * that the logging subsystem knows about it,
2733 * and is thus ready to be reopened should we
2734 * need it again. Note that the other fds
2735 * closed here are at least the locking and
2738 (void) fdset_close_others(fds
);
2740 if (arg_start_mode
== START_BOOT
) {
2744 /* Automatically search for the init system */
2746 m
= strv_length(arg_parameters
);
2747 a
= newa(char*, m
+ 2);
2748 memcpy_safe(a
+ 1, arg_parameters
, m
* sizeof(char*));
2751 a
[0] = (char*) "/usr/lib/systemd/systemd";
2752 execve(a
[0], a
, env_use
);
2754 a
[0] = (char*) "/lib/systemd/systemd";
2755 execve(a
[0], a
, env_use
);
2757 a
[0] = (char*) "/sbin/init";
2758 execve(a
[0], a
, env_use
);
2759 } else if (!strv_isempty(arg_parameters
))
2760 execvpe(arg_parameters
[0], arg_parameters
, env_use
);
2763 /* If we cannot change the directory, we'll end up in /, that is expected. */
2764 (void) chdir(home
?: "/root");
2766 execle("/bin/bash", "-bash", NULL
, env_use
);
2767 execle("/bin/sh", "-sh", NULL
, env_use
);
2772 return log_error_errno(r
, "execv() failed: %m");
2775 static int outer_child(
2777 const char *directory
,
2778 const char *console
,
2779 const char *root_device
, bool root_device_rw
,
2780 const char *home_device
, bool home_device_rw
,
2781 const char *srv_device
, bool srv_device_rw
,
2788 int uid_shift_socket
,
2798 assert(pid_socket
>= 0);
2799 assert(uuid_socket
>= 0);
2800 assert(kmsg_socket
>= 0);
2804 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0)
2805 return log_error_errno(errno
, "PR_SET_PDEATHSIG failed: %m");
2808 close_nointr(STDIN_FILENO
);
2809 close_nointr(STDOUT_FILENO
);
2810 close_nointr(STDERR_FILENO
);
2812 r
= open_terminal(console
, O_RDWR
);
2813 if (r
!= STDIN_FILENO
) {
2819 return log_error_errno(r
, "Failed to open console: %m");
2822 if (dup2(STDIN_FILENO
, STDOUT_FILENO
) != STDOUT_FILENO
||
2823 dup2(STDIN_FILENO
, STDERR_FILENO
) != STDERR_FILENO
)
2824 return log_error_errno(errno
, "Failed to duplicate console: %m");
2827 r
= reset_audit_loginuid();
2831 /* Mark everything as slave, so that we still
2832 * receive mounts from the real root, but don't
2833 * propagate mounts to the real root. */
2834 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
2835 return log_error_errno(errno
, "MS_SLAVE|MS_REC failed: %m");
2837 r
= mount_devices(directory
,
2838 root_device
, root_device_rw
,
2839 home_device
, home_device_rw
,
2840 srv_device
, srv_device_rw
);
2844 r
= determine_uid_shift(directory
);
2848 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
2849 /* Let the parent know which UID shift we read from the image */
2850 l
= send(uid_shift_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), MSG_NOSIGNAL
);
2852 return log_error_errno(errno
, "Failed to send UID shift: %m");
2853 if (l
!= sizeof(arg_uid_shift
)) {
2854 log_error("Short write while sending UID shift.");
2858 if (arg_userns_mode
== USER_NAMESPACE_PICK
) {
2859 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
2860 * we just read from the image is available. If yes, it will send the UID shift back to us, if
2861 * not it will pick a different one, and send it back to us. */
2863 l
= recv(uid_shift_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), 0);
2865 return log_error_errno(errno
, "Failed to recv UID shift: %m");
2866 if (l
!= sizeof(arg_uid_shift
)) {
2867 log_error("Short read while recieving UID shift.");
2872 log_info("Selected user namespace base " UID_FMT
" and range " UID_FMT
".", arg_uid_shift
, arg_uid_range
);
2875 /* Turn directory into bind mount */
2876 if (mount(directory
, directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
2877 return log_error_errno(errno
, "Failed to make bind mount: %m");
2879 r
= recursive_chown(directory
, arg_uid_shift
, arg_uid_range
);
2886 arg_userns_mode
!= USER_NAMESPACE_NO
,
2889 arg_selinux_context
);
2893 r
= setup_volatile_state(
2896 arg_userns_mode
!= USER_NAMESPACE_NO
,
2899 arg_selinux_context
);
2903 r
= base_filesystem_create(directory
, arg_uid_shift
, (gid_t
) arg_uid_shift
);
2907 if (arg_read_only
) {
2908 r
= bind_remount_recursive(directory
, true);
2910 return log_error_errno(r
, "Failed to make tree read-only: %m");
2913 r
= mount_all(directory
,
2914 arg_userns_mode
!= USER_NAMESPACE_NO
,
2916 arg_private_network
,
2919 arg_selinux_apifs_context
);
2923 r
= copy_devnodes(directory
);
2927 dev_setup(directory
, arg_uid_shift
, arg_uid_shift
);
2929 r
= setup_pts(directory
);
2933 r
= setup_propagate(directory
);
2937 r
= setup_dev_console(directory
, console
);
2941 r
= setup_seccomp();
2945 r
= setup_timezone(directory
);
2949 r
= setup_resolv_conf(directory
);
2953 r
= setup_machine_id(directory
);
2957 r
= setup_journal(directory
);
2964 arg_n_custom_mounts
,
2965 arg_userns_mode
!= USER_NAMESPACE_NO
,
2968 arg_selinux_apifs_context
);
2974 arg_unified_cgroup_hierarchy
,
2975 arg_userns_mode
!= USER_NAMESPACE_NO
,
2978 arg_selinux_apifs_context
);
2982 r
= mount_move_root(directory
);
2984 return log_error_errno(r
, "Failed to move root directory: %m");
2986 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
|
2987 (arg_share_system
? 0 : CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
) |
2988 (arg_private_network
? CLONE_NEWNET
: 0) |
2989 (arg_userns_mode
!= USER_NAMESPACE_NO
? CLONE_NEWUSER
: 0),
2992 return log_error_errno(errno
, "Failed to fork inner child: %m");
2994 pid_socket
= safe_close(pid_socket
);
2995 uuid_socket
= safe_close(uuid_socket
);
2996 uid_shift_socket
= safe_close(uid_shift_socket
);
2998 /* The inner child has all namespaces that are
2999 * requested, so that we all are owned by the user if
3000 * user namespaces are turned on. */
3002 r
= inner_child(barrier
, directory
, secondary
, kmsg_socket
, rtnl_socket
, fds
);
3004 _exit(EXIT_FAILURE
);
3006 _exit(EXIT_SUCCESS
);
3009 l
= send(pid_socket
, &pid
, sizeof(pid
), MSG_NOSIGNAL
);
3011 return log_error_errno(errno
, "Failed to send PID: %m");
3012 if (l
!= sizeof(pid
)) {
3013 log_error("Short write while sending PID.");
3017 l
= send(uuid_socket
, &arg_uuid
, sizeof(arg_uuid
), MSG_NOSIGNAL
);
3019 return log_error_errno(errno
, "Failed to send machine ID: %m");
3020 if (l
!= sizeof(arg_uuid
)) {
3021 log_error("Short write while sending machine ID.");
3025 pid_socket
= safe_close(pid_socket
);
3026 uuid_socket
= safe_close(uuid_socket
);
3027 kmsg_socket
= safe_close(kmsg_socket
);
3028 rtnl_socket
= safe_close(rtnl_socket
);
3033 static int uid_shift_pick(uid_t
*shift
, LockFile
*ret_lock_file
) {
3034 unsigned n_tries
= 100;
3039 assert(ret_lock_file
);
3040 assert(arg_userns_mode
== USER_NAMESPACE_PICK
);
3041 assert(arg_uid_range
== 0x10000U
);
3045 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3048 char lock_path
[strlen("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t
) + 1];
3049 _cleanup_release_lock_file_ LockFile lf
= LOCK_FILE_INIT
;
3054 if (candidate
< UID_SHIFT_PICK_MIN
|| candidate
> UID_SHIFT_PICK_MAX
)
3056 if ((candidate
& UINT32_C(0xFFFF)) != 0)
3059 xsprintf(lock_path
, "/run/systemd/nspawn-uid/" UID_FMT
, candidate
);
3060 r
= make_lock_file(lock_path
, LOCK_EX
|LOCK_NB
, &lf
);
3061 if (r
== -EBUSY
) /* Range already taken by another nspawn instance */
3066 /* Make some superficial checks whether the range is currently known in the user database */
3067 if (getpwuid(candidate
))
3069 if (getpwuid(candidate
+ UINT32_C(0xFFFE)))
3071 if (getgrgid(candidate
))
3073 if (getgrgid(candidate
+ UINT32_C(0xFFFE)))
3076 *ret_lock_file
= lf
;
3077 lf
= (struct LockFile
) LOCK_FILE_INIT
;
3082 random_bytes(&candidate
, sizeof(candidate
));
3083 candidate
= (candidate
% (UID_SHIFT_PICK_MAX
- UID_SHIFT_PICK_MIN
)) + UID_SHIFT_PICK_MIN
;
3084 candidate
&= (uid_t
) UINT32_C(0xFFFF0000);
3088 static int setup_uid_map(pid_t pid
) {
3089 char uid_map
[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t
) + 1], line
[DECIMAL_STR_MAX(uid_t
)*3+3+1];
3094 xsprintf(uid_map
, "/proc/" PID_FMT
"/uid_map", pid
);
3095 xsprintf(line
, UID_FMT
" " UID_FMT
" " UID_FMT
"\n", 0, arg_uid_shift
, arg_uid_range
);
3096 r
= write_string_file(uid_map
, line
, 0);
3098 return log_error_errno(r
, "Failed to write UID map: %m");
3100 /* We always assign the same UID and GID ranges */
3101 xsprintf(uid_map
, "/proc/" PID_FMT
"/gid_map", pid
);
3102 r
= write_string_file(uid_map
, line
, 0);
3104 return log_error_errno(r
, "Failed to write GID map: %m");
3109 static int load_settings(void) {
3110 _cleanup_(settings_freep
) Settings
*settings
= NULL
;
3111 _cleanup_fclose_
FILE *f
= NULL
;
3112 _cleanup_free_
char *p
= NULL
;
3116 /* If all settings are masked, there's no point in looking for
3117 * the settings file */
3118 if ((arg_settings_mask
& _SETTINGS_MASK_ALL
) == _SETTINGS_MASK_ALL
)
3121 fn
= strjoina(arg_machine
, ".nspawn");
3123 /* We first look in the admin's directories in /etc and /run */
3124 FOREACH_STRING(i
, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
3125 _cleanup_free_
char *j
= NULL
;
3127 j
= strjoin(i
, "/", fn
, NULL
);
3136 /* By default, we trust configuration from /etc and /run */
3137 if (arg_settings_trusted
< 0)
3138 arg_settings_trusted
= true;
3143 if (errno
!= ENOENT
)
3144 return log_error_errno(errno
, "Failed to open %s: %m", j
);
3148 /* After that, let's look for a file next to the
3149 * actual image we shall boot. */
3152 p
= file_in_same_dir(arg_image
, fn
);
3155 } else if (arg_directory
) {
3156 p
= file_in_same_dir(arg_directory
, fn
);
3163 if (!f
&& errno
!= ENOENT
)
3164 return log_error_errno(errno
, "Failed to open %s: %m", p
);
3166 /* By default, we do not trust configuration from /var/lib/machines */
3167 if (arg_settings_trusted
< 0)
3168 arg_settings_trusted
= false;
3175 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted
));
3177 r
= settings_load(f
, p
, &settings
);
3181 /* Copy over bits from the settings, unless they have been
3182 * explicitly masked by command line switches. */
3184 if ((arg_settings_mask
& SETTING_START_MODE
) == 0 &&
3185 settings
->start_mode
>= 0) {
3186 arg_start_mode
= settings
->start_mode
;
3188 strv_free(arg_parameters
);
3189 arg_parameters
= settings
->parameters
;
3190 settings
->parameters
= NULL
;
3193 if ((arg_settings_mask
& SETTING_WORKING_DIRECTORY
) == 0 &&
3194 settings
->working_directory
) {
3196 arg_chdir
= settings
->working_directory
;
3197 settings
->working_directory
= NULL
;
3200 if ((arg_settings_mask
& SETTING_ENVIRONMENT
) == 0 &&
3201 settings
->environment
) {
3202 strv_free(arg_setenv
);
3203 arg_setenv
= settings
->environment
;
3204 settings
->environment
= NULL
;
3207 if ((arg_settings_mask
& SETTING_USER
) == 0 &&
3210 arg_user
= settings
->user
;
3211 settings
->user
= NULL
;
3214 if ((arg_settings_mask
& SETTING_CAPABILITY
) == 0) {
3217 plus
= settings
->capability
;
3218 if (settings_private_network(settings
))
3219 plus
|= (1ULL << CAP_NET_ADMIN
);
3221 if (!arg_settings_trusted
&& plus
!= 0) {
3222 if (settings
->capability
!= 0)
3223 log_warning("Ignoring Capability= setting, file %s is not trusted.", p
);
3227 arg_retain
&= ~settings
->drop_capability
;
3230 if ((arg_settings_mask
& SETTING_KILL_SIGNAL
) == 0 &&
3231 settings
->kill_signal
> 0)
3232 arg_kill_signal
= settings
->kill_signal
;
3234 if ((arg_settings_mask
& SETTING_PERSONALITY
) == 0 &&
3235 settings
->personality
!= PERSONALITY_INVALID
)
3236 arg_personality
= settings
->personality
;
3238 if ((arg_settings_mask
& SETTING_MACHINE_ID
) == 0 &&
3239 !sd_id128_is_null(settings
->machine_id
)) {
3241 if (!arg_settings_trusted
)
3242 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p
);
3244 arg_uuid
= settings
->machine_id
;
3247 if ((arg_settings_mask
& SETTING_READ_ONLY
) == 0 &&
3248 settings
->read_only
>= 0)
3249 arg_read_only
= settings
->read_only
;
3251 if ((arg_settings_mask
& SETTING_VOLATILE_MODE
) == 0 &&
3252 settings
->volatile_mode
!= _VOLATILE_MODE_INVALID
)
3253 arg_volatile_mode
= settings
->volatile_mode
;
3255 if ((arg_settings_mask
& SETTING_CUSTOM_MOUNTS
) == 0 &&
3256 settings
->n_custom_mounts
> 0) {
3258 if (!arg_settings_trusted
)
3259 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p
);
3261 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
3262 arg_custom_mounts
= settings
->custom_mounts
;
3263 arg_n_custom_mounts
= settings
->n_custom_mounts
;
3265 settings
->custom_mounts
= NULL
;
3266 settings
->n_custom_mounts
= 0;
3270 if ((arg_settings_mask
& SETTING_NETWORK
) == 0 &&
3271 (settings
->private_network
>= 0 ||
3272 settings
->network_veth
>= 0 ||
3273 settings
->network_bridge
||
3274 settings
->network_interfaces
||
3275 settings
->network_macvlan
||
3276 settings
->network_ipvlan
||
3277 settings
->network_veth_extra
)) {
3279 if (!arg_settings_trusted
)
3280 log_warning("Ignoring network settings, file %s is not trusted.", p
);
3282 arg_network_veth
= settings_network_veth(settings
);
3283 arg_private_network
= settings_private_network(settings
);
3285 strv_free(arg_network_interfaces
);
3286 arg_network_interfaces
= settings
->network_interfaces
;
3287 settings
->network_interfaces
= NULL
;
3289 strv_free(arg_network_macvlan
);
3290 arg_network_macvlan
= settings
->network_macvlan
;
3291 settings
->network_macvlan
= NULL
;
3293 strv_free(arg_network_ipvlan
);
3294 arg_network_ipvlan
= settings
->network_ipvlan
;
3295 settings
->network_ipvlan
= NULL
;
3297 strv_free(arg_network_veth_extra
);
3298 arg_network_veth_extra
= settings
->network_veth_extra
;
3299 settings
->network_veth_extra
= NULL
;
3301 free(arg_network_bridge
);
3302 arg_network_bridge
= settings
->network_bridge
;
3303 settings
->network_bridge
= NULL
;
3307 if ((arg_settings_mask
& SETTING_EXPOSE_PORTS
) == 0 &&
3308 settings
->expose_ports
) {
3310 if (!arg_settings_trusted
)
3311 log_warning("Ignoring Port= setting, file %s is not trusted.", p
);
3313 expose_port_free_all(arg_expose_ports
);
3314 arg_expose_ports
= settings
->expose_ports
;
3315 settings
->expose_ports
= NULL
;
3319 if ((arg_settings_mask
& SETTING_USERNS
) == 0 &&
3320 settings
->userns_mode
!= _USER_NAMESPACE_MODE_INVALID
) {
3322 if (!arg_settings_trusted
)
3323 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p
);
3325 arg_userns_mode
= settings
->userns_mode
;
3326 arg_uid_shift
= settings
->uid_shift
;
3327 arg_uid_range
= settings
->uid_range
;
3328 arg_userns_chown
= settings
->userns_chown
;
3335 int main(int argc
, char *argv
[]) {
3337 _cleanup_free_
char *device_path
= NULL
, *root_device
= NULL
, *home_device
= NULL
, *srv_device
= NULL
, *console
= NULL
;
3338 bool root_device_rw
= true, home_device_rw
= true, srv_device_rw
= true;
3339 _cleanup_close_
int master
= -1, image_fd
= -1;
3340 _cleanup_fdset_free_ FDSet
*fds
= NULL
;
3341 int r
, n_fd_passed
, loop_nr
= -1;
3342 char veth_name
[IFNAMSIZ
];
3343 bool secondary
= false, remove_subvol
= false;
3346 int ret
= EXIT_SUCCESS
;
3347 union in_addr_union exposed
= {};
3348 _cleanup_release_lock_file_ LockFile tree_global_lock
= LOCK_FILE_INIT
, tree_local_lock
= LOCK_FILE_INIT
;
3351 log_parse_environment();
3354 /* Make sure rename_process() in the stub init process can work */
3358 r
= parse_argv(argc
, argv
);
3362 if (geteuid() != 0) {
3363 log_error("Need to be root.");
3367 r
= determine_names();
3371 r
= load_settings();
3375 r
= verify_arguments();
3379 n_fd_passed
= sd_listen_fds(false);
3380 if (n_fd_passed
> 0) {
3381 r
= fdset_new_listen_fds(&fds
, false);
3383 log_error_errno(r
, "Failed to collect file descriptors: %m");
3388 if (arg_directory
) {
3391 if (path_equal(arg_directory
, "/") && !arg_ephemeral
) {
3392 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3397 if (arg_ephemeral
) {
3398 _cleanup_free_
char *np
= NULL
;
3400 /* If the specified path is a mount point we
3401 * generate the new snapshot immediately
3402 * inside it under a random name. However if
3403 * the specified is not a mount point we
3404 * create the new snapshot in the parent
3405 * directory, just next to it. */
3406 r
= path_is_mount_point(arg_directory
, 0);
3408 log_error_errno(r
, "Failed to determine whether directory %s is mount point: %m", arg_directory
);
3412 r
= tempfn_random_child(arg_directory
, "machine.", &np
);
3414 r
= tempfn_random(arg_directory
, "machine.", &np
);
3416 log_error_errno(r
, "Failed to generate name for snapshot: %m");
3420 r
= image_path_lock(np
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3422 log_error_errno(r
, "Failed to lock %s: %m", np
);
3426 r
= btrfs_subvol_snapshot(arg_directory
, np
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
3428 log_error_errno(r
, "Failed to create snapshot %s from %s: %m", np
, arg_directory
);
3432 free(arg_directory
);
3436 remove_subvol
= true;
3439 r
= image_path_lock(arg_directory
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3441 log_error_errno(r
, "Directory tree %s is currently busy.", arg_directory
);
3445 log_error_errno(r
, "Failed to lock %s: %m", arg_directory
);
3450 r
= btrfs_subvol_snapshot(arg_template
, arg_directory
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
3453 log_info("Directory %s already exists, not populating from template %s.", arg_directory
, arg_template
);
3455 log_error_errno(r
, "Couldn't create snapshot %s from %s: %m", arg_directory
, arg_template
);
3459 log_info("Populated %s from template %s.", arg_directory
, arg_template
);
3464 if (arg_start_mode
== START_BOOT
) {
3465 if (path_is_os_tree(arg_directory
) <= 0) {
3466 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory
);
3473 p
= strjoina(arg_directory
, "/usr/");
3474 if (laccess(p
, F_OK
) < 0) {
3475 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory
);
3482 char template[] = "/tmp/nspawn-root-XXXXXX";
3485 assert(!arg_template
);
3487 r
= image_path_lock(arg_image
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3489 r
= log_error_errno(r
, "Disk image %s is currently busy.", arg_image
);
3493 r
= log_error_errno(r
, "Failed to create image lock: %m");
3497 if (!mkdtemp(template)) {
3498 log_error_errno(errno
, "Failed to create temporary directory: %m");
3503 arg_directory
= strdup(template);
3504 if (!arg_directory
) {
3509 image_fd
= setup_image(&device_path
, &loop_nr
);
3515 r
= dissect_image(image_fd
,
3516 &root_device
, &root_device_rw
,
3517 &home_device
, &home_device_rw
,
3518 &srv_device
, &srv_device_rw
,
3524 r
= custom_mounts_prepare();
3529 isatty(STDIN_FILENO
) > 0 &&
3530 isatty(STDOUT_FILENO
) > 0;
3532 master
= posix_openpt(O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NDELAY
);
3534 r
= log_error_errno(errno
, "Failed to acquire pseudo tty: %m");
3538 r
= ptsname_malloc(master
, &console
);
3540 r
= log_error_errno(r
, "Failed to determine tty name: %m");
3544 if (arg_selinux_apifs_context
) {
3545 r
= mac_selinux_apply(console
, arg_selinux_apifs_context
);
3550 if (unlockpt(master
) < 0) {
3551 r
= log_error_errno(errno
, "Failed to unlock tty: %m");
3556 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3557 arg_machine
, arg_image
?: arg_directory
);
3559 assert_se(sigprocmask_many(SIG_BLOCK
, NULL
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, -1) >= 0);
3561 assert_se(sigemptyset(&mask_chld
) == 0);
3562 assert_se(sigaddset(&mask_chld
, SIGCHLD
) == 0);
3564 if (prctl(PR_SET_CHILD_SUBREAPER
, 1) < 0) {
3565 r
= log_error_errno(errno
, "Failed to become subreaper: %m");
3570 static const struct sigaction sa
= {
3571 .sa_handler
= nop_signal_handler
,
3572 .sa_flags
= SA_NOCLDSTOP
,
3575 _cleanup_release_lock_file_ LockFile uid_shift_lock
= LOCK_FILE_INIT
;
3576 _cleanup_close_
int etc_passwd_lock
= -1;
3577 _cleanup_close_pair_
int
3578 kmsg_socket_pair
[2] = { -1, -1 },
3579 rtnl_socket_pair
[2] = { -1, -1 },
3580 pid_socket_pair
[2] = { -1, -1 },
3581 uuid_socket_pair
[2] = { -1, -1 },
3582 uid_shift_socket_pair
[2] = { -1, -1 };
3583 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
3584 _cleanup_(sd_event_unrefp
) sd_event
*event
= NULL
;
3585 _cleanup_(pty_forward_freep
) PTYForward
*forward
= NULL
;
3586 _cleanup_(sd_netlink_unrefp
) sd_netlink
*rtnl
= NULL
;
3587 ContainerStatus container_status
;
3592 if (arg_userns_mode
== USER_NAMESPACE_PICK
) {
3593 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3594 * check with getpwuid() if the specific user already exists. Note that /etc might be
3595 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3596 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3597 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3600 etc_passwd_lock
= take_etc_passwd_lock(NULL
);
3601 if (etc_passwd_lock
< 0 && etc_passwd_lock
!= -EROFS
) {
3602 log_error_errno(r
, "Failed to take /etc/passwd lock: %m");
3607 r
= barrier_create(&barrier
);
3609 log_error_errno(r
, "Cannot initialize IPC barrier: %m");
3613 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, kmsg_socket_pair
) < 0) {
3614 r
= log_error_errno(errno
, "Failed to create kmsg socket pair: %m");
3618 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, rtnl_socket_pair
) < 0) {
3619 r
= log_error_errno(errno
, "Failed to create rtnl socket pair: %m");
3623 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, pid_socket_pair
) < 0) {
3624 r
= log_error_errno(errno
, "Failed to create pid socket pair: %m");
3628 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, uuid_socket_pair
) < 0) {
3629 r
= log_error_errno(errno
, "Failed to create id socket pair: %m");
3633 if (arg_userns_mode
!= USER_NAMESPACE_NO
)
3634 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, uid_shift_socket_pair
) < 0) {
3635 r
= log_error_errno(errno
, "Failed to create uid shift socket pair: %m");
3639 /* Child can be killed before execv(), so handle SIGCHLD
3640 * in order to interrupt parent's blocking calls and
3641 * give it a chance to call wait() and terminate. */
3642 r
= sigprocmask(SIG_UNBLOCK
, &mask_chld
, NULL
);
3644 r
= log_error_errno(errno
, "Failed to change the signal mask: %m");
3648 r
= sigaction(SIGCHLD
, &sa
, NULL
);
3650 r
= log_error_errno(errno
, "Failed to install SIGCHLD handler: %m");
3654 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
, NULL
);
3656 if (errno
== EINVAL
)
3657 r
= log_error_errno(errno
, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3659 r
= log_error_errno(errno
, "clone() failed: %m");
3665 /* The outer child only has a file system namespace. */
3666 barrier_set_role(&barrier
, BARRIER_CHILD
);
3668 master
= safe_close(master
);
3670 kmsg_socket_pair
[0] = safe_close(kmsg_socket_pair
[0]);
3671 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3672 pid_socket_pair
[0] = safe_close(pid_socket_pair
[0]);
3673 uuid_socket_pair
[0] = safe_close(uuid_socket_pair
[0]);
3674 uid_shift_socket_pair
[0] = safe_close(uid_shift_socket_pair
[0]);
3676 (void) reset_all_signal_handlers();
3677 (void) reset_signal_mask();
3679 r
= outer_child(&barrier
,
3682 root_device
, root_device_rw
,
3683 home_device
, home_device_rw
,
3684 srv_device
, srv_device_rw
,
3688 uuid_socket_pair
[1],
3689 kmsg_socket_pair
[1],
3690 rtnl_socket_pair
[1],
3691 uid_shift_socket_pair
[1],
3694 _exit(EXIT_FAILURE
);
3696 _exit(EXIT_SUCCESS
);
3699 barrier_set_role(&barrier
, BARRIER_PARENT
);
3701 fds
= fdset_free(fds
);
3703 kmsg_socket_pair
[1] = safe_close(kmsg_socket_pair
[1]);
3704 rtnl_socket_pair
[1] = safe_close(rtnl_socket_pair
[1]);
3705 pid_socket_pair
[1] = safe_close(pid_socket_pair
[1]);
3706 uuid_socket_pair
[1] = safe_close(uuid_socket_pair
[1]);
3707 uid_shift_socket_pair
[1] = safe_close(uid_shift_socket_pair
[1]);
3709 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
3710 /* The child just let us know the UID shift it might have read from the image. */
3711 l
= recv(uid_shift_socket_pair
[0], &arg_uid_shift
, sizeof(arg_uid_shift
), 0);
3713 r
= log_error_errno(errno
, "Failed to read UID shift: %m");
3716 if (l
!= sizeof(arg_uid_shift
)) {
3717 log_error("Short read while reading UID shift.");
3722 if (arg_userns_mode
== USER_NAMESPACE_PICK
) {
3723 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3724 * image, but if that's already in use, pick a new one, and report back to the child,
3725 * which one we now picked. */
3727 r
= uid_shift_pick(&arg_uid_shift
, &uid_shift_lock
);
3729 log_error_errno(r
, "Failed to pick suitable UID/GID range: %m");
3733 l
= send(uid_shift_socket_pair
[0], &arg_uid_shift
, sizeof(arg_uid_shift
), MSG_NOSIGNAL
);
3735 r
= log_error_errno(errno
, "Failed to send UID shift: %m");
3738 if (l
!= sizeof(arg_uid_shift
)) {
3739 log_error("Short write while writing UID shift.");
3746 /* Wait for the outer child. */
3747 r
= wait_for_terminate_and_warn("namespace helper", pid
, NULL
);
3756 /* And now retrieve the PID of the inner child. */
3757 l
= recv(pid_socket_pair
[0], &pid
, sizeof(pid
), 0);
3759 r
= log_error_errno(errno
, "Failed to read inner child PID: %m");
3762 if (l
!= sizeof(pid
)) {
3763 log_error("Short read while reading inner child PID.");
3768 /* We also retrieve container UUID in case it was generated by outer child */
3769 l
= recv(uuid_socket_pair
[0], &arg_uuid
, sizeof(arg_uuid
), 0);
3771 r
= log_error_errno(errno
, "Failed to read container machine ID: %m");
3774 if (l
!= sizeof(arg_uuid
)) {
3775 log_error("Short read while reading container machined ID.");
3780 log_debug("Init process invoked as PID " PID_FMT
, pid
);
3782 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
3783 if (!barrier_place_and_sync(&barrier
)) { /* #1 */
3784 log_error("Child died too early.");
3789 r
= setup_uid_map(pid
);
3793 (void) barrier_place(&barrier
); /* #2 */
3796 if (arg_private_network
) {
3798 r
= move_network_interfaces(pid
, arg_network_interfaces
);
3802 if (arg_network_veth
) {
3803 r
= setup_veth(arg_machine
, pid
, veth_name
, !!arg_network_bridge
);
3809 if (arg_network_bridge
) {
3810 r
= setup_bridge(veth_name
, arg_network_bridge
);
3818 r
= setup_veth_extra(arg_machine
, pid
, arg_network_veth_extra
);
3822 r
= setup_macvlan(arg_machine
, pid
, arg_network_macvlan
);
3826 r
= setup_ipvlan(arg_machine
, pid
, arg_network_ipvlan
);
3832 r
= register_machine(
3839 arg_custom_mounts
, arg_n_custom_mounts
,
3843 arg_container_service_name
);
3848 r
= sync_cgroup(pid
, arg_unified_cgroup_hierarchy
);
3852 if (arg_keep_unit
) {
3853 r
= create_subcgroup(pid
, arg_unified_cgroup_hierarchy
);
3858 r
= chown_cgroup(pid
, arg_uid_shift
);
3862 /* Notify the child that the parent is ready with all
3863 * its setup (including cgroup-ification), and that
3864 * the child can now hand over control to the code to
3865 * run inside the container. */
3866 (void) barrier_place(&barrier
); /* #3 */
3868 /* Block SIGCHLD here, before notifying child.
3869 * process_pty() will handle it with the other signals. */
3870 assert_se(sigprocmask(SIG_BLOCK
, &mask_chld
, NULL
) >= 0);
3872 /* Reset signal to default */
3873 r
= default_signals(SIGCHLD
, -1);
3875 log_error_errno(r
, "Failed to reset SIGCHLD: %m");
3879 /* Let the child know that we are ready and wait that the child is completely ready now. */
3880 if (!barrier_place_and_sync(&barrier
)) { /* #4 */
3881 log_error("Child died too early.");
3886 /* At this point we have made use of the UID we picked, and thus nss-mymachines will make them appear
3887 * in getpwuid(), thus we can release the /etc/passwd lock. */
3888 etc_passwd_lock
= safe_close(etc_passwd_lock
);
3892 "STATUS=Container running.\n"
3893 "X_NSPAWN_LEADER_PID=" PID_FMT
, pid
);
3895 r
= sd_event_new(&event
);
3897 log_error_errno(r
, "Failed to get default event source: %m");
3901 if (arg_kill_signal
> 0) {
3902 /* Try to kill the init system on SIGINT or SIGTERM */
3903 sd_event_add_signal(event
, NULL
, SIGINT
, on_orderly_shutdown
, PID_TO_PTR(pid
));
3904 sd_event_add_signal(event
, NULL
, SIGTERM
, on_orderly_shutdown
, PID_TO_PTR(pid
));
3906 /* Immediately exit */
3907 sd_event_add_signal(event
, NULL
, SIGINT
, NULL
, NULL
);
3908 sd_event_add_signal(event
, NULL
, SIGTERM
, NULL
, NULL
);
3911 /* simply exit on sigchld */
3912 sd_event_add_signal(event
, NULL
, SIGCHLD
, NULL
, NULL
);
3914 if (arg_expose_ports
) {
3915 r
= expose_port_watch_rtnl(event
, rtnl_socket_pair
[0], on_address_change
, &exposed
, &rtnl
);
3919 (void) expose_port_execute(rtnl
, arg_expose_ports
, &exposed
);
3922 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3924 r
= pty_forward_new(event
, master
, PTY_FORWARD_IGNORE_VHANGUP
| (interactive
? 0 : PTY_FORWARD_READ_ONLY
), &forward
);
3926 log_error_errno(r
, "Failed to create PTY forwarder: %m");
3930 r
= sd_event_loop(event
);
3932 log_error_errno(r
, "Failed to run event loop: %m");
3936 pty_forward_get_last_char(forward
, &last_char
);
3938 forward
= pty_forward_free(forward
);
3940 if (!arg_quiet
&& last_char
!= '\n')
3943 /* Kill if it is not dead yet anyway */
3944 if (arg_register
&& !arg_keep_unit
)
3945 terminate_machine(pid
);
3947 /* Normally redundant, but better safe than sorry */
3950 r
= wait_for_container(pid
, &container_status
);
3954 /* We failed to wait for the container, or the
3955 * container exited abnormally */
3957 else if (r
> 0 || container_status
== CONTAINER_TERMINATED
) {
3958 /* The container exited with a non-zero
3959 * status, or with zero status and no reboot
3965 /* CONTAINER_REBOOTED, loop again */
3967 if (arg_keep_unit
) {
3968 /* Special handling if we are running as a
3969 * service: instead of simply restarting the
3970 * machine we want to restart the entire
3971 * service, so let's inform systemd about this
3972 * with the special exit code 133. The service
3973 * file uses RestartForceExitStatus=133 so
3974 * that this results in a full nspawn
3975 * restart. This is necessary since we might
3976 * have cgroup parameters set we want to have
3983 expose_port_flush(arg_expose_ports
, &exposed
);
3984 (void) remove_veth_links(veth_name
, arg_network_veth_extra
);
3990 "STATUS=Terminating...");
3995 /* Try to flush whatever is still queued in the pty */
3997 (void) copy_bytes(master
, STDOUT_FILENO
, (uint64_t) -1, false);
3999 loop_remove(loop_nr
, &image_fd
);
4001 if (remove_subvol
&& arg_directory
) {
4004 k
= btrfs_subvol_remove(arg_directory
, BTRFS_REMOVE_RECURSIVE
|BTRFS_REMOVE_QUOTA
);
4006 log_warning_errno(k
, "Cannot remove subvolume '%s', ignoring: %m", arg_directory
);
4012 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
4013 (void) rm_rf(p
, REMOVE_ROOT
);
4016 expose_port_flush(arg_expose_ports
, &exposed
);
4017 (void) remove_veth_links(veth_name
, arg_network_veth_extra
);
4019 free(arg_directory
);
4025 strv_free(arg_setenv
);
4026 free(arg_network_bridge
);
4027 strv_free(arg_network_interfaces
);
4028 strv_free(arg_network_macvlan
);
4029 strv_free(arg_network_ipvlan
);
4030 strv_free(arg_network_veth_extra
);
4031 strv_free(arg_parameters
);
4032 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
4033 expose_port_free_all(arg_expose_ports
);
4035 return r
< 0 ? EXIT_FAILURE
: ret
;