]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
tree-wide: use SET_FLAG in more places (#5892)
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18 ***/
19
20 #ifdef HAVE_BLKID
21 #include <blkid.h>
22 #endif
23 #include <errno.h>
24 #include <getopt.h>
25 #include <grp.h>
26 #include <linux/loop.h>
27 #include <pwd.h>
28 #include <sched.h>
29 #ifdef HAVE_SELINUX
30 #include <selinux/selinux.h>
31 #endif
32 #include <signal.h>
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <sys/file.h>
37 #include <sys/mount.h>
38 #include <sys/personality.h>
39 #include <sys/prctl.h>
40 #include <sys/types.h>
41 #include <sys/wait.h>
42 #include <unistd.h>
43
44 #include "sd-bus.h"
45 #include "sd-daemon.h"
46 #include "sd-id128.h"
47
48 #include "alloc-util.h"
49 #include "barrier.h"
50 #include "base-filesystem.h"
51 #include "blkid-util.h"
52 #include "btrfs-util.h"
53 #include "bus-util.h"
54 #include "cap-list.h"
55 #include "capability-util.h"
56 #include "cgroup-util.h"
57 #include "copy.h"
58 #include "dev-setup.h"
59 #include "dissect-image.h"
60 #include "env-util.h"
61 #include "fd-util.h"
62 #include "fdset.h"
63 #include "fileio.h"
64 #include "format-util.h"
65 #include "fs-util.h"
66 #include "gpt.h"
67 #include "hexdecoct.h"
68 #include "hostname-util.h"
69 #include "id128-util.h"
70 #include "log.h"
71 #include "loop-util.h"
72 #include "loopback-setup.h"
73 #include "machine-image.h"
74 #include "macro.h"
75 #include "missing.h"
76 #include "mkdir.h"
77 #include "mount-util.h"
78 #include "netlink-util.h"
79 #include "nspawn-cgroup.h"
80 #include "nspawn-expose-ports.h"
81 #include "nspawn-mount.h"
82 #include "nspawn-network.h"
83 #include "nspawn-patch-uid.h"
84 #include "nspawn-register.h"
85 #include "nspawn-seccomp.h"
86 #include "nspawn-settings.h"
87 #include "nspawn-setuid.h"
88 #include "nspawn-stub-pid1.h"
89 #include "parse-util.h"
90 #include "path-util.h"
91 #include "process-util.h"
92 #include "ptyfwd.h"
93 #include "random-util.h"
94 #include "raw-clone.h"
95 #include "rm-rf.h"
96 #include "selinux-util.h"
97 #include "signal-util.h"
98 #include "socket-util.h"
99 #include "stat-util.h"
100 #include "stdio-util.h"
101 #include "string-util.h"
102 #include "strv.h"
103 #include "terminal-util.h"
104 #include "udev-util.h"
105 #include "umask-util.h"
106 #include "user-util.h"
107 #include "util.h"
108
109 /* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
110 * UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems
111 * may have their own allocation ranges too. */
112 #define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
113 #define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
114
115 /* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
116 * nspawn_notify_socket_path is relative to the container
117 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
118 #define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
119
120 #define EXIT_FORCE_RESTART 133
121
122 typedef enum ContainerStatus {
123 CONTAINER_TERMINATED,
124 CONTAINER_REBOOTED
125 } ContainerStatus;
126
127 typedef enum LinkJournal {
128 LINK_NO,
129 LINK_AUTO,
130 LINK_HOST,
131 LINK_GUEST
132 } LinkJournal;
133
134 static char *arg_directory = NULL;
135 static char *arg_template = NULL;
136 static char *arg_chdir = NULL;
137 static char *arg_pivot_root_new = NULL;
138 static char *arg_pivot_root_old = NULL;
139 static char *arg_user = NULL;
140 static sd_id128_t arg_uuid = {};
141 static char *arg_machine = NULL;
142 static const char *arg_selinux_context = NULL;
143 static const char *arg_selinux_apifs_context = NULL;
144 static const char *arg_slice = NULL;
145 static bool arg_private_network = false;
146 static bool arg_read_only = false;
147 static StartMode arg_start_mode = START_PID1;
148 static bool arg_ephemeral = false;
149 static LinkJournal arg_link_journal = LINK_AUTO;
150 static bool arg_link_journal_try = false;
151 static uint64_t arg_caps_retain =
152 (1ULL << CAP_AUDIT_CONTROL) |
153 (1ULL << CAP_AUDIT_WRITE) |
154 (1ULL << CAP_CHOWN) |
155 (1ULL << CAP_DAC_OVERRIDE) |
156 (1ULL << CAP_DAC_READ_SEARCH) |
157 (1ULL << CAP_FOWNER) |
158 (1ULL << CAP_FSETID) |
159 (1ULL << CAP_IPC_OWNER) |
160 (1ULL << CAP_KILL) |
161 (1ULL << CAP_LEASE) |
162 (1ULL << CAP_LINUX_IMMUTABLE) |
163 (1ULL << CAP_MKNOD) |
164 (1ULL << CAP_NET_BIND_SERVICE) |
165 (1ULL << CAP_NET_BROADCAST) |
166 (1ULL << CAP_NET_RAW) |
167 (1ULL << CAP_SETFCAP) |
168 (1ULL << CAP_SETGID) |
169 (1ULL << CAP_SETPCAP) |
170 (1ULL << CAP_SETUID) |
171 (1ULL << CAP_SYS_ADMIN) |
172 (1ULL << CAP_SYS_BOOT) |
173 (1ULL << CAP_SYS_CHROOT) |
174 (1ULL << CAP_SYS_NICE) |
175 (1ULL << CAP_SYS_PTRACE) |
176 (1ULL << CAP_SYS_RESOURCE) |
177 (1ULL << CAP_SYS_TTY_CONFIG);
178 static CustomMount *arg_custom_mounts = NULL;
179 static unsigned arg_n_custom_mounts = 0;
180 static char **arg_setenv = NULL;
181 static bool arg_quiet = false;
182 static bool arg_register = true;
183 static bool arg_keep_unit = false;
184 static char **arg_network_interfaces = NULL;
185 static char **arg_network_macvlan = NULL;
186 static char **arg_network_ipvlan = NULL;
187 static bool arg_network_veth = false;
188 static char **arg_network_veth_extra = NULL;
189 static char *arg_network_bridge = NULL;
190 static char *arg_network_zone = NULL;
191 static unsigned long arg_personality = PERSONALITY_INVALID;
192 static char *arg_image = NULL;
193 static VolatileMode arg_volatile_mode = VOLATILE_NO;
194 static ExposePort *arg_expose_ports = NULL;
195 static char **arg_property = NULL;
196 static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
197 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
198 static bool arg_userns_chown = false;
199 static int arg_kill_signal = 0;
200 static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
201 static SettingsMask arg_settings_mask = 0;
202 static int arg_settings_trusted = -1;
203 static char **arg_parameters = NULL;
204 static const char *arg_container_service_name = "systemd-nspawn";
205 static bool arg_notify_ready = false;
206 static bool arg_use_cgns = true;
207 static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
208 static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO;
209 static void *arg_root_hash = NULL;
210 static size_t arg_root_hash_size = 0;
211
212 static void help(void) {
213 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
214 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
215 " -h --help Show this help\n"
216 " --version Print version string\n"
217 " -q --quiet Do not show status information\n"
218 " -D --directory=PATH Root directory for the container\n"
219 " --template=PATH Initialize root directory from template directory,\n"
220 " if missing\n"
221 " -x --ephemeral Run container with snapshot of root directory, and\n"
222 " remove it after exit\n"
223 " -i --image=PATH File system device or disk image for the container\n"
224 " --root-hash=HASH Specify verity root hash\n"
225 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
226 " -b --boot Boot up full system (i.e. invoke init)\n"
227 " --chdir=PATH Set working directory in the container\n"
228 " --pivot-root=PATH[:PATH]\n"
229 " Pivot root to given directory in the container\n"
230 " -u --user=USER Run the command under specified user or uid\n"
231 " -M --machine=NAME Set the machine name for the container\n"
232 " --uuid=UUID Set a specific machine UUID for the container\n"
233 " -S --slice=SLICE Place the container in the specified slice\n"
234 " --property=NAME=VALUE Set scope unit property\n"
235 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
236 " --private-users[=UIDBASE[:NUIDS]]\n"
237 " Similar, but with user configured UID/GID range\n"
238 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n"
239 " --private-network Disable network in container\n"
240 " --network-interface=INTERFACE\n"
241 " Assign an existing network interface to the\n"
242 " container\n"
243 " --network-macvlan=INTERFACE\n"
244 " Create a macvlan network interface based on an\n"
245 " existing network interface to the container\n"
246 " --network-ipvlan=INTERFACE\n"
247 " Create a ipvlan network interface based on an\n"
248 " existing network interface to the container\n"
249 " -n --network-veth Add a virtual Ethernet connection between host\n"
250 " and container\n"
251 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
252 " Add an additional virtual Ethernet link between\n"
253 " host and container\n"
254 " --network-bridge=INTERFACE\n"
255 " Add a virtual Ethernet connection to the container\n"
256 " and attach it to an existing bridge on the host\n"
257 " --network-zone=NAME Similar, but attach the new interface to an\n"
258 " an automatically managed bridge interface\n"
259 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
260 " Expose a container IP port on the host\n"
261 " -Z --selinux-context=SECLABEL\n"
262 " Set the SELinux security context to be used by\n"
263 " processes in the container\n"
264 " -L --selinux-apifs-context=SECLABEL\n"
265 " Set the SELinux security context to be used by\n"
266 " API/tmpfs file systems in the container\n"
267 " --capability=CAP In addition to the default, retain specified\n"
268 " capability\n"
269 " --drop-capability=CAP Drop the specified capability from the default set\n"
270 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
271 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
272 " host, try-guest, try-host\n"
273 " -j Equivalent to --link-journal=try-guest\n"
274 " --read-only Mount the root directory read-only\n"
275 " --bind=PATH[:PATH[:OPTIONS]]\n"
276 " Bind mount a file or directory from the host into\n"
277 " the container\n"
278 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
279 " Similar, but creates a read-only bind mount\n"
280 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
281 " --overlay=PATH[:PATH...]:PATH\n"
282 " Create an overlay mount from the host to \n"
283 " the container\n"
284 " --overlay-ro=PATH[:PATH...]:PATH\n"
285 " Similar, but creates a read-only overlay mount\n"
286 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
287 " --register=BOOLEAN Register container as machine\n"
288 " --keep-unit Do not register a scope for the machine, reuse\n"
289 " the service unit nspawn is running in\n"
290 " --volatile[=MODE] Run the system in volatile mode\n"
291 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
292 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
293 , program_invocation_short_name);
294 }
295
296 static int custom_mount_check_all(void) {
297 unsigned i;
298
299 for (i = 0; i < arg_n_custom_mounts; i++) {
300 CustomMount *m = &arg_custom_mounts[i];
301
302 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
303
304 if (arg_userns_chown) {
305 log_error("--private-users-chown may not be combined with custom root mounts.");
306 return -EINVAL;
307 } else if (arg_uid_shift == UID_INVALID) {
308 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
309 return -EINVAL;
310 }
311 }
312 }
313
314 return 0;
315 }
316
317 static int detect_unified_cgroup_hierarchy(const char *directory) {
318 const char *e;
319 int r;
320
321 /* Allow the user to control whether the unified hierarchy is used */
322 e = getenv("UNIFIED_CGROUP_HIERARCHY");
323 if (e) {
324 r = parse_boolean(e);
325 if (r < 0)
326 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
327 if (r > 0)
328 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
329 else
330 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
331
332 return 0;
333 }
334
335 /* Otherwise inherit the default from the host system */
336 r = cg_all_unified();
337 if (r < 0)
338 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
339 if (r > 0) {
340 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
341 * routine only detects 231, so we'll have a false negative here for 230. */
342 r = systemd_installation_has_version(directory, 230);
343 if (r < 0)
344 return log_error_errno(r, "Failed to determine systemd version in container: %m");
345 if (r > 0)
346 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
347 else
348 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
349 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
350 /* Mixed cgroup hierarchy support was added in 233 */
351 r = systemd_installation_has_version(directory, 233);
352 if (r < 0)
353 return log_error_errno(r, "Failed to determine systemd version in container: %m");
354 if (r > 0)
355 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
356 else
357 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
358 } else
359 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
360
361 return 0;
362 }
363
364 static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
365 int r;
366
367 r = getenv_bool(name);
368 if (r == -ENXIO)
369 return;
370 if (r < 0)
371 log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
372 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
373 }
374
375 static void parse_mount_settings_env(void) {
376 int r;
377 const char *e;
378
379 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
380 if (!e)
381 return;
382
383 if (streq(e, "network")) {
384 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
385 return;
386 }
387
388 r = parse_boolean(e);
389 if (r < 0) {
390 log_warning_errno(r, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring.");
391 return;
392 }
393
394 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
395 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
396 }
397
398 static int parse_argv(int argc, char *argv[]) {
399
400 enum {
401 ARG_VERSION = 0x100,
402 ARG_PRIVATE_NETWORK,
403 ARG_UUID,
404 ARG_READ_ONLY,
405 ARG_CAPABILITY,
406 ARG_DROP_CAPABILITY,
407 ARG_LINK_JOURNAL,
408 ARG_BIND,
409 ARG_BIND_RO,
410 ARG_TMPFS,
411 ARG_OVERLAY,
412 ARG_OVERLAY_RO,
413 ARG_SHARE_SYSTEM,
414 ARG_REGISTER,
415 ARG_KEEP_UNIT,
416 ARG_NETWORK_INTERFACE,
417 ARG_NETWORK_MACVLAN,
418 ARG_NETWORK_IPVLAN,
419 ARG_NETWORK_BRIDGE,
420 ARG_NETWORK_ZONE,
421 ARG_NETWORK_VETH_EXTRA,
422 ARG_PERSONALITY,
423 ARG_VOLATILE,
424 ARG_TEMPLATE,
425 ARG_PROPERTY,
426 ARG_PRIVATE_USERS,
427 ARG_KILL_SIGNAL,
428 ARG_SETTINGS,
429 ARG_CHDIR,
430 ARG_PIVOT_ROOT,
431 ARG_PRIVATE_USERS_CHOWN,
432 ARG_NOTIFY_READY,
433 ARG_ROOT_HASH,
434 };
435
436 static const struct option options[] = {
437 { "help", no_argument, NULL, 'h' },
438 { "version", no_argument, NULL, ARG_VERSION },
439 { "directory", required_argument, NULL, 'D' },
440 { "template", required_argument, NULL, ARG_TEMPLATE },
441 { "ephemeral", no_argument, NULL, 'x' },
442 { "user", required_argument, NULL, 'u' },
443 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
444 { "as-pid2", no_argument, NULL, 'a' },
445 { "boot", no_argument, NULL, 'b' },
446 { "uuid", required_argument, NULL, ARG_UUID },
447 { "read-only", no_argument, NULL, ARG_READ_ONLY },
448 { "capability", required_argument, NULL, ARG_CAPABILITY },
449 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
450 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
451 { "bind", required_argument, NULL, ARG_BIND },
452 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
453 { "tmpfs", required_argument, NULL, ARG_TMPFS },
454 { "overlay", required_argument, NULL, ARG_OVERLAY },
455 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
456 { "machine", required_argument, NULL, 'M' },
457 { "slice", required_argument, NULL, 'S' },
458 { "setenv", required_argument, NULL, 'E' },
459 { "selinux-context", required_argument, NULL, 'Z' },
460 { "selinux-apifs-context", required_argument, NULL, 'L' },
461 { "quiet", no_argument, NULL, 'q' },
462 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
463 { "register", required_argument, NULL, ARG_REGISTER },
464 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
465 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
466 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
467 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
468 { "network-veth", no_argument, NULL, 'n' },
469 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
470 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
471 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
472 { "personality", required_argument, NULL, ARG_PERSONALITY },
473 { "image", required_argument, NULL, 'i' },
474 { "volatile", optional_argument, NULL, ARG_VOLATILE },
475 { "port", required_argument, NULL, 'p' },
476 { "property", required_argument, NULL, ARG_PROPERTY },
477 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
478 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
479 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
480 { "settings", required_argument, NULL, ARG_SETTINGS },
481 { "chdir", required_argument, NULL, ARG_CHDIR },
482 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
483 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
484 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
485 {}
486 };
487
488 int c, r;
489 const char *p, *e;
490 uint64_t plus = 0, minus = 0;
491 bool mask_all_settings = false, mask_no_settings = false;
492
493 assert(argc >= 0);
494 assert(argv);
495
496 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:", options, NULL)) >= 0)
497
498 switch (c) {
499
500 case 'h':
501 help();
502 return 0;
503
504 case ARG_VERSION:
505 return version();
506
507 case 'D':
508 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
509 if (r < 0)
510 return r;
511 break;
512
513 case ARG_TEMPLATE:
514 r = parse_path_argument_and_warn(optarg, false, &arg_template);
515 if (r < 0)
516 return r;
517 break;
518
519 case 'i':
520 r = parse_path_argument_and_warn(optarg, false, &arg_image);
521 if (r < 0)
522 return r;
523 break;
524
525 case 'x':
526 arg_ephemeral = true;
527 break;
528
529 case 'u':
530 r = free_and_strdup(&arg_user, optarg);
531 if (r < 0)
532 return log_oom();
533
534 arg_settings_mask |= SETTING_USER;
535 break;
536
537 case ARG_NETWORK_ZONE: {
538 char *j;
539
540 j = strappend("vz-", optarg);
541 if (!j)
542 return log_oom();
543
544 if (!ifname_valid(j)) {
545 log_error("Network zone name not valid: %s", j);
546 free(j);
547 return -EINVAL;
548 }
549
550 free(arg_network_zone);
551 arg_network_zone = j;
552
553 arg_network_veth = true;
554 arg_private_network = true;
555 arg_settings_mask |= SETTING_NETWORK;
556 break;
557 }
558
559 case ARG_NETWORK_BRIDGE:
560
561 if (!ifname_valid(optarg)) {
562 log_error("Bridge interface name not valid: %s", optarg);
563 return -EINVAL;
564 }
565
566 r = free_and_strdup(&arg_network_bridge, optarg);
567 if (r < 0)
568 return log_oom();
569
570 /* fall through */
571
572 case 'n':
573 arg_network_veth = true;
574 arg_private_network = true;
575 arg_settings_mask |= SETTING_NETWORK;
576 break;
577
578 case ARG_NETWORK_VETH_EXTRA:
579 r = veth_extra_parse(&arg_network_veth_extra, optarg);
580 if (r < 0)
581 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
582
583 arg_private_network = true;
584 arg_settings_mask |= SETTING_NETWORK;
585 break;
586
587 case ARG_NETWORK_INTERFACE:
588
589 if (!ifname_valid(optarg)) {
590 log_error("Network interface name not valid: %s", optarg);
591 return -EINVAL;
592 }
593
594 if (strv_extend(&arg_network_interfaces, optarg) < 0)
595 return log_oom();
596
597 arg_private_network = true;
598 arg_settings_mask |= SETTING_NETWORK;
599 break;
600
601 case ARG_NETWORK_MACVLAN:
602
603 if (!ifname_valid(optarg)) {
604 log_error("MACVLAN network interface name not valid: %s", optarg);
605 return -EINVAL;
606 }
607
608 if (strv_extend(&arg_network_macvlan, optarg) < 0)
609 return log_oom();
610
611 arg_private_network = true;
612 arg_settings_mask |= SETTING_NETWORK;
613 break;
614
615 case ARG_NETWORK_IPVLAN:
616
617 if (!ifname_valid(optarg)) {
618 log_error("IPVLAN network interface name not valid: %s", optarg);
619 return -EINVAL;
620 }
621
622 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
623 return log_oom();
624
625 /* fall through */
626
627 case ARG_PRIVATE_NETWORK:
628 arg_private_network = true;
629 arg_settings_mask |= SETTING_NETWORK;
630 break;
631
632 case 'b':
633 if (arg_start_mode == START_PID2) {
634 log_error("--boot and --as-pid2 may not be combined.");
635 return -EINVAL;
636 }
637
638 arg_start_mode = START_BOOT;
639 arg_settings_mask |= SETTING_START_MODE;
640 break;
641
642 case 'a':
643 if (arg_start_mode == START_BOOT) {
644 log_error("--boot and --as-pid2 may not be combined.");
645 return -EINVAL;
646 }
647
648 arg_start_mode = START_PID2;
649 arg_settings_mask |= SETTING_START_MODE;
650 break;
651
652 case ARG_UUID:
653 r = sd_id128_from_string(optarg, &arg_uuid);
654 if (r < 0)
655 return log_error_errno(r, "Invalid UUID: %s", optarg);
656
657 if (sd_id128_is_null(arg_uuid)) {
658 log_error("Machine UUID may not be all zeroes.");
659 return -EINVAL;
660 }
661
662 arg_settings_mask |= SETTING_MACHINE_ID;
663 break;
664
665 case 'S':
666 arg_slice = optarg;
667 break;
668
669 case 'M':
670 if (isempty(optarg))
671 arg_machine = mfree(arg_machine);
672 else {
673 if (!machine_name_is_valid(optarg)) {
674 log_error("Invalid machine name: %s", optarg);
675 return -EINVAL;
676 }
677
678 r = free_and_strdup(&arg_machine, optarg);
679 if (r < 0)
680 return log_oom();
681 }
682 break;
683
684 case 'Z':
685 arg_selinux_context = optarg;
686 break;
687
688 case 'L':
689 arg_selinux_apifs_context = optarg;
690 break;
691
692 case ARG_READ_ONLY:
693 arg_read_only = true;
694 arg_settings_mask |= SETTING_READ_ONLY;
695 break;
696
697 case ARG_CAPABILITY:
698 case ARG_DROP_CAPABILITY: {
699 p = optarg;
700 for (;;) {
701 _cleanup_free_ char *t = NULL;
702
703 r = extract_first_word(&p, &t, ",", 0);
704 if (r < 0)
705 return log_error_errno(r, "Failed to parse capability %s.", t);
706
707 if (r == 0)
708 break;
709
710 if (streq(t, "all")) {
711 if (c == ARG_CAPABILITY)
712 plus = (uint64_t) -1;
713 else
714 minus = (uint64_t) -1;
715 } else {
716 int cap;
717
718 cap = capability_from_name(t);
719 if (cap < 0) {
720 log_error("Failed to parse capability %s.", t);
721 return -EINVAL;
722 }
723
724 if (c == ARG_CAPABILITY)
725 plus |= 1ULL << (uint64_t) cap;
726 else
727 minus |= 1ULL << (uint64_t) cap;
728 }
729 }
730
731 arg_settings_mask |= SETTING_CAPABILITY;
732 break;
733 }
734
735 case 'j':
736 arg_link_journal = LINK_GUEST;
737 arg_link_journal_try = true;
738 break;
739
740 case ARG_LINK_JOURNAL:
741 if (streq(optarg, "auto")) {
742 arg_link_journal = LINK_AUTO;
743 arg_link_journal_try = false;
744 } else if (streq(optarg, "no")) {
745 arg_link_journal = LINK_NO;
746 arg_link_journal_try = false;
747 } else if (streq(optarg, "guest")) {
748 arg_link_journal = LINK_GUEST;
749 arg_link_journal_try = false;
750 } else if (streq(optarg, "host")) {
751 arg_link_journal = LINK_HOST;
752 arg_link_journal_try = false;
753 } else if (streq(optarg, "try-guest")) {
754 arg_link_journal = LINK_GUEST;
755 arg_link_journal_try = true;
756 } else if (streq(optarg, "try-host")) {
757 arg_link_journal = LINK_HOST;
758 arg_link_journal_try = true;
759 } else {
760 log_error("Failed to parse link journal mode %s", optarg);
761 return -EINVAL;
762 }
763
764 break;
765
766 case ARG_BIND:
767 case ARG_BIND_RO:
768 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
769 if (r < 0)
770 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
771
772 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
773 break;
774
775 case ARG_TMPFS:
776 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
777 if (r < 0)
778 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
779
780 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
781 break;
782
783 case ARG_OVERLAY:
784 case ARG_OVERLAY_RO:
785 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
786 if (r == -EADDRNOTAVAIL)
787 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
788 if (r < 0)
789 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
790
791 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
792 break;
793
794 case 'E': {
795 char **n;
796
797 if (!env_assignment_is_valid(optarg)) {
798 log_error("Environment variable assignment '%s' is not valid.", optarg);
799 return -EINVAL;
800 }
801
802 n = strv_env_set(arg_setenv, optarg);
803 if (!n)
804 return log_oom();
805
806 strv_free(arg_setenv);
807 arg_setenv = n;
808
809 arg_settings_mask |= SETTING_ENVIRONMENT;
810 break;
811 }
812
813 case 'q':
814 arg_quiet = true;
815 break;
816
817 case ARG_SHARE_SYSTEM:
818 /* We don't officially support this anymore, except for compat reasons. People should use the
819 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
820 arg_clone_ns_flags = 0;
821 break;
822
823 case ARG_REGISTER:
824 r = parse_boolean(optarg);
825 if (r < 0) {
826 log_error("Failed to parse --register= argument: %s", optarg);
827 return r;
828 }
829
830 arg_register = r;
831 break;
832
833 case ARG_KEEP_UNIT:
834 arg_keep_unit = true;
835 break;
836
837 case ARG_PERSONALITY:
838
839 arg_personality = personality_from_string(optarg);
840 if (arg_personality == PERSONALITY_INVALID) {
841 log_error("Unknown or unsupported personality '%s'.", optarg);
842 return -EINVAL;
843 }
844
845 arg_settings_mask |= SETTING_PERSONALITY;
846 break;
847
848 case ARG_VOLATILE:
849
850 if (!optarg)
851 arg_volatile_mode = VOLATILE_YES;
852 else {
853 VolatileMode m;
854
855 m = volatile_mode_from_string(optarg);
856 if (m < 0) {
857 log_error("Failed to parse --volatile= argument: %s", optarg);
858 return -EINVAL;
859 } else
860 arg_volatile_mode = m;
861 }
862
863 arg_settings_mask |= SETTING_VOLATILE_MODE;
864 break;
865
866 case 'p':
867 r = expose_port_parse(&arg_expose_ports, optarg);
868 if (r == -EEXIST)
869 return log_error_errno(r, "Duplicate port specification: %s", optarg);
870 if (r < 0)
871 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
872
873 arg_settings_mask |= SETTING_EXPOSE_PORTS;
874 break;
875
876 case ARG_PROPERTY:
877 if (strv_extend(&arg_property, optarg) < 0)
878 return log_oom();
879
880 break;
881
882 case ARG_PRIVATE_USERS: {
883 int boolean = -1;
884
885 if (!optarg)
886 boolean = true;
887 else if (!in_charset(optarg, DIGITS))
888 /* do *not* parse numbers as booleans */
889 boolean = parse_boolean(optarg);
890
891 if (boolean == false) {
892 /* no: User namespacing off */
893 arg_userns_mode = USER_NAMESPACE_NO;
894 arg_uid_shift = UID_INVALID;
895 arg_uid_range = UINT32_C(0x10000);
896 } else if (boolean == true) {
897 /* yes: User namespacing on, UID range is read from root dir */
898 arg_userns_mode = USER_NAMESPACE_FIXED;
899 arg_uid_shift = UID_INVALID;
900 arg_uid_range = UINT32_C(0x10000);
901 } else if (streq(optarg, "pick")) {
902 /* pick: User namespacing on, UID range is picked randomly */
903 arg_userns_mode = USER_NAMESPACE_PICK;
904 arg_uid_shift = UID_INVALID;
905 arg_uid_range = UINT32_C(0x10000);
906 } else {
907 _cleanup_free_ char *buffer = NULL;
908 const char *range, *shift;
909
910 /* anything else: User namespacing on, UID range is explicitly configured */
911
912 range = strchr(optarg, ':');
913 if (range) {
914 buffer = strndup(optarg, range - optarg);
915 if (!buffer)
916 return log_oom();
917 shift = buffer;
918
919 range++;
920 r = safe_atou32(range, &arg_uid_range);
921 if (r < 0)
922 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
923 } else
924 shift = optarg;
925
926 r = parse_uid(shift, &arg_uid_shift);
927 if (r < 0)
928 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
929
930 arg_userns_mode = USER_NAMESPACE_FIXED;
931 }
932
933 if (arg_uid_range <= 0) {
934 log_error("UID range cannot be 0.");
935 return -EINVAL;
936 }
937
938 arg_settings_mask |= SETTING_USERNS;
939 break;
940 }
941
942 case 'U':
943 if (userns_supported()) {
944 arg_userns_mode = USER_NAMESPACE_PICK;
945 arg_uid_shift = UID_INVALID;
946 arg_uid_range = UINT32_C(0x10000);
947
948 arg_settings_mask |= SETTING_USERNS;
949 }
950
951 break;
952
953 case ARG_PRIVATE_USERS_CHOWN:
954 arg_userns_chown = true;
955
956 arg_settings_mask |= SETTING_USERNS;
957 break;
958
959 case ARG_KILL_SIGNAL:
960 arg_kill_signal = signal_from_string_try_harder(optarg);
961 if (arg_kill_signal < 0) {
962 log_error("Cannot parse signal: %s", optarg);
963 return -EINVAL;
964 }
965
966 arg_settings_mask |= SETTING_KILL_SIGNAL;
967 break;
968
969 case ARG_SETTINGS:
970
971 /* no → do not read files
972 * yes → read files, do not override cmdline, trust only subset
973 * override → read files, override cmdline, trust only subset
974 * trusted → read files, do not override cmdline, trust all
975 */
976
977 r = parse_boolean(optarg);
978 if (r < 0) {
979 if (streq(optarg, "trusted")) {
980 mask_all_settings = false;
981 mask_no_settings = false;
982 arg_settings_trusted = true;
983
984 } else if (streq(optarg, "override")) {
985 mask_all_settings = false;
986 mask_no_settings = true;
987 arg_settings_trusted = -1;
988 } else
989 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
990 } else if (r > 0) {
991 /* yes */
992 mask_all_settings = false;
993 mask_no_settings = false;
994 arg_settings_trusted = -1;
995 } else {
996 /* no */
997 mask_all_settings = true;
998 mask_no_settings = false;
999 arg_settings_trusted = false;
1000 }
1001
1002 break;
1003
1004 case ARG_CHDIR:
1005 if (!path_is_absolute(optarg)) {
1006 log_error("Working directory %s is not an absolute path.", optarg);
1007 return -EINVAL;
1008 }
1009
1010 r = free_and_strdup(&arg_chdir, optarg);
1011 if (r < 0)
1012 return log_oom();
1013
1014 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1015 break;
1016
1017 case ARG_PIVOT_ROOT:
1018 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1019 if (r < 0)
1020 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1021
1022 arg_settings_mask |= SETTING_PIVOT_ROOT;
1023 break;
1024
1025 case ARG_NOTIFY_READY:
1026 r = parse_boolean(optarg);
1027 if (r < 0) {
1028 log_error("%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1029 return -EINVAL;
1030 }
1031 arg_notify_ready = r;
1032 arg_settings_mask |= SETTING_NOTIFY_READY;
1033 break;
1034
1035 case ARG_ROOT_HASH: {
1036 void *k;
1037 size_t l;
1038
1039 r = unhexmem(optarg, strlen(optarg), &k, &l);
1040 if (r < 0)
1041 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1042 if (l < sizeof(sd_id128_t)) {
1043 log_error("Root hash must be at least 128bit long: %s", optarg);
1044 free(k);
1045 return -EINVAL;
1046 }
1047
1048 free(arg_root_hash);
1049 arg_root_hash = k;
1050 arg_root_hash_size = l;
1051 break;
1052 }
1053
1054 case '?':
1055 return -EINVAL;
1056
1057 default:
1058 assert_not_reached("Unhandled option");
1059 }
1060
1061 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
1062 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
1063 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
1064 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
1065
1066 if (arg_userns_mode != USER_NAMESPACE_NO)
1067 arg_mount_settings |= MOUNT_USE_USERNS;
1068
1069 if (arg_private_network)
1070 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1071
1072 parse_mount_settings_env();
1073
1074 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1075 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
1076 arg_register = false;
1077 if (arg_start_mode != START_PID1) {
1078 log_error("--boot cannot be used without namespacing.");
1079 return -EINVAL;
1080 }
1081 }
1082
1083 if (arg_userns_mode == USER_NAMESPACE_PICK)
1084 arg_userns_chown = true;
1085
1086 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
1087 log_error("--keep-unit may not be used when invoked from a user session.");
1088 return -EINVAL;
1089 }
1090
1091 if (arg_directory && arg_image) {
1092 log_error("--directory= and --image= may not be combined.");
1093 return -EINVAL;
1094 }
1095
1096 if (arg_template && arg_image) {
1097 log_error("--template= and --image= may not be combined.");
1098 return -EINVAL;
1099 }
1100
1101 if (arg_ephemeral && arg_template && !arg_directory) {
1102 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1103 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1104 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1105 * --directory=". */
1106
1107 arg_directory = arg_template;
1108 arg_template = NULL;
1109 }
1110
1111 if (arg_template && !(arg_directory || arg_machine)) {
1112 log_error("--template= needs --directory= or --machine=.");
1113 return -EINVAL;
1114 }
1115
1116 if (arg_ephemeral && arg_template) {
1117 log_error("--ephemeral and --template= may not be combined.");
1118 return -EINVAL;
1119 }
1120
1121 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1122 log_error("--ephemeral and --link-journal= may not be combined.");
1123 return -EINVAL;
1124 }
1125
1126 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported()) {
1127 log_error("--private-users= is not supported, kernel compiled without user namespace support.");
1128 return -EOPNOTSUPP;
1129 }
1130
1131 if (arg_userns_chown && arg_read_only) {
1132 log_error("--read-only and --private-users-chown may not be combined.");
1133 return -EINVAL;
1134 }
1135
1136 if (arg_network_bridge && arg_network_zone) {
1137 log_error("--network-bridge= and --network-zone= may not be combined.");
1138 return -EINVAL;
1139 }
1140
1141 if (argc > optind) {
1142 arg_parameters = strv_copy(argv + optind);
1143 if (!arg_parameters)
1144 return log_oom();
1145
1146 arg_settings_mask |= SETTING_START_MODE;
1147 }
1148
1149 /* Load all settings from .nspawn files */
1150 if (mask_no_settings)
1151 arg_settings_mask = 0;
1152
1153 /* Don't load any settings from .nspawn files */
1154 if (mask_all_settings)
1155 arg_settings_mask = _SETTINGS_MASK_ALL;
1156
1157 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
1158
1159 r = cg_unified_flush();
1160 if (r < 0)
1161 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
1162
1163 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
1164 if (e)
1165 arg_container_service_name = e;
1166
1167 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
1168 if (r < 0)
1169 arg_use_cgns = cg_ns_supported();
1170 else
1171 arg_use_cgns = r;
1172
1173 r = custom_mount_check_all();
1174 if (r < 0)
1175 return r;
1176
1177 return 1;
1178 }
1179
1180 static int verify_arguments(void) {
1181 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network) {
1182 log_error("Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1183 return -EINVAL;
1184 }
1185
1186 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO)) {
1187 log_error("Cannot combine --private-users with read-write mounts.");
1188 return -EINVAL;
1189 }
1190
1191 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
1192 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1193 return -EINVAL;
1194 }
1195
1196 if (arg_expose_ports && !arg_private_network) {
1197 log_error("Cannot use --port= without private networking.");
1198 return -EINVAL;
1199 }
1200
1201 #ifndef HAVE_LIBIPTC
1202 if (arg_expose_ports) {
1203 log_error("--port= is not supported, compiled without libiptc support.");
1204 return -EOPNOTSUPP;
1205 }
1206 #endif
1207
1208 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1209 arg_kill_signal = SIGRTMIN+3;
1210
1211 return 0;
1212 }
1213
1214 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1215 assert(p);
1216
1217 if (arg_userns_mode == USER_NAMESPACE_NO)
1218 return 0;
1219
1220 if (uid == UID_INVALID && gid == GID_INVALID)
1221 return 0;
1222
1223 if (uid != UID_INVALID) {
1224 uid += arg_uid_shift;
1225
1226 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1227 return -EOVERFLOW;
1228 }
1229
1230 if (gid != GID_INVALID) {
1231 gid += (gid_t) arg_uid_shift;
1232
1233 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1234 return -EOVERFLOW;
1235 }
1236
1237 if (lchown(p, uid, gid) < 0)
1238 return -errno;
1239
1240 return 0;
1241 }
1242
1243 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1244 const char *q;
1245
1246 q = prefix_roota(root, path);
1247 if (mkdir(q, mode) < 0) {
1248 if (errno == EEXIST)
1249 return 0;
1250 return -errno;
1251 }
1252
1253 return userns_lchown(q, uid, gid);
1254 }
1255
1256 static int setup_timezone(const char *dest) {
1257 _cleanup_free_ char *p = NULL, *q = NULL;
1258 const char *where, *check, *what;
1259 char *z, *y;
1260 int r;
1261
1262 assert(dest);
1263
1264 /* Fix the timezone, if possible */
1265 r = readlink_malloc("/etc/localtime", &p);
1266 if (r < 0) {
1267 log_warning("host's /etc/localtime is not a symlink, not updating container timezone.");
1268 /* to handle warning, delete /etc/localtime and replace it
1269 * with a symbolic link to a time zone data file.
1270 *
1271 * Example:
1272 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1273 */
1274 return 0;
1275 }
1276
1277 z = path_startswith(p, "../usr/share/zoneinfo/");
1278 if (!z)
1279 z = path_startswith(p, "/usr/share/zoneinfo/");
1280 if (!z) {
1281 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1282 return 0;
1283 }
1284
1285 where = prefix_roota(dest, "/etc/localtime");
1286 r = readlink_malloc(where, &q);
1287 if (r >= 0) {
1288 y = path_startswith(q, "../usr/share/zoneinfo/");
1289 if (!y)
1290 y = path_startswith(q, "/usr/share/zoneinfo/");
1291
1292 /* Already pointing to the right place? Then do nothing .. */
1293 if (y && streq(y, z))
1294 return 0;
1295 }
1296
1297 check = strjoina("/usr/share/zoneinfo/", z);
1298 check = prefix_roota(dest, check);
1299 if (laccess(check, F_OK) < 0) {
1300 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1301 return 0;
1302 }
1303
1304 if (unlink(where) < 0 && errno != ENOENT) {
1305 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1306 errno,
1307 "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1308 return 0;
1309 }
1310
1311 what = strjoina("../usr/share/zoneinfo/", z);
1312 if (symlink(what, where) < 0) {
1313 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1314 errno,
1315 "Failed to correct timezone of container, ignoring: %m");
1316 return 0;
1317 }
1318
1319 r = userns_lchown(where, 0, 0);
1320 if (r < 0)
1321 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1322
1323 return 0;
1324 }
1325
1326 static int resolved_listening(void) {
1327 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
1328 _cleanup_free_ char *dns_stub_listener_mode = NULL;
1329 int r;
1330
1331 /* Check if resolved is listening */
1332
1333 r = sd_bus_open_system(&bus);
1334 if (r < 0)
1335 return r;
1336
1337 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
1338 if (r <= 0)
1339 return r;
1340
1341 r = sd_bus_get_property_string(bus,
1342 "org.freedesktop.resolve1",
1343 "/org/freedesktop/resolve1",
1344 "org.freedesktop.resolve1.Manager",
1345 "DNSStubListener",
1346 NULL,
1347 &dns_stub_listener_mode);
1348 if (r < 0)
1349 return r;
1350
1351 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
1352 }
1353
1354 static int setup_resolv_conf(const char *dest) {
1355 _cleanup_free_ char *resolved = NULL, *etc = NULL;
1356 const char *where;
1357 int r, found;
1358
1359 assert(dest);
1360
1361 if (arg_private_network)
1362 return 0;
1363
1364 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
1365 if (r < 0) {
1366 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1367 return 0;
1368 }
1369
1370 where = strjoina(etc, "/resolv.conf");
1371 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
1372 if (found < 0) {
1373 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
1374 return 0;
1375 }
1376
1377 if (access("/usr/lib/systemd/resolv.conf", F_OK) >= 0 &&
1378 resolved_listening() > 0) {
1379
1380 /* resolved is enabled on the host. In this, case bind mount its static resolv.conf file into the
1381 * container, so that the container can use the host's resolver. Given that network namespacing is
1382 * disabled it's only natural of the container also uses the host's resolver. It also has the big
1383 * advantage that the container will be able to follow the host's DNS server configuration changes
1384 * transparently. */
1385
1386 if (found == 0) /* missing? */
1387 (void) touch(resolved);
1388
1389 r = mount_verbose(LOG_DEBUG, "/usr/lib/systemd/resolv.conf", resolved, NULL, MS_BIND, NULL);
1390 if (r >= 0)
1391 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1392 }
1393
1394 /* If that didn't work, let's copy the file */
1395 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0, COPY_REFLINK);
1396 if (r < 0) {
1397 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1398 * resolved or something similar runs inside and the symlink points there.
1399 *
1400 * If the disk image is read-only, there's also no point in complaining.
1401 */
1402 log_full_errno(IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1403 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
1404 return 0;
1405 }
1406
1407 r = userns_lchown(where, 0, 0);
1408 if (r < 0)
1409 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
1410
1411 return 0;
1412 }
1413
1414 static int setup_boot_id(const char *dest) {
1415 sd_id128_t rnd = SD_ID128_NULL;
1416 const char *from, *to;
1417 int r;
1418
1419 /* Generate a new randomized boot ID, so that each boot-up of
1420 * the container gets a new one */
1421
1422 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1423 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1424
1425 r = sd_id128_randomize(&rnd);
1426 if (r < 0)
1427 return log_error_errno(r, "Failed to generate random boot id: %m");
1428
1429 r = id128_write(from, ID128_UUID, rnd, false);
1430 if (r < 0)
1431 return log_error_errno(r, "Failed to write boot id: %m");
1432
1433 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1434 if (r >= 0)
1435 r = mount_verbose(LOG_ERR, NULL, to, NULL,
1436 MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1437
1438 (void) unlink(from);
1439 return r;
1440 }
1441
1442 static int copy_devnodes(const char *dest) {
1443
1444 static const char devnodes[] =
1445 "null\0"
1446 "zero\0"
1447 "full\0"
1448 "random\0"
1449 "urandom\0"
1450 "tty\0"
1451 "net/tun\0";
1452
1453 const char *d;
1454 int r = 0;
1455 _cleanup_umask_ mode_t u;
1456
1457 assert(dest);
1458
1459 u = umask(0000);
1460
1461 /* Create /dev/net, so that we can create /dev/net/tun in it */
1462 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1463 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1464
1465 NULSTR_FOREACH(d, devnodes) {
1466 _cleanup_free_ char *from = NULL, *to = NULL;
1467 struct stat st;
1468
1469 from = strappend("/dev/", d);
1470 to = prefix_root(dest, from);
1471
1472 if (stat(from, &st) < 0) {
1473
1474 if (errno != ENOENT)
1475 return log_error_errno(errno, "Failed to stat %s: %m", from);
1476
1477 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1478
1479 log_error("%s is not a char or block device, cannot copy.", from);
1480 return -EIO;
1481
1482 } else {
1483 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1484 /* Explicitly warn the user when /dev is already populated. */
1485 if (errno == EEXIST)
1486 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
1487 if (errno != EPERM)
1488 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1489
1490 /* Some systems abusively restrict mknod but
1491 * allow bind mounts. */
1492 r = touch(to);
1493 if (r < 0)
1494 return log_error_errno(r, "touch (%s) failed: %m", to);
1495 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1496 if (r < 0)
1497 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
1498 }
1499
1500 r = userns_lchown(to, 0, 0);
1501 if (r < 0)
1502 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1503 }
1504 }
1505
1506 return r;
1507 }
1508
1509 static int setup_pts(const char *dest) {
1510 _cleanup_free_ char *options = NULL;
1511 const char *p;
1512 int r;
1513
1514 #ifdef HAVE_SELINUX
1515 if (arg_selinux_apifs_context)
1516 (void) asprintf(&options,
1517 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1518 arg_uid_shift + TTY_GID,
1519 arg_selinux_apifs_context);
1520 else
1521 #endif
1522 (void) asprintf(&options,
1523 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1524 arg_uid_shift + TTY_GID);
1525
1526 if (!options)
1527 return log_oom();
1528
1529 /* Mount /dev/pts itself */
1530 p = prefix_roota(dest, "/dev/pts");
1531 if (mkdir(p, 0755) < 0)
1532 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1533 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
1534 if (r < 0)
1535 return r;
1536 r = userns_lchown(p, 0, 0);
1537 if (r < 0)
1538 return log_error_errno(r, "Failed to chown /dev/pts: %m");
1539
1540 /* Create /dev/ptmx symlink */
1541 p = prefix_roota(dest, "/dev/ptmx");
1542 if (symlink("pts/ptmx", p) < 0)
1543 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1544 r = userns_lchown(p, 0, 0);
1545 if (r < 0)
1546 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
1547
1548 /* And fix /dev/pts/ptmx ownership */
1549 p = prefix_roota(dest, "/dev/pts/ptmx");
1550 r = userns_lchown(p, 0, 0);
1551 if (r < 0)
1552 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
1553
1554 return 0;
1555 }
1556
1557 static int setup_dev_console(const char *dest, const char *console) {
1558 _cleanup_umask_ mode_t u;
1559 const char *to;
1560 int r;
1561
1562 assert(dest);
1563 assert(console);
1564
1565 u = umask(0000);
1566
1567 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1568 if (r < 0)
1569 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1570
1571 /* We need to bind mount the right tty to /dev/console since
1572 * ptys can only exist on pts file systems. To have something
1573 * to bind mount things on we create a empty regular file. */
1574
1575 to = prefix_roota(dest, "/dev/console");
1576 r = touch(to);
1577 if (r < 0)
1578 return log_error_errno(r, "touch() for /dev/console failed: %m");
1579
1580 return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
1581 }
1582
1583 static int setup_kmsg(const char *dest, int kmsg_socket) {
1584 const char *from, *to;
1585 _cleanup_umask_ mode_t u;
1586 int fd, r;
1587
1588 assert(kmsg_socket >= 0);
1589
1590 u = umask(0000);
1591
1592 /* We create the kmsg FIFO as /run/kmsg, but immediately
1593 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1594 * on the reading side behave very similar to /proc/kmsg,
1595 * their writing side behaves differently from /dev/kmsg in
1596 * that writing blocks when nothing is reading. In order to
1597 * avoid any problems with containers deadlocking due to this
1598 * we simply make /dev/kmsg unavailable to the container. */
1599 from = prefix_roota(dest, "/run/kmsg");
1600 to = prefix_roota(dest, "/proc/kmsg");
1601
1602 if (mkfifo(from, 0600) < 0)
1603 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1604 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1605 if (r < 0)
1606 return r;
1607
1608 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1609 if (fd < 0)
1610 return log_error_errno(errno, "Failed to open fifo: %m");
1611
1612 /* Store away the fd in the socket, so that it stays open as
1613 * long as we run the child */
1614 r = send_one_fd(kmsg_socket, fd, 0);
1615 safe_close(fd);
1616
1617 if (r < 0)
1618 return log_error_errno(r, "Failed to send FIFO fd: %m");
1619
1620 /* And now make the FIFO unavailable as /run/kmsg... */
1621 (void) unlink(from);
1622
1623 return 0;
1624 }
1625
1626 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
1627 union in_addr_union *exposed = userdata;
1628
1629 assert(rtnl);
1630 assert(m);
1631 assert(exposed);
1632
1633 expose_port_execute(rtnl, arg_expose_ports, exposed);
1634 return 0;
1635 }
1636
1637 static int setup_hostname(void) {
1638
1639 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
1640 return 0;
1641
1642 if (sethostname_idempotent(arg_machine) < 0)
1643 return -errno;
1644
1645 return 0;
1646 }
1647
1648 static int setup_journal(const char *directory) {
1649 sd_id128_t this_id;
1650 _cleanup_free_ char *d = NULL;
1651 const char *p, *q;
1652 bool try;
1653 char id[33];
1654 int r;
1655
1656 /* Don't link journals in ephemeral mode */
1657 if (arg_ephemeral)
1658 return 0;
1659
1660 if (arg_link_journal == LINK_NO)
1661 return 0;
1662
1663 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1664
1665 r = sd_id128_get_machine(&this_id);
1666 if (r < 0)
1667 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1668
1669 if (sd_id128_equal(arg_uuid, this_id)) {
1670 log_full(try ? LOG_WARNING : LOG_ERR,
1671 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
1672 if (try)
1673 return 0;
1674 return -EEXIST;
1675 }
1676
1677 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1678 if (r < 0)
1679 return log_error_errno(r, "Failed to create /var: %m");
1680
1681 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1682 if (r < 0)
1683 return log_error_errno(r, "Failed to create /var/log: %m");
1684
1685 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1686 if (r < 0)
1687 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1688
1689 (void) sd_id128_to_string(arg_uuid, id);
1690
1691 p = strjoina("/var/log/journal/", id);
1692 q = prefix_roota(directory, p);
1693
1694 if (path_is_mount_point(p, NULL, 0) > 0) {
1695 if (try)
1696 return 0;
1697
1698 log_error("%s: already a mount point, refusing to use for journal", p);
1699 return -EEXIST;
1700 }
1701
1702 if (path_is_mount_point(q, NULL, 0) > 0) {
1703 if (try)
1704 return 0;
1705
1706 log_error("%s: already a mount point, refusing to use for journal", q);
1707 return -EEXIST;
1708 }
1709
1710 r = readlink_and_make_absolute(p, &d);
1711 if (r >= 0) {
1712 if ((arg_link_journal == LINK_GUEST ||
1713 arg_link_journal == LINK_AUTO) &&
1714 path_equal(d, q)) {
1715
1716 r = userns_mkdir(directory, p, 0755, 0, 0);
1717 if (r < 0)
1718 log_warning_errno(r, "Failed to create directory %s: %m", q);
1719 return 0;
1720 }
1721
1722 if (unlink(p) < 0)
1723 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1724 } else if (r == -EINVAL) {
1725
1726 if (arg_link_journal == LINK_GUEST &&
1727 rmdir(p) < 0) {
1728
1729 if (errno == ENOTDIR) {
1730 log_error("%s already exists and is neither a symlink nor a directory", p);
1731 return r;
1732 } else
1733 return log_error_errno(errno, "Failed to remove %s: %m", p);
1734 }
1735 } else if (r != -ENOENT)
1736 return log_error_errno(r, "readlink(%s) failed: %m", p);
1737
1738 if (arg_link_journal == LINK_GUEST) {
1739
1740 if (symlink(q, p) < 0) {
1741 if (try) {
1742 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1743 return 0;
1744 } else
1745 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1746 }
1747
1748 r = userns_mkdir(directory, p, 0755, 0, 0);
1749 if (r < 0)
1750 log_warning_errno(r, "Failed to create directory %s: %m", q);
1751 return 0;
1752 }
1753
1754 if (arg_link_journal == LINK_HOST) {
1755 /* don't create parents here — if the host doesn't have
1756 * permanent journal set up, don't force it here */
1757
1758 if (mkdir(p, 0755) < 0 && errno != EEXIST) {
1759 if (try) {
1760 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1761 return 0;
1762 } else
1763 return log_error_errno(errno, "Failed to create %s: %m", p);
1764 }
1765
1766 } else if (access(p, F_OK) < 0)
1767 return 0;
1768
1769 if (dir_is_empty(q) == 0)
1770 log_warning("%s is not empty, proceeding anyway.", q);
1771
1772 r = userns_mkdir(directory, p, 0755, 0, 0);
1773 if (r < 0)
1774 return log_error_errno(r, "Failed to create %s: %m", q);
1775
1776 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
1777 if (r < 0)
1778 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1779
1780 return 0;
1781 }
1782
1783 static int drop_capabilities(void) {
1784 return capability_bounding_set_drop(arg_caps_retain, false);
1785 }
1786
1787 static int reset_audit_loginuid(void) {
1788 _cleanup_free_ char *p = NULL;
1789 int r;
1790
1791 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
1792 return 0;
1793
1794 r = read_one_line_file("/proc/self/loginuid", &p);
1795 if (r == -ENOENT)
1796 return 0;
1797 if (r < 0)
1798 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1799
1800 /* Already reset? */
1801 if (streq(p, "4294967295"))
1802 return 0;
1803
1804 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
1805 if (r < 0) {
1806 log_error_errno(r,
1807 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1808 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1809 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1810 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1811 "using systemd-nspawn. Sleeping for 5s... (%m)");
1812
1813 sleep(5);
1814 }
1815
1816 return 0;
1817 }
1818
1819
1820 static int setup_propagate(const char *root) {
1821 const char *p, *q;
1822 int r;
1823
1824 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1825 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1826 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
1827 (void) mkdir_p(p, 0600);
1828
1829 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1830 if (r < 0)
1831 return log_error_errno(r, "Failed to create /run/systemd: %m");
1832
1833 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1834 if (r < 0)
1835 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
1836
1837 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1838 if (r < 0)
1839 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
1840
1841 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
1842 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
1843 if (r < 0)
1844 return r;
1845
1846 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
1847 if (r < 0)
1848 return r;
1849
1850 /* machined will MS_MOVE into that directory, and that's only
1851 * supported for non-shared mounts. */
1852 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
1853 }
1854
1855 static int setup_machine_id(const char *directory) {
1856 const char *etc_machine_id;
1857 sd_id128_t id;
1858 int r;
1859
1860 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
1861 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
1862 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
1863 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
1864 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
1865 * container behaves nicely). */
1866
1867 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
1868
1869 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
1870 if (r < 0) {
1871 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
1872 return log_error_errno(r, "Failed to read machine ID from container image: %m");
1873
1874 if (sd_id128_is_null(arg_uuid)) {
1875 r = sd_id128_randomize(&arg_uuid);
1876 if (r < 0)
1877 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
1878 }
1879 } else {
1880 if (sd_id128_is_null(id)) {
1881 log_error("Machine ID in container image is zero, refusing.");
1882 return -EINVAL;
1883 }
1884
1885 arg_uuid = id;
1886 }
1887
1888 return 0;
1889 }
1890
1891 static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
1892 int r;
1893
1894 assert(directory);
1895
1896 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
1897 return 0;
1898
1899 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
1900 if (r == -EOPNOTSUPP)
1901 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
1902 if (r == -EBADE)
1903 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
1904 if (r < 0)
1905 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
1906 if (r == 0)
1907 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
1908 else
1909 log_debug("Patched directory tree to match UID/GID range.");
1910
1911 return r;
1912 }
1913
1914 /*
1915 * Return values:
1916 * < 0 : wait_for_terminate() failed to get the state of the
1917 * container, the container was terminated by a signal, or
1918 * failed for an unknown reason. No change is made to the
1919 * container argument.
1920 * > 0 : The program executed in the container terminated with an
1921 * error. The exit code of the program executed in the
1922 * container is returned. The container argument has been set
1923 * to CONTAINER_TERMINATED.
1924 * 0 : The container is being rebooted, has been shut down or exited
1925 * successfully. The container argument has been set to either
1926 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
1927 *
1928 * That is, success is indicated by a return value of zero, and an
1929 * error is indicated by a non-zero value.
1930 */
1931 static int wait_for_container(pid_t pid, ContainerStatus *container) {
1932 siginfo_t status;
1933 int r;
1934
1935 r = wait_for_terminate(pid, &status);
1936 if (r < 0)
1937 return log_warning_errno(r, "Failed to wait for container: %m");
1938
1939 switch (status.si_code) {
1940
1941 case CLD_EXITED:
1942 if (status.si_status == 0)
1943 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
1944 else
1945 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
1946
1947 *container = CONTAINER_TERMINATED;
1948 return status.si_status;
1949
1950 case CLD_KILLED:
1951 if (status.si_status == SIGINT) {
1952 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
1953 *container = CONTAINER_TERMINATED;
1954 return 0;
1955
1956 } else if (status.si_status == SIGHUP) {
1957 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
1958 *container = CONTAINER_REBOOTED;
1959 return 0;
1960 }
1961
1962 /* fall through */
1963
1964 case CLD_DUMPED:
1965 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1966 return -EIO;
1967
1968 default:
1969 log_error("Container %s failed due to unknown reason.", arg_machine);
1970 return -EIO;
1971 }
1972 }
1973
1974 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1975 pid_t pid;
1976
1977 pid = PTR_TO_PID(userdata);
1978 if (pid > 0) {
1979 if (kill(pid, arg_kill_signal) >= 0) {
1980 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1981 sd_event_source_set_userdata(s, NULL);
1982 return 0;
1983 }
1984 }
1985
1986 sd_event_exit(sd_event_source_get_event(s), 0);
1987 return 0;
1988 }
1989
1990 static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
1991 for (;;) {
1992 siginfo_t si = {};
1993 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
1994 return log_error_errno(errno, "Failed to waitid(): %m");
1995 if (si.si_pid == 0) /* No pending children. */
1996 break;
1997 if (si.si_pid == PTR_TO_PID(userdata)) {
1998 /* The main process we care for has exited. Return from
1999 * signal handler but leave the zombie. */
2000 sd_event_exit(sd_event_source_get_event(s), 0);
2001 break;
2002 }
2003 /* Reap all other children. */
2004 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2005 }
2006
2007 return 0;
2008 }
2009
2010 static int determine_names(void) {
2011 int r;
2012
2013 if (arg_template && !arg_directory && arg_machine) {
2014
2015 /* If --template= was specified then we should not
2016 * search for a machine, but instead create a new one
2017 * in /var/lib/machine. */
2018
2019 arg_directory = strjoin("/var/lib/machines/", arg_machine);
2020 if (!arg_directory)
2021 return log_oom();
2022 }
2023
2024 if (!arg_image && !arg_directory) {
2025 if (arg_machine) {
2026 _cleanup_(image_unrefp) Image *i = NULL;
2027
2028 r = image_find(arg_machine, &i);
2029 if (r < 0)
2030 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2031 if (r == 0) {
2032 log_error("No image for machine '%s': %m", arg_machine);
2033 return -ENOENT;
2034 }
2035
2036 if (i->type == IMAGE_RAW)
2037 r = free_and_strdup(&arg_image, i->path);
2038 else
2039 r = free_and_strdup(&arg_directory, i->path);
2040 if (r < 0)
2041 return log_oom();
2042
2043 if (!arg_ephemeral)
2044 arg_read_only = arg_read_only || i->read_only;
2045 } else
2046 arg_directory = get_current_dir_name();
2047
2048 if (!arg_directory && !arg_image) {
2049 log_error("Failed to determine path, please use -D or -i.");
2050 return -EINVAL;
2051 }
2052 }
2053
2054 if (!arg_machine) {
2055
2056 if (arg_directory && path_equal(arg_directory, "/"))
2057 arg_machine = gethostname_malloc();
2058 else {
2059 if (arg_image) {
2060 char *e;
2061
2062 arg_machine = strdup(basename(arg_image));
2063
2064 /* Truncate suffix if there is one */
2065 e = endswith(arg_machine, ".raw");
2066 if (e)
2067 *e = 0;
2068 } else
2069 arg_machine = strdup(basename(arg_directory));
2070 }
2071 if (!arg_machine)
2072 return log_oom();
2073
2074 hostname_cleanup(arg_machine);
2075 if (!machine_name_is_valid(arg_machine)) {
2076 log_error("Failed to determine machine name automatically, please use -M.");
2077 return -EINVAL;
2078 }
2079
2080 if (arg_ephemeral) {
2081 char *b;
2082
2083 /* Add a random suffix when this is an
2084 * ephemeral machine, so that we can run many
2085 * instances at once without manually having
2086 * to specify -M each time. */
2087
2088 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2089 return log_oom();
2090
2091 free(arg_machine);
2092 arg_machine = b;
2093 }
2094 }
2095
2096 return 0;
2097 }
2098
2099 static int chase_symlinks_and_update(char **p, unsigned flags) {
2100 char *chased;
2101 int r;
2102
2103 assert(p);
2104
2105 if (!*p)
2106 return 0;
2107
2108 r = chase_symlinks(*p, NULL, flags, &chased);
2109 if (r < 0)
2110 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
2111
2112 free(*p);
2113 *p = chased;
2114
2115 return 0;
2116 }
2117
2118 static int determine_uid_shift(const char *directory) {
2119 int r;
2120
2121 if (arg_userns_mode == USER_NAMESPACE_NO) {
2122 arg_uid_shift = 0;
2123 return 0;
2124 }
2125
2126 if (arg_uid_shift == UID_INVALID) {
2127 struct stat st;
2128
2129 r = stat(directory, &st);
2130 if (r < 0)
2131 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2132
2133 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2134
2135 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
2136 log_error("UID and GID base of %s don't match.", directory);
2137 return -EINVAL;
2138 }
2139
2140 arg_uid_range = UINT32_C(0x10000);
2141 }
2142
2143 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2144 log_error("UID base too high for UID range.");
2145 return -EINVAL;
2146 }
2147
2148 return 0;
2149 }
2150
2151 static int inner_child(
2152 Barrier *barrier,
2153 const char *directory,
2154 bool secondary,
2155 int kmsg_socket,
2156 int rtnl_socket,
2157 FDSet *fds) {
2158
2159 _cleanup_free_ char *home = NULL;
2160 char as_uuid[37];
2161 unsigned n_env = 1;
2162 const char *envp[] = {
2163 "PATH=" DEFAULT_PATH_SPLIT_USR,
2164 NULL, /* container */
2165 NULL, /* TERM */
2166 NULL, /* HOME */
2167 NULL, /* USER */
2168 NULL, /* LOGNAME */
2169 NULL, /* container_uuid */
2170 NULL, /* LISTEN_FDS */
2171 NULL, /* LISTEN_PID */
2172 NULL, /* NOTIFY_SOCKET */
2173 NULL
2174 };
2175 const char *exec_target;
2176
2177 _cleanup_strv_free_ char **env_use = NULL;
2178 int r;
2179
2180 assert(barrier);
2181 assert(directory);
2182 assert(kmsg_socket >= 0);
2183
2184 if (arg_userns_mode != USER_NAMESPACE_NO) {
2185 /* Tell the parent, that it now can write the UID map. */
2186 (void) barrier_place(barrier); /* #1 */
2187
2188 /* Wait until the parent wrote the UID map */
2189 if (!barrier_place_and_sync(barrier)) { /* #2 */
2190 log_error("Parent died too early");
2191 return -ESRCH;
2192 }
2193 }
2194
2195 r = reset_uid_gid();
2196 if (r < 0)
2197 return log_error_errno(r, "Couldn't become new root: %m");
2198
2199 r = mount_all(NULL,
2200 arg_mount_settings | MOUNT_IN_USERNS,
2201 arg_uid_shift,
2202 arg_uid_range,
2203 arg_selinux_apifs_context);
2204
2205 if (r < 0)
2206 return r;
2207
2208 r = mount_sysfs(NULL, arg_mount_settings);
2209 if (r < 0)
2210 return r;
2211
2212 /* Wait until we are cgroup-ified, so that we
2213 * can mount the right cgroup path writable */
2214 if (!barrier_place_and_sync(barrier)) { /* #3 */
2215 log_error("Parent died too early");
2216 return -ESRCH;
2217 }
2218
2219 if (arg_use_cgns && cg_ns_supported()) {
2220 r = unshare(CLONE_NEWCGROUP);
2221 if (r < 0)
2222 return log_error_errno(errno, "Failed to unshare cgroup namespace");
2223 r = mount_cgroups(
2224 "",
2225 arg_unified_cgroup_hierarchy,
2226 arg_userns_mode != USER_NAMESPACE_NO,
2227 arg_uid_shift,
2228 arg_uid_range,
2229 arg_selinux_apifs_context,
2230 true);
2231 if (r < 0)
2232 return r;
2233 } else {
2234 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2235 if (r < 0)
2236 return r;
2237 }
2238
2239 r = setup_boot_id(NULL);
2240 if (r < 0)
2241 return r;
2242
2243 r = setup_kmsg(NULL, kmsg_socket);
2244 if (r < 0)
2245 return r;
2246 kmsg_socket = safe_close(kmsg_socket);
2247
2248 umask(0022);
2249
2250 if (setsid() < 0)
2251 return log_error_errno(errno, "setsid() failed: %m");
2252
2253 if (arg_private_network)
2254 loopback_setup();
2255
2256 if (arg_expose_ports) {
2257 r = expose_port_send_rtnl(rtnl_socket);
2258 if (r < 0)
2259 return r;
2260 rtnl_socket = safe_close(rtnl_socket);
2261 }
2262
2263 r = drop_capabilities();
2264 if (r < 0)
2265 return log_error_errno(r, "drop_capabilities() failed: %m");
2266
2267 setup_hostname();
2268
2269 if (arg_personality != PERSONALITY_INVALID) {
2270 if (personality(arg_personality) < 0)
2271 return log_error_errno(errno, "personality() failed: %m");
2272 } else if (secondary) {
2273 if (personality(PER_LINUX32) < 0)
2274 return log_error_errno(errno, "personality() failed: %m");
2275 }
2276
2277 #ifdef HAVE_SELINUX
2278 if (arg_selinux_context)
2279 if (setexeccon(arg_selinux_context) < 0)
2280 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2281 #endif
2282
2283 r = change_uid_gid(arg_user, &home);
2284 if (r < 0)
2285 return r;
2286
2287 /* LXC sets container=lxc, so follow the scheme here */
2288 envp[n_env++] = strjoina("container=", arg_container_service_name);
2289
2290 envp[n_env] = strv_find_prefix(environ, "TERM=");
2291 if (envp[n_env])
2292 n_env++;
2293
2294 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2295 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2296 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2297 return log_oom();
2298
2299 assert(!sd_id128_is_null(arg_uuid));
2300
2301 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
2302 return log_oom();
2303
2304 if (fdset_size(fds) > 0) {
2305 r = fdset_cloexec(fds, false);
2306 if (r < 0)
2307 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2308
2309 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2310 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2311 return log_oom();
2312 }
2313 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
2314 return log_oom();
2315
2316 env_use = strv_env_merge(2, envp, arg_setenv);
2317 if (!env_use)
2318 return log_oom();
2319
2320 /* Let the parent know that we are ready and
2321 * wait until the parent is ready with the
2322 * setup, too... */
2323 if (!barrier_place_and_sync(barrier)) { /* #4 */
2324 log_error("Parent died too early");
2325 return -ESRCH;
2326 }
2327
2328 if (arg_chdir)
2329 if (chdir(arg_chdir) < 0)
2330 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2331
2332 if (arg_start_mode == START_PID2) {
2333 r = stub_pid1(arg_uuid);
2334 if (r < 0)
2335 return r;
2336 }
2337
2338 /* Now, explicitly close the log, so that we
2339 * then can close all remaining fds. Closing
2340 * the log explicitly first has the benefit
2341 * that the logging subsystem knows about it,
2342 * and is thus ready to be reopened should we
2343 * need it again. Note that the other fds
2344 * closed here are at least the locking and
2345 * barrier fds. */
2346 log_close();
2347 (void) fdset_close_others(fds);
2348
2349 if (arg_start_mode == START_BOOT) {
2350 char **a;
2351 size_t m;
2352
2353 /* Automatically search for the init system */
2354
2355 m = strv_length(arg_parameters);
2356 a = newa(char*, m + 2);
2357 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2358 a[1 + m] = NULL;
2359
2360 a[0] = (char*) "/usr/lib/systemd/systemd";
2361 execve(a[0], a, env_use);
2362
2363 a[0] = (char*) "/lib/systemd/systemd";
2364 execve(a[0], a, env_use);
2365
2366 a[0] = (char*) "/sbin/init";
2367 execve(a[0], a, env_use);
2368
2369 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
2370 } else if (!strv_isempty(arg_parameters)) {
2371 exec_target = arg_parameters[0];
2372 execvpe(arg_parameters[0], arg_parameters, env_use);
2373 } else {
2374 if (!arg_chdir)
2375 /* If we cannot change the directory, we'll end up in /, that is expected. */
2376 (void) chdir(home ?: "/root");
2377
2378 execle("/bin/bash", "-bash", NULL, env_use);
2379 execle("/bin/sh", "-sh", NULL, env_use);
2380
2381 exec_target = "/bin/bash, /bin/sh";
2382 }
2383
2384 r = -errno;
2385 (void) log_open();
2386 return log_error_errno(r, "execv(%s) failed: %m", exec_target);
2387 }
2388
2389 static int setup_sd_notify_child(void) {
2390 static const int one = 1;
2391 int fd = -1;
2392 union sockaddr_union sa = {
2393 .sa.sa_family = AF_UNIX,
2394 };
2395 int r;
2396
2397 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2398 if (fd < 0)
2399 return log_error_errno(errno, "Failed to allocate notification socket: %m");
2400
2401 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
2402 (void) unlink(NSPAWN_NOTIFY_SOCKET_PATH);
2403
2404 strncpy(sa.un.sun_path, NSPAWN_NOTIFY_SOCKET_PATH, sizeof(sa.un.sun_path)-1);
2405 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
2406 if (r < 0) {
2407 safe_close(fd);
2408 return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path);
2409 }
2410
2411 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
2412 if (r < 0) {
2413 safe_close(fd);
2414 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
2415 }
2416
2417 r = setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &one, sizeof(one));
2418 if (r < 0) {
2419 safe_close(fd);
2420 return log_error_errno(errno, "SO_PASSCRED failed: %m");
2421 }
2422
2423 return fd;
2424 }
2425
2426 static int outer_child(
2427 Barrier *barrier,
2428 const char *directory,
2429 const char *console,
2430 DissectedImage *dissected_image,
2431 bool interactive,
2432 bool secondary,
2433 int pid_socket,
2434 int uuid_socket,
2435 int notify_socket,
2436 int kmsg_socket,
2437 int rtnl_socket,
2438 int uid_shift_socket,
2439 FDSet *fds) {
2440
2441 pid_t pid;
2442 ssize_t l;
2443 int r;
2444 _cleanup_close_ int fd = -1;
2445
2446 assert(barrier);
2447 assert(directory);
2448 assert(console);
2449 assert(pid_socket >= 0);
2450 assert(uuid_socket >= 0);
2451 assert(notify_socket >= 0);
2452 assert(kmsg_socket >= 0);
2453
2454 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2455 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2456
2457 if (interactive) {
2458 close_nointr(STDIN_FILENO);
2459 close_nointr(STDOUT_FILENO);
2460 close_nointr(STDERR_FILENO);
2461
2462 r = open_terminal(console, O_RDWR);
2463 if (r != STDIN_FILENO) {
2464 if (r >= 0) {
2465 safe_close(r);
2466 r = -EINVAL;
2467 }
2468
2469 return log_error_errno(r, "Failed to open console: %m");
2470 }
2471
2472 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2473 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2474 return log_error_errno(errno, "Failed to duplicate console: %m");
2475 }
2476
2477 r = reset_audit_loginuid();
2478 if (r < 0)
2479 return r;
2480
2481 /* Mark everything as slave, so that we still
2482 * receive mounts from the real root, but don't
2483 * propagate mounts to the real root. */
2484 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
2485 if (r < 0)
2486 return r;
2487
2488 if (dissected_image) {
2489 r = dissected_image_mount(dissected_image, directory, DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0));
2490 if (r < 0)
2491 return r;
2492 }
2493
2494 r = determine_uid_shift(directory);
2495 if (r < 0)
2496 return r;
2497
2498 if (arg_userns_mode != USER_NAMESPACE_NO) {
2499 /* Let the parent know which UID shift we read from the image */
2500 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2501 if (l < 0)
2502 return log_error_errno(errno, "Failed to send UID shift: %m");
2503 if (l != sizeof(arg_uid_shift)) {
2504 log_error("Short write while sending UID shift.");
2505 return -EIO;
2506 }
2507
2508 if (arg_userns_mode == USER_NAMESPACE_PICK) {
2509 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
2510 * we just read from the image is available. If yes, it will send the UID shift back to us, if
2511 * not it will pick a different one, and send it back to us. */
2512
2513 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
2514 if (l < 0)
2515 return log_error_errno(errno, "Failed to recv UID shift: %m");
2516 if (l != sizeof(arg_uid_shift)) {
2517 log_error("Short read while receiving UID shift.");
2518 return -EIO;
2519 }
2520 }
2521
2522 log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2523 }
2524
2525 /* Turn directory into bind mount */
2526 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
2527 if (r < 0)
2528 return r;
2529
2530 r = setup_pivot_root(
2531 directory,
2532 arg_pivot_root_new,
2533 arg_pivot_root_old);
2534 if (r < 0)
2535 return r;
2536
2537 r = setup_volatile(
2538 directory,
2539 arg_volatile_mode,
2540 arg_userns_mode != USER_NAMESPACE_NO,
2541 arg_uid_shift,
2542 arg_uid_range,
2543 arg_selinux_context);
2544 if (r < 0)
2545 return r;
2546
2547 r = setup_volatile_state(
2548 directory,
2549 arg_volatile_mode,
2550 arg_userns_mode != USER_NAMESPACE_NO,
2551 arg_uid_shift,
2552 arg_uid_range,
2553 arg_selinux_context);
2554 if (r < 0)
2555 return r;
2556
2557 /* Mark everything as shared so our mounts get propagated down. This is
2558 * required to make new bind mounts available in systemd services
2559 * inside the containter that create a new mount namespace.
2560 * See https://github.com/systemd/systemd/issues/3860
2561 * Further submounts (such as /dev) done after this will inherit the
2562 * shared propagation mode. */
2563 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
2564 if (r < 0)
2565 return r;
2566
2567 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
2568 if (r < 0)
2569 return r;
2570
2571 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2572 if (r < 0)
2573 return r;
2574
2575 if (arg_read_only) {
2576 r = bind_remount_recursive(directory, true, NULL);
2577 if (r < 0)
2578 return log_error_errno(r, "Failed to make tree read-only: %m");
2579 }
2580
2581 r = mount_all(directory,
2582 arg_mount_settings,
2583 arg_uid_shift,
2584 arg_uid_range,
2585 arg_selinux_apifs_context);
2586 if (r < 0)
2587 return r;
2588
2589 r = copy_devnodes(directory);
2590 if (r < 0)
2591 return r;
2592
2593 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2594
2595 r = setup_pts(directory);
2596 if (r < 0)
2597 return r;
2598
2599 r = setup_propagate(directory);
2600 if (r < 0)
2601 return r;
2602
2603 r = setup_dev_console(directory, console);
2604 if (r < 0)
2605 return r;
2606
2607 r = setup_seccomp(arg_caps_retain);
2608 if (r < 0)
2609 return r;
2610
2611 r = setup_timezone(directory);
2612 if (r < 0)
2613 return r;
2614
2615 r = setup_resolv_conf(directory);
2616 if (r < 0)
2617 return r;
2618
2619 r = setup_machine_id(directory);
2620 if (r < 0)
2621 return r;
2622
2623 r = setup_journal(directory);
2624 if (r < 0)
2625 return r;
2626
2627 r = mount_custom(
2628 directory,
2629 arg_custom_mounts,
2630 arg_n_custom_mounts,
2631 arg_userns_mode != USER_NAMESPACE_NO,
2632 arg_uid_shift,
2633 arg_uid_range,
2634 arg_selinux_apifs_context);
2635 if (r < 0)
2636 return r;
2637
2638 if (!arg_use_cgns || !cg_ns_supported()) {
2639 r = mount_cgroups(
2640 directory,
2641 arg_unified_cgroup_hierarchy,
2642 arg_userns_mode != USER_NAMESPACE_NO,
2643 arg_uid_shift,
2644 arg_uid_range,
2645 arg_selinux_apifs_context,
2646 false);
2647 if (r < 0)
2648 return r;
2649 }
2650
2651 r = mount_move_root(directory);
2652 if (r < 0)
2653 return log_error_errno(r, "Failed to move root directory: %m");
2654
2655 fd = setup_sd_notify_child();
2656 if (fd < 0)
2657 return fd;
2658
2659 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2660 arg_clone_ns_flags |
2661 (arg_private_network ? CLONE_NEWNET : 0) |
2662 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
2663 if (pid < 0)
2664 return log_error_errno(errno, "Failed to fork inner child: %m");
2665 if (pid == 0) {
2666 pid_socket = safe_close(pid_socket);
2667 uuid_socket = safe_close(uuid_socket);
2668 notify_socket = safe_close(notify_socket);
2669 uid_shift_socket = safe_close(uid_shift_socket);
2670
2671 /* The inner child has all namespaces that are
2672 * requested, so that we all are owned by the user if
2673 * user namespaces are turned on. */
2674
2675 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
2676 if (r < 0)
2677 _exit(EXIT_FAILURE);
2678
2679 _exit(EXIT_SUCCESS);
2680 }
2681
2682 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2683 if (l < 0)
2684 return log_error_errno(errno, "Failed to send PID: %m");
2685 if (l != sizeof(pid)) {
2686 log_error("Short write while sending PID.");
2687 return -EIO;
2688 }
2689
2690 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
2691 if (l < 0)
2692 return log_error_errno(errno, "Failed to send machine ID: %m");
2693 if (l != sizeof(arg_uuid)) {
2694 log_error("Short write while sending machine ID.");
2695 return -EIO;
2696 }
2697
2698 l = send_one_fd(notify_socket, fd, 0);
2699 if (l < 0)
2700 return log_error_errno(errno, "Failed to send notify fd: %m");
2701
2702 pid_socket = safe_close(pid_socket);
2703 uuid_socket = safe_close(uuid_socket);
2704 notify_socket = safe_close(notify_socket);
2705 kmsg_socket = safe_close(kmsg_socket);
2706 rtnl_socket = safe_close(rtnl_socket);
2707
2708 return 0;
2709 }
2710
2711 static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
2712 unsigned n_tries = 100;
2713 uid_t candidate;
2714 int r;
2715
2716 assert(shift);
2717 assert(ret_lock_file);
2718 assert(arg_userns_mode == USER_NAMESPACE_PICK);
2719 assert(arg_uid_range == 0x10000U);
2720
2721 candidate = *shift;
2722
2723 (void) mkdir("/run/systemd/nspawn-uid", 0755);
2724
2725 for (;;) {
2726 char lock_path[strlen("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
2727 _cleanup_release_lock_file_ LockFile lf = LOCK_FILE_INIT;
2728
2729 if (--n_tries <= 0)
2730 return -EBUSY;
2731
2732 if (candidate < UID_SHIFT_PICK_MIN || candidate > UID_SHIFT_PICK_MAX)
2733 goto next;
2734 if ((candidate & UINT32_C(0xFFFF)) != 0)
2735 goto next;
2736
2737 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
2738 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
2739 if (r == -EBUSY) /* Range already taken by another nspawn instance */
2740 goto next;
2741 if (r < 0)
2742 return r;
2743
2744 /* Make some superficial checks whether the range is currently known in the user database */
2745 if (getpwuid(candidate))
2746 goto next;
2747 if (getpwuid(candidate + UINT32_C(0xFFFE)))
2748 goto next;
2749 if (getgrgid(candidate))
2750 goto next;
2751 if (getgrgid(candidate + UINT32_C(0xFFFE)))
2752 goto next;
2753
2754 *ret_lock_file = lf;
2755 lf = (struct LockFile) LOCK_FILE_INIT;
2756 *shift = candidate;
2757 return 0;
2758
2759 next:
2760 random_bytes(&candidate, sizeof(candidate));
2761 candidate = (candidate % (UID_SHIFT_PICK_MAX - UID_SHIFT_PICK_MIN)) + UID_SHIFT_PICK_MIN;
2762 candidate &= (uid_t) UINT32_C(0xFFFF0000);
2763 }
2764 }
2765
2766 static int setup_uid_map(pid_t pid) {
2767 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2768 int r;
2769
2770 assert(pid > 1);
2771
2772 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2773 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
2774 r = write_string_file(uid_map, line, 0);
2775 if (r < 0)
2776 return log_error_errno(r, "Failed to write UID map: %m");
2777
2778 /* We always assign the same UID and GID ranges */
2779 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
2780 r = write_string_file(uid_map, line, 0);
2781 if (r < 0)
2782 return log_error_errno(r, "Failed to write GID map: %m");
2783
2784 return 0;
2785 }
2786
2787 static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
2788 char buf[NOTIFY_BUFFER_MAX+1];
2789 char *p = NULL;
2790 struct iovec iovec = {
2791 .iov_base = buf,
2792 .iov_len = sizeof(buf)-1,
2793 };
2794 union {
2795 struct cmsghdr cmsghdr;
2796 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
2797 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
2798 } control = {};
2799 struct msghdr msghdr = {
2800 .msg_iov = &iovec,
2801 .msg_iovlen = 1,
2802 .msg_control = &control,
2803 .msg_controllen = sizeof(control),
2804 };
2805 struct cmsghdr *cmsg;
2806 struct ucred *ucred = NULL;
2807 ssize_t n;
2808 pid_t inner_child_pid;
2809 _cleanup_strv_free_ char **tags = NULL;
2810
2811 assert(userdata);
2812
2813 inner_child_pid = PTR_TO_PID(userdata);
2814
2815 if (revents != EPOLLIN) {
2816 log_warning("Got unexpected poll event for notify fd.");
2817 return 0;
2818 }
2819
2820 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
2821 if (n < 0) {
2822 if (errno == EAGAIN || errno == EINTR)
2823 return 0;
2824
2825 return log_warning_errno(errno, "Couldn't read notification socket: %m");
2826 }
2827 cmsg_close_all(&msghdr);
2828
2829 CMSG_FOREACH(cmsg, &msghdr) {
2830 if (cmsg->cmsg_level == SOL_SOCKET &&
2831 cmsg->cmsg_type == SCM_CREDENTIALS &&
2832 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
2833
2834 ucred = (struct ucred*) CMSG_DATA(cmsg);
2835 }
2836 }
2837
2838 if (!ucred || ucred->pid != inner_child_pid) {
2839 log_warning("Received notify message without valid credentials. Ignoring.");
2840 return 0;
2841 }
2842
2843 if ((size_t) n >= sizeof(buf)) {
2844 log_warning("Received notify message exceeded maximum size. Ignoring.");
2845 return 0;
2846 }
2847
2848 buf[n] = 0;
2849 tags = strv_split(buf, "\n\r");
2850 if (!tags)
2851 return log_oom();
2852
2853 if (strv_find(tags, "READY=1"))
2854 sd_notifyf(false, "READY=1\n");
2855
2856 p = strv_find_startswith(tags, "STATUS=");
2857 if (p)
2858 sd_notifyf(false, "STATUS=Container running: %s", p);
2859
2860 return 0;
2861 }
2862
2863 static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
2864 int r;
2865
2866 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
2867 if (r < 0)
2868 return log_error_errno(r, "Failed to allocate notify event source: %m");
2869
2870 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
2871
2872 return 0;
2873 }
2874
2875 static int load_settings(void) {
2876 _cleanup_(settings_freep) Settings *settings = NULL;
2877 _cleanup_fclose_ FILE *f = NULL;
2878 _cleanup_free_ char *p = NULL;
2879 const char *fn, *i;
2880 int r;
2881
2882 /* If all settings are masked, there's no point in looking for
2883 * the settings file */
2884 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2885 return 0;
2886
2887 fn = strjoina(arg_machine, ".nspawn");
2888
2889 /* We first look in the admin's directories in /etc and /run */
2890 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2891 _cleanup_free_ char *j = NULL;
2892
2893 j = strjoin(i, "/", fn);
2894 if (!j)
2895 return log_oom();
2896
2897 f = fopen(j, "re");
2898 if (f) {
2899 p = j;
2900 j = NULL;
2901
2902 /* By default, we trust configuration from /etc and /run */
2903 if (arg_settings_trusted < 0)
2904 arg_settings_trusted = true;
2905
2906 break;
2907 }
2908
2909 if (errno != ENOENT)
2910 return log_error_errno(errno, "Failed to open %s: %m", j);
2911 }
2912
2913 if (!f) {
2914 /* After that, let's look for a file next to the
2915 * actual image we shall boot. */
2916
2917 if (arg_image) {
2918 p = file_in_same_dir(arg_image, fn);
2919 if (!p)
2920 return log_oom();
2921 } else if (arg_directory) {
2922 p = file_in_same_dir(arg_directory, fn);
2923 if (!p)
2924 return log_oom();
2925 }
2926
2927 if (p) {
2928 f = fopen(p, "re");
2929 if (!f && errno != ENOENT)
2930 return log_error_errno(errno, "Failed to open %s: %m", p);
2931
2932 /* By default, we do not trust configuration from /var/lib/machines */
2933 if (arg_settings_trusted < 0)
2934 arg_settings_trusted = false;
2935 }
2936 }
2937
2938 if (!f)
2939 return 0;
2940
2941 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2942
2943 r = settings_load(f, p, &settings);
2944 if (r < 0)
2945 return r;
2946
2947 /* Copy over bits from the settings, unless they have been
2948 * explicitly masked by command line switches. */
2949
2950 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
2951 settings->start_mode >= 0) {
2952 arg_start_mode = settings->start_mode;
2953
2954 strv_free(arg_parameters);
2955 arg_parameters = settings->parameters;
2956 settings->parameters = NULL;
2957 }
2958
2959 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
2960 settings->pivot_root_new) {
2961 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
2962 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
2963 }
2964
2965 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
2966 settings->working_directory) {
2967 free(arg_chdir);
2968 arg_chdir = settings->working_directory;
2969 settings->working_directory = NULL;
2970 }
2971
2972 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2973 settings->environment) {
2974 strv_free(arg_setenv);
2975 arg_setenv = settings->environment;
2976 settings->environment = NULL;
2977 }
2978
2979 if ((arg_settings_mask & SETTING_USER) == 0 &&
2980 settings->user) {
2981 free(arg_user);
2982 arg_user = settings->user;
2983 settings->user = NULL;
2984 }
2985
2986 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
2987 uint64_t plus;
2988
2989 plus = settings->capability;
2990 if (settings_private_network(settings))
2991 plus |= (1ULL << CAP_NET_ADMIN);
2992
2993 if (!arg_settings_trusted && plus != 0) {
2994 if (settings->capability != 0)
2995 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
2996 } else
2997 arg_caps_retain |= plus;
2998
2999 arg_caps_retain &= ~settings->drop_capability;
3000 }
3001
3002 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3003 settings->kill_signal > 0)
3004 arg_kill_signal = settings->kill_signal;
3005
3006 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3007 settings->personality != PERSONALITY_INVALID)
3008 arg_personality = settings->personality;
3009
3010 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3011 !sd_id128_is_null(settings->machine_id)) {
3012
3013 if (!arg_settings_trusted)
3014 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
3015 else
3016 arg_uuid = settings->machine_id;
3017 }
3018
3019 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3020 settings->read_only >= 0)
3021 arg_read_only = settings->read_only;
3022
3023 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3024 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3025 arg_volatile_mode = settings->volatile_mode;
3026
3027 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3028 settings->n_custom_mounts > 0) {
3029
3030 if (!arg_settings_trusted)
3031 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
3032 else {
3033 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3034 arg_custom_mounts = settings->custom_mounts;
3035 arg_n_custom_mounts = settings->n_custom_mounts;
3036
3037 settings->custom_mounts = NULL;
3038 settings->n_custom_mounts = 0;
3039 }
3040 }
3041
3042 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3043 (settings->private_network >= 0 ||
3044 settings->network_veth >= 0 ||
3045 settings->network_bridge ||
3046 settings->network_zone ||
3047 settings->network_interfaces ||
3048 settings->network_macvlan ||
3049 settings->network_ipvlan ||
3050 settings->network_veth_extra)) {
3051
3052 if (!arg_settings_trusted)
3053 log_warning("Ignoring network settings, file %s is not trusted.", p);
3054 else {
3055 arg_network_veth = settings_network_veth(settings);
3056 arg_private_network = settings_private_network(settings);
3057
3058 strv_free(arg_network_interfaces);
3059 arg_network_interfaces = settings->network_interfaces;
3060 settings->network_interfaces = NULL;
3061
3062 strv_free(arg_network_macvlan);
3063 arg_network_macvlan = settings->network_macvlan;
3064 settings->network_macvlan = NULL;
3065
3066 strv_free(arg_network_ipvlan);
3067 arg_network_ipvlan = settings->network_ipvlan;
3068 settings->network_ipvlan = NULL;
3069
3070 strv_free(arg_network_veth_extra);
3071 arg_network_veth_extra = settings->network_veth_extra;
3072 settings->network_veth_extra = NULL;
3073
3074 free(arg_network_bridge);
3075 arg_network_bridge = settings->network_bridge;
3076 settings->network_bridge = NULL;
3077
3078 free(arg_network_zone);
3079 arg_network_zone = settings->network_zone;
3080 settings->network_zone = NULL;
3081 }
3082 }
3083
3084 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3085 settings->expose_ports) {
3086
3087 if (!arg_settings_trusted)
3088 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3089 else {
3090 expose_port_free_all(arg_expose_ports);
3091 arg_expose_ports = settings->expose_ports;
3092 settings->expose_ports = NULL;
3093 }
3094 }
3095
3096 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3097 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3098
3099 if (!arg_settings_trusted)
3100 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p);
3101 else {
3102 arg_userns_mode = settings->userns_mode;
3103 arg_uid_shift = settings->uid_shift;
3104 arg_uid_range = settings->uid_range;
3105 arg_userns_chown = settings->userns_chown;
3106 }
3107 }
3108
3109 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3110 arg_notify_ready = settings->notify_ready;
3111
3112 return 0;
3113 }
3114
3115 static int run(int master,
3116 const char* console,
3117 DissectedImage *dissected_image,
3118 bool interactive,
3119 bool secondary,
3120 FDSet *fds,
3121 char veth_name[IFNAMSIZ], bool *veth_created,
3122 union in_addr_union *exposed,
3123 pid_t *pid, int *ret) {
3124
3125 static const struct sigaction sa = {
3126 .sa_handler = nop_signal_handler,
3127 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
3128 };
3129
3130 _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
3131 _cleanup_close_ int etc_passwd_lock = -1;
3132 _cleanup_close_pair_ int
3133 kmsg_socket_pair[2] = { -1, -1 },
3134 rtnl_socket_pair[2] = { -1, -1 },
3135 pid_socket_pair[2] = { -1, -1 },
3136 uuid_socket_pair[2] = { -1, -1 },
3137 notify_socket_pair[2] = { -1, -1 },
3138 uid_shift_socket_pair[2] = { -1, -1 };
3139 _cleanup_close_ int notify_socket= -1;
3140 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3141 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
3142 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
3143 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3144 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
3145 ContainerStatus container_status = 0;
3146 char last_char = 0;
3147 int ifi = 0, r;
3148 ssize_t l;
3149 sigset_t mask_chld;
3150
3151 assert_se(sigemptyset(&mask_chld) == 0);
3152 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3153
3154 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3155 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3156 * check with getpwuid() if the specific user already exists. Note that /etc might be
3157 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3158 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3159 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3160 * really ours. */
3161
3162 etc_passwd_lock = take_etc_passwd_lock(NULL);
3163 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
3164 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
3165 }
3166
3167 r = barrier_create(&barrier);
3168 if (r < 0)
3169 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
3170
3171 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
3172 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3173
3174 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
3175 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3176
3177 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
3178 return log_error_errno(errno, "Failed to create pid socket pair: %m");
3179
3180 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
3181 return log_error_errno(errno, "Failed to create id socket pair: %m");
3182
3183 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
3184 return log_error_errno(errno, "Failed to create notify socket pair: %m");
3185
3186 if (arg_userns_mode != USER_NAMESPACE_NO)
3187 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
3188 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3189
3190 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
3191 * parent's blocking calls and give it a chance to call wait() and terminate. */
3192 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3193 if (r < 0)
3194 return log_error_errno(errno, "Failed to change the signal mask: %m");
3195
3196 r = sigaction(SIGCHLD, &sa, NULL);
3197 if (r < 0)
3198 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3199
3200 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
3201 if (*pid < 0)
3202 return log_error_errno(errno, "clone() failed%s: %m",
3203 errno == EINVAL ?
3204 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
3205
3206 if (*pid == 0) {
3207 /* The outer child only has a file system namespace. */
3208 barrier_set_role(&barrier, BARRIER_CHILD);
3209
3210 master = safe_close(master);
3211
3212 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3213 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3214 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3215 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
3216 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3217 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3218
3219 (void) reset_all_signal_handlers();
3220 (void) reset_signal_mask();
3221
3222 r = outer_child(&barrier,
3223 arg_directory,
3224 console,
3225 dissected_image,
3226 interactive,
3227 secondary,
3228 pid_socket_pair[1],
3229 uuid_socket_pair[1],
3230 notify_socket_pair[1],
3231 kmsg_socket_pair[1],
3232 rtnl_socket_pair[1],
3233 uid_shift_socket_pair[1],
3234 fds);
3235 if (r < 0)
3236 _exit(EXIT_FAILURE);
3237
3238 _exit(EXIT_SUCCESS);
3239 }
3240
3241 barrier_set_role(&barrier, BARRIER_PARENT);
3242
3243 fds = fdset_free(fds);
3244
3245 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3246 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3247 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3248 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
3249 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3250 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3251
3252 if (arg_userns_mode != USER_NAMESPACE_NO) {
3253 /* The child just let us know the UID shift it might have read from the image. */
3254 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
3255 if (l < 0)
3256 return log_error_errno(errno, "Failed to read UID shift: %m");
3257 if (l != sizeof arg_uid_shift) {
3258 log_error("Short read while reading UID shift.");
3259 return -EIO;
3260 }
3261
3262 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3263 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3264 * image, but if that's already in use, pick a new one, and report back to the child,
3265 * which one we now picked. */
3266
3267 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
3268 if (r < 0)
3269 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
3270
3271 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
3272 if (l < 0)
3273 return log_error_errno(errno, "Failed to send UID shift: %m");
3274 if (l != sizeof arg_uid_shift) {
3275 log_error("Short write while writing UID shift.");
3276 return -EIO;
3277 }
3278 }
3279 }
3280
3281 /* Wait for the outer child. */
3282 r = wait_for_terminate_and_warn("namespace helper", *pid, NULL);
3283 if (r != 0)
3284 return r < 0 ? r : -EIO;
3285
3286 /* And now retrieve the PID of the inner child. */
3287 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
3288 if (l < 0)
3289 return log_error_errno(errno, "Failed to read inner child PID: %m");
3290 if (l != sizeof *pid) {
3291 log_error("Short read while reading inner child PID.");
3292 return -EIO;
3293 }
3294
3295 /* We also retrieve container UUID in case it was generated by outer child */
3296 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
3297 if (l < 0)
3298 return log_error_errno(errno, "Failed to read container machine ID: %m");
3299 if (l != sizeof(arg_uuid)) {
3300 log_error("Short read while reading container machined ID.");
3301 return -EIO;
3302 }
3303
3304 /* We also retrieve the socket used for notifications generated by outer child */
3305 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
3306 if (notify_socket < 0)
3307 return log_error_errno(notify_socket,
3308 "Failed to receive notification socket from the outer child: %m");
3309
3310 log_debug("Init process invoked as PID "PID_FMT, *pid);
3311
3312 if (arg_userns_mode != USER_NAMESPACE_NO) {
3313 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3314 log_error("Child died too early.");
3315 return -ESRCH;
3316 }
3317
3318 r = setup_uid_map(*pid);
3319 if (r < 0)
3320 return r;
3321
3322 (void) barrier_place(&barrier); /* #2 */
3323 }
3324
3325 if (arg_private_network) {
3326
3327 r = move_network_interfaces(*pid, arg_network_interfaces);
3328 if (r < 0)
3329 return r;
3330
3331 if (arg_network_veth) {
3332 r = setup_veth(arg_machine, *pid, veth_name,
3333 arg_network_bridge || arg_network_zone);
3334 if (r < 0)
3335 return r;
3336 else if (r > 0)
3337 ifi = r;
3338
3339 if (arg_network_bridge) {
3340 /* Add the interface to a bridge */
3341 r = setup_bridge(veth_name, arg_network_bridge, false);
3342 if (r < 0)
3343 return r;
3344 if (r > 0)
3345 ifi = r;
3346 } else if (arg_network_zone) {
3347 /* Add the interface to a bridge, possibly creating it */
3348 r = setup_bridge(veth_name, arg_network_zone, true);
3349 if (r < 0)
3350 return r;
3351 if (r > 0)
3352 ifi = r;
3353 }
3354 }
3355
3356 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
3357 if (r < 0)
3358 return r;
3359
3360 /* We created the primary and extra veth links now; let's remember this, so that we know to
3361 remove them later on. Note that we don't bother with removing veth links that were created
3362 here when their setup failed half-way, because in that case the kernel should be able to
3363 remove them on its own, since they cannot be referenced by anything yet. */
3364 *veth_created = true;
3365
3366 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
3367 if (r < 0)
3368 return r;
3369
3370 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
3371 if (r < 0)
3372 return r;
3373 }
3374
3375 if (arg_register) {
3376 r = register_machine(
3377 arg_machine,
3378 *pid,
3379 arg_directory,
3380 arg_uuid,
3381 ifi,
3382 arg_slice,
3383 arg_custom_mounts, arg_n_custom_mounts,
3384 arg_kill_signal,
3385 arg_property,
3386 arg_keep_unit,
3387 arg_container_service_name);
3388 if (r < 0)
3389 return r;
3390 }
3391
3392 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
3393 if (r < 0)
3394 return r;
3395
3396 if (arg_keep_unit) {
3397 r = create_subcgroup(*pid, arg_unified_cgroup_hierarchy);
3398 if (r < 0)
3399 return r;
3400 }
3401
3402 r = chown_cgroup(*pid, arg_uid_shift);
3403 if (r < 0)
3404 return r;
3405
3406 /* Notify the child that the parent is ready with all
3407 * its setup (including cgroup-ification), and that
3408 * the child can now hand over control to the code to
3409 * run inside the container. */
3410 (void) barrier_place(&barrier); /* #3 */
3411
3412 /* Block SIGCHLD here, before notifying child.
3413 * process_pty() will handle it with the other signals. */
3414 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3415
3416 /* Reset signal to default */
3417 r = default_signals(SIGCHLD, -1);
3418 if (r < 0)
3419 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
3420
3421 r = sd_event_new(&event);
3422 if (r < 0)
3423 return log_error_errno(r, "Failed to get default event source: %m");
3424
3425 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
3426 if (r < 0)
3427 return r;
3428
3429 /* Let the child know that we are ready and wait that the child is completely ready now. */
3430 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3431 log_error("Child died too early.");
3432 return -ESRCH;
3433 }
3434
3435 /* At this point we have made use of the UID we picked, and thus nss-mymachines
3436 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
3437 etc_passwd_lock = safe_close(etc_passwd_lock);
3438
3439 sd_notifyf(false,
3440 "STATUS=Container running.\n"
3441 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
3442 if (!arg_notify_ready)
3443 sd_notify(false, "READY=1\n");
3444
3445 if (arg_kill_signal > 0) {
3446 /* Try to kill the init system on SIGINT or SIGTERM */
3447 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
3448 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
3449 } else {
3450 /* Immediately exit */
3451 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3452 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3453 }
3454
3455 /* Exit when the child exits */
3456 sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
3457
3458 if (arg_expose_ports) {
3459 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
3460 if (r < 0)
3461 return r;
3462
3463 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
3464 }
3465
3466 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3467
3468 r = pty_forward_new(event, master,
3469 PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY),
3470 &forward);
3471 if (r < 0)
3472 return log_error_errno(r, "Failed to create PTY forwarder: %m");
3473
3474 r = sd_event_loop(event);
3475 if (r < 0)
3476 return log_error_errno(r, "Failed to run event loop: %m");
3477
3478 pty_forward_get_last_char(forward, &last_char);
3479
3480 forward = pty_forward_free(forward);
3481
3482 if (!arg_quiet && last_char != '\n')
3483 putc('\n', stdout);
3484
3485 /* Kill if it is not dead yet anyway */
3486 if (arg_register && !arg_keep_unit)
3487 terminate_machine(*pid);
3488
3489 /* Normally redundant, but better safe than sorry */
3490 (void) kill(*pid, SIGKILL);
3491
3492 r = wait_for_container(*pid, &container_status);
3493 *pid = 0;
3494
3495 if (r < 0)
3496 /* We failed to wait for the container, or the container exited abnormally. */
3497 return r;
3498 if (r > 0 || container_status == CONTAINER_TERMINATED) {
3499 /* r > 0 → The container exited with a non-zero status.
3500 * As a special case, we need to replace 133 with a different value,
3501 * because 133 is special-cased in the service file to reboot the container.
3502 * otherwise → The container exited with zero status and a reboot was not requested.
3503 */
3504 if (r == EXIT_FORCE_RESTART)
3505 r = EXIT_FAILURE; /* replace 133 with the general failure code */
3506 *ret = r;
3507 return 0; /* finito */
3508 }
3509
3510 /* CONTAINER_REBOOTED, loop again */
3511
3512 if (arg_keep_unit) {
3513 /* Special handling if we are running as a service: instead of simply
3514 * restarting the machine we want to restart the entire service, so let's
3515 * inform systemd about this with the special exit code 133. The service
3516 * file uses RestartForceExitStatus=133 so that this results in a full
3517 * nspawn restart. This is necessary since we might have cgroup parameters
3518 * set we want to have flushed out. */
3519 *ret = EXIT_FORCE_RESTART;
3520 return 0; /* finito */
3521 }
3522
3523 expose_port_flush(arg_expose_ports, exposed);
3524
3525 (void) remove_veth_links(veth_name, arg_network_veth_extra);
3526 *veth_created = false;
3527 return 1; /* loop again */
3528 }
3529
3530 int main(int argc, char *argv[]) {
3531
3532 _cleanup_free_ char *console = NULL;
3533 _cleanup_close_ int master = -1;
3534 _cleanup_fdset_free_ FDSet *fds = NULL;
3535 int r, n_fd_passed, ret = EXIT_SUCCESS;
3536 char veth_name[IFNAMSIZ] = "";
3537 bool secondary = false, remove_directory = false, remove_image = false;
3538 pid_t pid = 0;
3539 union in_addr_union exposed = {};
3540 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3541 bool interactive, veth_created = false, remove_tmprootdir = false;
3542 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
3543 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
3544 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
3545 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
3546
3547 log_parse_environment();
3548 log_open();
3549
3550 /* Make sure rename_process() in the stub init process can work */
3551 saved_argv = argv;
3552 saved_argc = argc;
3553
3554 r = parse_argv(argc, argv);
3555 if (r <= 0)
3556 goto finish;
3557
3558 if (geteuid() != 0) {
3559 log_error("Need to be root.");
3560 r = -EPERM;
3561 goto finish;
3562 }
3563 r = determine_names();
3564 if (r < 0)
3565 goto finish;
3566
3567 r = load_settings();
3568 if (r < 0)
3569 goto finish;
3570
3571 r = verify_arguments();
3572 if (r < 0)
3573 goto finish;
3574
3575 n_fd_passed = sd_listen_fds(false);
3576 if (n_fd_passed > 0) {
3577 r = fdset_new_listen_fds(&fds, false);
3578 if (r < 0) {
3579 log_error_errno(r, "Failed to collect file descriptors: %m");
3580 goto finish;
3581 }
3582 }
3583
3584 if (arg_directory) {
3585 assert(!arg_image);
3586
3587 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3588 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3589 r = -EINVAL;
3590 goto finish;
3591 }
3592
3593 if (arg_ephemeral) {
3594 _cleanup_free_ char *np = NULL;
3595
3596 r = chase_symlinks_and_update(&arg_directory, 0);
3597 if (r < 0)
3598 goto finish;
3599
3600 /* If the specified path is a mount point we
3601 * generate the new snapshot immediately
3602 * inside it under a random name. However if
3603 * the specified is not a mount point we
3604 * create the new snapshot in the parent
3605 * directory, just next to it. */
3606 r = path_is_mount_point(arg_directory, NULL, 0);
3607 if (r < 0) {
3608 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3609 goto finish;
3610 }
3611 if (r > 0)
3612 r = tempfn_random_child(arg_directory, "machine.", &np);
3613 else
3614 r = tempfn_random(arg_directory, "machine.", &np);
3615 if (r < 0) {
3616 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
3617 goto finish;
3618 }
3619
3620 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3621 if (r < 0) {
3622 log_error_errno(r, "Failed to lock %s: %m", np);
3623 goto finish;
3624 }
3625
3626 r = btrfs_subvol_snapshot(arg_directory, np,
3627 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
3628 BTRFS_SNAPSHOT_FALLBACK_COPY |
3629 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3630 BTRFS_SNAPSHOT_RECURSIVE |
3631 BTRFS_SNAPSHOT_QUOTA);
3632 if (r < 0) {
3633 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3634 goto finish;
3635 }
3636
3637 free(arg_directory);
3638 arg_directory = np;
3639 np = NULL;
3640
3641 remove_directory = true;
3642
3643 } else {
3644 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
3645 if (r < 0)
3646 goto finish;
3647
3648 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3649 if (r == -EBUSY) {
3650 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3651 goto finish;
3652 }
3653 if (r < 0) {
3654 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3655 goto finish;
3656 }
3657
3658 if (arg_template) {
3659 r = chase_symlinks_and_update(&arg_template, 0);
3660 if (r < 0)
3661 goto finish;
3662
3663 r = btrfs_subvol_snapshot(arg_template, arg_directory,
3664 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
3665 BTRFS_SNAPSHOT_FALLBACK_COPY |
3666 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3667 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
3668 BTRFS_SNAPSHOT_RECURSIVE |
3669 BTRFS_SNAPSHOT_QUOTA);
3670 if (r == -EEXIST) {
3671 if (!arg_quiet)
3672 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3673 } else if (r < 0) {
3674 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3675 goto finish;
3676 } else {
3677 if (!arg_quiet)
3678 log_info("Populated %s from template %s.", arg_directory, arg_template);
3679 }
3680 }
3681 }
3682
3683 if (arg_start_mode == START_BOOT) {
3684 if (path_is_os_tree(arg_directory) <= 0) {
3685 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3686 r = -EINVAL;
3687 goto finish;
3688 }
3689 } else {
3690 const char *p;
3691
3692 p = strjoina(arg_directory, "/usr/");
3693 if (laccess(p, F_OK) < 0) {
3694 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
3695 r = -EINVAL;
3696 goto finish;
3697 }
3698 }
3699
3700 } else {
3701 assert(arg_image);
3702 assert(!arg_template);
3703
3704 r = chase_symlinks_and_update(&arg_image, 0);
3705 if (r < 0)
3706 goto finish;
3707
3708 if (arg_ephemeral) {
3709 _cleanup_free_ char *np = NULL;
3710
3711 r = tempfn_random(arg_image, "machine.", &np);
3712 if (r < 0) {
3713 log_error_errno(r, "Failed to generate name for image snapshot: %m");
3714 goto finish;
3715 }
3716
3717 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3718 if (r < 0) {
3719 r = log_error_errno(r, "Failed to create image lock: %m");
3720 goto finish;
3721 }
3722
3723 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, COPY_REFLINK);
3724 if (r < 0) {
3725 r = log_error_errno(r, "Failed to copy image file: %m");
3726 goto finish;
3727 }
3728
3729 free(arg_image);
3730 arg_image = np;
3731 np = NULL;
3732
3733 remove_image = true;
3734 } else {
3735 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3736 if (r == -EBUSY) {
3737 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3738 goto finish;
3739 }
3740 if (r < 0) {
3741 r = log_error_errno(r, "Failed to create image lock: %m");
3742 goto finish;
3743 }
3744
3745 if (!arg_root_hash) {
3746 r = root_hash_load(arg_image, &arg_root_hash, &arg_root_hash_size);
3747 if (r < 0) {
3748 log_error_errno(r, "Failed to load root hash file for %s: %m", arg_image);
3749 goto finish;
3750 }
3751 }
3752 }
3753
3754 if (!mkdtemp(tmprootdir)) {
3755 r = log_error_errno(errno, "Failed to create temporary directory: %m");
3756 goto finish;
3757 }
3758
3759 remove_tmprootdir = true;
3760
3761 arg_directory = strdup(tmprootdir);
3762 if (!arg_directory) {
3763 r = log_oom();
3764 goto finish;
3765 }
3766
3767 r = loop_device_make_by_path(arg_image, arg_read_only ? O_RDONLY : O_RDWR, &loop);
3768 if (r < 0) {
3769 log_error_errno(r, "Failed to set up loopback block device: %m");
3770 goto finish;
3771 }
3772
3773 r = dissect_image(
3774 loop->fd,
3775 arg_root_hash, arg_root_hash_size,
3776 DISSECT_IMAGE_REQUIRE_ROOT,
3777 &dissected_image);
3778 if (r == -ENOPKG) {
3779 log_error_errno(r, "Could not find a suitable file system or partition table in image: %s", arg_image);
3780
3781 log_notice("Note that the disk image needs to\n"
3782 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
3783 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
3784 " c) or follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n"
3785 " d) or contain a file system without a partition table\n"
3786 "in order to be bootable with systemd-nspawn.");
3787 goto finish;
3788 }
3789 if (r == -EADDRNOTAVAIL) {
3790 log_error_errno(r, "No root partition for specified root hash found.");
3791 goto finish;
3792 }
3793 if (r == -EOPNOTSUPP) {
3794 log_error_errno(r, "--image= is not supported, compiled without blkid support.");
3795 goto finish;
3796 }
3797 if (r < 0) {
3798 log_error_errno(r, "Failed to dissect image: %m");
3799 goto finish;
3800 }
3801
3802 if (!arg_root_hash && dissected_image->can_verity)
3803 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
3804
3805 r = dissected_image_decrypt_interactively(dissected_image, NULL, arg_root_hash, arg_root_hash_size, 0, &decrypted_image);
3806 if (r < 0)
3807 goto finish;
3808
3809 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
3810 if (remove_image && unlink(arg_image) >= 0)
3811 remove_image = false;
3812 }
3813
3814 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
3815 if (r < 0)
3816 goto finish;
3817
3818 r = detect_unified_cgroup_hierarchy(arg_directory);
3819 if (r < 0)
3820 goto finish;
3821
3822 interactive =
3823 isatty(STDIN_FILENO) > 0 &&
3824 isatty(STDOUT_FILENO) > 0;
3825
3826 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3827 if (master < 0) {
3828 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3829 goto finish;
3830 }
3831
3832 r = ptsname_malloc(master, &console);
3833 if (r < 0) {
3834 r = log_error_errno(r, "Failed to determine tty name: %m");
3835 goto finish;
3836 }
3837
3838 if (arg_selinux_apifs_context) {
3839 r = mac_selinux_apply(console, arg_selinux_apifs_context);
3840 if (r < 0)
3841 goto finish;
3842 }
3843
3844 if (unlockpt(master) < 0) {
3845 r = log_error_errno(errno, "Failed to unlock tty: %m");
3846 goto finish;
3847 }
3848
3849 if (!arg_quiet)
3850 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3851 arg_machine, arg_image ?: arg_directory);
3852
3853 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
3854
3855 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3856 r = log_error_errno(errno, "Failed to become subreaper: %m");
3857 goto finish;
3858 }
3859
3860 for (;;) {
3861 r = run(master,
3862 console,
3863 dissected_image,
3864 interactive, secondary,
3865 fds,
3866 veth_name, &veth_created,
3867 &exposed,
3868 &pid, &ret);
3869 if (r <= 0)
3870 break;
3871 }
3872
3873 finish:
3874 sd_notify(false,
3875 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
3876 "STOPPING=1\nSTATUS=Terminating...");
3877
3878 if (pid > 0)
3879 (void) kill(pid, SIGKILL);
3880
3881 /* Try to flush whatever is still queued in the pty */
3882 if (master >= 0) {
3883 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, 0);
3884 master = safe_close(master);
3885 }
3886
3887 if (pid > 0)
3888 (void) wait_for_terminate(pid, NULL);
3889
3890 if (remove_directory && arg_directory) {
3891 int k;
3892
3893 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
3894 if (k < 0)
3895 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
3896 }
3897
3898 if (remove_image && arg_image) {
3899 if (unlink(arg_image) < 0)
3900 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
3901 }
3902
3903 if (remove_tmprootdir) {
3904 if (rmdir(tmprootdir) < 0)
3905 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
3906 }
3907
3908 if (arg_machine) {
3909 const char *p;
3910
3911 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
3912 (void) rm_rf(p, REMOVE_ROOT);
3913 }
3914
3915 expose_port_flush(arg_expose_ports, &exposed);
3916
3917 if (veth_created)
3918 (void) remove_veth_links(veth_name, arg_network_veth_extra);
3919 (void) remove_bridge(arg_network_zone);
3920
3921 free(arg_directory);
3922 free(arg_template);
3923 free(arg_image);
3924 free(arg_machine);
3925 free(arg_user);
3926 free(arg_pivot_root_new);
3927 free(arg_pivot_root_old);
3928 free(arg_chdir);
3929 strv_free(arg_setenv);
3930 free(arg_network_bridge);
3931 strv_free(arg_network_interfaces);
3932 strv_free(arg_network_macvlan);
3933 strv_free(arg_network_ipvlan);
3934 strv_free(arg_network_veth_extra);
3935 strv_free(arg_parameters);
3936 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3937 expose_port_free_all(arg_expose_ports);
3938 free(arg_root_hash);
3939
3940 return r < 0 ? EXIT_FAILURE : ret;
3941 }