]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
Add SPDX license identifiers to source files under the LGPL
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2010 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19 ***/
20
21 #if HAVE_BLKID
22 #include <blkid.h>
23 #endif
24 #include <errno.h>
25 #include <getopt.h>
26 #include <grp.h>
27 #include <linux/loop.h>
28 #include <pwd.h>
29 #include <sched.h>
30 #if HAVE_SELINUX
31 #include <selinux/selinux.h>
32 #endif
33 #include <signal.h>
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <string.h>
37 #include <sys/file.h>
38 #include <sys/mount.h>
39 #include <sys/personality.h>
40 #include <sys/prctl.h>
41 #include <sys/types.h>
42 #include <sys/wait.h>
43 #include <unistd.h>
44
45 #include "sd-bus.h"
46 #include "sd-daemon.h"
47 #include "sd-id128.h"
48
49 #include "alloc-util.h"
50 #include "barrier.h"
51 #include "base-filesystem.h"
52 #include "blkid-util.h"
53 #include "btrfs-util.h"
54 #include "bus-util.h"
55 #include "cap-list.h"
56 #include "capability-util.h"
57 #include "cgroup-util.h"
58 #include "copy.h"
59 #include "dev-setup.h"
60 #include "dissect-image.h"
61 #include "env-util.h"
62 #include "fd-util.h"
63 #include "fdset.h"
64 #include "fileio.h"
65 #include "format-util.h"
66 #include "fs-util.h"
67 #include "gpt.h"
68 #include "hexdecoct.h"
69 #include "hostname-util.h"
70 #include "id128-util.h"
71 #include "log.h"
72 #include "loop-util.h"
73 #include "loopback-setup.h"
74 #include "machine-image.h"
75 #include "macro.h"
76 #include "missing.h"
77 #include "mkdir.h"
78 #include "mount-util.h"
79 #include "netlink-util.h"
80 #include "nspawn-cgroup.h"
81 #include "nspawn-def.h"
82 #include "nspawn-expose-ports.h"
83 #include "nspawn-mount.h"
84 #include "nspawn-network.h"
85 #include "nspawn-patch-uid.h"
86 #include "nspawn-register.h"
87 #include "nspawn-seccomp.h"
88 #include "nspawn-settings.h"
89 #include "nspawn-setuid.h"
90 #include "nspawn-stub-pid1.h"
91 #include "parse-util.h"
92 #include "path-util.h"
93 #include "process-util.h"
94 #include "ptyfwd.h"
95 #include "random-util.h"
96 #include "raw-clone.h"
97 #include "rm-rf.h"
98 #include "selinux-util.h"
99 #include "signal-util.h"
100 #include "socket-util.h"
101 #include "stat-util.h"
102 #include "stdio-util.h"
103 #include "string-util.h"
104 #include "strv.h"
105 #include "terminal-util.h"
106 #include "udev-util.h"
107 #include "umask-util.h"
108 #include "user-util.h"
109 #include "util.h"
110
111 /* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
112 * nspawn_notify_socket_path is relative to the container
113 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
114 #define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
115
116 #define EXIT_FORCE_RESTART 133
117
118 typedef enum ContainerStatus {
119 CONTAINER_TERMINATED,
120 CONTAINER_REBOOTED
121 } ContainerStatus;
122
123 typedef enum LinkJournal {
124 LINK_NO,
125 LINK_AUTO,
126 LINK_HOST,
127 LINK_GUEST
128 } LinkJournal;
129
130 static char *arg_directory = NULL;
131 static char *arg_template = NULL;
132 static char *arg_chdir = NULL;
133 static char *arg_pivot_root_new = NULL;
134 static char *arg_pivot_root_old = NULL;
135 static char *arg_user = NULL;
136 static sd_id128_t arg_uuid = {};
137 static char *arg_machine = NULL;
138 static const char *arg_selinux_context = NULL;
139 static const char *arg_selinux_apifs_context = NULL;
140 static const char *arg_slice = NULL;
141 static bool arg_private_network = false;
142 static bool arg_read_only = false;
143 static StartMode arg_start_mode = START_PID1;
144 static bool arg_ephemeral = false;
145 static LinkJournal arg_link_journal = LINK_AUTO;
146 static bool arg_link_journal_try = false;
147 static uint64_t arg_caps_retain =
148 (1ULL << CAP_AUDIT_CONTROL) |
149 (1ULL << CAP_AUDIT_WRITE) |
150 (1ULL << CAP_CHOWN) |
151 (1ULL << CAP_DAC_OVERRIDE) |
152 (1ULL << CAP_DAC_READ_SEARCH) |
153 (1ULL << CAP_FOWNER) |
154 (1ULL << CAP_FSETID) |
155 (1ULL << CAP_IPC_OWNER) |
156 (1ULL << CAP_KILL) |
157 (1ULL << CAP_LEASE) |
158 (1ULL << CAP_LINUX_IMMUTABLE) |
159 (1ULL << CAP_MKNOD) |
160 (1ULL << CAP_NET_BIND_SERVICE) |
161 (1ULL << CAP_NET_BROADCAST) |
162 (1ULL << CAP_NET_RAW) |
163 (1ULL << CAP_SETFCAP) |
164 (1ULL << CAP_SETGID) |
165 (1ULL << CAP_SETPCAP) |
166 (1ULL << CAP_SETUID) |
167 (1ULL << CAP_SYS_ADMIN) |
168 (1ULL << CAP_SYS_BOOT) |
169 (1ULL << CAP_SYS_CHROOT) |
170 (1ULL << CAP_SYS_NICE) |
171 (1ULL << CAP_SYS_PTRACE) |
172 (1ULL << CAP_SYS_RESOURCE) |
173 (1ULL << CAP_SYS_TTY_CONFIG);
174 static CustomMount *arg_custom_mounts = NULL;
175 static unsigned arg_n_custom_mounts = 0;
176 static char **arg_setenv = NULL;
177 static bool arg_quiet = false;
178 static bool arg_register = true;
179 static bool arg_keep_unit = false;
180 static char **arg_network_interfaces = NULL;
181 static char **arg_network_macvlan = NULL;
182 static char **arg_network_ipvlan = NULL;
183 static bool arg_network_veth = false;
184 static char **arg_network_veth_extra = NULL;
185 static char *arg_network_bridge = NULL;
186 static char *arg_network_zone = NULL;
187 static unsigned long arg_personality = PERSONALITY_INVALID;
188 static char *arg_image = NULL;
189 static VolatileMode arg_volatile_mode = VOLATILE_NO;
190 static ExposePort *arg_expose_ports = NULL;
191 static char **arg_property = NULL;
192 static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
193 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
194 static bool arg_userns_chown = false;
195 static int arg_kill_signal = 0;
196 static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
197 static SettingsMask arg_settings_mask = 0;
198 static int arg_settings_trusted = -1;
199 static char **arg_parameters = NULL;
200 static const char *arg_container_service_name = "systemd-nspawn";
201 static bool arg_notify_ready = false;
202 static bool arg_use_cgns = true;
203 static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
204 static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO;
205 static void *arg_root_hash = NULL;
206 static size_t arg_root_hash_size = 0;
207 static char **arg_syscall_whitelist = NULL;
208 static char **arg_syscall_blacklist = NULL;
209
210 static void help(void) {
211 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
212 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
213 " -h --help Show this help\n"
214 " --version Print version string\n"
215 " -q --quiet Do not show status information\n"
216 " -D --directory=PATH Root directory for the container\n"
217 " --template=PATH Initialize root directory from template directory,\n"
218 " if missing\n"
219 " -x --ephemeral Run container with snapshot of root directory, and\n"
220 " remove it after exit\n"
221 " -i --image=PATH File system device or disk image for the container\n"
222 " --root-hash=HASH Specify verity root hash\n"
223 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
224 " -b --boot Boot up full system (i.e. invoke init)\n"
225 " --chdir=PATH Set working directory in the container\n"
226 " --pivot-root=PATH[:PATH]\n"
227 " Pivot root to given directory in the container\n"
228 " -u --user=USER Run the command under specified user or uid\n"
229 " -M --machine=NAME Set the machine name for the container\n"
230 " --uuid=UUID Set a specific machine UUID for the container\n"
231 " -S --slice=SLICE Place the container in the specified slice\n"
232 " --property=NAME=VALUE Set scope unit property\n"
233 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
234 " --private-users[=UIDBASE[:NUIDS]]\n"
235 " Similar, but with user configured UID/GID range\n"
236 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n"
237 " --private-network Disable network in container\n"
238 " --network-interface=INTERFACE\n"
239 " Assign an existing network interface to the\n"
240 " container\n"
241 " --network-macvlan=INTERFACE\n"
242 " Create a macvlan network interface based on an\n"
243 " existing network interface to the container\n"
244 " --network-ipvlan=INTERFACE\n"
245 " Create a ipvlan network interface based on an\n"
246 " existing network interface to the container\n"
247 " -n --network-veth Add a virtual Ethernet connection between host\n"
248 " and container\n"
249 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
250 " Add an additional virtual Ethernet link between\n"
251 " host and container\n"
252 " --network-bridge=INTERFACE\n"
253 " Add a virtual Ethernet connection to the container\n"
254 " and attach it to an existing bridge on the host\n"
255 " --network-zone=NAME Similar, but attach the new interface to an\n"
256 " an automatically managed bridge interface\n"
257 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
258 " Expose a container IP port on the host\n"
259 " -Z --selinux-context=SECLABEL\n"
260 " Set the SELinux security context to be used by\n"
261 " processes in the container\n"
262 " -L --selinux-apifs-context=SECLABEL\n"
263 " Set the SELinux security context to be used by\n"
264 " API/tmpfs file systems in the container\n"
265 " --capability=CAP In addition to the default, retain specified\n"
266 " capability\n"
267 " --drop-capability=CAP Drop the specified capability from the default set\n"
268 " --system-call-filter=LIST|~LIST\n"
269 " Permit/prohibit specific system calls\n"
270 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
271 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
272 " host, try-guest, try-host\n"
273 " -j Equivalent to --link-journal=try-guest\n"
274 " --read-only Mount the root directory read-only\n"
275 " --bind=PATH[:PATH[:OPTIONS]]\n"
276 " Bind mount a file or directory from the host into\n"
277 " the container\n"
278 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
279 " Similar, but creates a read-only bind mount\n"
280 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
281 " --overlay=PATH[:PATH...]:PATH\n"
282 " Create an overlay mount from the host to \n"
283 " the container\n"
284 " --overlay-ro=PATH[:PATH...]:PATH\n"
285 " Similar, but creates a read-only overlay mount\n"
286 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
287 " --register=BOOLEAN Register container as machine\n"
288 " --keep-unit Do not register a scope for the machine, reuse\n"
289 " the service unit nspawn is running in\n"
290 " --volatile[=MODE] Run the system in volatile mode\n"
291 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
292 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
293 , program_invocation_short_name);
294 }
295
296 static int custom_mount_check_all(void) {
297 unsigned i;
298
299 for (i = 0; i < arg_n_custom_mounts; i++) {
300 CustomMount *m = &arg_custom_mounts[i];
301
302 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
303
304 if (arg_userns_chown) {
305 log_error("--private-users-chown may not be combined with custom root mounts.");
306 return -EINVAL;
307 } else if (arg_uid_shift == UID_INVALID) {
308 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
309 return -EINVAL;
310 }
311 }
312 }
313
314 return 0;
315 }
316
317 static int detect_unified_cgroup_hierarchy(const char *directory) {
318 const char *e;
319 int r;
320
321 /* Allow the user to control whether the unified hierarchy is used */
322 e = getenv("UNIFIED_CGROUP_HIERARCHY");
323 if (e) {
324 r = parse_boolean(e);
325 if (r < 0)
326 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
327 if (r > 0)
328 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
329 else
330 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
331
332 return 0;
333 }
334
335 /* Otherwise inherit the default from the host system */
336 r = cg_all_unified();
337 if (r < 0)
338 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
339 if (r > 0) {
340 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
341 * routine only detects 231, so we'll have a false negative here for 230. */
342 r = systemd_installation_has_version(directory, 230);
343 if (r < 0)
344 return log_error_errno(r, "Failed to determine systemd version in container: %m");
345 if (r > 0)
346 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
347 else
348 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
349 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
350 /* Mixed cgroup hierarchy support was added in 233 */
351 r = systemd_installation_has_version(directory, 233);
352 if (r < 0)
353 return log_error_errno(r, "Failed to determine systemd version in container: %m");
354 if (r > 0)
355 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
356 else
357 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
358 } else
359 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
360
361 return 0;
362 }
363
364 static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
365 int r;
366
367 r = getenv_bool(name);
368 if (r == -ENXIO)
369 return;
370 if (r < 0)
371 log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
372 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
373 }
374
375 static void parse_mount_settings_env(void) {
376 int r;
377 const char *e;
378
379 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
380 if (!e)
381 return;
382
383 if (streq(e, "network")) {
384 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
385 return;
386 }
387
388 r = parse_boolean(e);
389 if (r < 0) {
390 log_warning_errno(r, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring.");
391 return;
392 }
393
394 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
395 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
396 }
397
398 static int parse_argv(int argc, char *argv[]) {
399
400 enum {
401 ARG_VERSION = 0x100,
402 ARG_PRIVATE_NETWORK,
403 ARG_UUID,
404 ARG_READ_ONLY,
405 ARG_CAPABILITY,
406 ARG_DROP_CAPABILITY,
407 ARG_LINK_JOURNAL,
408 ARG_BIND,
409 ARG_BIND_RO,
410 ARG_TMPFS,
411 ARG_OVERLAY,
412 ARG_OVERLAY_RO,
413 ARG_SHARE_SYSTEM,
414 ARG_REGISTER,
415 ARG_KEEP_UNIT,
416 ARG_NETWORK_INTERFACE,
417 ARG_NETWORK_MACVLAN,
418 ARG_NETWORK_IPVLAN,
419 ARG_NETWORK_BRIDGE,
420 ARG_NETWORK_ZONE,
421 ARG_NETWORK_VETH_EXTRA,
422 ARG_PERSONALITY,
423 ARG_VOLATILE,
424 ARG_TEMPLATE,
425 ARG_PROPERTY,
426 ARG_PRIVATE_USERS,
427 ARG_KILL_SIGNAL,
428 ARG_SETTINGS,
429 ARG_CHDIR,
430 ARG_PIVOT_ROOT,
431 ARG_PRIVATE_USERS_CHOWN,
432 ARG_NOTIFY_READY,
433 ARG_ROOT_HASH,
434 ARG_SYSTEM_CALL_FILTER,
435 };
436
437 static const struct option options[] = {
438 { "help", no_argument, NULL, 'h' },
439 { "version", no_argument, NULL, ARG_VERSION },
440 { "directory", required_argument, NULL, 'D' },
441 { "template", required_argument, NULL, ARG_TEMPLATE },
442 { "ephemeral", no_argument, NULL, 'x' },
443 { "user", required_argument, NULL, 'u' },
444 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
445 { "as-pid2", no_argument, NULL, 'a' },
446 { "boot", no_argument, NULL, 'b' },
447 { "uuid", required_argument, NULL, ARG_UUID },
448 { "read-only", no_argument, NULL, ARG_READ_ONLY },
449 { "capability", required_argument, NULL, ARG_CAPABILITY },
450 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
451 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
452 { "bind", required_argument, NULL, ARG_BIND },
453 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
454 { "tmpfs", required_argument, NULL, ARG_TMPFS },
455 { "overlay", required_argument, NULL, ARG_OVERLAY },
456 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
457 { "machine", required_argument, NULL, 'M' },
458 { "slice", required_argument, NULL, 'S' },
459 { "setenv", required_argument, NULL, 'E' },
460 { "selinux-context", required_argument, NULL, 'Z' },
461 { "selinux-apifs-context", required_argument, NULL, 'L' },
462 { "quiet", no_argument, NULL, 'q' },
463 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
464 { "register", required_argument, NULL, ARG_REGISTER },
465 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
466 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
467 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
468 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
469 { "network-veth", no_argument, NULL, 'n' },
470 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
471 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
472 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
473 { "personality", required_argument, NULL, ARG_PERSONALITY },
474 { "image", required_argument, NULL, 'i' },
475 { "volatile", optional_argument, NULL, ARG_VOLATILE },
476 { "port", required_argument, NULL, 'p' },
477 { "property", required_argument, NULL, ARG_PROPERTY },
478 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
479 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
480 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
481 { "settings", required_argument, NULL, ARG_SETTINGS },
482 { "chdir", required_argument, NULL, ARG_CHDIR },
483 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
484 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
485 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
486 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
487 {}
488 };
489
490 int c, r;
491 const char *p, *e;
492 uint64_t plus = 0, minus = 0;
493 bool mask_all_settings = false, mask_no_settings = false;
494
495 assert(argc >= 0);
496 assert(argv);
497
498 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:", options, NULL)) >= 0)
499
500 switch (c) {
501
502 case 'h':
503 help();
504 return 0;
505
506 case ARG_VERSION:
507 return version();
508
509 case 'D':
510 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
511 if (r < 0)
512 return r;
513 break;
514
515 case ARG_TEMPLATE:
516 r = parse_path_argument_and_warn(optarg, false, &arg_template);
517 if (r < 0)
518 return r;
519 break;
520
521 case 'i':
522 r = parse_path_argument_and_warn(optarg, false, &arg_image);
523 if (r < 0)
524 return r;
525 break;
526
527 case 'x':
528 arg_ephemeral = true;
529 break;
530
531 case 'u':
532 r = free_and_strdup(&arg_user, optarg);
533 if (r < 0)
534 return log_oom();
535
536 arg_settings_mask |= SETTING_USER;
537 break;
538
539 case ARG_NETWORK_ZONE: {
540 char *j;
541
542 j = strappend("vz-", optarg);
543 if (!j)
544 return log_oom();
545
546 if (!ifname_valid(j)) {
547 log_error("Network zone name not valid: %s", j);
548 free(j);
549 return -EINVAL;
550 }
551
552 free(arg_network_zone);
553 arg_network_zone = j;
554
555 arg_network_veth = true;
556 arg_private_network = true;
557 arg_settings_mask |= SETTING_NETWORK;
558 break;
559 }
560
561 case ARG_NETWORK_BRIDGE:
562
563 if (!ifname_valid(optarg)) {
564 log_error("Bridge interface name not valid: %s", optarg);
565 return -EINVAL;
566 }
567
568 r = free_and_strdup(&arg_network_bridge, optarg);
569 if (r < 0)
570 return log_oom();
571
572 /* fall through */
573
574 case 'n':
575 arg_network_veth = true;
576 arg_private_network = true;
577 arg_settings_mask |= SETTING_NETWORK;
578 break;
579
580 case ARG_NETWORK_VETH_EXTRA:
581 r = veth_extra_parse(&arg_network_veth_extra, optarg);
582 if (r < 0)
583 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
584
585 arg_private_network = true;
586 arg_settings_mask |= SETTING_NETWORK;
587 break;
588
589 case ARG_NETWORK_INTERFACE:
590
591 if (!ifname_valid(optarg)) {
592 log_error("Network interface name not valid: %s", optarg);
593 return -EINVAL;
594 }
595
596 if (strv_extend(&arg_network_interfaces, optarg) < 0)
597 return log_oom();
598
599 arg_private_network = true;
600 arg_settings_mask |= SETTING_NETWORK;
601 break;
602
603 case ARG_NETWORK_MACVLAN:
604
605 if (!ifname_valid(optarg)) {
606 log_error("MACVLAN network interface name not valid: %s", optarg);
607 return -EINVAL;
608 }
609
610 if (strv_extend(&arg_network_macvlan, optarg) < 0)
611 return log_oom();
612
613 arg_private_network = true;
614 arg_settings_mask |= SETTING_NETWORK;
615 break;
616
617 case ARG_NETWORK_IPVLAN:
618
619 if (!ifname_valid(optarg)) {
620 log_error("IPVLAN network interface name not valid: %s", optarg);
621 return -EINVAL;
622 }
623
624 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
625 return log_oom();
626
627 /* fall through */
628
629 case ARG_PRIVATE_NETWORK:
630 arg_private_network = true;
631 arg_settings_mask |= SETTING_NETWORK;
632 break;
633
634 case 'b':
635 if (arg_start_mode == START_PID2) {
636 log_error("--boot and --as-pid2 may not be combined.");
637 return -EINVAL;
638 }
639
640 arg_start_mode = START_BOOT;
641 arg_settings_mask |= SETTING_START_MODE;
642 break;
643
644 case 'a':
645 if (arg_start_mode == START_BOOT) {
646 log_error("--boot and --as-pid2 may not be combined.");
647 return -EINVAL;
648 }
649
650 arg_start_mode = START_PID2;
651 arg_settings_mask |= SETTING_START_MODE;
652 break;
653
654 case ARG_UUID:
655 r = sd_id128_from_string(optarg, &arg_uuid);
656 if (r < 0)
657 return log_error_errno(r, "Invalid UUID: %s", optarg);
658
659 if (sd_id128_is_null(arg_uuid)) {
660 log_error("Machine UUID may not be all zeroes.");
661 return -EINVAL;
662 }
663
664 arg_settings_mask |= SETTING_MACHINE_ID;
665 break;
666
667 case 'S':
668 arg_slice = optarg;
669 break;
670
671 case 'M':
672 if (isempty(optarg))
673 arg_machine = mfree(arg_machine);
674 else {
675 if (!machine_name_is_valid(optarg)) {
676 log_error("Invalid machine name: %s", optarg);
677 return -EINVAL;
678 }
679
680 r = free_and_strdup(&arg_machine, optarg);
681 if (r < 0)
682 return log_oom();
683 }
684 break;
685
686 case 'Z':
687 arg_selinux_context = optarg;
688 break;
689
690 case 'L':
691 arg_selinux_apifs_context = optarg;
692 break;
693
694 case ARG_READ_ONLY:
695 arg_read_only = true;
696 arg_settings_mask |= SETTING_READ_ONLY;
697 break;
698
699 case ARG_CAPABILITY:
700 case ARG_DROP_CAPABILITY: {
701 p = optarg;
702 for (;;) {
703 _cleanup_free_ char *t = NULL;
704
705 r = extract_first_word(&p, &t, ",", 0);
706 if (r < 0)
707 return log_error_errno(r, "Failed to parse capability %s.", t);
708
709 if (r == 0)
710 break;
711
712 if (streq(t, "all")) {
713 if (c == ARG_CAPABILITY)
714 plus = (uint64_t) -1;
715 else
716 minus = (uint64_t) -1;
717 } else {
718 int cap;
719
720 cap = capability_from_name(t);
721 if (cap < 0) {
722 log_error("Failed to parse capability %s.", t);
723 return -EINVAL;
724 }
725
726 if (c == ARG_CAPABILITY)
727 plus |= 1ULL << (uint64_t) cap;
728 else
729 minus |= 1ULL << (uint64_t) cap;
730 }
731 }
732
733 arg_settings_mask |= SETTING_CAPABILITY;
734 break;
735 }
736
737 case 'j':
738 arg_link_journal = LINK_GUEST;
739 arg_link_journal_try = true;
740 break;
741
742 case ARG_LINK_JOURNAL:
743 if (streq(optarg, "auto")) {
744 arg_link_journal = LINK_AUTO;
745 arg_link_journal_try = false;
746 } else if (streq(optarg, "no")) {
747 arg_link_journal = LINK_NO;
748 arg_link_journal_try = false;
749 } else if (streq(optarg, "guest")) {
750 arg_link_journal = LINK_GUEST;
751 arg_link_journal_try = false;
752 } else if (streq(optarg, "host")) {
753 arg_link_journal = LINK_HOST;
754 arg_link_journal_try = false;
755 } else if (streq(optarg, "try-guest")) {
756 arg_link_journal = LINK_GUEST;
757 arg_link_journal_try = true;
758 } else if (streq(optarg, "try-host")) {
759 arg_link_journal = LINK_HOST;
760 arg_link_journal_try = true;
761 } else {
762 log_error("Failed to parse link journal mode %s", optarg);
763 return -EINVAL;
764 }
765
766 break;
767
768 case ARG_BIND:
769 case ARG_BIND_RO:
770 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
771 if (r < 0)
772 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
773
774 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
775 break;
776
777 case ARG_TMPFS:
778 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
779 if (r < 0)
780 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
781
782 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
783 break;
784
785 case ARG_OVERLAY:
786 case ARG_OVERLAY_RO:
787 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
788 if (r == -EADDRNOTAVAIL)
789 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
790 if (r < 0)
791 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
792
793 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
794 break;
795
796 case 'E': {
797 char **n;
798
799 if (!env_assignment_is_valid(optarg)) {
800 log_error("Environment variable assignment '%s' is not valid.", optarg);
801 return -EINVAL;
802 }
803
804 n = strv_env_set(arg_setenv, optarg);
805 if (!n)
806 return log_oom();
807
808 strv_free(arg_setenv);
809 arg_setenv = n;
810
811 arg_settings_mask |= SETTING_ENVIRONMENT;
812 break;
813 }
814
815 case 'q':
816 arg_quiet = true;
817 break;
818
819 case ARG_SHARE_SYSTEM:
820 /* We don't officially support this anymore, except for compat reasons. People should use the
821 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
822 arg_clone_ns_flags = 0;
823 break;
824
825 case ARG_REGISTER:
826 r = parse_boolean(optarg);
827 if (r < 0) {
828 log_error("Failed to parse --register= argument: %s", optarg);
829 return r;
830 }
831
832 arg_register = r;
833 break;
834
835 case ARG_KEEP_UNIT:
836 arg_keep_unit = true;
837 break;
838
839 case ARG_PERSONALITY:
840
841 arg_personality = personality_from_string(optarg);
842 if (arg_personality == PERSONALITY_INVALID) {
843 log_error("Unknown or unsupported personality '%s'.", optarg);
844 return -EINVAL;
845 }
846
847 arg_settings_mask |= SETTING_PERSONALITY;
848 break;
849
850 case ARG_VOLATILE:
851
852 if (!optarg)
853 arg_volatile_mode = VOLATILE_YES;
854 else {
855 VolatileMode m;
856
857 m = volatile_mode_from_string(optarg);
858 if (m < 0) {
859 log_error("Failed to parse --volatile= argument: %s", optarg);
860 return -EINVAL;
861 } else
862 arg_volatile_mode = m;
863 }
864
865 arg_settings_mask |= SETTING_VOLATILE_MODE;
866 break;
867
868 case 'p':
869 r = expose_port_parse(&arg_expose_ports, optarg);
870 if (r == -EEXIST)
871 return log_error_errno(r, "Duplicate port specification: %s", optarg);
872 if (r < 0)
873 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
874
875 arg_settings_mask |= SETTING_EXPOSE_PORTS;
876 break;
877
878 case ARG_PROPERTY:
879 if (strv_extend(&arg_property, optarg) < 0)
880 return log_oom();
881
882 break;
883
884 case ARG_PRIVATE_USERS: {
885 int boolean = -1;
886
887 if (!optarg)
888 boolean = true;
889 else if (!in_charset(optarg, DIGITS))
890 /* do *not* parse numbers as booleans */
891 boolean = parse_boolean(optarg);
892
893 if (boolean == false) {
894 /* no: User namespacing off */
895 arg_userns_mode = USER_NAMESPACE_NO;
896 arg_uid_shift = UID_INVALID;
897 arg_uid_range = UINT32_C(0x10000);
898 } else if (boolean == true) {
899 /* yes: User namespacing on, UID range is read from root dir */
900 arg_userns_mode = USER_NAMESPACE_FIXED;
901 arg_uid_shift = UID_INVALID;
902 arg_uid_range = UINT32_C(0x10000);
903 } else if (streq(optarg, "pick")) {
904 /* pick: User namespacing on, UID range is picked randomly */
905 arg_userns_mode = USER_NAMESPACE_PICK;
906 arg_uid_shift = UID_INVALID;
907 arg_uid_range = UINT32_C(0x10000);
908 } else {
909 _cleanup_free_ char *buffer = NULL;
910 const char *range, *shift;
911
912 /* anything else: User namespacing on, UID range is explicitly configured */
913
914 range = strchr(optarg, ':');
915 if (range) {
916 buffer = strndup(optarg, range - optarg);
917 if (!buffer)
918 return log_oom();
919 shift = buffer;
920
921 range++;
922 r = safe_atou32(range, &arg_uid_range);
923 if (r < 0)
924 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
925 } else
926 shift = optarg;
927
928 r = parse_uid(shift, &arg_uid_shift);
929 if (r < 0)
930 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
931
932 arg_userns_mode = USER_NAMESPACE_FIXED;
933 }
934
935 if (arg_uid_range <= 0) {
936 log_error("UID range cannot be 0.");
937 return -EINVAL;
938 }
939
940 arg_settings_mask |= SETTING_USERNS;
941 break;
942 }
943
944 case 'U':
945 if (userns_supported()) {
946 arg_userns_mode = USER_NAMESPACE_PICK;
947 arg_uid_shift = UID_INVALID;
948 arg_uid_range = UINT32_C(0x10000);
949
950 arg_settings_mask |= SETTING_USERNS;
951 }
952
953 break;
954
955 case ARG_PRIVATE_USERS_CHOWN:
956 arg_userns_chown = true;
957
958 arg_settings_mask |= SETTING_USERNS;
959 break;
960
961 case ARG_KILL_SIGNAL:
962 arg_kill_signal = signal_from_string_try_harder(optarg);
963 if (arg_kill_signal < 0) {
964 log_error("Cannot parse signal: %s", optarg);
965 return -EINVAL;
966 }
967
968 arg_settings_mask |= SETTING_KILL_SIGNAL;
969 break;
970
971 case ARG_SETTINGS:
972
973 /* no → do not read files
974 * yes → read files, do not override cmdline, trust only subset
975 * override → read files, override cmdline, trust only subset
976 * trusted → read files, do not override cmdline, trust all
977 */
978
979 r = parse_boolean(optarg);
980 if (r < 0) {
981 if (streq(optarg, "trusted")) {
982 mask_all_settings = false;
983 mask_no_settings = false;
984 arg_settings_trusted = true;
985
986 } else if (streq(optarg, "override")) {
987 mask_all_settings = false;
988 mask_no_settings = true;
989 arg_settings_trusted = -1;
990 } else
991 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
992 } else if (r > 0) {
993 /* yes */
994 mask_all_settings = false;
995 mask_no_settings = false;
996 arg_settings_trusted = -1;
997 } else {
998 /* no */
999 mask_all_settings = true;
1000 mask_no_settings = false;
1001 arg_settings_trusted = false;
1002 }
1003
1004 break;
1005
1006 case ARG_CHDIR:
1007 if (!path_is_absolute(optarg)) {
1008 log_error("Working directory %s is not an absolute path.", optarg);
1009 return -EINVAL;
1010 }
1011
1012 r = free_and_strdup(&arg_chdir, optarg);
1013 if (r < 0)
1014 return log_oom();
1015
1016 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1017 break;
1018
1019 case ARG_PIVOT_ROOT:
1020 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1021 if (r < 0)
1022 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1023
1024 arg_settings_mask |= SETTING_PIVOT_ROOT;
1025 break;
1026
1027 case ARG_NOTIFY_READY:
1028 r = parse_boolean(optarg);
1029 if (r < 0) {
1030 log_error("%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1031 return -EINVAL;
1032 }
1033 arg_notify_ready = r;
1034 arg_settings_mask |= SETTING_NOTIFY_READY;
1035 break;
1036
1037 case ARG_ROOT_HASH: {
1038 void *k;
1039 size_t l;
1040
1041 r = unhexmem(optarg, strlen(optarg), &k, &l);
1042 if (r < 0)
1043 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1044 if (l < sizeof(sd_id128_t)) {
1045 log_error("Root hash must be at least 128bit long: %s", optarg);
1046 free(k);
1047 return -EINVAL;
1048 }
1049
1050 free(arg_root_hash);
1051 arg_root_hash = k;
1052 arg_root_hash_size = l;
1053 break;
1054 }
1055
1056 case ARG_SYSTEM_CALL_FILTER: {
1057 bool negative;
1058 const char *items;
1059
1060 negative = optarg[0] == '~';
1061 items = negative ? optarg + 1 : optarg;
1062
1063 for (;;) {
1064 _cleanup_free_ char *word = NULL;
1065
1066 r = extract_first_word(&items, &word, NULL, 0);
1067 if (r == 0)
1068 break;
1069 if (r == -ENOMEM)
1070 return log_oom();
1071 if (r < 0)
1072 return log_error_errno(r, "Failed to parse system call filter: %m");
1073
1074 if (negative)
1075 r = strv_extend(&arg_syscall_blacklist, word);
1076 else
1077 r = strv_extend(&arg_syscall_whitelist, word);
1078 if (r < 0)
1079 return log_oom();
1080 }
1081
1082 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1083 break;
1084 }
1085
1086 case '?':
1087 return -EINVAL;
1088
1089 default:
1090 assert_not_reached("Unhandled option");
1091 }
1092
1093 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
1094 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
1095 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
1096 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
1097
1098 if (arg_userns_mode != USER_NAMESPACE_NO)
1099 arg_mount_settings |= MOUNT_USE_USERNS;
1100
1101 if (arg_private_network)
1102 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1103
1104 parse_mount_settings_env();
1105
1106 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1107 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
1108 arg_register = false;
1109 if (arg_start_mode != START_PID1) {
1110 log_error("--boot cannot be used without namespacing.");
1111 return -EINVAL;
1112 }
1113 }
1114
1115 if (arg_userns_mode == USER_NAMESPACE_PICK)
1116 arg_userns_chown = true;
1117
1118 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0) {
1119 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1120 * The latter is not technically a user session, but we don't need to labour the point. */
1121 log_error("--keep-unit --register=yes may not be used when invoked from a user session.");
1122 return -EINVAL;
1123 }
1124
1125 if (arg_directory && arg_image) {
1126 log_error("--directory= and --image= may not be combined.");
1127 return -EINVAL;
1128 }
1129
1130 if (arg_template && arg_image) {
1131 log_error("--template= and --image= may not be combined.");
1132 return -EINVAL;
1133 }
1134
1135 if (arg_ephemeral && arg_template && !arg_directory) {
1136 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1137 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1138 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1139 * --directory=". */
1140
1141 arg_directory = arg_template;
1142 arg_template = NULL;
1143 }
1144
1145 if (arg_template && !(arg_directory || arg_machine)) {
1146 log_error("--template= needs --directory= or --machine=.");
1147 return -EINVAL;
1148 }
1149
1150 if (arg_ephemeral && arg_template) {
1151 log_error("--ephemeral and --template= may not be combined.");
1152 return -EINVAL;
1153 }
1154
1155 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1156 log_error("--ephemeral and --link-journal= may not be combined.");
1157 return -EINVAL;
1158 }
1159
1160 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported()) {
1161 log_error("--private-users= is not supported, kernel compiled without user namespace support.");
1162 return -EOPNOTSUPP;
1163 }
1164
1165 if (arg_userns_chown && arg_read_only) {
1166 log_error("--read-only and --private-users-chown may not be combined.");
1167 return -EINVAL;
1168 }
1169
1170 if (arg_network_bridge && arg_network_zone) {
1171 log_error("--network-bridge= and --network-zone= may not be combined.");
1172 return -EINVAL;
1173 }
1174
1175 if (argc > optind) {
1176 arg_parameters = strv_copy(argv + optind);
1177 if (!arg_parameters)
1178 return log_oom();
1179
1180 arg_settings_mask |= SETTING_START_MODE;
1181 }
1182
1183 /* Load all settings from .nspawn files */
1184 if (mask_no_settings)
1185 arg_settings_mask = 0;
1186
1187 /* Don't load any settings from .nspawn files */
1188 if (mask_all_settings)
1189 arg_settings_mask = _SETTINGS_MASK_ALL;
1190
1191 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
1192
1193 r = cg_unified_flush();
1194 if (r < 0)
1195 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
1196
1197 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
1198 if (e)
1199 arg_container_service_name = e;
1200
1201 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
1202 if (r < 0)
1203 arg_use_cgns = cg_ns_supported();
1204 else
1205 arg_use_cgns = r;
1206
1207 r = custom_mount_check_all();
1208 if (r < 0)
1209 return r;
1210
1211 return 1;
1212 }
1213
1214 static int verify_arguments(void) {
1215 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network) {
1216 log_error("Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1217 return -EINVAL;
1218 }
1219
1220 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO)) {
1221 log_error("Cannot combine --private-users with read-write mounts.");
1222 return -EINVAL;
1223 }
1224
1225 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
1226 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1227 return -EINVAL;
1228 }
1229
1230 if (arg_expose_ports && !arg_private_network) {
1231 log_error("Cannot use --port= without private networking.");
1232 return -EINVAL;
1233 }
1234
1235 #if ! HAVE_LIBIPTC
1236 if (arg_expose_ports) {
1237 log_error("--port= is not supported, compiled without libiptc support.");
1238 return -EOPNOTSUPP;
1239 }
1240 #endif
1241
1242 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1243 arg_kill_signal = SIGRTMIN+3;
1244
1245 return 0;
1246 }
1247
1248 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1249 assert(p);
1250
1251 if (arg_userns_mode == USER_NAMESPACE_NO)
1252 return 0;
1253
1254 if (uid == UID_INVALID && gid == GID_INVALID)
1255 return 0;
1256
1257 if (uid != UID_INVALID) {
1258 uid += arg_uid_shift;
1259
1260 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1261 return -EOVERFLOW;
1262 }
1263
1264 if (gid != GID_INVALID) {
1265 gid += (gid_t) arg_uid_shift;
1266
1267 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1268 return -EOVERFLOW;
1269 }
1270
1271 if (lchown(p, uid, gid) < 0)
1272 return -errno;
1273
1274 return 0;
1275 }
1276
1277 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1278 const char *q;
1279
1280 q = prefix_roota(root, path);
1281 if (mkdir(q, mode) < 0) {
1282 if (errno == EEXIST)
1283 return 0;
1284 return -errno;
1285 }
1286
1287 return userns_lchown(q, uid, gid);
1288 }
1289
1290 static int setup_timezone(const char *dest) {
1291 _cleanup_free_ char *p = NULL, *q = NULL;
1292 const char *where, *check, *what;
1293 char *z, *y;
1294 int r;
1295
1296 assert(dest);
1297
1298 /* Fix the timezone, if possible */
1299 r = readlink_malloc("/etc/localtime", &p);
1300 if (r < 0) {
1301 log_warning("host's /etc/localtime is not a symlink, not updating container timezone.");
1302 /* to handle warning, delete /etc/localtime and replace it
1303 * with a symbolic link to a time zone data file.
1304 *
1305 * Example:
1306 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1307 */
1308 return 0;
1309 }
1310
1311 z = path_startswith(p, "../usr/share/zoneinfo/");
1312 if (!z)
1313 z = path_startswith(p, "/usr/share/zoneinfo/");
1314 if (!z) {
1315 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1316 return 0;
1317 }
1318
1319 where = prefix_roota(dest, "/etc/localtime");
1320 r = readlink_malloc(where, &q);
1321 if (r >= 0) {
1322 y = path_startswith(q, "../usr/share/zoneinfo/");
1323 if (!y)
1324 y = path_startswith(q, "/usr/share/zoneinfo/");
1325
1326 /* Already pointing to the right place? Then do nothing .. */
1327 if (y && streq(y, z))
1328 return 0;
1329 }
1330
1331 check = strjoina("/usr/share/zoneinfo/", z);
1332 check = prefix_roota(dest, check);
1333 if (laccess(check, F_OK) < 0) {
1334 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1335 return 0;
1336 }
1337
1338 if (unlink(where) < 0 && errno != ENOENT) {
1339 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1340 errno,
1341 "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1342 return 0;
1343 }
1344
1345 what = strjoina("../usr/share/zoneinfo/", z);
1346 if (symlink(what, where) < 0) {
1347 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1348 errno,
1349 "Failed to correct timezone of container, ignoring: %m");
1350 return 0;
1351 }
1352
1353 r = userns_lchown(where, 0, 0);
1354 if (r < 0)
1355 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1356
1357 return 0;
1358 }
1359
1360 static int resolved_listening(void) {
1361 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
1362 _cleanup_free_ char *dns_stub_listener_mode = NULL;
1363 int r;
1364
1365 /* Check if resolved is listening */
1366
1367 r = sd_bus_open_system(&bus);
1368 if (r < 0)
1369 return r;
1370
1371 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
1372 if (r <= 0)
1373 return r;
1374
1375 r = sd_bus_get_property_string(bus,
1376 "org.freedesktop.resolve1",
1377 "/org/freedesktop/resolve1",
1378 "org.freedesktop.resolve1.Manager",
1379 "DNSStubListener",
1380 NULL,
1381 &dns_stub_listener_mode);
1382 if (r < 0)
1383 return r;
1384
1385 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
1386 }
1387
1388 static int setup_resolv_conf(const char *dest) {
1389 _cleanup_free_ char *resolved = NULL, *etc = NULL;
1390 const char *where;
1391 int r, found;
1392
1393 assert(dest);
1394
1395 if (arg_private_network)
1396 return 0;
1397
1398 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
1399 if (r < 0) {
1400 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1401 return 0;
1402 }
1403
1404 where = strjoina(etc, "/resolv.conf");
1405 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
1406 if (found < 0) {
1407 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
1408 return 0;
1409 }
1410
1411 if (access("/usr/lib/systemd/resolv.conf", F_OK) >= 0 &&
1412 resolved_listening() > 0) {
1413
1414 /* resolved is enabled on the host. In this, case bind mount its static resolv.conf file into the
1415 * container, so that the container can use the host's resolver. Given that network namespacing is
1416 * disabled it's only natural of the container also uses the host's resolver. It also has the big
1417 * advantage that the container will be able to follow the host's DNS server configuration changes
1418 * transparently. */
1419
1420 if (found == 0) /* missing? */
1421 (void) touch(resolved);
1422
1423 r = mount_verbose(LOG_DEBUG, "/usr/lib/systemd/resolv.conf", resolved, NULL, MS_BIND, NULL);
1424 if (r >= 0)
1425 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1426 }
1427
1428 /* If that didn't work, let's copy the file */
1429 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0, COPY_REFLINK);
1430 if (r < 0) {
1431 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1432 * resolved or something similar runs inside and the symlink points there.
1433 *
1434 * If the disk image is read-only, there's also no point in complaining.
1435 */
1436 log_full_errno(IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1437 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
1438 return 0;
1439 }
1440
1441 r = userns_lchown(where, 0, 0);
1442 if (r < 0)
1443 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
1444
1445 return 0;
1446 }
1447
1448 static int setup_boot_id(const char *dest) {
1449 sd_id128_t rnd = SD_ID128_NULL;
1450 const char *from, *to;
1451 int r;
1452
1453 /* Generate a new randomized boot ID, so that each boot-up of
1454 * the container gets a new one */
1455
1456 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1457 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1458
1459 r = sd_id128_randomize(&rnd);
1460 if (r < 0)
1461 return log_error_errno(r, "Failed to generate random boot id: %m");
1462
1463 r = id128_write(from, ID128_UUID, rnd, false);
1464 if (r < 0)
1465 return log_error_errno(r, "Failed to write boot id: %m");
1466
1467 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1468 if (r >= 0)
1469 r = mount_verbose(LOG_ERR, NULL, to, NULL,
1470 MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1471
1472 (void) unlink(from);
1473 return r;
1474 }
1475
1476 static int copy_devnodes(const char *dest) {
1477
1478 static const char devnodes[] =
1479 "null\0"
1480 "zero\0"
1481 "full\0"
1482 "random\0"
1483 "urandom\0"
1484 "tty\0"
1485 "net/tun\0";
1486
1487 const char *d;
1488 int r = 0;
1489 _cleanup_umask_ mode_t u;
1490
1491 assert(dest);
1492
1493 u = umask(0000);
1494
1495 /* Create /dev/net, so that we can create /dev/net/tun in it */
1496 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1497 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1498
1499 NULSTR_FOREACH(d, devnodes) {
1500 _cleanup_free_ char *from = NULL, *to = NULL;
1501 struct stat st;
1502
1503 from = strappend("/dev/", d);
1504 to = prefix_root(dest, from);
1505
1506 if (stat(from, &st) < 0) {
1507
1508 if (errno != ENOENT)
1509 return log_error_errno(errno, "Failed to stat %s: %m", from);
1510
1511 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1512
1513 log_error("%s is not a char or block device, cannot copy.", from);
1514 return -EIO;
1515
1516 } else {
1517 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1518 /* Explicitly warn the user when /dev is already populated. */
1519 if (errno == EEXIST)
1520 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
1521 if (errno != EPERM)
1522 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1523
1524 /* Some systems abusively restrict mknod but
1525 * allow bind mounts. */
1526 r = touch(to);
1527 if (r < 0)
1528 return log_error_errno(r, "touch (%s) failed: %m", to);
1529 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1530 if (r < 0)
1531 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
1532 }
1533
1534 r = userns_lchown(to, 0, 0);
1535 if (r < 0)
1536 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1537 }
1538 }
1539
1540 return r;
1541 }
1542
1543 static int setup_pts(const char *dest) {
1544 _cleanup_free_ char *options = NULL;
1545 const char *p;
1546 int r;
1547
1548 #if HAVE_SELINUX
1549 if (arg_selinux_apifs_context)
1550 (void) asprintf(&options,
1551 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1552 arg_uid_shift + TTY_GID,
1553 arg_selinux_apifs_context);
1554 else
1555 #endif
1556 (void) asprintf(&options,
1557 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1558 arg_uid_shift + TTY_GID);
1559
1560 if (!options)
1561 return log_oom();
1562
1563 /* Mount /dev/pts itself */
1564 p = prefix_roota(dest, "/dev/pts");
1565 if (mkdir(p, 0755) < 0)
1566 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1567 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
1568 if (r < 0)
1569 return r;
1570 r = userns_lchown(p, 0, 0);
1571 if (r < 0)
1572 return log_error_errno(r, "Failed to chown /dev/pts: %m");
1573
1574 /* Create /dev/ptmx symlink */
1575 p = prefix_roota(dest, "/dev/ptmx");
1576 if (symlink("pts/ptmx", p) < 0)
1577 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1578 r = userns_lchown(p, 0, 0);
1579 if (r < 0)
1580 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
1581
1582 /* And fix /dev/pts/ptmx ownership */
1583 p = prefix_roota(dest, "/dev/pts/ptmx");
1584 r = userns_lchown(p, 0, 0);
1585 if (r < 0)
1586 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
1587
1588 return 0;
1589 }
1590
1591 static int setup_dev_console(const char *dest, const char *console) {
1592 _cleanup_umask_ mode_t u;
1593 const char *to;
1594 int r;
1595
1596 assert(dest);
1597 assert(console);
1598
1599 u = umask(0000);
1600
1601 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1602 if (r < 0)
1603 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1604
1605 /* We need to bind mount the right tty to /dev/console since
1606 * ptys can only exist on pts file systems. To have something
1607 * to bind mount things on we create a empty regular file. */
1608
1609 to = prefix_roota(dest, "/dev/console");
1610 r = touch(to);
1611 if (r < 0)
1612 return log_error_errno(r, "touch() for /dev/console failed: %m");
1613
1614 return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
1615 }
1616
1617 static int setup_keyring(void) {
1618 key_serial_t keyring;
1619
1620 /* Allocate a new session keyring for the container. This makes sure the keyring of the session systemd-nspawn
1621 * was invoked from doesn't leak into the container. Note that by default we block keyctl() and request_key()
1622 * anyway via seccomp so doing this operation isn't strictly necessary, but in case people explicitly whitelist
1623 * these system calls let's make sure we don't leak anything into the container. */
1624
1625 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
1626 if (keyring == -1) {
1627 if (errno == ENOSYS)
1628 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
1629 else if (IN_SET(errno, EACCES, EPERM))
1630 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
1631 else
1632 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
1633 }
1634
1635 return 0;
1636 }
1637
1638 static int setup_kmsg(const char *dest, int kmsg_socket) {
1639 const char *from, *to;
1640 _cleanup_umask_ mode_t u;
1641 int fd, r;
1642
1643 assert(kmsg_socket >= 0);
1644
1645 u = umask(0000);
1646
1647 /* We create the kmsg FIFO as /run/kmsg, but immediately
1648 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1649 * on the reading side behave very similar to /proc/kmsg,
1650 * their writing side behaves differently from /dev/kmsg in
1651 * that writing blocks when nothing is reading. In order to
1652 * avoid any problems with containers deadlocking due to this
1653 * we simply make /dev/kmsg unavailable to the container. */
1654 from = prefix_roota(dest, "/run/kmsg");
1655 to = prefix_roota(dest, "/proc/kmsg");
1656
1657 if (mkfifo(from, 0600) < 0)
1658 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1659 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1660 if (r < 0)
1661 return r;
1662
1663 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1664 if (fd < 0)
1665 return log_error_errno(errno, "Failed to open fifo: %m");
1666
1667 /* Store away the fd in the socket, so that it stays open as
1668 * long as we run the child */
1669 r = send_one_fd(kmsg_socket, fd, 0);
1670 safe_close(fd);
1671
1672 if (r < 0)
1673 return log_error_errno(r, "Failed to send FIFO fd: %m");
1674
1675 /* And now make the FIFO unavailable as /run/kmsg... */
1676 (void) unlink(from);
1677
1678 return 0;
1679 }
1680
1681 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
1682 union in_addr_union *exposed = userdata;
1683
1684 assert(rtnl);
1685 assert(m);
1686 assert(exposed);
1687
1688 expose_port_execute(rtnl, arg_expose_ports, exposed);
1689 return 0;
1690 }
1691
1692 static int setup_hostname(void) {
1693
1694 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
1695 return 0;
1696
1697 if (sethostname_idempotent(arg_machine) < 0)
1698 return -errno;
1699
1700 return 0;
1701 }
1702
1703 static int setup_journal(const char *directory) {
1704 sd_id128_t this_id;
1705 _cleanup_free_ char *d = NULL;
1706 const char *p, *q;
1707 bool try;
1708 char id[33];
1709 int r;
1710
1711 /* Don't link journals in ephemeral mode */
1712 if (arg_ephemeral)
1713 return 0;
1714
1715 if (arg_link_journal == LINK_NO)
1716 return 0;
1717
1718 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1719
1720 r = sd_id128_get_machine(&this_id);
1721 if (r < 0)
1722 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1723
1724 if (sd_id128_equal(arg_uuid, this_id)) {
1725 log_full(try ? LOG_WARNING : LOG_ERR,
1726 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
1727 if (try)
1728 return 0;
1729 return -EEXIST;
1730 }
1731
1732 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1733 if (r < 0)
1734 return log_error_errno(r, "Failed to create /var: %m");
1735
1736 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1737 if (r < 0)
1738 return log_error_errno(r, "Failed to create /var/log: %m");
1739
1740 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1741 if (r < 0)
1742 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1743
1744 (void) sd_id128_to_string(arg_uuid, id);
1745
1746 p = strjoina("/var/log/journal/", id);
1747 q = prefix_roota(directory, p);
1748
1749 if (path_is_mount_point(p, NULL, 0) > 0) {
1750 if (try)
1751 return 0;
1752
1753 log_error("%s: already a mount point, refusing to use for journal", p);
1754 return -EEXIST;
1755 }
1756
1757 if (path_is_mount_point(q, NULL, 0) > 0) {
1758 if (try)
1759 return 0;
1760
1761 log_error("%s: already a mount point, refusing to use for journal", q);
1762 return -EEXIST;
1763 }
1764
1765 r = readlink_and_make_absolute(p, &d);
1766 if (r >= 0) {
1767 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
1768 path_equal(d, q)) {
1769
1770 r = userns_mkdir(directory, p, 0755, 0, 0);
1771 if (r < 0)
1772 log_warning_errno(r, "Failed to create directory %s: %m", q);
1773 return 0;
1774 }
1775
1776 if (unlink(p) < 0)
1777 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1778 } else if (r == -EINVAL) {
1779
1780 if (arg_link_journal == LINK_GUEST &&
1781 rmdir(p) < 0) {
1782
1783 if (errno == ENOTDIR) {
1784 log_error("%s already exists and is neither a symlink nor a directory", p);
1785 return r;
1786 } else
1787 return log_error_errno(errno, "Failed to remove %s: %m", p);
1788 }
1789 } else if (r != -ENOENT)
1790 return log_error_errno(r, "readlink(%s) failed: %m", p);
1791
1792 if (arg_link_journal == LINK_GUEST) {
1793
1794 if (symlink(q, p) < 0) {
1795 if (try) {
1796 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1797 return 0;
1798 } else
1799 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1800 }
1801
1802 r = userns_mkdir(directory, p, 0755, 0, 0);
1803 if (r < 0)
1804 log_warning_errno(r, "Failed to create directory %s: %m", q);
1805 return 0;
1806 }
1807
1808 if (arg_link_journal == LINK_HOST) {
1809 /* don't create parents here — if the host doesn't have
1810 * permanent journal set up, don't force it here */
1811
1812 if (mkdir(p, 0755) < 0 && errno != EEXIST) {
1813 if (try) {
1814 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1815 return 0;
1816 } else
1817 return log_error_errno(errno, "Failed to create %s: %m", p);
1818 }
1819
1820 } else if (access(p, F_OK) < 0)
1821 return 0;
1822
1823 if (dir_is_empty(q) == 0)
1824 log_warning("%s is not empty, proceeding anyway.", q);
1825
1826 r = userns_mkdir(directory, p, 0755, 0, 0);
1827 if (r < 0)
1828 return log_error_errno(r, "Failed to create %s: %m", q);
1829
1830 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
1831 if (r < 0)
1832 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1833
1834 return 0;
1835 }
1836
1837 static int drop_capabilities(void) {
1838 return capability_bounding_set_drop(arg_caps_retain, false);
1839 }
1840
1841 static int reset_audit_loginuid(void) {
1842 _cleanup_free_ char *p = NULL;
1843 int r;
1844
1845 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
1846 return 0;
1847
1848 r = read_one_line_file("/proc/self/loginuid", &p);
1849 if (r == -ENOENT)
1850 return 0;
1851 if (r < 0)
1852 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1853
1854 /* Already reset? */
1855 if (streq(p, "4294967295"))
1856 return 0;
1857
1858 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
1859 if (r < 0) {
1860 log_error_errno(r,
1861 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1862 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1863 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1864 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1865 "using systemd-nspawn. Sleeping for 5s... (%m)");
1866
1867 sleep(5);
1868 }
1869
1870 return 0;
1871 }
1872
1873
1874 static int setup_propagate(const char *root) {
1875 const char *p, *q;
1876 int r;
1877
1878 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1879 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1880 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
1881 (void) mkdir_p(p, 0600);
1882
1883 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1884 if (r < 0)
1885 return log_error_errno(r, "Failed to create /run/systemd: %m");
1886
1887 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1888 if (r < 0)
1889 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
1890
1891 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1892 if (r < 0)
1893 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
1894
1895 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
1896 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
1897 if (r < 0)
1898 return r;
1899
1900 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
1901 if (r < 0)
1902 return r;
1903
1904 /* machined will MS_MOVE into that directory, and that's only
1905 * supported for non-shared mounts. */
1906 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
1907 }
1908
1909 static int setup_machine_id(const char *directory) {
1910 const char *etc_machine_id;
1911 sd_id128_t id;
1912 int r;
1913
1914 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
1915 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
1916 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
1917 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
1918 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
1919 * container behaves nicely). */
1920
1921 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
1922
1923 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
1924 if (r < 0) {
1925 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
1926 return log_error_errno(r, "Failed to read machine ID from container image: %m");
1927
1928 if (sd_id128_is_null(arg_uuid)) {
1929 r = sd_id128_randomize(&arg_uuid);
1930 if (r < 0)
1931 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
1932 }
1933 } else {
1934 if (sd_id128_is_null(id)) {
1935 log_error("Machine ID in container image is zero, refusing.");
1936 return -EINVAL;
1937 }
1938
1939 arg_uuid = id;
1940 }
1941
1942 return 0;
1943 }
1944
1945 static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
1946 int r;
1947
1948 assert(directory);
1949
1950 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
1951 return 0;
1952
1953 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
1954 if (r == -EOPNOTSUPP)
1955 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
1956 if (r == -EBADE)
1957 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
1958 if (r < 0)
1959 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
1960 if (r == 0)
1961 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
1962 else
1963 log_debug("Patched directory tree to match UID/GID range.");
1964
1965 return r;
1966 }
1967
1968 /*
1969 * Return values:
1970 * < 0 : wait_for_terminate() failed to get the state of the
1971 * container, the container was terminated by a signal, or
1972 * failed for an unknown reason. No change is made to the
1973 * container argument.
1974 * > 0 : The program executed in the container terminated with an
1975 * error. The exit code of the program executed in the
1976 * container is returned. The container argument has been set
1977 * to CONTAINER_TERMINATED.
1978 * 0 : The container is being rebooted, has been shut down or exited
1979 * successfully. The container argument has been set to either
1980 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
1981 *
1982 * That is, success is indicated by a return value of zero, and an
1983 * error is indicated by a non-zero value.
1984 */
1985 static int wait_for_container(pid_t pid, ContainerStatus *container) {
1986 siginfo_t status;
1987 int r;
1988
1989 r = wait_for_terminate(pid, &status);
1990 if (r < 0)
1991 return log_warning_errno(r, "Failed to wait for container: %m");
1992
1993 switch (status.si_code) {
1994
1995 case CLD_EXITED:
1996 if (status.si_status == 0)
1997 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
1998 else
1999 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2000
2001 *container = CONTAINER_TERMINATED;
2002 return status.si_status;
2003
2004 case CLD_KILLED:
2005 if (status.si_status == SIGINT) {
2006 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2007 *container = CONTAINER_TERMINATED;
2008 return 0;
2009
2010 } else if (status.si_status == SIGHUP) {
2011 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2012 *container = CONTAINER_REBOOTED;
2013 return 0;
2014 }
2015
2016 /* fall through */
2017
2018 case CLD_DUMPED:
2019 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2020 return -EIO;
2021
2022 default:
2023 log_error("Container %s failed due to unknown reason.", arg_machine);
2024 return -EIO;
2025 }
2026 }
2027
2028 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2029 pid_t pid;
2030
2031 pid = PTR_TO_PID(userdata);
2032 if (pid > 0) {
2033 if (kill(pid, arg_kill_signal) >= 0) {
2034 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2035 sd_event_source_set_userdata(s, NULL);
2036 return 0;
2037 }
2038 }
2039
2040 sd_event_exit(sd_event_source_get_event(s), 0);
2041 return 0;
2042 }
2043
2044 static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
2045 for (;;) {
2046 siginfo_t si = {};
2047 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2048 return log_error_errno(errno, "Failed to waitid(): %m");
2049 if (si.si_pid == 0) /* No pending children. */
2050 break;
2051 if (si.si_pid == PTR_TO_PID(userdata)) {
2052 /* The main process we care for has exited. Return from
2053 * signal handler but leave the zombie. */
2054 sd_event_exit(sd_event_source_get_event(s), 0);
2055 break;
2056 }
2057 /* Reap all other children. */
2058 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2059 }
2060
2061 return 0;
2062 }
2063
2064 static int determine_names(void) {
2065 int r;
2066
2067 if (arg_template && !arg_directory && arg_machine) {
2068
2069 /* If --template= was specified then we should not
2070 * search for a machine, but instead create a new one
2071 * in /var/lib/machine. */
2072
2073 arg_directory = strjoin("/var/lib/machines/", arg_machine);
2074 if (!arg_directory)
2075 return log_oom();
2076 }
2077
2078 if (!arg_image && !arg_directory) {
2079 if (arg_machine) {
2080 _cleanup_(image_unrefp) Image *i = NULL;
2081
2082 r = image_find(arg_machine, &i);
2083 if (r < 0)
2084 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2085 if (r == 0) {
2086 log_error("No image for machine '%s'.", arg_machine);
2087 return -ENOENT;
2088 }
2089
2090 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
2091 r = free_and_strdup(&arg_image, i->path);
2092 else
2093 r = free_and_strdup(&arg_directory, i->path);
2094 if (r < 0)
2095 return log_oom();
2096
2097 if (!arg_ephemeral)
2098 arg_read_only = arg_read_only || i->read_only;
2099 } else
2100 arg_directory = get_current_dir_name();
2101
2102 if (!arg_directory && !arg_image) {
2103 log_error("Failed to determine path, please use -D or -i.");
2104 return -EINVAL;
2105 }
2106 }
2107
2108 if (!arg_machine) {
2109
2110 if (arg_directory && path_equal(arg_directory, "/"))
2111 arg_machine = gethostname_malloc();
2112 else {
2113 if (arg_image) {
2114 char *e;
2115
2116 arg_machine = strdup(basename(arg_image));
2117
2118 /* Truncate suffix if there is one */
2119 e = endswith(arg_machine, ".raw");
2120 if (e)
2121 *e = 0;
2122 } else
2123 arg_machine = strdup(basename(arg_directory));
2124 }
2125 if (!arg_machine)
2126 return log_oom();
2127
2128 hostname_cleanup(arg_machine);
2129 if (!machine_name_is_valid(arg_machine)) {
2130 log_error("Failed to determine machine name automatically, please use -M.");
2131 return -EINVAL;
2132 }
2133
2134 if (arg_ephemeral) {
2135 char *b;
2136
2137 /* Add a random suffix when this is an
2138 * ephemeral machine, so that we can run many
2139 * instances at once without manually having
2140 * to specify -M each time. */
2141
2142 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2143 return log_oom();
2144
2145 free(arg_machine);
2146 arg_machine = b;
2147 }
2148 }
2149
2150 return 0;
2151 }
2152
2153 static int chase_symlinks_and_update(char **p, unsigned flags) {
2154 char *chased;
2155 int r;
2156
2157 assert(p);
2158
2159 if (!*p)
2160 return 0;
2161
2162 r = chase_symlinks(*p, NULL, flags, &chased);
2163 if (r < 0)
2164 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
2165
2166 free(*p);
2167 *p = chased;
2168
2169 return 0;
2170 }
2171
2172 static int determine_uid_shift(const char *directory) {
2173 int r;
2174
2175 if (arg_userns_mode == USER_NAMESPACE_NO) {
2176 arg_uid_shift = 0;
2177 return 0;
2178 }
2179
2180 if (arg_uid_shift == UID_INVALID) {
2181 struct stat st;
2182
2183 r = stat(directory, &st);
2184 if (r < 0)
2185 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2186
2187 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2188
2189 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
2190 log_error("UID and GID base of %s don't match.", directory);
2191 return -EINVAL;
2192 }
2193
2194 arg_uid_range = UINT32_C(0x10000);
2195 }
2196
2197 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2198 log_error("UID base too high for UID range.");
2199 return -EINVAL;
2200 }
2201
2202 return 0;
2203 }
2204
2205 static int inner_child(
2206 Barrier *barrier,
2207 const char *directory,
2208 bool secondary,
2209 int kmsg_socket,
2210 int rtnl_socket,
2211 FDSet *fds) {
2212
2213 _cleanup_free_ char *home = NULL;
2214 char as_uuid[37];
2215 unsigned n_env = 1;
2216 const char *envp[] = {
2217 "PATH=" DEFAULT_PATH_SPLIT_USR,
2218 NULL, /* container */
2219 NULL, /* TERM */
2220 NULL, /* HOME */
2221 NULL, /* USER */
2222 NULL, /* LOGNAME */
2223 NULL, /* container_uuid */
2224 NULL, /* LISTEN_FDS */
2225 NULL, /* LISTEN_PID */
2226 NULL, /* NOTIFY_SOCKET */
2227 NULL
2228 };
2229 const char *exec_target;
2230
2231 _cleanup_strv_free_ char **env_use = NULL;
2232 int r;
2233
2234 assert(barrier);
2235 assert(directory);
2236 assert(kmsg_socket >= 0);
2237
2238 if (arg_userns_mode != USER_NAMESPACE_NO) {
2239 /* Tell the parent, that it now can write the UID map. */
2240 (void) barrier_place(barrier); /* #1 */
2241
2242 /* Wait until the parent wrote the UID map */
2243 if (!barrier_place_and_sync(barrier)) { /* #2 */
2244 log_error("Parent died too early");
2245 return -ESRCH;
2246 }
2247 }
2248
2249 r = reset_uid_gid();
2250 if (r < 0)
2251 return log_error_errno(r, "Couldn't become new root: %m");
2252
2253 r = mount_all(NULL,
2254 arg_mount_settings | MOUNT_IN_USERNS,
2255 arg_uid_shift,
2256 arg_uid_range,
2257 arg_selinux_apifs_context);
2258
2259 if (r < 0)
2260 return r;
2261
2262 r = mount_sysfs(NULL, arg_mount_settings);
2263 if (r < 0)
2264 return r;
2265
2266 /* Wait until we are cgroup-ified, so that we
2267 * can mount the right cgroup path writable */
2268 if (!barrier_place_and_sync(barrier)) { /* #3 */
2269 log_error("Parent died too early");
2270 return -ESRCH;
2271 }
2272
2273 if (arg_use_cgns && cg_ns_supported()) {
2274 r = unshare(CLONE_NEWCGROUP);
2275 if (r < 0)
2276 return log_error_errno(errno, "Failed to unshare cgroup namespace");
2277 r = mount_cgroups(
2278 "",
2279 arg_unified_cgroup_hierarchy,
2280 arg_userns_mode != USER_NAMESPACE_NO,
2281 arg_uid_shift,
2282 arg_uid_range,
2283 arg_selinux_apifs_context,
2284 true);
2285 if (r < 0)
2286 return r;
2287 } else {
2288 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2289 if (r < 0)
2290 return r;
2291 }
2292
2293 r = setup_boot_id(NULL);
2294 if (r < 0)
2295 return r;
2296
2297 r = setup_kmsg(NULL, kmsg_socket);
2298 if (r < 0)
2299 return r;
2300 kmsg_socket = safe_close(kmsg_socket);
2301
2302 umask(0022);
2303
2304 if (setsid() < 0)
2305 return log_error_errno(errno, "setsid() failed: %m");
2306
2307 if (arg_private_network)
2308 loopback_setup();
2309
2310 if (arg_expose_ports) {
2311 r = expose_port_send_rtnl(rtnl_socket);
2312 if (r < 0)
2313 return r;
2314 rtnl_socket = safe_close(rtnl_socket);
2315 }
2316
2317 r = drop_capabilities();
2318 if (r < 0)
2319 return log_error_errno(r, "drop_capabilities() failed: %m");
2320
2321 setup_hostname();
2322
2323 if (arg_personality != PERSONALITY_INVALID) {
2324 r = safe_personality(arg_personality);
2325 if (r < 0)
2326 return log_error_errno(r, "personality() failed: %m");
2327 } else if (secondary) {
2328 r = safe_personality(PER_LINUX32);
2329 if (r < 0)
2330 return log_error_errno(r, "personality() failed: %m");
2331 }
2332
2333 #if HAVE_SELINUX
2334 if (arg_selinux_context)
2335 if (setexeccon(arg_selinux_context) < 0)
2336 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2337 #endif
2338
2339 r = change_uid_gid(arg_user, &home);
2340 if (r < 0)
2341 return r;
2342
2343 /* LXC sets container=lxc, so follow the scheme here */
2344 envp[n_env++] = strjoina("container=", arg_container_service_name);
2345
2346 envp[n_env] = strv_find_prefix(environ, "TERM=");
2347 if (envp[n_env])
2348 n_env++;
2349
2350 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2351 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2352 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2353 return log_oom();
2354
2355 assert(!sd_id128_is_null(arg_uuid));
2356
2357 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
2358 return log_oom();
2359
2360 if (fdset_size(fds) > 0) {
2361 r = fdset_cloexec(fds, false);
2362 if (r < 0)
2363 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2364
2365 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2366 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2367 return log_oom();
2368 }
2369 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
2370 return log_oom();
2371
2372 env_use = strv_env_merge(2, envp, arg_setenv);
2373 if (!env_use)
2374 return log_oom();
2375
2376 /* Let the parent know that we are ready and
2377 * wait until the parent is ready with the
2378 * setup, too... */
2379 if (!barrier_place_and_sync(barrier)) { /* #4 */
2380 log_error("Parent died too early");
2381 return -ESRCH;
2382 }
2383
2384 if (arg_chdir)
2385 if (chdir(arg_chdir) < 0)
2386 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2387
2388 if (arg_start_mode == START_PID2) {
2389 r = stub_pid1(arg_uuid);
2390 if (r < 0)
2391 return r;
2392 }
2393
2394 /* Now, explicitly close the log, so that we
2395 * then can close all remaining fds. Closing
2396 * the log explicitly first has the benefit
2397 * that the logging subsystem knows about it,
2398 * and is thus ready to be reopened should we
2399 * need it again. Note that the other fds
2400 * closed here are at least the locking and
2401 * barrier fds. */
2402 log_close();
2403 (void) fdset_close_others(fds);
2404
2405 if (arg_start_mode == START_BOOT) {
2406 char **a;
2407 size_t m;
2408
2409 /* Automatically search for the init system */
2410
2411 m = strv_length(arg_parameters);
2412 a = newa(char*, m + 2);
2413 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2414 a[1 + m] = NULL;
2415
2416 a[0] = (char*) "/usr/lib/systemd/systemd";
2417 execve(a[0], a, env_use);
2418
2419 a[0] = (char*) "/lib/systemd/systemd";
2420 execve(a[0], a, env_use);
2421
2422 a[0] = (char*) "/sbin/init";
2423 execve(a[0], a, env_use);
2424
2425 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
2426 } else if (!strv_isempty(arg_parameters)) {
2427 exec_target = arg_parameters[0];
2428 execvpe(arg_parameters[0], arg_parameters, env_use);
2429 } else {
2430 if (!arg_chdir)
2431 /* If we cannot change the directory, we'll end up in /, that is expected. */
2432 (void) chdir(home ?: "/root");
2433
2434 execle("/bin/bash", "-bash", NULL, env_use);
2435 execle("/bin/sh", "-sh", NULL, env_use);
2436
2437 exec_target = "/bin/bash, /bin/sh";
2438 }
2439
2440 r = -errno;
2441 (void) log_open();
2442 return log_error_errno(r, "execv(%s) failed: %m", exec_target);
2443 }
2444
2445 static int setup_sd_notify_child(void) {
2446 static const int one = 1;
2447 int fd = -1;
2448 union sockaddr_union sa = {
2449 .sa.sa_family = AF_UNIX,
2450 };
2451 int r;
2452
2453 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2454 if (fd < 0)
2455 return log_error_errno(errno, "Failed to allocate notification socket: %m");
2456
2457 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
2458 (void) unlink(NSPAWN_NOTIFY_SOCKET_PATH);
2459
2460 strncpy(sa.un.sun_path, NSPAWN_NOTIFY_SOCKET_PATH, sizeof(sa.un.sun_path)-1);
2461 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
2462 if (r < 0) {
2463 safe_close(fd);
2464 return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path);
2465 }
2466
2467 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
2468 if (r < 0) {
2469 safe_close(fd);
2470 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
2471 }
2472
2473 r = setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &one, sizeof(one));
2474 if (r < 0) {
2475 safe_close(fd);
2476 return log_error_errno(errno, "SO_PASSCRED failed: %m");
2477 }
2478
2479 return fd;
2480 }
2481
2482 static int outer_child(
2483 Barrier *barrier,
2484 const char *directory,
2485 const char *console,
2486 DissectedImage *dissected_image,
2487 bool interactive,
2488 bool secondary,
2489 int pid_socket,
2490 int uuid_socket,
2491 int notify_socket,
2492 int kmsg_socket,
2493 int rtnl_socket,
2494 int uid_shift_socket,
2495 FDSet *fds) {
2496
2497 pid_t pid;
2498 ssize_t l;
2499 int r;
2500 _cleanup_close_ int fd = -1;
2501
2502 assert(barrier);
2503 assert(directory);
2504 assert(console);
2505 assert(pid_socket >= 0);
2506 assert(uuid_socket >= 0);
2507 assert(notify_socket >= 0);
2508 assert(kmsg_socket >= 0);
2509
2510 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2511 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2512
2513 if (interactive) {
2514 close_nointr(STDIN_FILENO);
2515 close_nointr(STDOUT_FILENO);
2516 close_nointr(STDERR_FILENO);
2517
2518 r = open_terminal(console, O_RDWR);
2519 if (r != STDIN_FILENO) {
2520 if (r >= 0) {
2521 safe_close(r);
2522 r = -EINVAL;
2523 }
2524
2525 return log_error_errno(r, "Failed to open console: %m");
2526 }
2527
2528 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2529 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2530 return log_error_errno(errno, "Failed to duplicate console: %m");
2531 }
2532
2533 r = reset_audit_loginuid();
2534 if (r < 0)
2535 return r;
2536
2537 /* Mark everything as slave, so that we still
2538 * receive mounts from the real root, but don't
2539 * propagate mounts to the real root. */
2540 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
2541 if (r < 0)
2542 return r;
2543
2544 if (dissected_image) {
2545 r = dissected_image_mount(dissected_image, directory, DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0));
2546 if (r < 0)
2547 return r;
2548 }
2549
2550 r = determine_uid_shift(directory);
2551 if (r < 0)
2552 return r;
2553
2554 if (arg_userns_mode != USER_NAMESPACE_NO) {
2555 /* Let the parent know which UID shift we read from the image */
2556 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2557 if (l < 0)
2558 return log_error_errno(errno, "Failed to send UID shift: %m");
2559 if (l != sizeof(arg_uid_shift)) {
2560 log_error("Short write while sending UID shift.");
2561 return -EIO;
2562 }
2563
2564 if (arg_userns_mode == USER_NAMESPACE_PICK) {
2565 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
2566 * we just read from the image is available. If yes, it will send the UID shift back to us, if
2567 * not it will pick a different one, and send it back to us. */
2568
2569 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
2570 if (l < 0)
2571 return log_error_errno(errno, "Failed to recv UID shift: %m");
2572 if (l != sizeof(arg_uid_shift)) {
2573 log_error("Short read while receiving UID shift.");
2574 return -EIO;
2575 }
2576 }
2577
2578 log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2579 }
2580
2581 /* Turn directory into bind mount */
2582 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
2583 if (r < 0)
2584 return r;
2585
2586 r = setup_pivot_root(
2587 directory,
2588 arg_pivot_root_new,
2589 arg_pivot_root_old);
2590 if (r < 0)
2591 return r;
2592
2593 r = setup_volatile(
2594 directory,
2595 arg_volatile_mode,
2596 arg_userns_mode != USER_NAMESPACE_NO,
2597 arg_uid_shift,
2598 arg_uid_range,
2599 arg_selinux_context);
2600 if (r < 0)
2601 return r;
2602
2603 r = setup_volatile_state(
2604 directory,
2605 arg_volatile_mode,
2606 arg_userns_mode != USER_NAMESPACE_NO,
2607 arg_uid_shift,
2608 arg_uid_range,
2609 arg_selinux_context);
2610 if (r < 0)
2611 return r;
2612
2613 /* Mark everything as shared so our mounts get propagated down. This is
2614 * required to make new bind mounts available in systemd services
2615 * inside the containter that create a new mount namespace.
2616 * See https://github.com/systemd/systemd/issues/3860
2617 * Further submounts (such as /dev) done after this will inherit the
2618 * shared propagation mode. */
2619 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
2620 if (r < 0)
2621 return r;
2622
2623 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
2624 if (r < 0)
2625 return r;
2626
2627 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2628 if (r < 0)
2629 return r;
2630
2631 if (arg_read_only) {
2632 r = bind_remount_recursive(directory, true, NULL);
2633 if (r < 0)
2634 return log_error_errno(r, "Failed to make tree read-only: %m");
2635 }
2636
2637 r = mount_all(directory,
2638 arg_mount_settings,
2639 arg_uid_shift,
2640 arg_uid_range,
2641 arg_selinux_apifs_context);
2642 if (r < 0)
2643 return r;
2644
2645 r = copy_devnodes(directory);
2646 if (r < 0)
2647 return r;
2648
2649 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2650
2651 r = setup_pts(directory);
2652 if (r < 0)
2653 return r;
2654
2655 r = setup_propagate(directory);
2656 if (r < 0)
2657 return r;
2658
2659 r = setup_dev_console(directory, console);
2660 if (r < 0)
2661 return r;
2662
2663 r = setup_keyring();
2664 if (r < 0)
2665 return r;
2666
2667 r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
2668 if (r < 0)
2669 return r;
2670
2671 r = setup_timezone(directory);
2672 if (r < 0)
2673 return r;
2674
2675 r = setup_resolv_conf(directory);
2676 if (r < 0)
2677 return r;
2678
2679 r = setup_machine_id(directory);
2680 if (r < 0)
2681 return r;
2682
2683 r = setup_journal(directory);
2684 if (r < 0)
2685 return r;
2686
2687 r = mount_custom(
2688 directory,
2689 arg_custom_mounts,
2690 arg_n_custom_mounts,
2691 arg_userns_mode != USER_NAMESPACE_NO,
2692 arg_uid_shift,
2693 arg_uid_range,
2694 arg_selinux_apifs_context);
2695 if (r < 0)
2696 return r;
2697
2698 if (!arg_use_cgns || !cg_ns_supported()) {
2699 r = mount_cgroups(
2700 directory,
2701 arg_unified_cgroup_hierarchy,
2702 arg_userns_mode != USER_NAMESPACE_NO,
2703 arg_uid_shift,
2704 arg_uid_range,
2705 arg_selinux_apifs_context,
2706 false);
2707 if (r < 0)
2708 return r;
2709 }
2710
2711 r = mount_move_root(directory);
2712 if (r < 0)
2713 return log_error_errno(r, "Failed to move root directory: %m");
2714
2715 fd = setup_sd_notify_child();
2716 if (fd < 0)
2717 return fd;
2718
2719 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2720 arg_clone_ns_flags |
2721 (arg_private_network ? CLONE_NEWNET : 0) |
2722 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
2723 if (pid < 0)
2724 return log_error_errno(errno, "Failed to fork inner child: %m");
2725 if (pid == 0) {
2726 pid_socket = safe_close(pid_socket);
2727 uuid_socket = safe_close(uuid_socket);
2728 notify_socket = safe_close(notify_socket);
2729 uid_shift_socket = safe_close(uid_shift_socket);
2730
2731 /* The inner child has all namespaces that are
2732 * requested, so that we all are owned by the user if
2733 * user namespaces are turned on. */
2734
2735 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
2736 if (r < 0)
2737 _exit(EXIT_FAILURE);
2738
2739 _exit(EXIT_SUCCESS);
2740 }
2741
2742 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2743 if (l < 0)
2744 return log_error_errno(errno, "Failed to send PID: %m");
2745 if (l != sizeof(pid)) {
2746 log_error("Short write while sending PID.");
2747 return -EIO;
2748 }
2749
2750 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
2751 if (l < 0)
2752 return log_error_errno(errno, "Failed to send machine ID: %m");
2753 if (l != sizeof(arg_uuid)) {
2754 log_error("Short write while sending machine ID.");
2755 return -EIO;
2756 }
2757
2758 l = send_one_fd(notify_socket, fd, 0);
2759 if (l < 0)
2760 return log_error_errno(errno, "Failed to send notify fd: %m");
2761
2762 pid_socket = safe_close(pid_socket);
2763 uuid_socket = safe_close(uuid_socket);
2764 notify_socket = safe_close(notify_socket);
2765 kmsg_socket = safe_close(kmsg_socket);
2766 rtnl_socket = safe_close(rtnl_socket);
2767
2768 return 0;
2769 }
2770
2771 static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
2772 unsigned n_tries = 100;
2773 uid_t candidate;
2774 int r;
2775
2776 assert(shift);
2777 assert(ret_lock_file);
2778 assert(arg_userns_mode == USER_NAMESPACE_PICK);
2779 assert(arg_uid_range == 0x10000U);
2780
2781 candidate = *shift;
2782
2783 (void) mkdir("/run/systemd/nspawn-uid", 0755);
2784
2785 for (;;) {
2786 char lock_path[strlen("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
2787 _cleanup_release_lock_file_ LockFile lf = LOCK_FILE_INIT;
2788
2789 if (--n_tries <= 0)
2790 return -EBUSY;
2791
2792 if (candidate < UID_SHIFT_PICK_MIN || candidate > UID_SHIFT_PICK_MAX)
2793 goto next;
2794 if ((candidate & UINT32_C(0xFFFF)) != 0)
2795 goto next;
2796
2797 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
2798 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
2799 if (r == -EBUSY) /* Range already taken by another nspawn instance */
2800 goto next;
2801 if (r < 0)
2802 return r;
2803
2804 /* Make some superficial checks whether the range is currently known in the user database */
2805 if (getpwuid(candidate))
2806 goto next;
2807 if (getpwuid(candidate + UINT32_C(0xFFFE)))
2808 goto next;
2809 if (getgrgid(candidate))
2810 goto next;
2811 if (getgrgid(candidate + UINT32_C(0xFFFE)))
2812 goto next;
2813
2814 *ret_lock_file = lf;
2815 lf = (struct LockFile) LOCK_FILE_INIT;
2816 *shift = candidate;
2817 return 0;
2818
2819 next:
2820 random_bytes(&candidate, sizeof(candidate));
2821 candidate = (candidate % (UID_SHIFT_PICK_MAX - UID_SHIFT_PICK_MIN)) + UID_SHIFT_PICK_MIN;
2822 candidate &= (uid_t) UINT32_C(0xFFFF0000);
2823 }
2824 }
2825
2826 static int setup_uid_map(pid_t pid) {
2827 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2828 int r;
2829
2830 assert(pid > 1);
2831
2832 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2833 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
2834 r = write_string_file(uid_map, line, 0);
2835 if (r < 0)
2836 return log_error_errno(r, "Failed to write UID map: %m");
2837
2838 /* We always assign the same UID and GID ranges */
2839 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
2840 r = write_string_file(uid_map, line, 0);
2841 if (r < 0)
2842 return log_error_errno(r, "Failed to write GID map: %m");
2843
2844 return 0;
2845 }
2846
2847 static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
2848 char buf[NOTIFY_BUFFER_MAX+1];
2849 char *p = NULL;
2850 struct iovec iovec = {
2851 .iov_base = buf,
2852 .iov_len = sizeof(buf)-1,
2853 };
2854 union {
2855 struct cmsghdr cmsghdr;
2856 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
2857 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
2858 } control = {};
2859 struct msghdr msghdr = {
2860 .msg_iov = &iovec,
2861 .msg_iovlen = 1,
2862 .msg_control = &control,
2863 .msg_controllen = sizeof(control),
2864 };
2865 struct cmsghdr *cmsg;
2866 struct ucred *ucred = NULL;
2867 ssize_t n;
2868 pid_t inner_child_pid;
2869 _cleanup_strv_free_ char **tags = NULL;
2870
2871 assert(userdata);
2872
2873 inner_child_pid = PTR_TO_PID(userdata);
2874
2875 if (revents != EPOLLIN) {
2876 log_warning("Got unexpected poll event for notify fd.");
2877 return 0;
2878 }
2879
2880 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
2881 if (n < 0) {
2882 if (IN_SET(errno, EAGAIN, EINTR))
2883 return 0;
2884
2885 return log_warning_errno(errno, "Couldn't read notification socket: %m");
2886 }
2887 cmsg_close_all(&msghdr);
2888
2889 CMSG_FOREACH(cmsg, &msghdr) {
2890 if (cmsg->cmsg_level == SOL_SOCKET &&
2891 cmsg->cmsg_type == SCM_CREDENTIALS &&
2892 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
2893
2894 ucred = (struct ucred*) CMSG_DATA(cmsg);
2895 }
2896 }
2897
2898 if (!ucred || ucred->pid != inner_child_pid) {
2899 log_debug("Received notify message without valid credentials. Ignoring.");
2900 return 0;
2901 }
2902
2903 if ((size_t) n >= sizeof(buf)) {
2904 log_warning("Received notify message exceeded maximum size. Ignoring.");
2905 return 0;
2906 }
2907
2908 buf[n] = 0;
2909 tags = strv_split(buf, "\n\r");
2910 if (!tags)
2911 return log_oom();
2912
2913 if (strv_find(tags, "READY=1"))
2914 sd_notifyf(false, "READY=1\n");
2915
2916 p = strv_find_startswith(tags, "STATUS=");
2917 if (p)
2918 sd_notifyf(false, "STATUS=Container running: %s", p);
2919
2920 return 0;
2921 }
2922
2923 static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
2924 int r;
2925
2926 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
2927 if (r < 0)
2928 return log_error_errno(r, "Failed to allocate notify event source: %m");
2929
2930 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
2931
2932 return 0;
2933 }
2934
2935 static int load_settings(void) {
2936 _cleanup_(settings_freep) Settings *settings = NULL;
2937 _cleanup_fclose_ FILE *f = NULL;
2938 _cleanup_free_ char *p = NULL;
2939 const char *fn, *i;
2940 int r;
2941
2942 /* If all settings are masked, there's no point in looking for
2943 * the settings file */
2944 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2945 return 0;
2946
2947 fn = strjoina(arg_machine, ".nspawn");
2948
2949 /* We first look in the admin's directories in /etc and /run */
2950 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2951 _cleanup_free_ char *j = NULL;
2952
2953 j = strjoin(i, "/", fn);
2954 if (!j)
2955 return log_oom();
2956
2957 f = fopen(j, "re");
2958 if (f) {
2959 p = j;
2960 j = NULL;
2961
2962 /* By default, we trust configuration from /etc and /run */
2963 if (arg_settings_trusted < 0)
2964 arg_settings_trusted = true;
2965
2966 break;
2967 }
2968
2969 if (errno != ENOENT)
2970 return log_error_errno(errno, "Failed to open %s: %m", j);
2971 }
2972
2973 if (!f) {
2974 /* After that, let's look for a file next to the
2975 * actual image we shall boot. */
2976
2977 if (arg_image) {
2978 p = file_in_same_dir(arg_image, fn);
2979 if (!p)
2980 return log_oom();
2981 } else if (arg_directory) {
2982 p = file_in_same_dir(arg_directory, fn);
2983 if (!p)
2984 return log_oom();
2985 }
2986
2987 if (p) {
2988 f = fopen(p, "re");
2989 if (!f && errno != ENOENT)
2990 return log_error_errno(errno, "Failed to open %s: %m", p);
2991
2992 /* By default, we do not trust configuration from /var/lib/machines */
2993 if (arg_settings_trusted < 0)
2994 arg_settings_trusted = false;
2995 }
2996 }
2997
2998 if (!f)
2999 return 0;
3000
3001 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
3002
3003 r = settings_load(f, p, &settings);
3004 if (r < 0)
3005 return r;
3006
3007 /* Copy over bits from the settings, unless they have been
3008 * explicitly masked by command line switches. */
3009
3010 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3011 settings->start_mode >= 0) {
3012 arg_start_mode = settings->start_mode;
3013
3014 strv_free(arg_parameters);
3015 arg_parameters = settings->parameters;
3016 settings->parameters = NULL;
3017 }
3018
3019 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
3020 settings->pivot_root_new) {
3021 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
3022 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
3023 }
3024
3025 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
3026 settings->working_directory) {
3027 free(arg_chdir);
3028 arg_chdir = settings->working_directory;
3029 settings->working_directory = NULL;
3030 }
3031
3032 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
3033 settings->environment) {
3034 strv_free(arg_setenv);
3035 arg_setenv = settings->environment;
3036 settings->environment = NULL;
3037 }
3038
3039 if ((arg_settings_mask & SETTING_USER) == 0 &&
3040 settings->user) {
3041 free(arg_user);
3042 arg_user = settings->user;
3043 settings->user = NULL;
3044 }
3045
3046 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
3047 uint64_t plus;
3048
3049 plus = settings->capability;
3050 if (settings_private_network(settings))
3051 plus |= (1ULL << CAP_NET_ADMIN);
3052
3053 if (!arg_settings_trusted && plus != 0) {
3054 if (settings->capability != 0)
3055 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
3056 } else
3057 arg_caps_retain |= plus;
3058
3059 arg_caps_retain &= ~settings->drop_capability;
3060 }
3061
3062 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3063 settings->kill_signal > 0)
3064 arg_kill_signal = settings->kill_signal;
3065
3066 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3067 settings->personality != PERSONALITY_INVALID)
3068 arg_personality = settings->personality;
3069
3070 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3071 !sd_id128_is_null(settings->machine_id)) {
3072
3073 if (!arg_settings_trusted)
3074 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
3075 else
3076 arg_uuid = settings->machine_id;
3077 }
3078
3079 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3080 settings->read_only >= 0)
3081 arg_read_only = settings->read_only;
3082
3083 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3084 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3085 arg_volatile_mode = settings->volatile_mode;
3086
3087 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3088 settings->n_custom_mounts > 0) {
3089
3090 if (!arg_settings_trusted)
3091 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
3092 else {
3093 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3094 arg_custom_mounts = settings->custom_mounts;
3095 arg_n_custom_mounts = settings->n_custom_mounts;
3096
3097 settings->custom_mounts = NULL;
3098 settings->n_custom_mounts = 0;
3099 }
3100 }
3101
3102 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3103 (settings->private_network >= 0 ||
3104 settings->network_veth >= 0 ||
3105 settings->network_bridge ||
3106 settings->network_zone ||
3107 settings->network_interfaces ||
3108 settings->network_macvlan ||
3109 settings->network_ipvlan ||
3110 settings->network_veth_extra)) {
3111
3112 if (!arg_settings_trusted)
3113 log_warning("Ignoring network settings, file %s is not trusted.", p);
3114 else {
3115 arg_network_veth = settings_network_veth(settings);
3116 arg_private_network = settings_private_network(settings);
3117
3118 strv_free(arg_network_interfaces);
3119 arg_network_interfaces = settings->network_interfaces;
3120 settings->network_interfaces = NULL;
3121
3122 strv_free(arg_network_macvlan);
3123 arg_network_macvlan = settings->network_macvlan;
3124 settings->network_macvlan = NULL;
3125
3126 strv_free(arg_network_ipvlan);
3127 arg_network_ipvlan = settings->network_ipvlan;
3128 settings->network_ipvlan = NULL;
3129
3130 strv_free(arg_network_veth_extra);
3131 arg_network_veth_extra = settings->network_veth_extra;
3132 settings->network_veth_extra = NULL;
3133
3134 free(arg_network_bridge);
3135 arg_network_bridge = settings->network_bridge;
3136 settings->network_bridge = NULL;
3137
3138 free(arg_network_zone);
3139 arg_network_zone = settings->network_zone;
3140 settings->network_zone = NULL;
3141 }
3142 }
3143
3144 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3145 settings->expose_ports) {
3146
3147 if (!arg_settings_trusted)
3148 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3149 else {
3150 expose_port_free_all(arg_expose_ports);
3151 arg_expose_ports = settings->expose_ports;
3152 settings->expose_ports = NULL;
3153 }
3154 }
3155
3156 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3157 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3158
3159 if (!arg_settings_trusted)
3160 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p);
3161 else {
3162 arg_userns_mode = settings->userns_mode;
3163 arg_uid_shift = settings->uid_shift;
3164 arg_uid_range = settings->uid_range;
3165 arg_userns_chown = settings->userns_chown;
3166 }
3167 }
3168
3169 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3170 arg_notify_ready = settings->notify_ready;
3171
3172 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
3173
3174 if (!arg_settings_trusted && !strv_isempty(arg_syscall_whitelist))
3175 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", p);
3176 else {
3177 strv_free(arg_syscall_whitelist);
3178 strv_free(arg_syscall_blacklist);
3179
3180 arg_syscall_whitelist = settings->syscall_whitelist;
3181 arg_syscall_blacklist = settings->syscall_blacklist;
3182
3183 settings->syscall_whitelist = settings->syscall_blacklist = NULL;
3184 }
3185 }
3186
3187 return 0;
3188 }
3189
3190 static int run(int master,
3191 const char* console,
3192 DissectedImage *dissected_image,
3193 bool interactive,
3194 bool secondary,
3195 FDSet *fds,
3196 char veth_name[IFNAMSIZ], bool *veth_created,
3197 union in_addr_union *exposed,
3198 pid_t *pid, int *ret) {
3199
3200 static const struct sigaction sa = {
3201 .sa_handler = nop_signal_handler,
3202 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
3203 };
3204
3205 _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
3206 _cleanup_close_ int etc_passwd_lock = -1;
3207 _cleanup_close_pair_ int
3208 kmsg_socket_pair[2] = { -1, -1 },
3209 rtnl_socket_pair[2] = { -1, -1 },
3210 pid_socket_pair[2] = { -1, -1 },
3211 uuid_socket_pair[2] = { -1, -1 },
3212 notify_socket_pair[2] = { -1, -1 },
3213 uid_shift_socket_pair[2] = { -1, -1 };
3214 _cleanup_close_ int notify_socket= -1;
3215 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3216 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
3217 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
3218 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3219 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
3220 ContainerStatus container_status = 0;
3221 char last_char = 0;
3222 int ifi = 0, r;
3223 ssize_t l;
3224 sigset_t mask_chld;
3225
3226 assert_se(sigemptyset(&mask_chld) == 0);
3227 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3228
3229 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3230 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3231 * check with getpwuid() if the specific user already exists. Note that /etc might be
3232 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3233 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3234 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3235 * really ours. */
3236
3237 etc_passwd_lock = take_etc_passwd_lock(NULL);
3238 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
3239 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
3240 }
3241
3242 r = barrier_create(&barrier);
3243 if (r < 0)
3244 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
3245
3246 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
3247 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3248
3249 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
3250 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3251
3252 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
3253 return log_error_errno(errno, "Failed to create pid socket pair: %m");
3254
3255 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
3256 return log_error_errno(errno, "Failed to create id socket pair: %m");
3257
3258 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
3259 return log_error_errno(errno, "Failed to create notify socket pair: %m");
3260
3261 if (arg_userns_mode != USER_NAMESPACE_NO)
3262 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
3263 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3264
3265 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
3266 * parent's blocking calls and give it a chance to call wait() and terminate. */
3267 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3268 if (r < 0)
3269 return log_error_errno(errno, "Failed to change the signal mask: %m");
3270
3271 r = sigaction(SIGCHLD, &sa, NULL);
3272 if (r < 0)
3273 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3274
3275 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
3276 if (*pid < 0)
3277 return log_error_errno(errno, "clone() failed%s: %m",
3278 errno == EINVAL ?
3279 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
3280
3281 if (*pid == 0) {
3282 /* The outer child only has a file system namespace. */
3283 barrier_set_role(&barrier, BARRIER_CHILD);
3284
3285 master = safe_close(master);
3286
3287 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3288 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3289 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3290 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
3291 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3292 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3293
3294 (void) reset_all_signal_handlers();
3295 (void) reset_signal_mask();
3296
3297 r = outer_child(&barrier,
3298 arg_directory,
3299 console,
3300 dissected_image,
3301 interactive,
3302 secondary,
3303 pid_socket_pair[1],
3304 uuid_socket_pair[1],
3305 notify_socket_pair[1],
3306 kmsg_socket_pair[1],
3307 rtnl_socket_pair[1],
3308 uid_shift_socket_pair[1],
3309 fds);
3310 if (r < 0)
3311 _exit(EXIT_FAILURE);
3312
3313 _exit(EXIT_SUCCESS);
3314 }
3315
3316 barrier_set_role(&barrier, BARRIER_PARENT);
3317
3318 fds = fdset_free(fds);
3319
3320 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3321 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3322 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3323 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
3324 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3325 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3326
3327 if (arg_userns_mode != USER_NAMESPACE_NO) {
3328 /* The child just let us know the UID shift it might have read from the image. */
3329 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
3330 if (l < 0)
3331 return log_error_errno(errno, "Failed to read UID shift: %m");
3332 if (l != sizeof arg_uid_shift) {
3333 log_error("Short read while reading UID shift.");
3334 return -EIO;
3335 }
3336
3337 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3338 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3339 * image, but if that's already in use, pick a new one, and report back to the child,
3340 * which one we now picked. */
3341
3342 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
3343 if (r < 0)
3344 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
3345
3346 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
3347 if (l < 0)
3348 return log_error_errno(errno, "Failed to send UID shift: %m");
3349 if (l != sizeof arg_uid_shift) {
3350 log_error("Short write while writing UID shift.");
3351 return -EIO;
3352 }
3353 }
3354 }
3355
3356 /* Wait for the outer child. */
3357 r = wait_for_terminate_and_warn("namespace helper", *pid, NULL);
3358 if (r != 0)
3359 return r < 0 ? r : -EIO;
3360
3361 /* And now retrieve the PID of the inner child. */
3362 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
3363 if (l < 0)
3364 return log_error_errno(errno, "Failed to read inner child PID: %m");
3365 if (l != sizeof *pid) {
3366 log_error("Short read while reading inner child PID.");
3367 return -EIO;
3368 }
3369
3370 /* We also retrieve container UUID in case it was generated by outer child */
3371 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
3372 if (l < 0)
3373 return log_error_errno(errno, "Failed to read container machine ID: %m");
3374 if (l != sizeof(arg_uuid)) {
3375 log_error("Short read while reading container machined ID.");
3376 return -EIO;
3377 }
3378
3379 /* We also retrieve the socket used for notifications generated by outer child */
3380 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
3381 if (notify_socket < 0)
3382 return log_error_errno(notify_socket,
3383 "Failed to receive notification socket from the outer child: %m");
3384
3385 log_debug("Init process invoked as PID "PID_FMT, *pid);
3386
3387 if (arg_userns_mode != USER_NAMESPACE_NO) {
3388 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3389 log_error("Child died too early.");
3390 return -ESRCH;
3391 }
3392
3393 r = setup_uid_map(*pid);
3394 if (r < 0)
3395 return r;
3396
3397 (void) barrier_place(&barrier); /* #2 */
3398 }
3399
3400 if (arg_private_network) {
3401
3402 r = move_network_interfaces(*pid, arg_network_interfaces);
3403 if (r < 0)
3404 return r;
3405
3406 if (arg_network_veth) {
3407 r = setup_veth(arg_machine, *pid, veth_name,
3408 arg_network_bridge || arg_network_zone);
3409 if (r < 0)
3410 return r;
3411 else if (r > 0)
3412 ifi = r;
3413
3414 if (arg_network_bridge) {
3415 /* Add the interface to a bridge */
3416 r = setup_bridge(veth_name, arg_network_bridge, false);
3417 if (r < 0)
3418 return r;
3419 if (r > 0)
3420 ifi = r;
3421 } else if (arg_network_zone) {
3422 /* Add the interface to a bridge, possibly creating it */
3423 r = setup_bridge(veth_name, arg_network_zone, true);
3424 if (r < 0)
3425 return r;
3426 if (r > 0)
3427 ifi = r;
3428 }
3429 }
3430
3431 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
3432 if (r < 0)
3433 return r;
3434
3435 /* We created the primary and extra veth links now; let's remember this, so that we know to
3436 remove them later on. Note that we don't bother with removing veth links that were created
3437 here when their setup failed half-way, because in that case the kernel should be able to
3438 remove them on its own, since they cannot be referenced by anything yet. */
3439 *veth_created = true;
3440
3441 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
3442 if (r < 0)
3443 return r;
3444
3445 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
3446 if (r < 0)
3447 return r;
3448 }
3449
3450 if (arg_register) {
3451 r = register_machine(
3452 arg_machine,
3453 *pid,
3454 arg_directory,
3455 arg_uuid,
3456 ifi,
3457 arg_slice,
3458 arg_custom_mounts, arg_n_custom_mounts,
3459 arg_kill_signal,
3460 arg_property,
3461 arg_keep_unit,
3462 arg_container_service_name);
3463 if (r < 0)
3464 return r;
3465 } else if (!arg_keep_unit) {
3466 r = allocate_scope(
3467 arg_machine,
3468 *pid,
3469 arg_slice,
3470 arg_custom_mounts, arg_n_custom_mounts,
3471 arg_kill_signal,
3472 arg_property);
3473 if (r < 0)
3474 return r;
3475
3476 } else if (arg_slice || arg_property)
3477 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
3478
3479 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
3480 if (r < 0)
3481 return r;
3482
3483 if (arg_keep_unit) {
3484 r = create_subcgroup(*pid, arg_unified_cgroup_hierarchy);
3485 if (r < 0)
3486 return r;
3487 }
3488
3489 r = chown_cgroup(*pid, arg_uid_shift);
3490 if (r < 0)
3491 return r;
3492
3493 /* Notify the child that the parent is ready with all
3494 * its setup (including cgroup-ification), and that
3495 * the child can now hand over control to the code to
3496 * run inside the container. */
3497 (void) barrier_place(&barrier); /* #3 */
3498
3499 /* Block SIGCHLD here, before notifying child.
3500 * process_pty() will handle it with the other signals. */
3501 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3502
3503 /* Reset signal to default */
3504 r = default_signals(SIGCHLD, -1);
3505 if (r < 0)
3506 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
3507
3508 r = sd_event_new(&event);
3509 if (r < 0)
3510 return log_error_errno(r, "Failed to get default event source: %m");
3511
3512 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
3513 if (r < 0)
3514 return r;
3515
3516 /* Let the child know that we are ready and wait that the child is completely ready now. */
3517 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3518 log_error("Child died too early.");
3519 return -ESRCH;
3520 }
3521
3522 /* At this point we have made use of the UID we picked, and thus nss-mymachines
3523 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
3524 etc_passwd_lock = safe_close(etc_passwd_lock);
3525
3526 sd_notifyf(false,
3527 "STATUS=Container running.\n"
3528 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
3529 if (!arg_notify_ready)
3530 sd_notify(false, "READY=1\n");
3531
3532 if (arg_kill_signal > 0) {
3533 /* Try to kill the init system on SIGINT or SIGTERM */
3534 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
3535 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
3536 } else {
3537 /* Immediately exit */
3538 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3539 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3540 }
3541
3542 /* Exit when the child exits */
3543 sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
3544
3545 if (arg_expose_ports) {
3546 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
3547 if (r < 0)
3548 return r;
3549
3550 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
3551 }
3552
3553 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3554
3555 r = pty_forward_new(event, master,
3556 PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY),
3557 &forward);
3558 if (r < 0)
3559 return log_error_errno(r, "Failed to create PTY forwarder: %m");
3560
3561 r = sd_event_loop(event);
3562 if (r < 0)
3563 return log_error_errno(r, "Failed to run event loop: %m");
3564
3565 pty_forward_get_last_char(forward, &last_char);
3566
3567 forward = pty_forward_free(forward);
3568
3569 if (!arg_quiet && last_char != '\n')
3570 putc('\n', stdout);
3571
3572 /* Kill if it is not dead yet anyway */
3573 if (arg_register && !arg_keep_unit)
3574 terminate_machine(*pid);
3575
3576 /* Normally redundant, but better safe than sorry */
3577 (void) kill(*pid, SIGKILL);
3578
3579 r = wait_for_container(*pid, &container_status);
3580 *pid = 0;
3581
3582 if (r < 0)
3583 /* We failed to wait for the container, or the container exited abnormally. */
3584 return r;
3585 if (r > 0 || container_status == CONTAINER_TERMINATED) {
3586 /* r > 0 → The container exited with a non-zero status.
3587 * As a special case, we need to replace 133 with a different value,
3588 * because 133 is special-cased in the service file to reboot the container.
3589 * otherwise → The container exited with zero status and a reboot was not requested.
3590 */
3591 if (r == EXIT_FORCE_RESTART)
3592 r = EXIT_FAILURE; /* replace 133 with the general failure code */
3593 *ret = r;
3594 return 0; /* finito */
3595 }
3596
3597 /* CONTAINER_REBOOTED, loop again */
3598
3599 if (arg_keep_unit) {
3600 /* Special handling if we are running as a service: instead of simply
3601 * restarting the machine we want to restart the entire service, so let's
3602 * inform systemd about this with the special exit code 133. The service
3603 * file uses RestartForceExitStatus=133 so that this results in a full
3604 * nspawn restart. This is necessary since we might have cgroup parameters
3605 * set we want to have flushed out. */
3606 *ret = EXIT_FORCE_RESTART;
3607 return 0; /* finito */
3608 }
3609
3610 expose_port_flush(arg_expose_ports, exposed);
3611
3612 (void) remove_veth_links(veth_name, arg_network_veth_extra);
3613 *veth_created = false;
3614 return 1; /* loop again */
3615 }
3616
3617 int main(int argc, char *argv[]) {
3618
3619 _cleanup_free_ char *console = NULL;
3620 _cleanup_close_ int master = -1;
3621 _cleanup_fdset_free_ FDSet *fds = NULL;
3622 int r, n_fd_passed, ret = EXIT_SUCCESS;
3623 char veth_name[IFNAMSIZ] = "";
3624 bool secondary = false, remove_directory = false, remove_image = false;
3625 pid_t pid = 0;
3626 union in_addr_union exposed = {};
3627 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3628 bool interactive, veth_created = false, remove_tmprootdir = false;
3629 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
3630 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
3631 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
3632 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
3633
3634 log_parse_environment();
3635 log_open();
3636
3637 /* Make sure rename_process() in the stub init process can work */
3638 saved_argv = argv;
3639 saved_argc = argc;
3640
3641 r = parse_argv(argc, argv);
3642 if (r <= 0)
3643 goto finish;
3644
3645 if (geteuid() != 0) {
3646 log_error("Need to be root.");
3647 r = -EPERM;
3648 goto finish;
3649 }
3650 r = determine_names();
3651 if (r < 0)
3652 goto finish;
3653
3654 r = load_settings();
3655 if (r < 0)
3656 goto finish;
3657
3658 r = verify_arguments();
3659 if (r < 0)
3660 goto finish;
3661
3662 n_fd_passed = sd_listen_fds(false);
3663 if (n_fd_passed > 0) {
3664 r = fdset_new_listen_fds(&fds, false);
3665 if (r < 0) {
3666 log_error_errno(r, "Failed to collect file descriptors: %m");
3667 goto finish;
3668 }
3669 }
3670
3671 if (arg_directory) {
3672 assert(!arg_image);
3673
3674 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3675 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3676 r = -EINVAL;
3677 goto finish;
3678 }
3679
3680 if (arg_ephemeral) {
3681 _cleanup_free_ char *np = NULL;
3682
3683 r = chase_symlinks_and_update(&arg_directory, 0);
3684 if (r < 0)
3685 goto finish;
3686
3687 /* If the specified path is a mount point we
3688 * generate the new snapshot immediately
3689 * inside it under a random name. However if
3690 * the specified is not a mount point we
3691 * create the new snapshot in the parent
3692 * directory, just next to it. */
3693 r = path_is_mount_point(arg_directory, NULL, 0);
3694 if (r < 0) {
3695 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3696 goto finish;
3697 }
3698 if (r > 0)
3699 r = tempfn_random_child(arg_directory, "machine.", &np);
3700 else
3701 r = tempfn_random(arg_directory, "machine.", &np);
3702 if (r < 0) {
3703 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
3704 goto finish;
3705 }
3706
3707 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3708 if (r < 0) {
3709 log_error_errno(r, "Failed to lock %s: %m", np);
3710 goto finish;
3711 }
3712
3713 r = btrfs_subvol_snapshot(arg_directory, np,
3714 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
3715 BTRFS_SNAPSHOT_FALLBACK_COPY |
3716 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3717 BTRFS_SNAPSHOT_RECURSIVE |
3718 BTRFS_SNAPSHOT_QUOTA);
3719 if (r < 0) {
3720 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3721 goto finish;
3722 }
3723
3724 free(arg_directory);
3725 arg_directory = np;
3726 np = NULL;
3727
3728 remove_directory = true;
3729
3730 } else {
3731 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
3732 if (r < 0)
3733 goto finish;
3734
3735 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3736 if (r == -EBUSY) {
3737 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3738 goto finish;
3739 }
3740 if (r < 0) {
3741 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3742 goto finish;
3743 }
3744
3745 if (arg_template) {
3746 r = chase_symlinks_and_update(&arg_template, 0);
3747 if (r < 0)
3748 goto finish;
3749
3750 r = btrfs_subvol_snapshot(arg_template, arg_directory,
3751 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
3752 BTRFS_SNAPSHOT_FALLBACK_COPY |
3753 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3754 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
3755 BTRFS_SNAPSHOT_RECURSIVE |
3756 BTRFS_SNAPSHOT_QUOTA);
3757 if (r == -EEXIST) {
3758 if (!arg_quiet)
3759 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3760 } else if (r < 0) {
3761 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3762 goto finish;
3763 } else {
3764 if (!arg_quiet)
3765 log_info("Populated %s from template %s.", arg_directory, arg_template);
3766 }
3767 }
3768 }
3769
3770 if (arg_start_mode == START_BOOT) {
3771 if (path_is_os_tree(arg_directory) <= 0) {
3772 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3773 r = -EINVAL;
3774 goto finish;
3775 }
3776 } else {
3777 const char *p;
3778
3779 p = strjoina(arg_directory, "/usr/");
3780 if (laccess(p, F_OK) < 0) {
3781 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
3782 r = -EINVAL;
3783 goto finish;
3784 }
3785 }
3786
3787 } else {
3788 assert(arg_image);
3789 assert(!arg_template);
3790
3791 r = chase_symlinks_and_update(&arg_image, 0);
3792 if (r < 0)
3793 goto finish;
3794
3795 if (arg_ephemeral) {
3796 _cleanup_free_ char *np = NULL;
3797
3798 r = tempfn_random(arg_image, "machine.", &np);
3799 if (r < 0) {
3800 log_error_errno(r, "Failed to generate name for image snapshot: %m");
3801 goto finish;
3802 }
3803
3804 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3805 if (r < 0) {
3806 r = log_error_errno(r, "Failed to create image lock: %m");
3807 goto finish;
3808 }
3809
3810 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, COPY_REFLINK);
3811 if (r < 0) {
3812 r = log_error_errno(r, "Failed to copy image file: %m");
3813 goto finish;
3814 }
3815
3816 free(arg_image);
3817 arg_image = np;
3818 np = NULL;
3819
3820 remove_image = true;
3821 } else {
3822 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3823 if (r == -EBUSY) {
3824 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3825 goto finish;
3826 }
3827 if (r < 0) {
3828 r = log_error_errno(r, "Failed to create image lock: %m");
3829 goto finish;
3830 }
3831
3832 if (!arg_root_hash) {
3833 r = root_hash_load(arg_image, &arg_root_hash, &arg_root_hash_size);
3834 if (r < 0) {
3835 log_error_errno(r, "Failed to load root hash file for %s: %m", arg_image);
3836 goto finish;
3837 }
3838 }
3839 }
3840
3841 if (!mkdtemp(tmprootdir)) {
3842 r = log_error_errno(errno, "Failed to create temporary directory: %m");
3843 goto finish;
3844 }
3845
3846 remove_tmprootdir = true;
3847
3848 arg_directory = strdup(tmprootdir);
3849 if (!arg_directory) {
3850 r = log_oom();
3851 goto finish;
3852 }
3853
3854 r = loop_device_make_by_path(arg_image, arg_read_only ? O_RDONLY : O_RDWR, &loop);
3855 if (r < 0) {
3856 log_error_errno(r, "Failed to set up loopback block device: %m");
3857 goto finish;
3858 }
3859
3860 r = dissect_image(
3861 loop->fd,
3862 arg_root_hash, arg_root_hash_size,
3863 DISSECT_IMAGE_REQUIRE_ROOT,
3864 &dissected_image);
3865 if (r == -ENOPKG) {
3866 log_error_errno(r, "Could not find a suitable file system or partition table in image: %s", arg_image);
3867
3868 log_notice("Note that the disk image needs to\n"
3869 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
3870 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
3871 " c) or follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n"
3872 " d) or contain a file system without a partition table\n"
3873 "in order to be bootable with systemd-nspawn.");
3874 goto finish;
3875 }
3876 if (r == -EADDRNOTAVAIL) {
3877 log_error_errno(r, "No root partition for specified root hash found.");
3878 goto finish;
3879 }
3880 if (r == -EOPNOTSUPP) {
3881 log_error_errno(r, "--image= is not supported, compiled without blkid support.");
3882 goto finish;
3883 }
3884 if (r == -EPROTONOSUPPORT) {
3885 log_error_errno(r, "Device is loopback block device with partition scanning turned off, please turn it on.");
3886 goto finish;
3887 }
3888 if (r < 0) {
3889 log_error_errno(r, "Failed to dissect image: %m");
3890 goto finish;
3891 }
3892
3893 if (!arg_root_hash && dissected_image->can_verity)
3894 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
3895
3896 r = dissected_image_decrypt_interactively(dissected_image, NULL, arg_root_hash, arg_root_hash_size, 0, &decrypted_image);
3897 if (r < 0)
3898 goto finish;
3899
3900 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
3901 if (remove_image && unlink(arg_image) >= 0)
3902 remove_image = false;
3903 }
3904
3905 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
3906 if (r < 0)
3907 goto finish;
3908
3909 r = detect_unified_cgroup_hierarchy(arg_directory);
3910 if (r < 0)
3911 goto finish;
3912
3913 interactive =
3914 isatty(STDIN_FILENO) > 0 &&
3915 isatty(STDOUT_FILENO) > 0;
3916
3917 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3918 if (master < 0) {
3919 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3920 goto finish;
3921 }
3922
3923 r = ptsname_malloc(master, &console);
3924 if (r < 0) {
3925 r = log_error_errno(r, "Failed to determine tty name: %m");
3926 goto finish;
3927 }
3928
3929 if (arg_selinux_apifs_context) {
3930 r = mac_selinux_apply(console, arg_selinux_apifs_context);
3931 if (r < 0)
3932 goto finish;
3933 }
3934
3935 if (unlockpt(master) < 0) {
3936 r = log_error_errno(errno, "Failed to unlock tty: %m");
3937 goto finish;
3938 }
3939
3940 if (!arg_quiet)
3941 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3942 arg_machine, arg_image ?: arg_directory);
3943
3944 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
3945
3946 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3947 r = log_error_errno(errno, "Failed to become subreaper: %m");
3948 goto finish;
3949 }
3950
3951 for (;;) {
3952 r = run(master,
3953 console,
3954 dissected_image,
3955 interactive, secondary,
3956 fds,
3957 veth_name, &veth_created,
3958 &exposed,
3959 &pid, &ret);
3960 if (r <= 0)
3961 break;
3962 }
3963
3964 finish:
3965 sd_notify(false,
3966 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
3967 "STOPPING=1\nSTATUS=Terminating...");
3968
3969 if (pid > 0)
3970 (void) kill(pid, SIGKILL);
3971
3972 /* Try to flush whatever is still queued in the pty */
3973 if (master >= 0) {
3974 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, 0);
3975 master = safe_close(master);
3976 }
3977
3978 if (pid > 0)
3979 (void) wait_for_terminate(pid, NULL);
3980
3981 if (remove_directory && arg_directory) {
3982 int k;
3983
3984 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
3985 if (k < 0)
3986 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
3987 }
3988
3989 if (remove_image && arg_image) {
3990 if (unlink(arg_image) < 0)
3991 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
3992 }
3993
3994 if (remove_tmprootdir) {
3995 if (rmdir(tmprootdir) < 0)
3996 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
3997 }
3998
3999 if (arg_machine) {
4000 const char *p;
4001
4002 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
4003 (void) rm_rf(p, REMOVE_ROOT);
4004 }
4005
4006 expose_port_flush(arg_expose_ports, &exposed);
4007
4008 if (veth_created)
4009 (void) remove_veth_links(veth_name, arg_network_veth_extra);
4010 (void) remove_bridge(arg_network_zone);
4011
4012 free(arg_directory);
4013 free(arg_template);
4014 free(arg_image);
4015 free(arg_machine);
4016 free(arg_user);
4017 free(arg_pivot_root_new);
4018 free(arg_pivot_root_old);
4019 free(arg_chdir);
4020 strv_free(arg_setenv);
4021 free(arg_network_bridge);
4022 strv_free(arg_network_interfaces);
4023 strv_free(arg_network_macvlan);
4024 strv_free(arg_network_ipvlan);
4025 strv_free(arg_network_veth_extra);
4026 strv_free(arg_parameters);
4027 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4028 expose_port_free_all(arg_expose_ports);
4029 free(arg_root_hash);
4030
4031 return r < 0 ? EXIT_FAILURE : ret;
4032 }