]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
dissect: when we invoke dissection on a loop device with partscan help the user
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18 ***/
19
20 #if HAVE_BLKID
21 #include <blkid.h>
22 #endif
23 #include <errno.h>
24 #include <getopt.h>
25 #include <grp.h>
26 #include <linux/loop.h>
27 #include <pwd.h>
28 #include <sched.h>
29 #if HAVE_SELINUX
30 #include <selinux/selinux.h>
31 #endif
32 #include <signal.h>
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <sys/file.h>
37 #include <sys/mount.h>
38 #include <sys/personality.h>
39 #include <sys/prctl.h>
40 #include <sys/types.h>
41 #include <sys/wait.h>
42 #include <unistd.h>
43
44 #include "sd-bus.h"
45 #include "sd-daemon.h"
46 #include "sd-id128.h"
47
48 #include "alloc-util.h"
49 #include "barrier.h"
50 #include "base-filesystem.h"
51 #include "blkid-util.h"
52 #include "btrfs-util.h"
53 #include "bus-util.h"
54 #include "cap-list.h"
55 #include "capability-util.h"
56 #include "cgroup-util.h"
57 #include "copy.h"
58 #include "dev-setup.h"
59 #include "dissect-image.h"
60 #include "env-util.h"
61 #include "fd-util.h"
62 #include "fdset.h"
63 #include "fileio.h"
64 #include "format-util.h"
65 #include "fs-util.h"
66 #include "gpt.h"
67 #include "hexdecoct.h"
68 #include "hostname-util.h"
69 #include "id128-util.h"
70 #include "log.h"
71 #include "loop-util.h"
72 #include "loopback-setup.h"
73 #include "machine-image.h"
74 #include "macro.h"
75 #include "missing.h"
76 #include "mkdir.h"
77 #include "mount-util.h"
78 #include "netlink-util.h"
79 #include "nspawn-cgroup.h"
80 #include "nspawn-expose-ports.h"
81 #include "nspawn-mount.h"
82 #include "nspawn-network.h"
83 #include "nspawn-patch-uid.h"
84 #include "nspawn-register.h"
85 #include "nspawn-seccomp.h"
86 #include "nspawn-settings.h"
87 #include "nspawn-setuid.h"
88 #include "nspawn-stub-pid1.h"
89 #include "parse-util.h"
90 #include "path-util.h"
91 #include "process-util.h"
92 #include "ptyfwd.h"
93 #include "random-util.h"
94 #include "raw-clone.h"
95 #include "rm-rf.h"
96 #include "selinux-util.h"
97 #include "signal-util.h"
98 #include "socket-util.h"
99 #include "stat-util.h"
100 #include "stdio-util.h"
101 #include "string-util.h"
102 #include "strv.h"
103 #include "terminal-util.h"
104 #include "udev-util.h"
105 #include "umask-util.h"
106 #include "user-util.h"
107 #include "util.h"
108
109 /* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
110 * UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems
111 * may have their own allocation ranges too. */
112 #define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
113 #define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
114
115 /* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
116 * nspawn_notify_socket_path is relative to the container
117 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
118 #define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
119
120 #define EXIT_FORCE_RESTART 133
121
122 typedef enum ContainerStatus {
123 CONTAINER_TERMINATED,
124 CONTAINER_REBOOTED
125 } ContainerStatus;
126
127 typedef enum LinkJournal {
128 LINK_NO,
129 LINK_AUTO,
130 LINK_HOST,
131 LINK_GUEST
132 } LinkJournal;
133
134 static char *arg_directory = NULL;
135 static char *arg_template = NULL;
136 static char *arg_chdir = NULL;
137 static char *arg_pivot_root_new = NULL;
138 static char *arg_pivot_root_old = NULL;
139 static char *arg_user = NULL;
140 static sd_id128_t arg_uuid = {};
141 static char *arg_machine = NULL;
142 static const char *arg_selinux_context = NULL;
143 static const char *arg_selinux_apifs_context = NULL;
144 static const char *arg_slice = NULL;
145 static bool arg_private_network = false;
146 static bool arg_read_only = false;
147 static StartMode arg_start_mode = START_PID1;
148 static bool arg_ephemeral = false;
149 static LinkJournal arg_link_journal = LINK_AUTO;
150 static bool arg_link_journal_try = false;
151 static uint64_t arg_caps_retain =
152 (1ULL << CAP_AUDIT_CONTROL) |
153 (1ULL << CAP_AUDIT_WRITE) |
154 (1ULL << CAP_CHOWN) |
155 (1ULL << CAP_DAC_OVERRIDE) |
156 (1ULL << CAP_DAC_READ_SEARCH) |
157 (1ULL << CAP_FOWNER) |
158 (1ULL << CAP_FSETID) |
159 (1ULL << CAP_IPC_OWNER) |
160 (1ULL << CAP_KILL) |
161 (1ULL << CAP_LEASE) |
162 (1ULL << CAP_LINUX_IMMUTABLE) |
163 (1ULL << CAP_MKNOD) |
164 (1ULL << CAP_NET_BIND_SERVICE) |
165 (1ULL << CAP_NET_BROADCAST) |
166 (1ULL << CAP_NET_RAW) |
167 (1ULL << CAP_SETFCAP) |
168 (1ULL << CAP_SETGID) |
169 (1ULL << CAP_SETPCAP) |
170 (1ULL << CAP_SETUID) |
171 (1ULL << CAP_SYS_ADMIN) |
172 (1ULL << CAP_SYS_BOOT) |
173 (1ULL << CAP_SYS_CHROOT) |
174 (1ULL << CAP_SYS_NICE) |
175 (1ULL << CAP_SYS_PTRACE) |
176 (1ULL << CAP_SYS_RESOURCE) |
177 (1ULL << CAP_SYS_TTY_CONFIG);
178 static CustomMount *arg_custom_mounts = NULL;
179 static unsigned arg_n_custom_mounts = 0;
180 static char **arg_setenv = NULL;
181 static bool arg_quiet = false;
182 static bool arg_register = true;
183 static bool arg_keep_unit = false;
184 static char **arg_network_interfaces = NULL;
185 static char **arg_network_macvlan = NULL;
186 static char **arg_network_ipvlan = NULL;
187 static bool arg_network_veth = false;
188 static char **arg_network_veth_extra = NULL;
189 static char *arg_network_bridge = NULL;
190 static char *arg_network_zone = NULL;
191 static unsigned long arg_personality = PERSONALITY_INVALID;
192 static char *arg_image = NULL;
193 static VolatileMode arg_volatile_mode = VOLATILE_NO;
194 static ExposePort *arg_expose_ports = NULL;
195 static char **arg_property = NULL;
196 static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
197 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
198 static bool arg_userns_chown = false;
199 static int arg_kill_signal = 0;
200 static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
201 static SettingsMask arg_settings_mask = 0;
202 static int arg_settings_trusted = -1;
203 static char **arg_parameters = NULL;
204 static const char *arg_container_service_name = "systemd-nspawn";
205 static bool arg_notify_ready = false;
206 static bool arg_use_cgns = true;
207 static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
208 static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO;
209 static void *arg_root_hash = NULL;
210 static size_t arg_root_hash_size = 0;
211 static char **arg_syscall_whitelist = NULL;
212 static char **arg_syscall_blacklist = NULL;
213
214 static void help(void) {
215 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
216 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
217 " -h --help Show this help\n"
218 " --version Print version string\n"
219 " -q --quiet Do not show status information\n"
220 " -D --directory=PATH Root directory for the container\n"
221 " --template=PATH Initialize root directory from template directory,\n"
222 " if missing\n"
223 " -x --ephemeral Run container with snapshot of root directory, and\n"
224 " remove it after exit\n"
225 " -i --image=PATH File system device or disk image for the container\n"
226 " --root-hash=HASH Specify verity root hash\n"
227 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
228 " -b --boot Boot up full system (i.e. invoke init)\n"
229 " --chdir=PATH Set working directory in the container\n"
230 " --pivot-root=PATH[:PATH]\n"
231 " Pivot root to given directory in the container\n"
232 " -u --user=USER Run the command under specified user or uid\n"
233 " -M --machine=NAME Set the machine name for the container\n"
234 " --uuid=UUID Set a specific machine UUID for the container\n"
235 " -S --slice=SLICE Place the container in the specified slice\n"
236 " --property=NAME=VALUE Set scope unit property\n"
237 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
238 " --private-users[=UIDBASE[:NUIDS]]\n"
239 " Similar, but with user configured UID/GID range\n"
240 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n"
241 " --private-network Disable network in container\n"
242 " --network-interface=INTERFACE\n"
243 " Assign an existing network interface to the\n"
244 " container\n"
245 " --network-macvlan=INTERFACE\n"
246 " Create a macvlan network interface based on an\n"
247 " existing network interface to the container\n"
248 " --network-ipvlan=INTERFACE\n"
249 " Create a ipvlan network interface based on an\n"
250 " existing network interface to the container\n"
251 " -n --network-veth Add a virtual Ethernet connection between host\n"
252 " and container\n"
253 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
254 " Add an additional virtual Ethernet link between\n"
255 " host and container\n"
256 " --network-bridge=INTERFACE\n"
257 " Add a virtual Ethernet connection to the container\n"
258 " and attach it to an existing bridge on the host\n"
259 " --network-zone=NAME Similar, but attach the new interface to an\n"
260 " an automatically managed bridge interface\n"
261 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
262 " Expose a container IP port on the host\n"
263 " -Z --selinux-context=SECLABEL\n"
264 " Set the SELinux security context to be used by\n"
265 " processes in the container\n"
266 " -L --selinux-apifs-context=SECLABEL\n"
267 " Set the SELinux security context to be used by\n"
268 " API/tmpfs file systems in the container\n"
269 " --capability=CAP In addition to the default, retain specified\n"
270 " capability\n"
271 " --drop-capability=CAP Drop the specified capability from the default set\n"
272 " --system-call-filter=LIST|~LIST\n"
273 " Permit/prohibit specific system calls\n"
274 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
275 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
276 " host, try-guest, try-host\n"
277 " -j Equivalent to --link-journal=try-guest\n"
278 " --read-only Mount the root directory read-only\n"
279 " --bind=PATH[:PATH[:OPTIONS]]\n"
280 " Bind mount a file or directory from the host into\n"
281 " the container\n"
282 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
283 " Similar, but creates a read-only bind mount\n"
284 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
285 " --overlay=PATH[:PATH...]:PATH\n"
286 " Create an overlay mount from the host to \n"
287 " the container\n"
288 " --overlay-ro=PATH[:PATH...]:PATH\n"
289 " Similar, but creates a read-only overlay mount\n"
290 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
291 " --register=BOOLEAN Register container as machine\n"
292 " --keep-unit Do not register a scope for the machine, reuse\n"
293 " the service unit nspawn is running in\n"
294 " --volatile[=MODE] Run the system in volatile mode\n"
295 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
296 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
297 , program_invocation_short_name);
298 }
299
300 static int custom_mount_check_all(void) {
301 unsigned i;
302
303 for (i = 0; i < arg_n_custom_mounts; i++) {
304 CustomMount *m = &arg_custom_mounts[i];
305
306 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
307
308 if (arg_userns_chown) {
309 log_error("--private-users-chown may not be combined with custom root mounts.");
310 return -EINVAL;
311 } else if (arg_uid_shift == UID_INVALID) {
312 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
313 return -EINVAL;
314 }
315 }
316 }
317
318 return 0;
319 }
320
321 static int detect_unified_cgroup_hierarchy(const char *directory) {
322 const char *e;
323 int r;
324
325 /* Allow the user to control whether the unified hierarchy is used */
326 e = getenv("UNIFIED_CGROUP_HIERARCHY");
327 if (e) {
328 r = parse_boolean(e);
329 if (r < 0)
330 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
331 if (r > 0)
332 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
333 else
334 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
335
336 return 0;
337 }
338
339 /* Otherwise inherit the default from the host system */
340 r = cg_all_unified();
341 if (r < 0)
342 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
343 if (r > 0) {
344 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
345 * routine only detects 231, so we'll have a false negative here for 230. */
346 r = systemd_installation_has_version(directory, 230);
347 if (r < 0)
348 return log_error_errno(r, "Failed to determine systemd version in container: %m");
349 if (r > 0)
350 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
351 else
352 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
353 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
354 /* Mixed cgroup hierarchy support was added in 233 */
355 r = systemd_installation_has_version(directory, 233);
356 if (r < 0)
357 return log_error_errno(r, "Failed to determine systemd version in container: %m");
358 if (r > 0)
359 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
360 else
361 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
362 } else
363 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
364
365 return 0;
366 }
367
368 static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
369 int r;
370
371 r = getenv_bool(name);
372 if (r == -ENXIO)
373 return;
374 if (r < 0)
375 log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
376 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
377 }
378
379 static void parse_mount_settings_env(void) {
380 int r;
381 const char *e;
382
383 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
384 if (!e)
385 return;
386
387 if (streq(e, "network")) {
388 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
389 return;
390 }
391
392 r = parse_boolean(e);
393 if (r < 0) {
394 log_warning_errno(r, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring.");
395 return;
396 }
397
398 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
399 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
400 }
401
402 static int parse_argv(int argc, char *argv[]) {
403
404 enum {
405 ARG_VERSION = 0x100,
406 ARG_PRIVATE_NETWORK,
407 ARG_UUID,
408 ARG_READ_ONLY,
409 ARG_CAPABILITY,
410 ARG_DROP_CAPABILITY,
411 ARG_LINK_JOURNAL,
412 ARG_BIND,
413 ARG_BIND_RO,
414 ARG_TMPFS,
415 ARG_OVERLAY,
416 ARG_OVERLAY_RO,
417 ARG_SHARE_SYSTEM,
418 ARG_REGISTER,
419 ARG_KEEP_UNIT,
420 ARG_NETWORK_INTERFACE,
421 ARG_NETWORK_MACVLAN,
422 ARG_NETWORK_IPVLAN,
423 ARG_NETWORK_BRIDGE,
424 ARG_NETWORK_ZONE,
425 ARG_NETWORK_VETH_EXTRA,
426 ARG_PERSONALITY,
427 ARG_VOLATILE,
428 ARG_TEMPLATE,
429 ARG_PROPERTY,
430 ARG_PRIVATE_USERS,
431 ARG_KILL_SIGNAL,
432 ARG_SETTINGS,
433 ARG_CHDIR,
434 ARG_PIVOT_ROOT,
435 ARG_PRIVATE_USERS_CHOWN,
436 ARG_NOTIFY_READY,
437 ARG_ROOT_HASH,
438 ARG_SYSTEM_CALL_FILTER,
439 };
440
441 static const struct option options[] = {
442 { "help", no_argument, NULL, 'h' },
443 { "version", no_argument, NULL, ARG_VERSION },
444 { "directory", required_argument, NULL, 'D' },
445 { "template", required_argument, NULL, ARG_TEMPLATE },
446 { "ephemeral", no_argument, NULL, 'x' },
447 { "user", required_argument, NULL, 'u' },
448 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
449 { "as-pid2", no_argument, NULL, 'a' },
450 { "boot", no_argument, NULL, 'b' },
451 { "uuid", required_argument, NULL, ARG_UUID },
452 { "read-only", no_argument, NULL, ARG_READ_ONLY },
453 { "capability", required_argument, NULL, ARG_CAPABILITY },
454 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
455 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
456 { "bind", required_argument, NULL, ARG_BIND },
457 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
458 { "tmpfs", required_argument, NULL, ARG_TMPFS },
459 { "overlay", required_argument, NULL, ARG_OVERLAY },
460 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
461 { "machine", required_argument, NULL, 'M' },
462 { "slice", required_argument, NULL, 'S' },
463 { "setenv", required_argument, NULL, 'E' },
464 { "selinux-context", required_argument, NULL, 'Z' },
465 { "selinux-apifs-context", required_argument, NULL, 'L' },
466 { "quiet", no_argument, NULL, 'q' },
467 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
468 { "register", required_argument, NULL, ARG_REGISTER },
469 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
470 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
471 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
472 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
473 { "network-veth", no_argument, NULL, 'n' },
474 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
475 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
476 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
477 { "personality", required_argument, NULL, ARG_PERSONALITY },
478 { "image", required_argument, NULL, 'i' },
479 { "volatile", optional_argument, NULL, ARG_VOLATILE },
480 { "port", required_argument, NULL, 'p' },
481 { "property", required_argument, NULL, ARG_PROPERTY },
482 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
483 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
484 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
485 { "settings", required_argument, NULL, ARG_SETTINGS },
486 { "chdir", required_argument, NULL, ARG_CHDIR },
487 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
488 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
489 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
490 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
491 {}
492 };
493
494 int c, r;
495 const char *p, *e;
496 uint64_t plus = 0, minus = 0;
497 bool mask_all_settings = false, mask_no_settings = false;
498
499 assert(argc >= 0);
500 assert(argv);
501
502 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:", options, NULL)) >= 0)
503
504 switch (c) {
505
506 case 'h':
507 help();
508 return 0;
509
510 case ARG_VERSION:
511 return version();
512
513 case 'D':
514 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
515 if (r < 0)
516 return r;
517 break;
518
519 case ARG_TEMPLATE:
520 r = parse_path_argument_and_warn(optarg, false, &arg_template);
521 if (r < 0)
522 return r;
523 break;
524
525 case 'i':
526 r = parse_path_argument_and_warn(optarg, false, &arg_image);
527 if (r < 0)
528 return r;
529 break;
530
531 case 'x':
532 arg_ephemeral = true;
533 break;
534
535 case 'u':
536 r = free_and_strdup(&arg_user, optarg);
537 if (r < 0)
538 return log_oom();
539
540 arg_settings_mask |= SETTING_USER;
541 break;
542
543 case ARG_NETWORK_ZONE: {
544 char *j;
545
546 j = strappend("vz-", optarg);
547 if (!j)
548 return log_oom();
549
550 if (!ifname_valid(j)) {
551 log_error("Network zone name not valid: %s", j);
552 free(j);
553 return -EINVAL;
554 }
555
556 free(arg_network_zone);
557 arg_network_zone = j;
558
559 arg_network_veth = true;
560 arg_private_network = true;
561 arg_settings_mask |= SETTING_NETWORK;
562 break;
563 }
564
565 case ARG_NETWORK_BRIDGE:
566
567 if (!ifname_valid(optarg)) {
568 log_error("Bridge interface name not valid: %s", optarg);
569 return -EINVAL;
570 }
571
572 r = free_and_strdup(&arg_network_bridge, optarg);
573 if (r < 0)
574 return log_oom();
575
576 /* fall through */
577
578 case 'n':
579 arg_network_veth = true;
580 arg_private_network = true;
581 arg_settings_mask |= SETTING_NETWORK;
582 break;
583
584 case ARG_NETWORK_VETH_EXTRA:
585 r = veth_extra_parse(&arg_network_veth_extra, optarg);
586 if (r < 0)
587 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
588
589 arg_private_network = true;
590 arg_settings_mask |= SETTING_NETWORK;
591 break;
592
593 case ARG_NETWORK_INTERFACE:
594
595 if (!ifname_valid(optarg)) {
596 log_error("Network interface name not valid: %s", optarg);
597 return -EINVAL;
598 }
599
600 if (strv_extend(&arg_network_interfaces, optarg) < 0)
601 return log_oom();
602
603 arg_private_network = true;
604 arg_settings_mask |= SETTING_NETWORK;
605 break;
606
607 case ARG_NETWORK_MACVLAN:
608
609 if (!ifname_valid(optarg)) {
610 log_error("MACVLAN network interface name not valid: %s", optarg);
611 return -EINVAL;
612 }
613
614 if (strv_extend(&arg_network_macvlan, optarg) < 0)
615 return log_oom();
616
617 arg_private_network = true;
618 arg_settings_mask |= SETTING_NETWORK;
619 break;
620
621 case ARG_NETWORK_IPVLAN:
622
623 if (!ifname_valid(optarg)) {
624 log_error("IPVLAN network interface name not valid: %s", optarg);
625 return -EINVAL;
626 }
627
628 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
629 return log_oom();
630
631 /* fall through */
632
633 case ARG_PRIVATE_NETWORK:
634 arg_private_network = true;
635 arg_settings_mask |= SETTING_NETWORK;
636 break;
637
638 case 'b':
639 if (arg_start_mode == START_PID2) {
640 log_error("--boot and --as-pid2 may not be combined.");
641 return -EINVAL;
642 }
643
644 arg_start_mode = START_BOOT;
645 arg_settings_mask |= SETTING_START_MODE;
646 break;
647
648 case 'a':
649 if (arg_start_mode == START_BOOT) {
650 log_error("--boot and --as-pid2 may not be combined.");
651 return -EINVAL;
652 }
653
654 arg_start_mode = START_PID2;
655 arg_settings_mask |= SETTING_START_MODE;
656 break;
657
658 case ARG_UUID:
659 r = sd_id128_from_string(optarg, &arg_uuid);
660 if (r < 0)
661 return log_error_errno(r, "Invalid UUID: %s", optarg);
662
663 if (sd_id128_is_null(arg_uuid)) {
664 log_error("Machine UUID may not be all zeroes.");
665 return -EINVAL;
666 }
667
668 arg_settings_mask |= SETTING_MACHINE_ID;
669 break;
670
671 case 'S':
672 arg_slice = optarg;
673 break;
674
675 case 'M':
676 if (isempty(optarg))
677 arg_machine = mfree(arg_machine);
678 else {
679 if (!machine_name_is_valid(optarg)) {
680 log_error("Invalid machine name: %s", optarg);
681 return -EINVAL;
682 }
683
684 r = free_and_strdup(&arg_machine, optarg);
685 if (r < 0)
686 return log_oom();
687 }
688 break;
689
690 case 'Z':
691 arg_selinux_context = optarg;
692 break;
693
694 case 'L':
695 arg_selinux_apifs_context = optarg;
696 break;
697
698 case ARG_READ_ONLY:
699 arg_read_only = true;
700 arg_settings_mask |= SETTING_READ_ONLY;
701 break;
702
703 case ARG_CAPABILITY:
704 case ARG_DROP_CAPABILITY: {
705 p = optarg;
706 for (;;) {
707 _cleanup_free_ char *t = NULL;
708
709 r = extract_first_word(&p, &t, ",", 0);
710 if (r < 0)
711 return log_error_errno(r, "Failed to parse capability %s.", t);
712
713 if (r == 0)
714 break;
715
716 if (streq(t, "all")) {
717 if (c == ARG_CAPABILITY)
718 plus = (uint64_t) -1;
719 else
720 minus = (uint64_t) -1;
721 } else {
722 int cap;
723
724 cap = capability_from_name(t);
725 if (cap < 0) {
726 log_error("Failed to parse capability %s.", t);
727 return -EINVAL;
728 }
729
730 if (c == ARG_CAPABILITY)
731 plus |= 1ULL << (uint64_t) cap;
732 else
733 minus |= 1ULL << (uint64_t) cap;
734 }
735 }
736
737 arg_settings_mask |= SETTING_CAPABILITY;
738 break;
739 }
740
741 case 'j':
742 arg_link_journal = LINK_GUEST;
743 arg_link_journal_try = true;
744 break;
745
746 case ARG_LINK_JOURNAL:
747 if (streq(optarg, "auto")) {
748 arg_link_journal = LINK_AUTO;
749 arg_link_journal_try = false;
750 } else if (streq(optarg, "no")) {
751 arg_link_journal = LINK_NO;
752 arg_link_journal_try = false;
753 } else if (streq(optarg, "guest")) {
754 arg_link_journal = LINK_GUEST;
755 arg_link_journal_try = false;
756 } else if (streq(optarg, "host")) {
757 arg_link_journal = LINK_HOST;
758 arg_link_journal_try = false;
759 } else if (streq(optarg, "try-guest")) {
760 arg_link_journal = LINK_GUEST;
761 arg_link_journal_try = true;
762 } else if (streq(optarg, "try-host")) {
763 arg_link_journal = LINK_HOST;
764 arg_link_journal_try = true;
765 } else {
766 log_error("Failed to parse link journal mode %s", optarg);
767 return -EINVAL;
768 }
769
770 break;
771
772 case ARG_BIND:
773 case ARG_BIND_RO:
774 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
775 if (r < 0)
776 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
777
778 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
779 break;
780
781 case ARG_TMPFS:
782 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
783 if (r < 0)
784 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
785
786 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
787 break;
788
789 case ARG_OVERLAY:
790 case ARG_OVERLAY_RO:
791 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
792 if (r == -EADDRNOTAVAIL)
793 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
794 if (r < 0)
795 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
796
797 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
798 break;
799
800 case 'E': {
801 char **n;
802
803 if (!env_assignment_is_valid(optarg)) {
804 log_error("Environment variable assignment '%s' is not valid.", optarg);
805 return -EINVAL;
806 }
807
808 n = strv_env_set(arg_setenv, optarg);
809 if (!n)
810 return log_oom();
811
812 strv_free(arg_setenv);
813 arg_setenv = n;
814
815 arg_settings_mask |= SETTING_ENVIRONMENT;
816 break;
817 }
818
819 case 'q':
820 arg_quiet = true;
821 break;
822
823 case ARG_SHARE_SYSTEM:
824 /* We don't officially support this anymore, except for compat reasons. People should use the
825 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
826 arg_clone_ns_flags = 0;
827 break;
828
829 case ARG_REGISTER:
830 r = parse_boolean(optarg);
831 if (r < 0) {
832 log_error("Failed to parse --register= argument: %s", optarg);
833 return r;
834 }
835
836 arg_register = r;
837 break;
838
839 case ARG_KEEP_UNIT:
840 arg_keep_unit = true;
841 break;
842
843 case ARG_PERSONALITY:
844
845 arg_personality = personality_from_string(optarg);
846 if (arg_personality == PERSONALITY_INVALID) {
847 log_error("Unknown or unsupported personality '%s'.", optarg);
848 return -EINVAL;
849 }
850
851 arg_settings_mask |= SETTING_PERSONALITY;
852 break;
853
854 case ARG_VOLATILE:
855
856 if (!optarg)
857 arg_volatile_mode = VOLATILE_YES;
858 else {
859 VolatileMode m;
860
861 m = volatile_mode_from_string(optarg);
862 if (m < 0) {
863 log_error("Failed to parse --volatile= argument: %s", optarg);
864 return -EINVAL;
865 } else
866 arg_volatile_mode = m;
867 }
868
869 arg_settings_mask |= SETTING_VOLATILE_MODE;
870 break;
871
872 case 'p':
873 r = expose_port_parse(&arg_expose_ports, optarg);
874 if (r == -EEXIST)
875 return log_error_errno(r, "Duplicate port specification: %s", optarg);
876 if (r < 0)
877 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
878
879 arg_settings_mask |= SETTING_EXPOSE_PORTS;
880 break;
881
882 case ARG_PROPERTY:
883 if (strv_extend(&arg_property, optarg) < 0)
884 return log_oom();
885
886 break;
887
888 case ARG_PRIVATE_USERS: {
889 int boolean = -1;
890
891 if (!optarg)
892 boolean = true;
893 else if (!in_charset(optarg, DIGITS))
894 /* do *not* parse numbers as booleans */
895 boolean = parse_boolean(optarg);
896
897 if (boolean == false) {
898 /* no: User namespacing off */
899 arg_userns_mode = USER_NAMESPACE_NO;
900 arg_uid_shift = UID_INVALID;
901 arg_uid_range = UINT32_C(0x10000);
902 } else if (boolean == true) {
903 /* yes: User namespacing on, UID range is read from root dir */
904 arg_userns_mode = USER_NAMESPACE_FIXED;
905 arg_uid_shift = UID_INVALID;
906 arg_uid_range = UINT32_C(0x10000);
907 } else if (streq(optarg, "pick")) {
908 /* pick: User namespacing on, UID range is picked randomly */
909 arg_userns_mode = USER_NAMESPACE_PICK;
910 arg_uid_shift = UID_INVALID;
911 arg_uid_range = UINT32_C(0x10000);
912 } else {
913 _cleanup_free_ char *buffer = NULL;
914 const char *range, *shift;
915
916 /* anything else: User namespacing on, UID range is explicitly configured */
917
918 range = strchr(optarg, ':');
919 if (range) {
920 buffer = strndup(optarg, range - optarg);
921 if (!buffer)
922 return log_oom();
923 shift = buffer;
924
925 range++;
926 r = safe_atou32(range, &arg_uid_range);
927 if (r < 0)
928 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
929 } else
930 shift = optarg;
931
932 r = parse_uid(shift, &arg_uid_shift);
933 if (r < 0)
934 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
935
936 arg_userns_mode = USER_NAMESPACE_FIXED;
937 }
938
939 if (arg_uid_range <= 0) {
940 log_error("UID range cannot be 0.");
941 return -EINVAL;
942 }
943
944 arg_settings_mask |= SETTING_USERNS;
945 break;
946 }
947
948 case 'U':
949 if (userns_supported()) {
950 arg_userns_mode = USER_NAMESPACE_PICK;
951 arg_uid_shift = UID_INVALID;
952 arg_uid_range = UINT32_C(0x10000);
953
954 arg_settings_mask |= SETTING_USERNS;
955 }
956
957 break;
958
959 case ARG_PRIVATE_USERS_CHOWN:
960 arg_userns_chown = true;
961
962 arg_settings_mask |= SETTING_USERNS;
963 break;
964
965 case ARG_KILL_SIGNAL:
966 arg_kill_signal = signal_from_string_try_harder(optarg);
967 if (arg_kill_signal < 0) {
968 log_error("Cannot parse signal: %s", optarg);
969 return -EINVAL;
970 }
971
972 arg_settings_mask |= SETTING_KILL_SIGNAL;
973 break;
974
975 case ARG_SETTINGS:
976
977 /* no → do not read files
978 * yes → read files, do not override cmdline, trust only subset
979 * override → read files, override cmdline, trust only subset
980 * trusted → read files, do not override cmdline, trust all
981 */
982
983 r = parse_boolean(optarg);
984 if (r < 0) {
985 if (streq(optarg, "trusted")) {
986 mask_all_settings = false;
987 mask_no_settings = false;
988 arg_settings_trusted = true;
989
990 } else if (streq(optarg, "override")) {
991 mask_all_settings = false;
992 mask_no_settings = true;
993 arg_settings_trusted = -1;
994 } else
995 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
996 } else if (r > 0) {
997 /* yes */
998 mask_all_settings = false;
999 mask_no_settings = false;
1000 arg_settings_trusted = -1;
1001 } else {
1002 /* no */
1003 mask_all_settings = true;
1004 mask_no_settings = false;
1005 arg_settings_trusted = false;
1006 }
1007
1008 break;
1009
1010 case ARG_CHDIR:
1011 if (!path_is_absolute(optarg)) {
1012 log_error("Working directory %s is not an absolute path.", optarg);
1013 return -EINVAL;
1014 }
1015
1016 r = free_and_strdup(&arg_chdir, optarg);
1017 if (r < 0)
1018 return log_oom();
1019
1020 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1021 break;
1022
1023 case ARG_PIVOT_ROOT:
1024 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1025 if (r < 0)
1026 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1027
1028 arg_settings_mask |= SETTING_PIVOT_ROOT;
1029 break;
1030
1031 case ARG_NOTIFY_READY:
1032 r = parse_boolean(optarg);
1033 if (r < 0) {
1034 log_error("%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1035 return -EINVAL;
1036 }
1037 arg_notify_ready = r;
1038 arg_settings_mask |= SETTING_NOTIFY_READY;
1039 break;
1040
1041 case ARG_ROOT_HASH: {
1042 void *k;
1043 size_t l;
1044
1045 r = unhexmem(optarg, strlen(optarg), &k, &l);
1046 if (r < 0)
1047 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1048 if (l < sizeof(sd_id128_t)) {
1049 log_error("Root hash must be at least 128bit long: %s", optarg);
1050 free(k);
1051 return -EINVAL;
1052 }
1053
1054 free(arg_root_hash);
1055 arg_root_hash = k;
1056 arg_root_hash_size = l;
1057 break;
1058 }
1059
1060 case ARG_SYSTEM_CALL_FILTER: {
1061 bool negative;
1062 const char *items;
1063
1064 negative = optarg[0] == '~';
1065 items = negative ? optarg + 1 : optarg;
1066
1067 for (;;) {
1068 _cleanup_free_ char *word = NULL;
1069
1070 r = extract_first_word(&items, &word, NULL, 0);
1071 if (r == 0)
1072 break;
1073 if (r == -ENOMEM)
1074 return log_oom();
1075 if (r < 0)
1076 return log_error_errno(r, "Failed to parse system call filter: %m");
1077
1078 if (negative)
1079 r = strv_extend(&arg_syscall_blacklist, word);
1080 else
1081 r = strv_extend(&arg_syscall_whitelist, word);
1082 if (r < 0)
1083 return log_oom();
1084 }
1085
1086 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1087 break;
1088 }
1089
1090 case '?':
1091 return -EINVAL;
1092
1093 default:
1094 assert_not_reached("Unhandled option");
1095 }
1096
1097 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
1098 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
1099 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
1100 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
1101
1102 if (arg_userns_mode != USER_NAMESPACE_NO)
1103 arg_mount_settings |= MOUNT_USE_USERNS;
1104
1105 if (arg_private_network)
1106 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1107
1108 parse_mount_settings_env();
1109
1110 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1111 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
1112 arg_register = false;
1113 if (arg_start_mode != START_PID1) {
1114 log_error("--boot cannot be used without namespacing.");
1115 return -EINVAL;
1116 }
1117 }
1118
1119 if (arg_userns_mode == USER_NAMESPACE_PICK)
1120 arg_userns_chown = true;
1121
1122 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0) {
1123 log_error("--keep-unit --register=yes may not be used when invoked from a user session.");
1124 return -EINVAL;
1125 }
1126
1127 if (arg_directory && arg_image) {
1128 log_error("--directory= and --image= may not be combined.");
1129 return -EINVAL;
1130 }
1131
1132 if (arg_template && arg_image) {
1133 log_error("--template= and --image= may not be combined.");
1134 return -EINVAL;
1135 }
1136
1137 if (arg_ephemeral && arg_template && !arg_directory) {
1138 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1139 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1140 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1141 * --directory=". */
1142
1143 arg_directory = arg_template;
1144 arg_template = NULL;
1145 }
1146
1147 if (arg_template && !(arg_directory || arg_machine)) {
1148 log_error("--template= needs --directory= or --machine=.");
1149 return -EINVAL;
1150 }
1151
1152 if (arg_ephemeral && arg_template) {
1153 log_error("--ephemeral and --template= may not be combined.");
1154 return -EINVAL;
1155 }
1156
1157 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1158 log_error("--ephemeral and --link-journal= may not be combined.");
1159 return -EINVAL;
1160 }
1161
1162 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported()) {
1163 log_error("--private-users= is not supported, kernel compiled without user namespace support.");
1164 return -EOPNOTSUPP;
1165 }
1166
1167 if (arg_userns_chown && arg_read_only) {
1168 log_error("--read-only and --private-users-chown may not be combined.");
1169 return -EINVAL;
1170 }
1171
1172 if (arg_network_bridge && arg_network_zone) {
1173 log_error("--network-bridge= and --network-zone= may not be combined.");
1174 return -EINVAL;
1175 }
1176
1177 if (argc > optind) {
1178 arg_parameters = strv_copy(argv + optind);
1179 if (!arg_parameters)
1180 return log_oom();
1181
1182 arg_settings_mask |= SETTING_START_MODE;
1183 }
1184
1185 /* Load all settings from .nspawn files */
1186 if (mask_no_settings)
1187 arg_settings_mask = 0;
1188
1189 /* Don't load any settings from .nspawn files */
1190 if (mask_all_settings)
1191 arg_settings_mask = _SETTINGS_MASK_ALL;
1192
1193 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
1194
1195 r = cg_unified_flush();
1196 if (r < 0)
1197 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
1198
1199 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
1200 if (e)
1201 arg_container_service_name = e;
1202
1203 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
1204 if (r < 0)
1205 arg_use_cgns = cg_ns_supported();
1206 else
1207 arg_use_cgns = r;
1208
1209 r = custom_mount_check_all();
1210 if (r < 0)
1211 return r;
1212
1213 return 1;
1214 }
1215
1216 static int verify_arguments(void) {
1217 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network) {
1218 log_error("Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1219 return -EINVAL;
1220 }
1221
1222 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO)) {
1223 log_error("Cannot combine --private-users with read-write mounts.");
1224 return -EINVAL;
1225 }
1226
1227 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
1228 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1229 return -EINVAL;
1230 }
1231
1232 if (arg_expose_ports && !arg_private_network) {
1233 log_error("Cannot use --port= without private networking.");
1234 return -EINVAL;
1235 }
1236
1237 #if ! HAVE_LIBIPTC
1238 if (arg_expose_ports) {
1239 log_error("--port= is not supported, compiled without libiptc support.");
1240 return -EOPNOTSUPP;
1241 }
1242 #endif
1243
1244 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1245 arg_kill_signal = SIGRTMIN+3;
1246
1247 return 0;
1248 }
1249
1250 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1251 assert(p);
1252
1253 if (arg_userns_mode == USER_NAMESPACE_NO)
1254 return 0;
1255
1256 if (uid == UID_INVALID && gid == GID_INVALID)
1257 return 0;
1258
1259 if (uid != UID_INVALID) {
1260 uid += arg_uid_shift;
1261
1262 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1263 return -EOVERFLOW;
1264 }
1265
1266 if (gid != GID_INVALID) {
1267 gid += (gid_t) arg_uid_shift;
1268
1269 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1270 return -EOVERFLOW;
1271 }
1272
1273 if (lchown(p, uid, gid) < 0)
1274 return -errno;
1275
1276 return 0;
1277 }
1278
1279 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1280 const char *q;
1281
1282 q = prefix_roota(root, path);
1283 if (mkdir(q, mode) < 0) {
1284 if (errno == EEXIST)
1285 return 0;
1286 return -errno;
1287 }
1288
1289 return userns_lchown(q, uid, gid);
1290 }
1291
1292 static int setup_timezone(const char *dest) {
1293 _cleanup_free_ char *p = NULL, *q = NULL;
1294 const char *where, *check, *what;
1295 char *z, *y;
1296 int r;
1297
1298 assert(dest);
1299
1300 /* Fix the timezone, if possible */
1301 r = readlink_malloc("/etc/localtime", &p);
1302 if (r < 0) {
1303 log_warning("host's /etc/localtime is not a symlink, not updating container timezone.");
1304 /* to handle warning, delete /etc/localtime and replace it
1305 * with a symbolic link to a time zone data file.
1306 *
1307 * Example:
1308 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1309 */
1310 return 0;
1311 }
1312
1313 z = path_startswith(p, "../usr/share/zoneinfo/");
1314 if (!z)
1315 z = path_startswith(p, "/usr/share/zoneinfo/");
1316 if (!z) {
1317 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1318 return 0;
1319 }
1320
1321 where = prefix_roota(dest, "/etc/localtime");
1322 r = readlink_malloc(where, &q);
1323 if (r >= 0) {
1324 y = path_startswith(q, "../usr/share/zoneinfo/");
1325 if (!y)
1326 y = path_startswith(q, "/usr/share/zoneinfo/");
1327
1328 /* Already pointing to the right place? Then do nothing .. */
1329 if (y && streq(y, z))
1330 return 0;
1331 }
1332
1333 check = strjoina("/usr/share/zoneinfo/", z);
1334 check = prefix_roota(dest, check);
1335 if (laccess(check, F_OK) < 0) {
1336 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1337 return 0;
1338 }
1339
1340 if (unlink(where) < 0 && errno != ENOENT) {
1341 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1342 errno,
1343 "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1344 return 0;
1345 }
1346
1347 what = strjoina("../usr/share/zoneinfo/", z);
1348 if (symlink(what, where) < 0) {
1349 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1350 errno,
1351 "Failed to correct timezone of container, ignoring: %m");
1352 return 0;
1353 }
1354
1355 r = userns_lchown(where, 0, 0);
1356 if (r < 0)
1357 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1358
1359 return 0;
1360 }
1361
1362 static int resolved_listening(void) {
1363 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
1364 _cleanup_free_ char *dns_stub_listener_mode = NULL;
1365 int r;
1366
1367 /* Check if resolved is listening */
1368
1369 r = sd_bus_open_system(&bus);
1370 if (r < 0)
1371 return r;
1372
1373 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
1374 if (r <= 0)
1375 return r;
1376
1377 r = sd_bus_get_property_string(bus,
1378 "org.freedesktop.resolve1",
1379 "/org/freedesktop/resolve1",
1380 "org.freedesktop.resolve1.Manager",
1381 "DNSStubListener",
1382 NULL,
1383 &dns_stub_listener_mode);
1384 if (r < 0)
1385 return r;
1386
1387 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
1388 }
1389
1390 static int setup_resolv_conf(const char *dest) {
1391 _cleanup_free_ char *resolved = NULL, *etc = NULL;
1392 const char *where;
1393 int r, found;
1394
1395 assert(dest);
1396
1397 if (arg_private_network)
1398 return 0;
1399
1400 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
1401 if (r < 0) {
1402 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1403 return 0;
1404 }
1405
1406 where = strjoina(etc, "/resolv.conf");
1407 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
1408 if (found < 0) {
1409 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
1410 return 0;
1411 }
1412
1413 if (access("/usr/lib/systemd/resolv.conf", F_OK) >= 0 &&
1414 resolved_listening() > 0) {
1415
1416 /* resolved is enabled on the host. In this, case bind mount its static resolv.conf file into the
1417 * container, so that the container can use the host's resolver. Given that network namespacing is
1418 * disabled it's only natural of the container also uses the host's resolver. It also has the big
1419 * advantage that the container will be able to follow the host's DNS server configuration changes
1420 * transparently. */
1421
1422 if (found == 0) /* missing? */
1423 (void) touch(resolved);
1424
1425 r = mount_verbose(LOG_DEBUG, "/usr/lib/systemd/resolv.conf", resolved, NULL, MS_BIND, NULL);
1426 if (r >= 0)
1427 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1428 }
1429
1430 /* If that didn't work, let's copy the file */
1431 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0, COPY_REFLINK);
1432 if (r < 0) {
1433 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1434 * resolved or something similar runs inside and the symlink points there.
1435 *
1436 * If the disk image is read-only, there's also no point in complaining.
1437 */
1438 log_full_errno(IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1439 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
1440 return 0;
1441 }
1442
1443 r = userns_lchown(where, 0, 0);
1444 if (r < 0)
1445 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
1446
1447 return 0;
1448 }
1449
1450 static int setup_boot_id(const char *dest) {
1451 sd_id128_t rnd = SD_ID128_NULL;
1452 const char *from, *to;
1453 int r;
1454
1455 /* Generate a new randomized boot ID, so that each boot-up of
1456 * the container gets a new one */
1457
1458 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1459 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1460
1461 r = sd_id128_randomize(&rnd);
1462 if (r < 0)
1463 return log_error_errno(r, "Failed to generate random boot id: %m");
1464
1465 r = id128_write(from, ID128_UUID, rnd, false);
1466 if (r < 0)
1467 return log_error_errno(r, "Failed to write boot id: %m");
1468
1469 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1470 if (r >= 0)
1471 r = mount_verbose(LOG_ERR, NULL, to, NULL,
1472 MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1473
1474 (void) unlink(from);
1475 return r;
1476 }
1477
1478 static int copy_devnodes(const char *dest) {
1479
1480 static const char devnodes[] =
1481 "null\0"
1482 "zero\0"
1483 "full\0"
1484 "random\0"
1485 "urandom\0"
1486 "tty\0"
1487 "net/tun\0";
1488
1489 const char *d;
1490 int r = 0;
1491 _cleanup_umask_ mode_t u;
1492
1493 assert(dest);
1494
1495 u = umask(0000);
1496
1497 /* Create /dev/net, so that we can create /dev/net/tun in it */
1498 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1499 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1500
1501 NULSTR_FOREACH(d, devnodes) {
1502 _cleanup_free_ char *from = NULL, *to = NULL;
1503 struct stat st;
1504
1505 from = strappend("/dev/", d);
1506 to = prefix_root(dest, from);
1507
1508 if (stat(from, &st) < 0) {
1509
1510 if (errno != ENOENT)
1511 return log_error_errno(errno, "Failed to stat %s: %m", from);
1512
1513 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1514
1515 log_error("%s is not a char or block device, cannot copy.", from);
1516 return -EIO;
1517
1518 } else {
1519 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1520 /* Explicitly warn the user when /dev is already populated. */
1521 if (errno == EEXIST)
1522 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
1523 if (errno != EPERM)
1524 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1525
1526 /* Some systems abusively restrict mknod but
1527 * allow bind mounts. */
1528 r = touch(to);
1529 if (r < 0)
1530 return log_error_errno(r, "touch (%s) failed: %m", to);
1531 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1532 if (r < 0)
1533 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
1534 }
1535
1536 r = userns_lchown(to, 0, 0);
1537 if (r < 0)
1538 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1539 }
1540 }
1541
1542 return r;
1543 }
1544
1545 static int setup_pts(const char *dest) {
1546 _cleanup_free_ char *options = NULL;
1547 const char *p;
1548 int r;
1549
1550 #if HAVE_SELINUX
1551 if (arg_selinux_apifs_context)
1552 (void) asprintf(&options,
1553 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1554 arg_uid_shift + TTY_GID,
1555 arg_selinux_apifs_context);
1556 else
1557 #endif
1558 (void) asprintf(&options,
1559 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1560 arg_uid_shift + TTY_GID);
1561
1562 if (!options)
1563 return log_oom();
1564
1565 /* Mount /dev/pts itself */
1566 p = prefix_roota(dest, "/dev/pts");
1567 if (mkdir(p, 0755) < 0)
1568 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1569 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
1570 if (r < 0)
1571 return r;
1572 r = userns_lchown(p, 0, 0);
1573 if (r < 0)
1574 return log_error_errno(r, "Failed to chown /dev/pts: %m");
1575
1576 /* Create /dev/ptmx symlink */
1577 p = prefix_roota(dest, "/dev/ptmx");
1578 if (symlink("pts/ptmx", p) < 0)
1579 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1580 r = userns_lchown(p, 0, 0);
1581 if (r < 0)
1582 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
1583
1584 /* And fix /dev/pts/ptmx ownership */
1585 p = prefix_roota(dest, "/dev/pts/ptmx");
1586 r = userns_lchown(p, 0, 0);
1587 if (r < 0)
1588 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
1589
1590 return 0;
1591 }
1592
1593 static int setup_dev_console(const char *dest, const char *console) {
1594 _cleanup_umask_ mode_t u;
1595 const char *to;
1596 int r;
1597
1598 assert(dest);
1599 assert(console);
1600
1601 u = umask(0000);
1602
1603 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1604 if (r < 0)
1605 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1606
1607 /* We need to bind mount the right tty to /dev/console since
1608 * ptys can only exist on pts file systems. To have something
1609 * to bind mount things on we create a empty regular file. */
1610
1611 to = prefix_roota(dest, "/dev/console");
1612 r = touch(to);
1613 if (r < 0)
1614 return log_error_errno(r, "touch() for /dev/console failed: %m");
1615
1616 return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
1617 }
1618
1619 static int setup_keyring(void) {
1620 key_serial_t keyring;
1621
1622 /* Allocate a new session keyring for the container. This makes sure the keyring of the session systemd-nspawn
1623 * was invoked from doesn't leak into the container. Note that by default we block keyctl() and request_key()
1624 * anyway via seccomp so doing this operation isn't strictly necessary, but in case people explicitly whitelist
1625 * these system calls let's make sure we don't leak anything into the container. */
1626
1627 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
1628 if (keyring == -1) {
1629 if (errno == ENOSYS)
1630 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
1631 else if (IN_SET(errno, EACCES, EPERM))
1632 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
1633 else
1634 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
1635 }
1636
1637 return 0;
1638 }
1639
1640 static int setup_kmsg(const char *dest, int kmsg_socket) {
1641 const char *from, *to;
1642 _cleanup_umask_ mode_t u;
1643 int fd, r;
1644
1645 assert(kmsg_socket >= 0);
1646
1647 u = umask(0000);
1648
1649 /* We create the kmsg FIFO as /run/kmsg, but immediately
1650 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1651 * on the reading side behave very similar to /proc/kmsg,
1652 * their writing side behaves differently from /dev/kmsg in
1653 * that writing blocks when nothing is reading. In order to
1654 * avoid any problems with containers deadlocking due to this
1655 * we simply make /dev/kmsg unavailable to the container. */
1656 from = prefix_roota(dest, "/run/kmsg");
1657 to = prefix_roota(dest, "/proc/kmsg");
1658
1659 if (mkfifo(from, 0600) < 0)
1660 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1661 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1662 if (r < 0)
1663 return r;
1664
1665 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1666 if (fd < 0)
1667 return log_error_errno(errno, "Failed to open fifo: %m");
1668
1669 /* Store away the fd in the socket, so that it stays open as
1670 * long as we run the child */
1671 r = send_one_fd(kmsg_socket, fd, 0);
1672 safe_close(fd);
1673
1674 if (r < 0)
1675 return log_error_errno(r, "Failed to send FIFO fd: %m");
1676
1677 /* And now make the FIFO unavailable as /run/kmsg... */
1678 (void) unlink(from);
1679
1680 return 0;
1681 }
1682
1683 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
1684 union in_addr_union *exposed = userdata;
1685
1686 assert(rtnl);
1687 assert(m);
1688 assert(exposed);
1689
1690 expose_port_execute(rtnl, arg_expose_ports, exposed);
1691 return 0;
1692 }
1693
1694 static int setup_hostname(void) {
1695
1696 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
1697 return 0;
1698
1699 if (sethostname_idempotent(arg_machine) < 0)
1700 return -errno;
1701
1702 return 0;
1703 }
1704
1705 static int setup_journal(const char *directory) {
1706 sd_id128_t this_id;
1707 _cleanup_free_ char *d = NULL;
1708 const char *p, *q;
1709 bool try;
1710 char id[33];
1711 int r;
1712
1713 /* Don't link journals in ephemeral mode */
1714 if (arg_ephemeral)
1715 return 0;
1716
1717 if (arg_link_journal == LINK_NO)
1718 return 0;
1719
1720 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1721
1722 r = sd_id128_get_machine(&this_id);
1723 if (r < 0)
1724 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1725
1726 if (sd_id128_equal(arg_uuid, this_id)) {
1727 log_full(try ? LOG_WARNING : LOG_ERR,
1728 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
1729 if (try)
1730 return 0;
1731 return -EEXIST;
1732 }
1733
1734 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1735 if (r < 0)
1736 return log_error_errno(r, "Failed to create /var: %m");
1737
1738 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1739 if (r < 0)
1740 return log_error_errno(r, "Failed to create /var/log: %m");
1741
1742 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1743 if (r < 0)
1744 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1745
1746 (void) sd_id128_to_string(arg_uuid, id);
1747
1748 p = strjoina("/var/log/journal/", id);
1749 q = prefix_roota(directory, p);
1750
1751 if (path_is_mount_point(p, NULL, 0) > 0) {
1752 if (try)
1753 return 0;
1754
1755 log_error("%s: already a mount point, refusing to use for journal", p);
1756 return -EEXIST;
1757 }
1758
1759 if (path_is_mount_point(q, NULL, 0) > 0) {
1760 if (try)
1761 return 0;
1762
1763 log_error("%s: already a mount point, refusing to use for journal", q);
1764 return -EEXIST;
1765 }
1766
1767 r = readlink_and_make_absolute(p, &d);
1768 if (r >= 0) {
1769 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
1770 path_equal(d, q)) {
1771
1772 r = userns_mkdir(directory, p, 0755, 0, 0);
1773 if (r < 0)
1774 log_warning_errno(r, "Failed to create directory %s: %m", q);
1775 return 0;
1776 }
1777
1778 if (unlink(p) < 0)
1779 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1780 } else if (r == -EINVAL) {
1781
1782 if (arg_link_journal == LINK_GUEST &&
1783 rmdir(p) < 0) {
1784
1785 if (errno == ENOTDIR) {
1786 log_error("%s already exists and is neither a symlink nor a directory", p);
1787 return r;
1788 } else
1789 return log_error_errno(errno, "Failed to remove %s: %m", p);
1790 }
1791 } else if (r != -ENOENT)
1792 return log_error_errno(r, "readlink(%s) failed: %m", p);
1793
1794 if (arg_link_journal == LINK_GUEST) {
1795
1796 if (symlink(q, p) < 0) {
1797 if (try) {
1798 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1799 return 0;
1800 } else
1801 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1802 }
1803
1804 r = userns_mkdir(directory, p, 0755, 0, 0);
1805 if (r < 0)
1806 log_warning_errno(r, "Failed to create directory %s: %m", q);
1807 return 0;
1808 }
1809
1810 if (arg_link_journal == LINK_HOST) {
1811 /* don't create parents here — if the host doesn't have
1812 * permanent journal set up, don't force it here */
1813
1814 if (mkdir(p, 0755) < 0 && errno != EEXIST) {
1815 if (try) {
1816 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1817 return 0;
1818 } else
1819 return log_error_errno(errno, "Failed to create %s: %m", p);
1820 }
1821
1822 } else if (access(p, F_OK) < 0)
1823 return 0;
1824
1825 if (dir_is_empty(q) == 0)
1826 log_warning("%s is not empty, proceeding anyway.", q);
1827
1828 r = userns_mkdir(directory, p, 0755, 0, 0);
1829 if (r < 0)
1830 return log_error_errno(r, "Failed to create %s: %m", q);
1831
1832 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
1833 if (r < 0)
1834 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1835
1836 return 0;
1837 }
1838
1839 static int drop_capabilities(void) {
1840 return capability_bounding_set_drop(arg_caps_retain, false);
1841 }
1842
1843 static int reset_audit_loginuid(void) {
1844 _cleanup_free_ char *p = NULL;
1845 int r;
1846
1847 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
1848 return 0;
1849
1850 r = read_one_line_file("/proc/self/loginuid", &p);
1851 if (r == -ENOENT)
1852 return 0;
1853 if (r < 0)
1854 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1855
1856 /* Already reset? */
1857 if (streq(p, "4294967295"))
1858 return 0;
1859
1860 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
1861 if (r < 0) {
1862 log_error_errno(r,
1863 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1864 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1865 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1866 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1867 "using systemd-nspawn. Sleeping for 5s... (%m)");
1868
1869 sleep(5);
1870 }
1871
1872 return 0;
1873 }
1874
1875
1876 static int setup_propagate(const char *root) {
1877 const char *p, *q;
1878 int r;
1879
1880 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1881 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1882 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
1883 (void) mkdir_p(p, 0600);
1884
1885 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1886 if (r < 0)
1887 return log_error_errno(r, "Failed to create /run/systemd: %m");
1888
1889 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1890 if (r < 0)
1891 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
1892
1893 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1894 if (r < 0)
1895 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
1896
1897 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
1898 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
1899 if (r < 0)
1900 return r;
1901
1902 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
1903 if (r < 0)
1904 return r;
1905
1906 /* machined will MS_MOVE into that directory, and that's only
1907 * supported for non-shared mounts. */
1908 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
1909 }
1910
1911 static int setup_machine_id(const char *directory) {
1912 const char *etc_machine_id;
1913 sd_id128_t id;
1914 int r;
1915
1916 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
1917 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
1918 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
1919 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
1920 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
1921 * container behaves nicely). */
1922
1923 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
1924
1925 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
1926 if (r < 0) {
1927 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
1928 return log_error_errno(r, "Failed to read machine ID from container image: %m");
1929
1930 if (sd_id128_is_null(arg_uuid)) {
1931 r = sd_id128_randomize(&arg_uuid);
1932 if (r < 0)
1933 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
1934 }
1935 } else {
1936 if (sd_id128_is_null(id)) {
1937 log_error("Machine ID in container image is zero, refusing.");
1938 return -EINVAL;
1939 }
1940
1941 arg_uuid = id;
1942 }
1943
1944 return 0;
1945 }
1946
1947 static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
1948 int r;
1949
1950 assert(directory);
1951
1952 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
1953 return 0;
1954
1955 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
1956 if (r == -EOPNOTSUPP)
1957 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
1958 if (r == -EBADE)
1959 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
1960 if (r < 0)
1961 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
1962 if (r == 0)
1963 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
1964 else
1965 log_debug("Patched directory tree to match UID/GID range.");
1966
1967 return r;
1968 }
1969
1970 /*
1971 * Return values:
1972 * < 0 : wait_for_terminate() failed to get the state of the
1973 * container, the container was terminated by a signal, or
1974 * failed for an unknown reason. No change is made to the
1975 * container argument.
1976 * > 0 : The program executed in the container terminated with an
1977 * error. The exit code of the program executed in the
1978 * container is returned. The container argument has been set
1979 * to CONTAINER_TERMINATED.
1980 * 0 : The container is being rebooted, has been shut down or exited
1981 * successfully. The container argument has been set to either
1982 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
1983 *
1984 * That is, success is indicated by a return value of zero, and an
1985 * error is indicated by a non-zero value.
1986 */
1987 static int wait_for_container(pid_t pid, ContainerStatus *container) {
1988 siginfo_t status;
1989 int r;
1990
1991 r = wait_for_terminate(pid, &status);
1992 if (r < 0)
1993 return log_warning_errno(r, "Failed to wait for container: %m");
1994
1995 switch (status.si_code) {
1996
1997 case CLD_EXITED:
1998 if (status.si_status == 0)
1999 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2000 else
2001 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2002
2003 *container = CONTAINER_TERMINATED;
2004 return status.si_status;
2005
2006 case CLD_KILLED:
2007 if (status.si_status == SIGINT) {
2008 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2009 *container = CONTAINER_TERMINATED;
2010 return 0;
2011
2012 } else if (status.si_status == SIGHUP) {
2013 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2014 *container = CONTAINER_REBOOTED;
2015 return 0;
2016 }
2017
2018 /* fall through */
2019
2020 case CLD_DUMPED:
2021 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2022 return -EIO;
2023
2024 default:
2025 log_error("Container %s failed due to unknown reason.", arg_machine);
2026 return -EIO;
2027 }
2028 }
2029
2030 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2031 pid_t pid;
2032
2033 pid = PTR_TO_PID(userdata);
2034 if (pid > 0) {
2035 if (kill(pid, arg_kill_signal) >= 0) {
2036 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2037 sd_event_source_set_userdata(s, NULL);
2038 return 0;
2039 }
2040 }
2041
2042 sd_event_exit(sd_event_source_get_event(s), 0);
2043 return 0;
2044 }
2045
2046 static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
2047 for (;;) {
2048 siginfo_t si = {};
2049 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2050 return log_error_errno(errno, "Failed to waitid(): %m");
2051 if (si.si_pid == 0) /* No pending children. */
2052 break;
2053 if (si.si_pid == PTR_TO_PID(userdata)) {
2054 /* The main process we care for has exited. Return from
2055 * signal handler but leave the zombie. */
2056 sd_event_exit(sd_event_source_get_event(s), 0);
2057 break;
2058 }
2059 /* Reap all other children. */
2060 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2061 }
2062
2063 return 0;
2064 }
2065
2066 static int determine_names(void) {
2067 int r;
2068
2069 if (arg_template && !arg_directory && arg_machine) {
2070
2071 /* If --template= was specified then we should not
2072 * search for a machine, but instead create a new one
2073 * in /var/lib/machine. */
2074
2075 arg_directory = strjoin("/var/lib/machines/", arg_machine);
2076 if (!arg_directory)
2077 return log_oom();
2078 }
2079
2080 if (!arg_image && !arg_directory) {
2081 if (arg_machine) {
2082 _cleanup_(image_unrefp) Image *i = NULL;
2083
2084 r = image_find(arg_machine, &i);
2085 if (r < 0)
2086 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2087 if (r == 0) {
2088 log_error("No image for machine '%s'.", arg_machine);
2089 return -ENOENT;
2090 }
2091
2092 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
2093 r = free_and_strdup(&arg_image, i->path);
2094 else
2095 r = free_and_strdup(&arg_directory, i->path);
2096 if (r < 0)
2097 return log_oom();
2098
2099 if (!arg_ephemeral)
2100 arg_read_only = arg_read_only || i->read_only;
2101 } else
2102 arg_directory = get_current_dir_name();
2103
2104 if (!arg_directory && !arg_image) {
2105 log_error("Failed to determine path, please use -D or -i.");
2106 return -EINVAL;
2107 }
2108 }
2109
2110 if (!arg_machine) {
2111
2112 if (arg_directory && path_equal(arg_directory, "/"))
2113 arg_machine = gethostname_malloc();
2114 else {
2115 if (arg_image) {
2116 char *e;
2117
2118 arg_machine = strdup(basename(arg_image));
2119
2120 /* Truncate suffix if there is one */
2121 e = endswith(arg_machine, ".raw");
2122 if (e)
2123 *e = 0;
2124 } else
2125 arg_machine = strdup(basename(arg_directory));
2126 }
2127 if (!arg_machine)
2128 return log_oom();
2129
2130 hostname_cleanup(arg_machine);
2131 if (!machine_name_is_valid(arg_machine)) {
2132 log_error("Failed to determine machine name automatically, please use -M.");
2133 return -EINVAL;
2134 }
2135
2136 if (arg_ephemeral) {
2137 char *b;
2138
2139 /* Add a random suffix when this is an
2140 * ephemeral machine, so that we can run many
2141 * instances at once without manually having
2142 * to specify -M each time. */
2143
2144 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2145 return log_oom();
2146
2147 free(arg_machine);
2148 arg_machine = b;
2149 }
2150 }
2151
2152 return 0;
2153 }
2154
2155 static int chase_symlinks_and_update(char **p, unsigned flags) {
2156 char *chased;
2157 int r;
2158
2159 assert(p);
2160
2161 if (!*p)
2162 return 0;
2163
2164 r = chase_symlinks(*p, NULL, flags, &chased);
2165 if (r < 0)
2166 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
2167
2168 free(*p);
2169 *p = chased;
2170
2171 return 0;
2172 }
2173
2174 static int determine_uid_shift(const char *directory) {
2175 int r;
2176
2177 if (arg_userns_mode == USER_NAMESPACE_NO) {
2178 arg_uid_shift = 0;
2179 return 0;
2180 }
2181
2182 if (arg_uid_shift == UID_INVALID) {
2183 struct stat st;
2184
2185 r = stat(directory, &st);
2186 if (r < 0)
2187 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2188
2189 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2190
2191 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
2192 log_error("UID and GID base of %s don't match.", directory);
2193 return -EINVAL;
2194 }
2195
2196 arg_uid_range = UINT32_C(0x10000);
2197 }
2198
2199 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2200 log_error("UID base too high for UID range.");
2201 return -EINVAL;
2202 }
2203
2204 return 0;
2205 }
2206
2207 static int inner_child(
2208 Barrier *barrier,
2209 const char *directory,
2210 bool secondary,
2211 int kmsg_socket,
2212 int rtnl_socket,
2213 FDSet *fds) {
2214
2215 _cleanup_free_ char *home = NULL;
2216 char as_uuid[37];
2217 unsigned n_env = 1;
2218 const char *envp[] = {
2219 "PATH=" DEFAULT_PATH_SPLIT_USR,
2220 NULL, /* container */
2221 NULL, /* TERM */
2222 NULL, /* HOME */
2223 NULL, /* USER */
2224 NULL, /* LOGNAME */
2225 NULL, /* container_uuid */
2226 NULL, /* LISTEN_FDS */
2227 NULL, /* LISTEN_PID */
2228 NULL, /* NOTIFY_SOCKET */
2229 NULL
2230 };
2231 const char *exec_target;
2232
2233 _cleanup_strv_free_ char **env_use = NULL;
2234 int r;
2235
2236 assert(barrier);
2237 assert(directory);
2238 assert(kmsg_socket >= 0);
2239
2240 if (arg_userns_mode != USER_NAMESPACE_NO) {
2241 /* Tell the parent, that it now can write the UID map. */
2242 (void) barrier_place(barrier); /* #1 */
2243
2244 /* Wait until the parent wrote the UID map */
2245 if (!barrier_place_and_sync(barrier)) { /* #2 */
2246 log_error("Parent died too early");
2247 return -ESRCH;
2248 }
2249 }
2250
2251 r = reset_uid_gid();
2252 if (r < 0)
2253 return log_error_errno(r, "Couldn't become new root: %m");
2254
2255 r = mount_all(NULL,
2256 arg_mount_settings | MOUNT_IN_USERNS,
2257 arg_uid_shift,
2258 arg_uid_range,
2259 arg_selinux_apifs_context);
2260
2261 if (r < 0)
2262 return r;
2263
2264 r = mount_sysfs(NULL, arg_mount_settings);
2265 if (r < 0)
2266 return r;
2267
2268 /* Wait until we are cgroup-ified, so that we
2269 * can mount the right cgroup path writable */
2270 if (!barrier_place_and_sync(barrier)) { /* #3 */
2271 log_error("Parent died too early");
2272 return -ESRCH;
2273 }
2274
2275 if (arg_use_cgns && cg_ns_supported()) {
2276 r = unshare(CLONE_NEWCGROUP);
2277 if (r < 0)
2278 return log_error_errno(errno, "Failed to unshare cgroup namespace");
2279 r = mount_cgroups(
2280 "",
2281 arg_unified_cgroup_hierarchy,
2282 arg_userns_mode != USER_NAMESPACE_NO,
2283 arg_uid_shift,
2284 arg_uid_range,
2285 arg_selinux_apifs_context,
2286 true);
2287 if (r < 0)
2288 return r;
2289 } else {
2290 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2291 if (r < 0)
2292 return r;
2293 }
2294
2295 r = setup_boot_id(NULL);
2296 if (r < 0)
2297 return r;
2298
2299 r = setup_kmsg(NULL, kmsg_socket);
2300 if (r < 0)
2301 return r;
2302 kmsg_socket = safe_close(kmsg_socket);
2303
2304 umask(0022);
2305
2306 if (setsid() < 0)
2307 return log_error_errno(errno, "setsid() failed: %m");
2308
2309 if (arg_private_network)
2310 loopback_setup();
2311
2312 if (arg_expose_ports) {
2313 r = expose_port_send_rtnl(rtnl_socket);
2314 if (r < 0)
2315 return r;
2316 rtnl_socket = safe_close(rtnl_socket);
2317 }
2318
2319 r = drop_capabilities();
2320 if (r < 0)
2321 return log_error_errno(r, "drop_capabilities() failed: %m");
2322
2323 setup_hostname();
2324
2325 if (arg_personality != PERSONALITY_INVALID) {
2326 r = safe_personality(arg_personality);
2327 if (r < 0)
2328 return log_error_errno(r, "personality() failed: %m");
2329 } else if (secondary) {
2330 r = safe_personality(PER_LINUX32);
2331 if (r < 0)
2332 return log_error_errno(r, "personality() failed: %m");
2333 }
2334
2335 #if HAVE_SELINUX
2336 if (arg_selinux_context)
2337 if (setexeccon(arg_selinux_context) < 0)
2338 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2339 #endif
2340
2341 r = change_uid_gid(arg_user, &home);
2342 if (r < 0)
2343 return r;
2344
2345 /* LXC sets container=lxc, so follow the scheme here */
2346 envp[n_env++] = strjoina("container=", arg_container_service_name);
2347
2348 envp[n_env] = strv_find_prefix(environ, "TERM=");
2349 if (envp[n_env])
2350 n_env++;
2351
2352 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2353 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2354 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2355 return log_oom();
2356
2357 assert(!sd_id128_is_null(arg_uuid));
2358
2359 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
2360 return log_oom();
2361
2362 if (fdset_size(fds) > 0) {
2363 r = fdset_cloexec(fds, false);
2364 if (r < 0)
2365 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2366
2367 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2368 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2369 return log_oom();
2370 }
2371 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
2372 return log_oom();
2373
2374 env_use = strv_env_merge(2, envp, arg_setenv);
2375 if (!env_use)
2376 return log_oom();
2377
2378 /* Let the parent know that we are ready and
2379 * wait until the parent is ready with the
2380 * setup, too... */
2381 if (!barrier_place_and_sync(barrier)) { /* #4 */
2382 log_error("Parent died too early");
2383 return -ESRCH;
2384 }
2385
2386 if (arg_chdir)
2387 if (chdir(arg_chdir) < 0)
2388 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2389
2390 if (arg_start_mode == START_PID2) {
2391 r = stub_pid1(arg_uuid);
2392 if (r < 0)
2393 return r;
2394 }
2395
2396 /* Now, explicitly close the log, so that we
2397 * then can close all remaining fds. Closing
2398 * the log explicitly first has the benefit
2399 * that the logging subsystem knows about it,
2400 * and is thus ready to be reopened should we
2401 * need it again. Note that the other fds
2402 * closed here are at least the locking and
2403 * barrier fds. */
2404 log_close();
2405 (void) fdset_close_others(fds);
2406
2407 if (arg_start_mode == START_BOOT) {
2408 char **a;
2409 size_t m;
2410
2411 /* Automatically search for the init system */
2412
2413 m = strv_length(arg_parameters);
2414 a = newa(char*, m + 2);
2415 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2416 a[1 + m] = NULL;
2417
2418 a[0] = (char*) "/usr/lib/systemd/systemd";
2419 execve(a[0], a, env_use);
2420
2421 a[0] = (char*) "/lib/systemd/systemd";
2422 execve(a[0], a, env_use);
2423
2424 a[0] = (char*) "/sbin/init";
2425 execve(a[0], a, env_use);
2426
2427 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
2428 } else if (!strv_isempty(arg_parameters)) {
2429 exec_target = arg_parameters[0];
2430 execvpe(arg_parameters[0], arg_parameters, env_use);
2431 } else {
2432 if (!arg_chdir)
2433 /* If we cannot change the directory, we'll end up in /, that is expected. */
2434 (void) chdir(home ?: "/root");
2435
2436 execle("/bin/bash", "-bash", NULL, env_use);
2437 execle("/bin/sh", "-sh", NULL, env_use);
2438
2439 exec_target = "/bin/bash, /bin/sh";
2440 }
2441
2442 r = -errno;
2443 (void) log_open();
2444 return log_error_errno(r, "execv(%s) failed: %m", exec_target);
2445 }
2446
2447 static int setup_sd_notify_child(void) {
2448 static const int one = 1;
2449 int fd = -1;
2450 union sockaddr_union sa = {
2451 .sa.sa_family = AF_UNIX,
2452 };
2453 int r;
2454
2455 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2456 if (fd < 0)
2457 return log_error_errno(errno, "Failed to allocate notification socket: %m");
2458
2459 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
2460 (void) unlink(NSPAWN_NOTIFY_SOCKET_PATH);
2461
2462 strncpy(sa.un.sun_path, NSPAWN_NOTIFY_SOCKET_PATH, sizeof(sa.un.sun_path)-1);
2463 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
2464 if (r < 0) {
2465 safe_close(fd);
2466 return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path);
2467 }
2468
2469 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
2470 if (r < 0) {
2471 safe_close(fd);
2472 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
2473 }
2474
2475 r = setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &one, sizeof(one));
2476 if (r < 0) {
2477 safe_close(fd);
2478 return log_error_errno(errno, "SO_PASSCRED failed: %m");
2479 }
2480
2481 return fd;
2482 }
2483
2484 static int outer_child(
2485 Barrier *barrier,
2486 const char *directory,
2487 const char *console,
2488 DissectedImage *dissected_image,
2489 bool interactive,
2490 bool secondary,
2491 int pid_socket,
2492 int uuid_socket,
2493 int notify_socket,
2494 int kmsg_socket,
2495 int rtnl_socket,
2496 int uid_shift_socket,
2497 FDSet *fds) {
2498
2499 pid_t pid;
2500 ssize_t l;
2501 int r;
2502 _cleanup_close_ int fd = -1;
2503
2504 assert(barrier);
2505 assert(directory);
2506 assert(console);
2507 assert(pid_socket >= 0);
2508 assert(uuid_socket >= 0);
2509 assert(notify_socket >= 0);
2510 assert(kmsg_socket >= 0);
2511
2512 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2513 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2514
2515 if (interactive) {
2516 close_nointr(STDIN_FILENO);
2517 close_nointr(STDOUT_FILENO);
2518 close_nointr(STDERR_FILENO);
2519
2520 r = open_terminal(console, O_RDWR);
2521 if (r != STDIN_FILENO) {
2522 if (r >= 0) {
2523 safe_close(r);
2524 r = -EINVAL;
2525 }
2526
2527 return log_error_errno(r, "Failed to open console: %m");
2528 }
2529
2530 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2531 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2532 return log_error_errno(errno, "Failed to duplicate console: %m");
2533 }
2534
2535 r = reset_audit_loginuid();
2536 if (r < 0)
2537 return r;
2538
2539 /* Mark everything as slave, so that we still
2540 * receive mounts from the real root, but don't
2541 * propagate mounts to the real root. */
2542 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
2543 if (r < 0)
2544 return r;
2545
2546 if (dissected_image) {
2547 r = dissected_image_mount(dissected_image, directory, DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0));
2548 if (r < 0)
2549 return r;
2550 }
2551
2552 r = determine_uid_shift(directory);
2553 if (r < 0)
2554 return r;
2555
2556 if (arg_userns_mode != USER_NAMESPACE_NO) {
2557 /* Let the parent know which UID shift we read from the image */
2558 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2559 if (l < 0)
2560 return log_error_errno(errno, "Failed to send UID shift: %m");
2561 if (l != sizeof(arg_uid_shift)) {
2562 log_error("Short write while sending UID shift.");
2563 return -EIO;
2564 }
2565
2566 if (arg_userns_mode == USER_NAMESPACE_PICK) {
2567 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
2568 * we just read from the image is available. If yes, it will send the UID shift back to us, if
2569 * not it will pick a different one, and send it back to us. */
2570
2571 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
2572 if (l < 0)
2573 return log_error_errno(errno, "Failed to recv UID shift: %m");
2574 if (l != sizeof(arg_uid_shift)) {
2575 log_error("Short read while receiving UID shift.");
2576 return -EIO;
2577 }
2578 }
2579
2580 log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2581 }
2582
2583 /* Turn directory into bind mount */
2584 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
2585 if (r < 0)
2586 return r;
2587
2588 r = setup_pivot_root(
2589 directory,
2590 arg_pivot_root_new,
2591 arg_pivot_root_old);
2592 if (r < 0)
2593 return r;
2594
2595 r = setup_volatile(
2596 directory,
2597 arg_volatile_mode,
2598 arg_userns_mode != USER_NAMESPACE_NO,
2599 arg_uid_shift,
2600 arg_uid_range,
2601 arg_selinux_context);
2602 if (r < 0)
2603 return r;
2604
2605 r = setup_volatile_state(
2606 directory,
2607 arg_volatile_mode,
2608 arg_userns_mode != USER_NAMESPACE_NO,
2609 arg_uid_shift,
2610 arg_uid_range,
2611 arg_selinux_context);
2612 if (r < 0)
2613 return r;
2614
2615 /* Mark everything as shared so our mounts get propagated down. This is
2616 * required to make new bind mounts available in systemd services
2617 * inside the containter that create a new mount namespace.
2618 * See https://github.com/systemd/systemd/issues/3860
2619 * Further submounts (such as /dev) done after this will inherit the
2620 * shared propagation mode. */
2621 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
2622 if (r < 0)
2623 return r;
2624
2625 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
2626 if (r < 0)
2627 return r;
2628
2629 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2630 if (r < 0)
2631 return r;
2632
2633 if (arg_read_only) {
2634 r = bind_remount_recursive(directory, true, NULL);
2635 if (r < 0)
2636 return log_error_errno(r, "Failed to make tree read-only: %m");
2637 }
2638
2639 r = mount_all(directory,
2640 arg_mount_settings,
2641 arg_uid_shift,
2642 arg_uid_range,
2643 arg_selinux_apifs_context);
2644 if (r < 0)
2645 return r;
2646
2647 r = copy_devnodes(directory);
2648 if (r < 0)
2649 return r;
2650
2651 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2652
2653 r = setup_pts(directory);
2654 if (r < 0)
2655 return r;
2656
2657 r = setup_propagate(directory);
2658 if (r < 0)
2659 return r;
2660
2661 r = setup_dev_console(directory, console);
2662 if (r < 0)
2663 return r;
2664
2665 r = setup_keyring();
2666 if (r < 0)
2667 return r;
2668
2669 r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
2670 if (r < 0)
2671 return r;
2672
2673 r = setup_timezone(directory);
2674 if (r < 0)
2675 return r;
2676
2677 r = setup_resolv_conf(directory);
2678 if (r < 0)
2679 return r;
2680
2681 r = setup_machine_id(directory);
2682 if (r < 0)
2683 return r;
2684
2685 r = setup_journal(directory);
2686 if (r < 0)
2687 return r;
2688
2689 r = mount_custom(
2690 directory,
2691 arg_custom_mounts,
2692 arg_n_custom_mounts,
2693 arg_userns_mode != USER_NAMESPACE_NO,
2694 arg_uid_shift,
2695 arg_uid_range,
2696 arg_selinux_apifs_context);
2697 if (r < 0)
2698 return r;
2699
2700 if (!arg_use_cgns || !cg_ns_supported()) {
2701 r = mount_cgroups(
2702 directory,
2703 arg_unified_cgroup_hierarchy,
2704 arg_userns_mode != USER_NAMESPACE_NO,
2705 arg_uid_shift,
2706 arg_uid_range,
2707 arg_selinux_apifs_context,
2708 false);
2709 if (r < 0)
2710 return r;
2711 }
2712
2713 r = mount_move_root(directory);
2714 if (r < 0)
2715 return log_error_errno(r, "Failed to move root directory: %m");
2716
2717 fd = setup_sd_notify_child();
2718 if (fd < 0)
2719 return fd;
2720
2721 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2722 arg_clone_ns_flags |
2723 (arg_private_network ? CLONE_NEWNET : 0) |
2724 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
2725 if (pid < 0)
2726 return log_error_errno(errno, "Failed to fork inner child: %m");
2727 if (pid == 0) {
2728 pid_socket = safe_close(pid_socket);
2729 uuid_socket = safe_close(uuid_socket);
2730 notify_socket = safe_close(notify_socket);
2731 uid_shift_socket = safe_close(uid_shift_socket);
2732
2733 /* The inner child has all namespaces that are
2734 * requested, so that we all are owned by the user if
2735 * user namespaces are turned on. */
2736
2737 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
2738 if (r < 0)
2739 _exit(EXIT_FAILURE);
2740
2741 _exit(EXIT_SUCCESS);
2742 }
2743
2744 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2745 if (l < 0)
2746 return log_error_errno(errno, "Failed to send PID: %m");
2747 if (l != sizeof(pid)) {
2748 log_error("Short write while sending PID.");
2749 return -EIO;
2750 }
2751
2752 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
2753 if (l < 0)
2754 return log_error_errno(errno, "Failed to send machine ID: %m");
2755 if (l != sizeof(arg_uuid)) {
2756 log_error("Short write while sending machine ID.");
2757 return -EIO;
2758 }
2759
2760 l = send_one_fd(notify_socket, fd, 0);
2761 if (l < 0)
2762 return log_error_errno(errno, "Failed to send notify fd: %m");
2763
2764 pid_socket = safe_close(pid_socket);
2765 uuid_socket = safe_close(uuid_socket);
2766 notify_socket = safe_close(notify_socket);
2767 kmsg_socket = safe_close(kmsg_socket);
2768 rtnl_socket = safe_close(rtnl_socket);
2769
2770 return 0;
2771 }
2772
2773 static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
2774 unsigned n_tries = 100;
2775 uid_t candidate;
2776 int r;
2777
2778 assert(shift);
2779 assert(ret_lock_file);
2780 assert(arg_userns_mode == USER_NAMESPACE_PICK);
2781 assert(arg_uid_range == 0x10000U);
2782
2783 candidate = *shift;
2784
2785 (void) mkdir("/run/systemd/nspawn-uid", 0755);
2786
2787 for (;;) {
2788 char lock_path[strlen("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
2789 _cleanup_release_lock_file_ LockFile lf = LOCK_FILE_INIT;
2790
2791 if (--n_tries <= 0)
2792 return -EBUSY;
2793
2794 if (candidate < UID_SHIFT_PICK_MIN || candidate > UID_SHIFT_PICK_MAX)
2795 goto next;
2796 if ((candidate & UINT32_C(0xFFFF)) != 0)
2797 goto next;
2798
2799 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
2800 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
2801 if (r == -EBUSY) /* Range already taken by another nspawn instance */
2802 goto next;
2803 if (r < 0)
2804 return r;
2805
2806 /* Make some superficial checks whether the range is currently known in the user database */
2807 if (getpwuid(candidate))
2808 goto next;
2809 if (getpwuid(candidate + UINT32_C(0xFFFE)))
2810 goto next;
2811 if (getgrgid(candidate))
2812 goto next;
2813 if (getgrgid(candidate + UINT32_C(0xFFFE)))
2814 goto next;
2815
2816 *ret_lock_file = lf;
2817 lf = (struct LockFile) LOCK_FILE_INIT;
2818 *shift = candidate;
2819 return 0;
2820
2821 next:
2822 random_bytes(&candidate, sizeof(candidate));
2823 candidate = (candidate % (UID_SHIFT_PICK_MAX - UID_SHIFT_PICK_MIN)) + UID_SHIFT_PICK_MIN;
2824 candidate &= (uid_t) UINT32_C(0xFFFF0000);
2825 }
2826 }
2827
2828 static int setup_uid_map(pid_t pid) {
2829 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2830 int r;
2831
2832 assert(pid > 1);
2833
2834 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2835 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
2836 r = write_string_file(uid_map, line, 0);
2837 if (r < 0)
2838 return log_error_errno(r, "Failed to write UID map: %m");
2839
2840 /* We always assign the same UID and GID ranges */
2841 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
2842 r = write_string_file(uid_map, line, 0);
2843 if (r < 0)
2844 return log_error_errno(r, "Failed to write GID map: %m");
2845
2846 return 0;
2847 }
2848
2849 static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
2850 char buf[NOTIFY_BUFFER_MAX+1];
2851 char *p = NULL;
2852 struct iovec iovec = {
2853 .iov_base = buf,
2854 .iov_len = sizeof(buf)-1,
2855 };
2856 union {
2857 struct cmsghdr cmsghdr;
2858 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
2859 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
2860 } control = {};
2861 struct msghdr msghdr = {
2862 .msg_iov = &iovec,
2863 .msg_iovlen = 1,
2864 .msg_control = &control,
2865 .msg_controllen = sizeof(control),
2866 };
2867 struct cmsghdr *cmsg;
2868 struct ucred *ucred = NULL;
2869 ssize_t n;
2870 pid_t inner_child_pid;
2871 _cleanup_strv_free_ char **tags = NULL;
2872
2873 assert(userdata);
2874
2875 inner_child_pid = PTR_TO_PID(userdata);
2876
2877 if (revents != EPOLLIN) {
2878 log_warning("Got unexpected poll event for notify fd.");
2879 return 0;
2880 }
2881
2882 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
2883 if (n < 0) {
2884 if (IN_SET(errno, EAGAIN, EINTR))
2885 return 0;
2886
2887 return log_warning_errno(errno, "Couldn't read notification socket: %m");
2888 }
2889 cmsg_close_all(&msghdr);
2890
2891 CMSG_FOREACH(cmsg, &msghdr) {
2892 if (cmsg->cmsg_level == SOL_SOCKET &&
2893 cmsg->cmsg_type == SCM_CREDENTIALS &&
2894 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
2895
2896 ucred = (struct ucred*) CMSG_DATA(cmsg);
2897 }
2898 }
2899
2900 if (!ucred || ucred->pid != inner_child_pid) {
2901 log_debug("Received notify message without valid credentials. Ignoring.");
2902 return 0;
2903 }
2904
2905 if ((size_t) n >= sizeof(buf)) {
2906 log_warning("Received notify message exceeded maximum size. Ignoring.");
2907 return 0;
2908 }
2909
2910 buf[n] = 0;
2911 tags = strv_split(buf, "\n\r");
2912 if (!tags)
2913 return log_oom();
2914
2915 if (strv_find(tags, "READY=1"))
2916 sd_notifyf(false, "READY=1\n");
2917
2918 p = strv_find_startswith(tags, "STATUS=");
2919 if (p)
2920 sd_notifyf(false, "STATUS=Container running: %s", p);
2921
2922 return 0;
2923 }
2924
2925 static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
2926 int r;
2927
2928 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
2929 if (r < 0)
2930 return log_error_errno(r, "Failed to allocate notify event source: %m");
2931
2932 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
2933
2934 return 0;
2935 }
2936
2937 static int load_settings(void) {
2938 _cleanup_(settings_freep) Settings *settings = NULL;
2939 _cleanup_fclose_ FILE *f = NULL;
2940 _cleanup_free_ char *p = NULL;
2941 const char *fn, *i;
2942 int r;
2943
2944 /* If all settings are masked, there's no point in looking for
2945 * the settings file */
2946 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2947 return 0;
2948
2949 fn = strjoina(arg_machine, ".nspawn");
2950
2951 /* We first look in the admin's directories in /etc and /run */
2952 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2953 _cleanup_free_ char *j = NULL;
2954
2955 j = strjoin(i, "/", fn);
2956 if (!j)
2957 return log_oom();
2958
2959 f = fopen(j, "re");
2960 if (f) {
2961 p = j;
2962 j = NULL;
2963
2964 /* By default, we trust configuration from /etc and /run */
2965 if (arg_settings_trusted < 0)
2966 arg_settings_trusted = true;
2967
2968 break;
2969 }
2970
2971 if (errno != ENOENT)
2972 return log_error_errno(errno, "Failed to open %s: %m", j);
2973 }
2974
2975 if (!f) {
2976 /* After that, let's look for a file next to the
2977 * actual image we shall boot. */
2978
2979 if (arg_image) {
2980 p = file_in_same_dir(arg_image, fn);
2981 if (!p)
2982 return log_oom();
2983 } else if (arg_directory) {
2984 p = file_in_same_dir(arg_directory, fn);
2985 if (!p)
2986 return log_oom();
2987 }
2988
2989 if (p) {
2990 f = fopen(p, "re");
2991 if (!f && errno != ENOENT)
2992 return log_error_errno(errno, "Failed to open %s: %m", p);
2993
2994 /* By default, we do not trust configuration from /var/lib/machines */
2995 if (arg_settings_trusted < 0)
2996 arg_settings_trusted = false;
2997 }
2998 }
2999
3000 if (!f)
3001 return 0;
3002
3003 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
3004
3005 r = settings_load(f, p, &settings);
3006 if (r < 0)
3007 return r;
3008
3009 /* Copy over bits from the settings, unless they have been
3010 * explicitly masked by command line switches. */
3011
3012 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3013 settings->start_mode >= 0) {
3014 arg_start_mode = settings->start_mode;
3015
3016 strv_free(arg_parameters);
3017 arg_parameters = settings->parameters;
3018 settings->parameters = NULL;
3019 }
3020
3021 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
3022 settings->pivot_root_new) {
3023 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
3024 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
3025 }
3026
3027 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
3028 settings->working_directory) {
3029 free(arg_chdir);
3030 arg_chdir = settings->working_directory;
3031 settings->working_directory = NULL;
3032 }
3033
3034 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
3035 settings->environment) {
3036 strv_free(arg_setenv);
3037 arg_setenv = settings->environment;
3038 settings->environment = NULL;
3039 }
3040
3041 if ((arg_settings_mask & SETTING_USER) == 0 &&
3042 settings->user) {
3043 free(arg_user);
3044 arg_user = settings->user;
3045 settings->user = NULL;
3046 }
3047
3048 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
3049 uint64_t plus;
3050
3051 plus = settings->capability;
3052 if (settings_private_network(settings))
3053 plus |= (1ULL << CAP_NET_ADMIN);
3054
3055 if (!arg_settings_trusted && plus != 0) {
3056 if (settings->capability != 0)
3057 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
3058 } else
3059 arg_caps_retain |= plus;
3060
3061 arg_caps_retain &= ~settings->drop_capability;
3062 }
3063
3064 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3065 settings->kill_signal > 0)
3066 arg_kill_signal = settings->kill_signal;
3067
3068 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3069 settings->personality != PERSONALITY_INVALID)
3070 arg_personality = settings->personality;
3071
3072 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3073 !sd_id128_is_null(settings->machine_id)) {
3074
3075 if (!arg_settings_trusted)
3076 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
3077 else
3078 arg_uuid = settings->machine_id;
3079 }
3080
3081 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3082 settings->read_only >= 0)
3083 arg_read_only = settings->read_only;
3084
3085 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3086 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3087 arg_volatile_mode = settings->volatile_mode;
3088
3089 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3090 settings->n_custom_mounts > 0) {
3091
3092 if (!arg_settings_trusted)
3093 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
3094 else {
3095 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3096 arg_custom_mounts = settings->custom_mounts;
3097 arg_n_custom_mounts = settings->n_custom_mounts;
3098
3099 settings->custom_mounts = NULL;
3100 settings->n_custom_mounts = 0;
3101 }
3102 }
3103
3104 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3105 (settings->private_network >= 0 ||
3106 settings->network_veth >= 0 ||
3107 settings->network_bridge ||
3108 settings->network_zone ||
3109 settings->network_interfaces ||
3110 settings->network_macvlan ||
3111 settings->network_ipvlan ||
3112 settings->network_veth_extra)) {
3113
3114 if (!arg_settings_trusted)
3115 log_warning("Ignoring network settings, file %s is not trusted.", p);
3116 else {
3117 arg_network_veth = settings_network_veth(settings);
3118 arg_private_network = settings_private_network(settings);
3119
3120 strv_free(arg_network_interfaces);
3121 arg_network_interfaces = settings->network_interfaces;
3122 settings->network_interfaces = NULL;
3123
3124 strv_free(arg_network_macvlan);
3125 arg_network_macvlan = settings->network_macvlan;
3126 settings->network_macvlan = NULL;
3127
3128 strv_free(arg_network_ipvlan);
3129 arg_network_ipvlan = settings->network_ipvlan;
3130 settings->network_ipvlan = NULL;
3131
3132 strv_free(arg_network_veth_extra);
3133 arg_network_veth_extra = settings->network_veth_extra;
3134 settings->network_veth_extra = NULL;
3135
3136 free(arg_network_bridge);
3137 arg_network_bridge = settings->network_bridge;
3138 settings->network_bridge = NULL;
3139
3140 free(arg_network_zone);
3141 arg_network_zone = settings->network_zone;
3142 settings->network_zone = NULL;
3143 }
3144 }
3145
3146 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3147 settings->expose_ports) {
3148
3149 if (!arg_settings_trusted)
3150 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3151 else {
3152 expose_port_free_all(arg_expose_ports);
3153 arg_expose_ports = settings->expose_ports;
3154 settings->expose_ports = NULL;
3155 }
3156 }
3157
3158 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3159 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3160
3161 if (!arg_settings_trusted)
3162 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p);
3163 else {
3164 arg_userns_mode = settings->userns_mode;
3165 arg_uid_shift = settings->uid_shift;
3166 arg_uid_range = settings->uid_range;
3167 arg_userns_chown = settings->userns_chown;
3168 }
3169 }
3170
3171 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3172 arg_notify_ready = settings->notify_ready;
3173
3174 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
3175
3176 if (!arg_settings_trusted && !strv_isempty(arg_syscall_whitelist))
3177 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", p);
3178 else {
3179 strv_free(arg_syscall_whitelist);
3180 strv_free(arg_syscall_blacklist);
3181
3182 arg_syscall_whitelist = settings->syscall_whitelist;
3183 arg_syscall_blacklist = settings->syscall_blacklist;
3184
3185 settings->syscall_whitelist = settings->syscall_blacklist = NULL;
3186 }
3187 }
3188
3189 return 0;
3190 }
3191
3192 static int run(int master,
3193 const char* console,
3194 DissectedImage *dissected_image,
3195 bool interactive,
3196 bool secondary,
3197 FDSet *fds,
3198 char veth_name[IFNAMSIZ], bool *veth_created,
3199 union in_addr_union *exposed,
3200 pid_t *pid, int *ret) {
3201
3202 static const struct sigaction sa = {
3203 .sa_handler = nop_signal_handler,
3204 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
3205 };
3206
3207 _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
3208 _cleanup_close_ int etc_passwd_lock = -1;
3209 _cleanup_close_pair_ int
3210 kmsg_socket_pair[2] = { -1, -1 },
3211 rtnl_socket_pair[2] = { -1, -1 },
3212 pid_socket_pair[2] = { -1, -1 },
3213 uuid_socket_pair[2] = { -1, -1 },
3214 notify_socket_pair[2] = { -1, -1 },
3215 uid_shift_socket_pair[2] = { -1, -1 };
3216 _cleanup_close_ int notify_socket= -1;
3217 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3218 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
3219 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
3220 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3221 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
3222 ContainerStatus container_status = 0;
3223 char last_char = 0;
3224 int ifi = 0, r;
3225 ssize_t l;
3226 sigset_t mask_chld;
3227
3228 assert_se(sigemptyset(&mask_chld) == 0);
3229 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3230
3231 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3232 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3233 * check with getpwuid() if the specific user already exists. Note that /etc might be
3234 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3235 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3236 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3237 * really ours. */
3238
3239 etc_passwd_lock = take_etc_passwd_lock(NULL);
3240 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
3241 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
3242 }
3243
3244 r = barrier_create(&barrier);
3245 if (r < 0)
3246 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
3247
3248 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
3249 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3250
3251 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
3252 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3253
3254 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
3255 return log_error_errno(errno, "Failed to create pid socket pair: %m");
3256
3257 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
3258 return log_error_errno(errno, "Failed to create id socket pair: %m");
3259
3260 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
3261 return log_error_errno(errno, "Failed to create notify socket pair: %m");
3262
3263 if (arg_userns_mode != USER_NAMESPACE_NO)
3264 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
3265 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3266
3267 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
3268 * parent's blocking calls and give it a chance to call wait() and terminate. */
3269 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3270 if (r < 0)
3271 return log_error_errno(errno, "Failed to change the signal mask: %m");
3272
3273 r = sigaction(SIGCHLD, &sa, NULL);
3274 if (r < 0)
3275 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3276
3277 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
3278 if (*pid < 0)
3279 return log_error_errno(errno, "clone() failed%s: %m",
3280 errno == EINVAL ?
3281 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
3282
3283 if (*pid == 0) {
3284 /* The outer child only has a file system namespace. */
3285 barrier_set_role(&barrier, BARRIER_CHILD);
3286
3287 master = safe_close(master);
3288
3289 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3290 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3291 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3292 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
3293 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3294 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3295
3296 (void) reset_all_signal_handlers();
3297 (void) reset_signal_mask();
3298
3299 r = outer_child(&barrier,
3300 arg_directory,
3301 console,
3302 dissected_image,
3303 interactive,
3304 secondary,
3305 pid_socket_pair[1],
3306 uuid_socket_pair[1],
3307 notify_socket_pair[1],
3308 kmsg_socket_pair[1],
3309 rtnl_socket_pair[1],
3310 uid_shift_socket_pair[1],
3311 fds);
3312 if (r < 0)
3313 _exit(EXIT_FAILURE);
3314
3315 _exit(EXIT_SUCCESS);
3316 }
3317
3318 barrier_set_role(&barrier, BARRIER_PARENT);
3319
3320 fds = fdset_free(fds);
3321
3322 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3323 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3324 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3325 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
3326 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3327 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3328
3329 if (arg_userns_mode != USER_NAMESPACE_NO) {
3330 /* The child just let us know the UID shift it might have read from the image. */
3331 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
3332 if (l < 0)
3333 return log_error_errno(errno, "Failed to read UID shift: %m");
3334 if (l != sizeof arg_uid_shift) {
3335 log_error("Short read while reading UID shift.");
3336 return -EIO;
3337 }
3338
3339 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3340 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3341 * image, but if that's already in use, pick a new one, and report back to the child,
3342 * which one we now picked. */
3343
3344 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
3345 if (r < 0)
3346 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
3347
3348 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
3349 if (l < 0)
3350 return log_error_errno(errno, "Failed to send UID shift: %m");
3351 if (l != sizeof arg_uid_shift) {
3352 log_error("Short write while writing UID shift.");
3353 return -EIO;
3354 }
3355 }
3356 }
3357
3358 /* Wait for the outer child. */
3359 r = wait_for_terminate_and_warn("namespace helper", *pid, NULL);
3360 if (r != 0)
3361 return r < 0 ? r : -EIO;
3362
3363 /* And now retrieve the PID of the inner child. */
3364 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
3365 if (l < 0)
3366 return log_error_errno(errno, "Failed to read inner child PID: %m");
3367 if (l != sizeof *pid) {
3368 log_error("Short read while reading inner child PID.");
3369 return -EIO;
3370 }
3371
3372 /* We also retrieve container UUID in case it was generated by outer child */
3373 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
3374 if (l < 0)
3375 return log_error_errno(errno, "Failed to read container machine ID: %m");
3376 if (l != sizeof(arg_uuid)) {
3377 log_error("Short read while reading container machined ID.");
3378 return -EIO;
3379 }
3380
3381 /* We also retrieve the socket used for notifications generated by outer child */
3382 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
3383 if (notify_socket < 0)
3384 return log_error_errno(notify_socket,
3385 "Failed to receive notification socket from the outer child: %m");
3386
3387 log_debug("Init process invoked as PID "PID_FMT, *pid);
3388
3389 if (arg_userns_mode != USER_NAMESPACE_NO) {
3390 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3391 log_error("Child died too early.");
3392 return -ESRCH;
3393 }
3394
3395 r = setup_uid_map(*pid);
3396 if (r < 0)
3397 return r;
3398
3399 (void) barrier_place(&barrier); /* #2 */
3400 }
3401
3402 if (arg_private_network) {
3403
3404 r = move_network_interfaces(*pid, arg_network_interfaces);
3405 if (r < 0)
3406 return r;
3407
3408 if (arg_network_veth) {
3409 r = setup_veth(arg_machine, *pid, veth_name,
3410 arg_network_bridge || arg_network_zone);
3411 if (r < 0)
3412 return r;
3413 else if (r > 0)
3414 ifi = r;
3415
3416 if (arg_network_bridge) {
3417 /* Add the interface to a bridge */
3418 r = setup_bridge(veth_name, arg_network_bridge, false);
3419 if (r < 0)
3420 return r;
3421 if (r > 0)
3422 ifi = r;
3423 } else if (arg_network_zone) {
3424 /* Add the interface to a bridge, possibly creating it */
3425 r = setup_bridge(veth_name, arg_network_zone, true);
3426 if (r < 0)
3427 return r;
3428 if (r > 0)
3429 ifi = r;
3430 }
3431 }
3432
3433 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
3434 if (r < 0)
3435 return r;
3436
3437 /* We created the primary and extra veth links now; let's remember this, so that we know to
3438 remove them later on. Note that we don't bother with removing veth links that were created
3439 here when their setup failed half-way, because in that case the kernel should be able to
3440 remove them on its own, since they cannot be referenced by anything yet. */
3441 *veth_created = true;
3442
3443 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
3444 if (r < 0)
3445 return r;
3446
3447 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
3448 if (r < 0)
3449 return r;
3450 }
3451
3452 if (arg_register) {
3453 r = register_machine(
3454 arg_machine,
3455 *pid,
3456 arg_directory,
3457 arg_uuid,
3458 ifi,
3459 arg_slice,
3460 arg_custom_mounts, arg_n_custom_mounts,
3461 arg_kill_signal,
3462 arg_property,
3463 arg_keep_unit,
3464 arg_container_service_name);
3465 if (r < 0)
3466 return r;
3467 } else if (!arg_keep_unit) {
3468 r = allocate_scope(
3469 arg_machine,
3470 *pid,
3471 arg_slice,
3472 arg_custom_mounts, arg_n_custom_mounts,
3473 arg_kill_signal,
3474 arg_property);
3475 if (r < 0)
3476 return r;
3477
3478 } else if (arg_slice || arg_property)
3479 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
3480
3481 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
3482 if (r < 0)
3483 return r;
3484
3485 if (arg_keep_unit) {
3486 r = create_subcgroup(*pid, arg_unified_cgroup_hierarchy);
3487 if (r < 0)
3488 return r;
3489 }
3490
3491 r = chown_cgroup(*pid, arg_uid_shift);
3492 if (r < 0)
3493 return r;
3494
3495 /* Notify the child that the parent is ready with all
3496 * its setup (including cgroup-ification), and that
3497 * the child can now hand over control to the code to
3498 * run inside the container. */
3499 (void) barrier_place(&barrier); /* #3 */
3500
3501 /* Block SIGCHLD here, before notifying child.
3502 * process_pty() will handle it with the other signals. */
3503 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3504
3505 /* Reset signal to default */
3506 r = default_signals(SIGCHLD, -1);
3507 if (r < 0)
3508 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
3509
3510 r = sd_event_new(&event);
3511 if (r < 0)
3512 return log_error_errno(r, "Failed to get default event source: %m");
3513
3514 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
3515 if (r < 0)
3516 return r;
3517
3518 /* Let the child know that we are ready and wait that the child is completely ready now. */
3519 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3520 log_error("Child died too early.");
3521 return -ESRCH;
3522 }
3523
3524 /* At this point we have made use of the UID we picked, and thus nss-mymachines
3525 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
3526 etc_passwd_lock = safe_close(etc_passwd_lock);
3527
3528 sd_notifyf(false,
3529 "STATUS=Container running.\n"
3530 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
3531 if (!arg_notify_ready)
3532 sd_notify(false, "READY=1\n");
3533
3534 if (arg_kill_signal > 0) {
3535 /* Try to kill the init system on SIGINT or SIGTERM */
3536 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
3537 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
3538 } else {
3539 /* Immediately exit */
3540 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3541 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3542 }
3543
3544 /* Exit when the child exits */
3545 sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
3546
3547 if (arg_expose_ports) {
3548 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
3549 if (r < 0)
3550 return r;
3551
3552 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
3553 }
3554
3555 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3556
3557 r = pty_forward_new(event, master,
3558 PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY),
3559 &forward);
3560 if (r < 0)
3561 return log_error_errno(r, "Failed to create PTY forwarder: %m");
3562
3563 r = sd_event_loop(event);
3564 if (r < 0)
3565 return log_error_errno(r, "Failed to run event loop: %m");
3566
3567 pty_forward_get_last_char(forward, &last_char);
3568
3569 forward = pty_forward_free(forward);
3570
3571 if (!arg_quiet && last_char != '\n')
3572 putc('\n', stdout);
3573
3574 /* Kill if it is not dead yet anyway */
3575 if (arg_register && !arg_keep_unit)
3576 terminate_machine(*pid);
3577
3578 /* Normally redundant, but better safe than sorry */
3579 (void) kill(*pid, SIGKILL);
3580
3581 r = wait_for_container(*pid, &container_status);
3582 *pid = 0;
3583
3584 if (r < 0)
3585 /* We failed to wait for the container, or the container exited abnormally. */
3586 return r;
3587 if (r > 0 || container_status == CONTAINER_TERMINATED) {
3588 /* r > 0 → The container exited with a non-zero status.
3589 * As a special case, we need to replace 133 with a different value,
3590 * because 133 is special-cased in the service file to reboot the container.
3591 * otherwise → The container exited with zero status and a reboot was not requested.
3592 */
3593 if (r == EXIT_FORCE_RESTART)
3594 r = EXIT_FAILURE; /* replace 133 with the general failure code */
3595 *ret = r;
3596 return 0; /* finito */
3597 }
3598
3599 /* CONTAINER_REBOOTED, loop again */
3600
3601 if (arg_keep_unit) {
3602 /* Special handling if we are running as a service: instead of simply
3603 * restarting the machine we want to restart the entire service, so let's
3604 * inform systemd about this with the special exit code 133. The service
3605 * file uses RestartForceExitStatus=133 so that this results in a full
3606 * nspawn restart. This is necessary since we might have cgroup parameters
3607 * set we want to have flushed out. */
3608 *ret = EXIT_FORCE_RESTART;
3609 return 0; /* finito */
3610 }
3611
3612 expose_port_flush(arg_expose_ports, exposed);
3613
3614 (void) remove_veth_links(veth_name, arg_network_veth_extra);
3615 *veth_created = false;
3616 return 1; /* loop again */
3617 }
3618
3619 int main(int argc, char *argv[]) {
3620
3621 _cleanup_free_ char *console = NULL;
3622 _cleanup_close_ int master = -1;
3623 _cleanup_fdset_free_ FDSet *fds = NULL;
3624 int r, n_fd_passed, ret = EXIT_SUCCESS;
3625 char veth_name[IFNAMSIZ] = "";
3626 bool secondary = false, remove_directory = false, remove_image = false;
3627 pid_t pid = 0;
3628 union in_addr_union exposed = {};
3629 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3630 bool interactive, veth_created = false, remove_tmprootdir = false;
3631 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
3632 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
3633 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
3634 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
3635
3636 log_parse_environment();
3637 log_open();
3638
3639 /* Make sure rename_process() in the stub init process can work */
3640 saved_argv = argv;
3641 saved_argc = argc;
3642
3643 r = parse_argv(argc, argv);
3644 if (r <= 0)
3645 goto finish;
3646
3647 if (geteuid() != 0) {
3648 log_error("Need to be root.");
3649 r = -EPERM;
3650 goto finish;
3651 }
3652 r = determine_names();
3653 if (r < 0)
3654 goto finish;
3655
3656 r = load_settings();
3657 if (r < 0)
3658 goto finish;
3659
3660 r = verify_arguments();
3661 if (r < 0)
3662 goto finish;
3663
3664 n_fd_passed = sd_listen_fds(false);
3665 if (n_fd_passed > 0) {
3666 r = fdset_new_listen_fds(&fds, false);
3667 if (r < 0) {
3668 log_error_errno(r, "Failed to collect file descriptors: %m");
3669 goto finish;
3670 }
3671 }
3672
3673 if (arg_directory) {
3674 assert(!arg_image);
3675
3676 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3677 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3678 r = -EINVAL;
3679 goto finish;
3680 }
3681
3682 if (arg_ephemeral) {
3683 _cleanup_free_ char *np = NULL;
3684
3685 r = chase_symlinks_and_update(&arg_directory, 0);
3686 if (r < 0)
3687 goto finish;
3688
3689 /* If the specified path is a mount point we
3690 * generate the new snapshot immediately
3691 * inside it under a random name. However if
3692 * the specified is not a mount point we
3693 * create the new snapshot in the parent
3694 * directory, just next to it. */
3695 r = path_is_mount_point(arg_directory, NULL, 0);
3696 if (r < 0) {
3697 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3698 goto finish;
3699 }
3700 if (r > 0)
3701 r = tempfn_random_child(arg_directory, "machine.", &np);
3702 else
3703 r = tempfn_random(arg_directory, "machine.", &np);
3704 if (r < 0) {
3705 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
3706 goto finish;
3707 }
3708
3709 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3710 if (r < 0) {
3711 log_error_errno(r, "Failed to lock %s: %m", np);
3712 goto finish;
3713 }
3714
3715 r = btrfs_subvol_snapshot(arg_directory, np,
3716 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
3717 BTRFS_SNAPSHOT_FALLBACK_COPY |
3718 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3719 BTRFS_SNAPSHOT_RECURSIVE |
3720 BTRFS_SNAPSHOT_QUOTA);
3721 if (r < 0) {
3722 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3723 goto finish;
3724 }
3725
3726 free(arg_directory);
3727 arg_directory = np;
3728 np = NULL;
3729
3730 remove_directory = true;
3731
3732 } else {
3733 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
3734 if (r < 0)
3735 goto finish;
3736
3737 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3738 if (r == -EBUSY) {
3739 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3740 goto finish;
3741 }
3742 if (r < 0) {
3743 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3744 goto finish;
3745 }
3746
3747 if (arg_template) {
3748 r = chase_symlinks_and_update(&arg_template, 0);
3749 if (r < 0)
3750 goto finish;
3751
3752 r = btrfs_subvol_snapshot(arg_template, arg_directory,
3753 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
3754 BTRFS_SNAPSHOT_FALLBACK_COPY |
3755 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3756 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
3757 BTRFS_SNAPSHOT_RECURSIVE |
3758 BTRFS_SNAPSHOT_QUOTA);
3759 if (r == -EEXIST) {
3760 if (!arg_quiet)
3761 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3762 } else if (r < 0) {
3763 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3764 goto finish;
3765 } else {
3766 if (!arg_quiet)
3767 log_info("Populated %s from template %s.", arg_directory, arg_template);
3768 }
3769 }
3770 }
3771
3772 if (arg_start_mode == START_BOOT) {
3773 if (path_is_os_tree(arg_directory) <= 0) {
3774 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3775 r = -EINVAL;
3776 goto finish;
3777 }
3778 } else {
3779 const char *p;
3780
3781 p = strjoina(arg_directory, "/usr/");
3782 if (laccess(p, F_OK) < 0) {
3783 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
3784 r = -EINVAL;
3785 goto finish;
3786 }
3787 }
3788
3789 } else {
3790 assert(arg_image);
3791 assert(!arg_template);
3792
3793 r = chase_symlinks_and_update(&arg_image, 0);
3794 if (r < 0)
3795 goto finish;
3796
3797 if (arg_ephemeral) {
3798 _cleanup_free_ char *np = NULL;
3799
3800 r = tempfn_random(arg_image, "machine.", &np);
3801 if (r < 0) {
3802 log_error_errno(r, "Failed to generate name for image snapshot: %m");
3803 goto finish;
3804 }
3805
3806 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3807 if (r < 0) {
3808 r = log_error_errno(r, "Failed to create image lock: %m");
3809 goto finish;
3810 }
3811
3812 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, COPY_REFLINK);
3813 if (r < 0) {
3814 r = log_error_errno(r, "Failed to copy image file: %m");
3815 goto finish;
3816 }
3817
3818 free(arg_image);
3819 arg_image = np;
3820 np = NULL;
3821
3822 remove_image = true;
3823 } else {
3824 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3825 if (r == -EBUSY) {
3826 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3827 goto finish;
3828 }
3829 if (r < 0) {
3830 r = log_error_errno(r, "Failed to create image lock: %m");
3831 goto finish;
3832 }
3833
3834 if (!arg_root_hash) {
3835 r = root_hash_load(arg_image, &arg_root_hash, &arg_root_hash_size);
3836 if (r < 0) {
3837 log_error_errno(r, "Failed to load root hash file for %s: %m", arg_image);
3838 goto finish;
3839 }
3840 }
3841 }
3842
3843 if (!mkdtemp(tmprootdir)) {
3844 r = log_error_errno(errno, "Failed to create temporary directory: %m");
3845 goto finish;
3846 }
3847
3848 remove_tmprootdir = true;
3849
3850 arg_directory = strdup(tmprootdir);
3851 if (!arg_directory) {
3852 r = log_oom();
3853 goto finish;
3854 }
3855
3856 r = loop_device_make_by_path(arg_image, arg_read_only ? O_RDONLY : O_RDWR, &loop);
3857 if (r < 0) {
3858 log_error_errno(r, "Failed to set up loopback block device: %m");
3859 goto finish;
3860 }
3861
3862 r = dissect_image(
3863 loop->fd,
3864 arg_root_hash, arg_root_hash_size,
3865 DISSECT_IMAGE_REQUIRE_ROOT,
3866 &dissected_image);
3867 if (r == -ENOPKG) {
3868 log_error_errno(r, "Could not find a suitable file system or partition table in image: %s", arg_image);
3869
3870 log_notice("Note that the disk image needs to\n"
3871 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
3872 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
3873 " c) or follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n"
3874 " d) or contain a file system without a partition table\n"
3875 "in order to be bootable with systemd-nspawn.");
3876 goto finish;
3877 }
3878 if (r == -EADDRNOTAVAIL) {
3879 log_error_errno(r, "No root partition for specified root hash found.");
3880 goto finish;
3881 }
3882 if (r == -EOPNOTSUPP) {
3883 log_error_errno(r, "--image= is not supported, compiled without blkid support.");
3884 goto finish;
3885 }
3886 if (r == -EPROTONOSUPPORT) {
3887 log_error_errno(r, "Device is loopback block device with partition scanning turned off, please turn it on.");
3888 goto finish;
3889 }
3890 if (r < 0) {
3891 log_error_errno(r, "Failed to dissect image: %m");
3892 goto finish;
3893 }
3894
3895 if (!arg_root_hash && dissected_image->can_verity)
3896 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
3897
3898 r = dissected_image_decrypt_interactively(dissected_image, NULL, arg_root_hash, arg_root_hash_size, 0, &decrypted_image);
3899 if (r < 0)
3900 goto finish;
3901
3902 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
3903 if (remove_image && unlink(arg_image) >= 0)
3904 remove_image = false;
3905 }
3906
3907 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
3908 if (r < 0)
3909 goto finish;
3910
3911 r = detect_unified_cgroup_hierarchy(arg_directory);
3912 if (r < 0)
3913 goto finish;
3914
3915 interactive =
3916 isatty(STDIN_FILENO) > 0 &&
3917 isatty(STDOUT_FILENO) > 0;
3918
3919 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3920 if (master < 0) {
3921 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3922 goto finish;
3923 }
3924
3925 r = ptsname_malloc(master, &console);
3926 if (r < 0) {
3927 r = log_error_errno(r, "Failed to determine tty name: %m");
3928 goto finish;
3929 }
3930
3931 if (arg_selinux_apifs_context) {
3932 r = mac_selinux_apply(console, arg_selinux_apifs_context);
3933 if (r < 0)
3934 goto finish;
3935 }
3936
3937 if (unlockpt(master) < 0) {
3938 r = log_error_errno(errno, "Failed to unlock tty: %m");
3939 goto finish;
3940 }
3941
3942 if (!arg_quiet)
3943 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3944 arg_machine, arg_image ?: arg_directory);
3945
3946 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
3947
3948 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3949 r = log_error_errno(errno, "Failed to become subreaper: %m");
3950 goto finish;
3951 }
3952
3953 for (;;) {
3954 r = run(master,
3955 console,
3956 dissected_image,
3957 interactive, secondary,
3958 fds,
3959 veth_name, &veth_created,
3960 &exposed,
3961 &pid, &ret);
3962 if (r <= 0)
3963 break;
3964 }
3965
3966 finish:
3967 sd_notify(false,
3968 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
3969 "STOPPING=1\nSTATUS=Terminating...");
3970
3971 if (pid > 0)
3972 (void) kill(pid, SIGKILL);
3973
3974 /* Try to flush whatever is still queued in the pty */
3975 if (master >= 0) {
3976 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, 0);
3977 master = safe_close(master);
3978 }
3979
3980 if (pid > 0)
3981 (void) wait_for_terminate(pid, NULL);
3982
3983 if (remove_directory && arg_directory) {
3984 int k;
3985
3986 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
3987 if (k < 0)
3988 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
3989 }
3990
3991 if (remove_image && arg_image) {
3992 if (unlink(arg_image) < 0)
3993 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
3994 }
3995
3996 if (remove_tmprootdir) {
3997 if (rmdir(tmprootdir) < 0)
3998 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
3999 }
4000
4001 if (arg_machine) {
4002 const char *p;
4003
4004 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
4005 (void) rm_rf(p, REMOVE_ROOT);
4006 }
4007
4008 expose_port_flush(arg_expose_ports, &exposed);
4009
4010 if (veth_created)
4011 (void) remove_veth_links(veth_name, arg_network_veth_extra);
4012 (void) remove_bridge(arg_network_zone);
4013
4014 free(arg_directory);
4015 free(arg_template);
4016 free(arg_image);
4017 free(arg_machine);
4018 free(arg_user);
4019 free(arg_pivot_root_new);
4020 free(arg_pivot_root_old);
4021 free(arg_chdir);
4022 strv_free(arg_setenv);
4023 free(arg_network_bridge);
4024 strv_free(arg_network_interfaces);
4025 strv_free(arg_network_macvlan);
4026 strv_free(arg_network_ipvlan);
4027 strv_free(arg_network_veth_extra);
4028 strv_free(arg_parameters);
4029 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4030 expose_port_free_all(arg_expose_ports);
4031 free(arg_root_hash);
4032
4033 return r < 0 ? EXIT_FAILURE : ret;
4034 }