]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
tree-wide: use sd_id128_is_null() instead of sd_id128_equal where appropriate
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18 ***/
19
20 #ifdef HAVE_BLKID
21 #include <blkid/blkid.h>
22 #endif
23 #include <errno.h>
24 #include <getopt.h>
25 #include <grp.h>
26 #include <linux/loop.h>
27 #include <pwd.h>
28 #include <sched.h>
29 #ifdef HAVE_SELINUX
30 #include <selinux/selinux.h>
31 #endif
32 #include <signal.h>
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <sys/file.h>
37 #include <sys/mount.h>
38 #include <sys/personality.h>
39 #include <sys/prctl.h>
40 #include <sys/types.h>
41 #include <unistd.h>
42
43 #include "sd-daemon.h"
44 #include "sd-id128.h"
45
46 #include "alloc-util.h"
47 #include "barrier.h"
48 #include "base-filesystem.h"
49 #include "blkid-util.h"
50 #include "btrfs-util.h"
51 #include "cap-list.h"
52 #include "capability-util.h"
53 #include "cgroup-util.h"
54 #include "copy.h"
55 #include "dev-setup.h"
56 #include "env-util.h"
57 #include "fd-util.h"
58 #include "fdset.h"
59 #include "fileio.h"
60 #include "formats-util.h"
61 #include "fs-util.h"
62 #include "gpt.h"
63 #include "hostname-util.h"
64 #include "log.h"
65 #include "loopback-setup.h"
66 #include "machine-id-setup.h"
67 #include "machine-image.h"
68 #include "macro.h"
69 #include "missing.h"
70 #include "mkdir.h"
71 #include "mount-util.h"
72 #include "netlink-util.h"
73 #include "nspawn-cgroup.h"
74 #include "nspawn-expose-ports.h"
75 #include "nspawn-mount.h"
76 #include "nspawn-network.h"
77 #include "nspawn-patch-uid.h"
78 #include "nspawn-register.h"
79 #include "nspawn-settings.h"
80 #include "nspawn-setuid.h"
81 #include "nspawn-stub-pid1.h"
82 #include "nspawn-seccomp.h"
83 #include "parse-util.h"
84 #include "path-util.h"
85 #include "process-util.h"
86 #include "ptyfwd.h"
87 #include "random-util.h"
88 #include "raw-clone.h"
89 #include "rm-rf.h"
90 #include "selinux-util.h"
91 #include "signal-util.h"
92 #include "socket-util.h"
93 #include "stat-util.h"
94 #include "stdio-util.h"
95 #include "string-util.h"
96 #include "strv.h"
97 #include "terminal-util.h"
98 #include "udev-util.h"
99 #include "umask-util.h"
100 #include "user-util.h"
101 #include "util.h"
102
103 /* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
104 * UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems
105 * may have their own allocation ranges too. */
106 #define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
107 #define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
108
109 /* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
110 * nspawn_notify_socket_path is relative to the container
111 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
112 #define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
113
114 typedef enum ContainerStatus {
115 CONTAINER_TERMINATED,
116 CONTAINER_REBOOTED
117 } ContainerStatus;
118
119 typedef enum LinkJournal {
120 LINK_NO,
121 LINK_AUTO,
122 LINK_HOST,
123 LINK_GUEST
124 } LinkJournal;
125
126 static char *arg_directory = NULL;
127 static char *arg_template = NULL;
128 static char *arg_chdir = NULL;
129 static char *arg_user = NULL;
130 static sd_id128_t arg_uuid = {};
131 static char *arg_machine = NULL;
132 static const char *arg_selinux_context = NULL;
133 static const char *arg_selinux_apifs_context = NULL;
134 static const char *arg_slice = NULL;
135 static bool arg_private_network = false;
136 static bool arg_read_only = false;
137 static StartMode arg_start_mode = START_PID1;
138 static bool arg_ephemeral = false;
139 static LinkJournal arg_link_journal = LINK_AUTO;
140 static bool arg_link_journal_try = false;
141 static uint64_t arg_caps_retain =
142 (1ULL << CAP_AUDIT_CONTROL) |
143 (1ULL << CAP_AUDIT_WRITE) |
144 (1ULL << CAP_CHOWN) |
145 (1ULL << CAP_DAC_OVERRIDE) |
146 (1ULL << CAP_DAC_READ_SEARCH) |
147 (1ULL << CAP_FOWNER) |
148 (1ULL << CAP_FSETID) |
149 (1ULL << CAP_IPC_OWNER) |
150 (1ULL << CAP_KILL) |
151 (1ULL << CAP_LEASE) |
152 (1ULL << CAP_LINUX_IMMUTABLE) |
153 (1ULL << CAP_MKNOD) |
154 (1ULL << CAP_NET_BIND_SERVICE) |
155 (1ULL << CAP_NET_BROADCAST) |
156 (1ULL << CAP_NET_RAW) |
157 (1ULL << CAP_SETFCAP) |
158 (1ULL << CAP_SETGID) |
159 (1ULL << CAP_SETPCAP) |
160 (1ULL << CAP_SETUID) |
161 (1ULL << CAP_SYS_ADMIN) |
162 (1ULL << CAP_SYS_BOOT) |
163 (1ULL << CAP_SYS_CHROOT) |
164 (1ULL << CAP_SYS_NICE) |
165 (1ULL << CAP_SYS_PTRACE) |
166 (1ULL << CAP_SYS_RESOURCE) |
167 (1ULL << CAP_SYS_TTY_CONFIG);
168 static CustomMount *arg_custom_mounts = NULL;
169 static unsigned arg_n_custom_mounts = 0;
170 static char **arg_setenv = NULL;
171 static bool arg_quiet = false;
172 static bool arg_share_system = false;
173 static bool arg_register = true;
174 static bool arg_keep_unit = false;
175 static char **arg_network_interfaces = NULL;
176 static char **arg_network_macvlan = NULL;
177 static char **arg_network_ipvlan = NULL;
178 static bool arg_network_veth = false;
179 static char **arg_network_veth_extra = NULL;
180 static char *arg_network_bridge = NULL;
181 static char *arg_network_zone = NULL;
182 static unsigned long arg_personality = PERSONALITY_INVALID;
183 static char *arg_image = NULL;
184 static VolatileMode arg_volatile_mode = VOLATILE_NO;
185 static ExposePort *arg_expose_ports = NULL;
186 static char **arg_property = NULL;
187 static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
188 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
189 static bool arg_userns_chown = false;
190 static int arg_kill_signal = 0;
191 static bool arg_unified_cgroup_hierarchy = false;
192 static SettingsMask arg_settings_mask = 0;
193 static int arg_settings_trusted = -1;
194 static char **arg_parameters = NULL;
195 static const char *arg_container_service_name = "systemd-nspawn";
196 static bool arg_notify_ready = false;
197
198 static void help(void) {
199 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
200 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
201 " -h --help Show this help\n"
202 " --version Print version string\n"
203 " -q --quiet Do not show status information\n"
204 " -D --directory=PATH Root directory for the container\n"
205 " --template=PATH Initialize root directory from template directory,\n"
206 " if missing\n"
207 " -x --ephemeral Run container with snapshot of root directory, and\n"
208 " remove it after exit\n"
209 " -i --image=PATH File system device or disk image for the container\n"
210 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
211 " -b --boot Boot up full system (i.e. invoke init)\n"
212 " --chdir=PATH Set working directory in the container\n"
213 " -u --user=USER Run the command under specified user or uid\n"
214 " -M --machine=NAME Set the machine name for the container\n"
215 " --uuid=UUID Set a specific machine UUID for the container\n"
216 " -S --slice=SLICE Place the container in the specified slice\n"
217 " --property=NAME=VALUE Set scope unit property\n"
218 " -U --private-users=pick Run within user namespace, pick UID/GID range automatically\n"
219 " --private-users[=UIDBASE[:NUIDS]]\n"
220 " Run within user namespace, user configured UID/GID range\n"
221 " --private-user-chown Adjust OS tree file ownership for private UID/GID range\n"
222 " --private-network Disable network in container\n"
223 " --network-interface=INTERFACE\n"
224 " Assign an existing network interface to the\n"
225 " container\n"
226 " --network-macvlan=INTERFACE\n"
227 " Create a macvlan network interface based on an\n"
228 " existing network interface to the container\n"
229 " --network-ipvlan=INTERFACE\n"
230 " Create a ipvlan network interface based on an\n"
231 " existing network interface to the container\n"
232 " -n --network-veth Add a virtual Ethernet connection between host\n"
233 " and container\n"
234 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
235 " Add an additional virtual Ethernet link between\n"
236 " host and container\n"
237 " --network-bridge=INTERFACE\n"
238 " Add a virtual Ethernet connection between host\n"
239 " and container and add it to an existing bridge on\n"
240 " the host\n"
241 " --network-zone=NAME Add a virtual Ethernet connection to the container,\n"
242 " and add it to an automatically managed bridge interface\n"
243 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
244 " Expose a container IP port on the host\n"
245 " -Z --selinux-context=SECLABEL\n"
246 " Set the SELinux security context to be used by\n"
247 " processes in the container\n"
248 " -L --selinux-apifs-context=SECLABEL\n"
249 " Set the SELinux security context to be used by\n"
250 " API/tmpfs file systems in the container\n"
251 " --capability=CAP In addition to the default, retain specified\n"
252 " capability\n"
253 " --drop-capability=CAP Drop the specified capability from the default set\n"
254 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
255 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
256 " host, try-guest, try-host\n"
257 " -j Equivalent to --link-journal=try-guest\n"
258 " --read-only Mount the root directory read-only\n"
259 " --bind=PATH[:PATH[:OPTIONS]]\n"
260 " Bind mount a file or directory from the host into\n"
261 " the container\n"
262 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
263 " Similar, but creates a read-only bind mount\n"
264 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
265 " --overlay=PATH[:PATH...]:PATH\n"
266 " Create an overlay mount from the host to \n"
267 " the container\n"
268 " --overlay-ro=PATH[:PATH...]:PATH\n"
269 " Similar, but creates a read-only overlay mount\n"
270 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
271 " --share-system Share system namespaces with host\n"
272 " --register=BOOLEAN Register container as machine\n"
273 " --keep-unit Do not register a scope for the machine, reuse\n"
274 " the service unit nspawn is running in\n"
275 " --volatile[=MODE] Run the system in volatile mode\n"
276 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
277 " --notify-ready=BOOLEAN Receive notifications from the container's init process,\n"
278 " accepted values: yes and no\n"
279 , program_invocation_short_name);
280 }
281
282 static int custom_mounts_prepare(void) {
283 unsigned i;
284 int r;
285
286 /* Ensure the mounts are applied prefix first. */
287 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
288
289 /* Allocate working directories for the overlay file systems that need it */
290 for (i = 0; i < arg_n_custom_mounts; i++) {
291 CustomMount *m = &arg_custom_mounts[i];
292
293 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
294
295 if (arg_userns_chown) {
296 log_error("--private-users-chown may not be combined with custom root mounts.");
297 return -EINVAL;
298 } else if (arg_uid_shift == UID_INVALID) {
299 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
300 return -EINVAL;
301 }
302 }
303
304 if (m->type != CUSTOM_MOUNT_OVERLAY)
305 continue;
306
307 if (m->work_dir)
308 continue;
309
310 if (m->read_only)
311 continue;
312
313 r = tempfn_random(m->source, NULL, &m->work_dir);
314 if (r < 0)
315 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
316 }
317
318 return 0;
319 }
320
321 static int detect_unified_cgroup_hierarchy(void) {
322 const char *e;
323 int r;
324
325 /* Allow the user to control whether the unified hierarchy is used */
326 e = getenv("UNIFIED_CGROUP_HIERARCHY");
327 if (e) {
328 r = parse_boolean(e);
329 if (r < 0)
330 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
331
332 arg_unified_cgroup_hierarchy = r;
333 return 0;
334 }
335
336 /* Otherwise inherit the default from the host system */
337 r = cg_unified();
338 if (r < 0)
339 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
340
341 arg_unified_cgroup_hierarchy = r;
342 return 0;
343 }
344
345 static int parse_argv(int argc, char *argv[]) {
346
347 enum {
348 ARG_VERSION = 0x100,
349 ARG_PRIVATE_NETWORK,
350 ARG_UUID,
351 ARG_READ_ONLY,
352 ARG_CAPABILITY,
353 ARG_DROP_CAPABILITY,
354 ARG_LINK_JOURNAL,
355 ARG_BIND,
356 ARG_BIND_RO,
357 ARG_TMPFS,
358 ARG_OVERLAY,
359 ARG_OVERLAY_RO,
360 ARG_SHARE_SYSTEM,
361 ARG_REGISTER,
362 ARG_KEEP_UNIT,
363 ARG_NETWORK_INTERFACE,
364 ARG_NETWORK_MACVLAN,
365 ARG_NETWORK_IPVLAN,
366 ARG_NETWORK_BRIDGE,
367 ARG_NETWORK_ZONE,
368 ARG_NETWORK_VETH_EXTRA,
369 ARG_PERSONALITY,
370 ARG_VOLATILE,
371 ARG_TEMPLATE,
372 ARG_PROPERTY,
373 ARG_PRIVATE_USERS,
374 ARG_KILL_SIGNAL,
375 ARG_SETTINGS,
376 ARG_CHDIR,
377 ARG_PRIVATE_USERS_CHOWN,
378 ARG_NOTIFY_READY,
379 };
380
381 static const struct option options[] = {
382 { "help", no_argument, NULL, 'h' },
383 { "version", no_argument, NULL, ARG_VERSION },
384 { "directory", required_argument, NULL, 'D' },
385 { "template", required_argument, NULL, ARG_TEMPLATE },
386 { "ephemeral", no_argument, NULL, 'x' },
387 { "user", required_argument, NULL, 'u' },
388 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
389 { "as-pid2", no_argument, NULL, 'a' },
390 { "boot", no_argument, NULL, 'b' },
391 { "uuid", required_argument, NULL, ARG_UUID },
392 { "read-only", no_argument, NULL, ARG_READ_ONLY },
393 { "capability", required_argument, NULL, ARG_CAPABILITY },
394 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
395 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
396 { "bind", required_argument, NULL, ARG_BIND },
397 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
398 { "tmpfs", required_argument, NULL, ARG_TMPFS },
399 { "overlay", required_argument, NULL, ARG_OVERLAY },
400 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
401 { "machine", required_argument, NULL, 'M' },
402 { "slice", required_argument, NULL, 'S' },
403 { "setenv", required_argument, NULL, 'E' },
404 { "selinux-context", required_argument, NULL, 'Z' },
405 { "selinux-apifs-context", required_argument, NULL, 'L' },
406 { "quiet", no_argument, NULL, 'q' },
407 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
408 { "register", required_argument, NULL, ARG_REGISTER },
409 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
410 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
411 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
412 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
413 { "network-veth", no_argument, NULL, 'n' },
414 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA},
415 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
416 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
417 { "personality", required_argument, NULL, ARG_PERSONALITY },
418 { "image", required_argument, NULL, 'i' },
419 { "volatile", optional_argument, NULL, ARG_VOLATILE },
420 { "port", required_argument, NULL, 'p' },
421 { "property", required_argument, NULL, ARG_PROPERTY },
422 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
423 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN},
424 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
425 { "settings", required_argument, NULL, ARG_SETTINGS },
426 { "chdir", required_argument, NULL, ARG_CHDIR },
427 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
428 {}
429 };
430
431 int c, r;
432 const char *p, *e;
433 uint64_t plus = 0, minus = 0;
434 bool mask_all_settings = false, mask_no_settings = false;
435
436 assert(argc >= 0);
437 assert(argv);
438
439 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nU", options, NULL)) >= 0)
440
441 switch (c) {
442
443 case 'h':
444 help();
445 return 0;
446
447 case ARG_VERSION:
448 return version();
449
450 case 'D':
451 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
452 if (r < 0)
453 return r;
454 break;
455
456 case ARG_TEMPLATE:
457 r = parse_path_argument_and_warn(optarg, false, &arg_template);
458 if (r < 0)
459 return r;
460 break;
461
462 case 'i':
463 r = parse_path_argument_and_warn(optarg, false, &arg_image);
464 if (r < 0)
465 return r;
466 break;
467
468 case 'x':
469 arg_ephemeral = true;
470 break;
471
472 case 'u':
473 r = free_and_strdup(&arg_user, optarg);
474 if (r < 0)
475 return log_oom();
476
477 arg_settings_mask |= SETTING_USER;
478 break;
479
480 case ARG_NETWORK_ZONE: {
481 char *j;
482
483 j = strappend("vz-", optarg);
484 if (!j)
485 return log_oom();
486
487 if (!ifname_valid(j)) {
488 log_error("Network zone name not valid: %s", j);
489 free(j);
490 return -EINVAL;
491 }
492
493 free(arg_network_zone);
494 arg_network_zone = j;
495
496 arg_network_veth = true;
497 arg_private_network = true;
498 arg_settings_mask |= SETTING_NETWORK;
499 break;
500 }
501
502 case ARG_NETWORK_BRIDGE:
503
504 if (!ifname_valid(optarg)) {
505 log_error("Bridge interface name not valid: %s", optarg);
506 return -EINVAL;
507 }
508
509 r = free_and_strdup(&arg_network_bridge, optarg);
510 if (r < 0)
511 return log_oom();
512
513 /* fall through */
514
515 case 'n':
516 arg_network_veth = true;
517 arg_private_network = true;
518 arg_settings_mask |= SETTING_NETWORK;
519 break;
520
521 case ARG_NETWORK_VETH_EXTRA:
522 r = veth_extra_parse(&arg_network_veth_extra, optarg);
523 if (r < 0)
524 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
525
526 arg_private_network = true;
527 arg_settings_mask |= SETTING_NETWORK;
528 break;
529
530 case ARG_NETWORK_INTERFACE:
531
532 if (!ifname_valid(optarg)) {
533 log_error("Network interface name not valid: %s", optarg);
534 return -EINVAL;
535 }
536
537 if (strv_extend(&arg_network_interfaces, optarg) < 0)
538 return log_oom();
539
540 arg_private_network = true;
541 arg_settings_mask |= SETTING_NETWORK;
542 break;
543
544 case ARG_NETWORK_MACVLAN:
545
546 if (!ifname_valid(optarg)) {
547 log_error("MACVLAN network interface name not valid: %s", optarg);
548 return -EINVAL;
549 }
550
551 if (strv_extend(&arg_network_macvlan, optarg) < 0)
552 return log_oom();
553
554 arg_private_network = true;
555 arg_settings_mask |= SETTING_NETWORK;
556 break;
557
558 case ARG_NETWORK_IPVLAN:
559
560 if (!ifname_valid(optarg)) {
561 log_error("IPVLAN network interface name not valid: %s", optarg);
562 return -EINVAL;
563 }
564
565 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
566 return log_oom();
567
568 /* fall through */
569
570 case ARG_PRIVATE_NETWORK:
571 arg_private_network = true;
572 arg_settings_mask |= SETTING_NETWORK;
573 break;
574
575 case 'b':
576 if (arg_start_mode == START_PID2) {
577 log_error("--boot and --as-pid2 may not be combined.");
578 return -EINVAL;
579 }
580
581 arg_start_mode = START_BOOT;
582 arg_settings_mask |= SETTING_START_MODE;
583 break;
584
585 case 'a':
586 if (arg_start_mode == START_BOOT) {
587 log_error("--boot and --as-pid2 may not be combined.");
588 return -EINVAL;
589 }
590
591 arg_start_mode = START_PID2;
592 arg_settings_mask |= SETTING_START_MODE;
593 break;
594
595 case ARG_UUID:
596 r = sd_id128_from_string(optarg, &arg_uuid);
597 if (r < 0) {
598 log_error("Invalid UUID: %s", optarg);
599 return r;
600 }
601
602 arg_settings_mask |= SETTING_MACHINE_ID;
603 break;
604
605 case 'S':
606 arg_slice = optarg;
607 break;
608
609 case 'M':
610 if (isempty(optarg))
611 arg_machine = mfree(arg_machine);
612 else {
613 if (!machine_name_is_valid(optarg)) {
614 log_error("Invalid machine name: %s", optarg);
615 return -EINVAL;
616 }
617
618 r = free_and_strdup(&arg_machine, optarg);
619 if (r < 0)
620 return log_oom();
621
622 break;
623 }
624
625 case 'Z':
626 arg_selinux_context = optarg;
627 break;
628
629 case 'L':
630 arg_selinux_apifs_context = optarg;
631 break;
632
633 case ARG_READ_ONLY:
634 arg_read_only = true;
635 arg_settings_mask |= SETTING_READ_ONLY;
636 break;
637
638 case ARG_CAPABILITY:
639 case ARG_DROP_CAPABILITY: {
640 p = optarg;
641 for (;;) {
642 _cleanup_free_ char *t = NULL;
643
644 r = extract_first_word(&p, &t, ",", 0);
645 if (r < 0)
646 return log_error_errno(r, "Failed to parse capability %s.", t);
647
648 if (r == 0)
649 break;
650
651 if (streq(t, "all")) {
652 if (c == ARG_CAPABILITY)
653 plus = (uint64_t) -1;
654 else
655 minus = (uint64_t) -1;
656 } else {
657 int cap;
658
659 cap = capability_from_name(t);
660 if (cap < 0) {
661 log_error("Failed to parse capability %s.", t);
662 return -EINVAL;
663 }
664
665 if (c == ARG_CAPABILITY)
666 plus |= 1ULL << (uint64_t) cap;
667 else
668 minus |= 1ULL << (uint64_t) cap;
669 }
670 }
671
672 arg_settings_mask |= SETTING_CAPABILITY;
673 break;
674 }
675
676 case 'j':
677 arg_link_journal = LINK_GUEST;
678 arg_link_journal_try = true;
679 break;
680
681 case ARG_LINK_JOURNAL:
682 if (streq(optarg, "auto")) {
683 arg_link_journal = LINK_AUTO;
684 arg_link_journal_try = false;
685 } else if (streq(optarg, "no")) {
686 arg_link_journal = LINK_NO;
687 arg_link_journal_try = false;
688 } else if (streq(optarg, "guest")) {
689 arg_link_journal = LINK_GUEST;
690 arg_link_journal_try = false;
691 } else if (streq(optarg, "host")) {
692 arg_link_journal = LINK_HOST;
693 arg_link_journal_try = false;
694 } else if (streq(optarg, "try-guest")) {
695 arg_link_journal = LINK_GUEST;
696 arg_link_journal_try = true;
697 } else if (streq(optarg, "try-host")) {
698 arg_link_journal = LINK_HOST;
699 arg_link_journal_try = true;
700 } else {
701 log_error("Failed to parse link journal mode %s", optarg);
702 return -EINVAL;
703 }
704
705 break;
706
707 case ARG_BIND:
708 case ARG_BIND_RO:
709 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
710 if (r < 0)
711 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
712
713 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
714 break;
715
716 case ARG_TMPFS:
717 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
718 if (r < 0)
719 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
720
721 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
722 break;
723
724 case ARG_OVERLAY:
725 case ARG_OVERLAY_RO: {
726 _cleanup_free_ char *upper = NULL, *destination = NULL;
727 _cleanup_strv_free_ char **lower = NULL;
728 CustomMount *m;
729 unsigned n = 0;
730 char **i;
731
732 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
733 if (r == -ENOMEM)
734 return log_oom();
735 else if (r < 0) {
736 log_error("Invalid overlay specification: %s", optarg);
737 return r;
738 }
739
740 STRV_FOREACH(i, lower) {
741 if (!path_is_absolute(*i)) {
742 log_error("Overlay path %s is not absolute.", *i);
743 return -EINVAL;
744 }
745
746 n++;
747 }
748
749 if (n < 2) {
750 log_error("--overlay= needs at least two colon-separated directories specified.");
751 return -EINVAL;
752 }
753
754 if (n == 2) {
755 /* If two parameters are specified,
756 * the first one is the lower, the
757 * second one the upper directory. And
758 * we'll also define the destination
759 * mount point the same as the upper. */
760 upper = lower[1];
761 lower[1] = NULL;
762
763 destination = strdup(upper);
764 if (!destination)
765 return log_oom();
766
767 } else {
768 upper = lower[n - 2];
769 destination = lower[n - 1];
770 lower[n - 2] = NULL;
771 }
772
773 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
774 if (!m)
775 return log_oom();
776
777 m->destination = destination;
778 m->source = upper;
779 m->lower = lower;
780 m->read_only = c == ARG_OVERLAY_RO;
781
782 upper = destination = NULL;
783 lower = NULL;
784
785 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
786 break;
787 }
788
789 case 'E': {
790 char **n;
791
792 if (!env_assignment_is_valid(optarg)) {
793 log_error("Environment variable assignment '%s' is not valid.", optarg);
794 return -EINVAL;
795 }
796
797 n = strv_env_set(arg_setenv, optarg);
798 if (!n)
799 return log_oom();
800
801 strv_free(arg_setenv);
802 arg_setenv = n;
803
804 arg_settings_mask |= SETTING_ENVIRONMENT;
805 break;
806 }
807
808 case 'q':
809 arg_quiet = true;
810 break;
811
812 case ARG_SHARE_SYSTEM:
813 arg_share_system = true;
814 break;
815
816 case ARG_REGISTER:
817 r = parse_boolean(optarg);
818 if (r < 0) {
819 log_error("Failed to parse --register= argument: %s", optarg);
820 return r;
821 }
822
823 arg_register = r;
824 break;
825
826 case ARG_KEEP_UNIT:
827 arg_keep_unit = true;
828 break;
829
830 case ARG_PERSONALITY:
831
832 arg_personality = personality_from_string(optarg);
833 if (arg_personality == PERSONALITY_INVALID) {
834 log_error("Unknown or unsupported personality '%s'.", optarg);
835 return -EINVAL;
836 }
837
838 arg_settings_mask |= SETTING_PERSONALITY;
839 break;
840
841 case ARG_VOLATILE:
842
843 if (!optarg)
844 arg_volatile_mode = VOLATILE_YES;
845 else {
846 VolatileMode m;
847
848 m = volatile_mode_from_string(optarg);
849 if (m < 0) {
850 log_error("Failed to parse --volatile= argument: %s", optarg);
851 return -EINVAL;
852 } else
853 arg_volatile_mode = m;
854 }
855
856 arg_settings_mask |= SETTING_VOLATILE_MODE;
857 break;
858
859 case 'p':
860 r = expose_port_parse(&arg_expose_ports, optarg);
861 if (r == -EEXIST)
862 return log_error_errno(r, "Duplicate port specification: %s", optarg);
863 if (r < 0)
864 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
865
866 arg_settings_mask |= SETTING_EXPOSE_PORTS;
867 break;
868
869 case ARG_PROPERTY:
870 if (strv_extend(&arg_property, optarg) < 0)
871 return log_oom();
872
873 break;
874
875 case ARG_PRIVATE_USERS:
876
877 r = optarg ? parse_boolean(optarg) : 1;
878 if (r == 0) {
879 /* no: User namespacing off */
880 arg_userns_mode = USER_NAMESPACE_NO;
881 arg_uid_shift = UID_INVALID;
882 arg_uid_range = UINT32_C(0x10000);
883 } else if (r > 0) {
884 /* yes: User namespacing on, UID range is read from root dir */
885 arg_userns_mode = USER_NAMESPACE_FIXED;
886 arg_uid_shift = UID_INVALID;
887 arg_uid_range = UINT32_C(0x10000);
888 } else if (streq(optarg, "pick")) {
889 /* pick: User namespacing on, UID range is picked randomly */
890 arg_userns_mode = USER_NAMESPACE_PICK;
891 arg_uid_shift = UID_INVALID;
892 arg_uid_range = UINT32_C(0x10000);
893 } else {
894 _cleanup_free_ char *buffer = NULL;
895 const char *range, *shift;
896
897 /* anything else: User namespacing on, UID range is explicitly configured */
898
899 range = strchr(optarg, ':');
900 if (range) {
901 buffer = strndup(optarg, range - optarg);
902 if (!buffer)
903 return log_oom();
904 shift = buffer;
905
906 range++;
907 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
908 log_error("Failed to parse UID range: %s", range);
909 return -EINVAL;
910 }
911 } else
912 shift = optarg;
913
914 if (parse_uid(shift, &arg_uid_shift) < 0) {
915 log_error("Failed to parse UID: %s", optarg);
916 return -EINVAL;
917 }
918
919 arg_userns_mode = USER_NAMESPACE_FIXED;
920 }
921
922 arg_settings_mask |= SETTING_USERNS;
923 break;
924
925 case 'U':
926 if (userns_supported()) {
927 arg_userns_mode = USER_NAMESPACE_PICK;
928 arg_uid_shift = UID_INVALID;
929 arg_uid_range = UINT32_C(0x10000);
930
931 arg_settings_mask |= SETTING_USERNS;
932 }
933
934 break;
935
936 case ARG_PRIVATE_USERS_CHOWN:
937 arg_userns_chown = true;
938
939 arg_settings_mask |= SETTING_USERNS;
940 break;
941
942 case ARG_KILL_SIGNAL:
943 arg_kill_signal = signal_from_string_try_harder(optarg);
944 if (arg_kill_signal < 0) {
945 log_error("Cannot parse signal: %s", optarg);
946 return -EINVAL;
947 }
948
949 arg_settings_mask |= SETTING_KILL_SIGNAL;
950 break;
951
952 case ARG_SETTINGS:
953
954 /* no → do not read files
955 * yes → read files, do not override cmdline, trust only subset
956 * override → read files, override cmdline, trust only subset
957 * trusted → read files, do not override cmdline, trust all
958 */
959
960 r = parse_boolean(optarg);
961 if (r < 0) {
962 if (streq(optarg, "trusted")) {
963 mask_all_settings = false;
964 mask_no_settings = false;
965 arg_settings_trusted = true;
966
967 } else if (streq(optarg, "override")) {
968 mask_all_settings = false;
969 mask_no_settings = true;
970 arg_settings_trusted = -1;
971 } else
972 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
973 } else if (r > 0) {
974 /* yes */
975 mask_all_settings = false;
976 mask_no_settings = false;
977 arg_settings_trusted = -1;
978 } else {
979 /* no */
980 mask_all_settings = true;
981 mask_no_settings = false;
982 arg_settings_trusted = false;
983 }
984
985 break;
986
987 case ARG_CHDIR:
988 if (!path_is_absolute(optarg)) {
989 log_error("Working directory %s is not an absolute path.", optarg);
990 return -EINVAL;
991 }
992
993 r = free_and_strdup(&arg_chdir, optarg);
994 if (r < 0)
995 return log_oom();
996
997 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
998 break;
999
1000 case ARG_NOTIFY_READY:
1001 r = parse_boolean(optarg);
1002 if (r < 0) {
1003 log_error("%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1004 return -EINVAL;
1005 }
1006 arg_notify_ready = r;
1007 arg_settings_mask |= SETTING_NOTIFY_READY;
1008 break;
1009
1010 case '?':
1011 return -EINVAL;
1012
1013 default:
1014 assert_not_reached("Unhandled option");
1015 }
1016
1017 if (arg_share_system)
1018 arg_register = false;
1019
1020 if (arg_userns_mode == USER_NAMESPACE_PICK)
1021 arg_userns_chown = true;
1022
1023 if (arg_start_mode != START_PID1 && arg_share_system) {
1024 log_error("--boot and --share-system may not be combined.");
1025 return -EINVAL;
1026 }
1027
1028 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
1029 log_error("--keep-unit may not be used when invoked from a user session.");
1030 return -EINVAL;
1031 }
1032
1033 if (arg_directory && arg_image) {
1034 log_error("--directory= and --image= may not be combined.");
1035 return -EINVAL;
1036 }
1037
1038 if (arg_template && arg_image) {
1039 log_error("--template= and --image= may not be combined.");
1040 return -EINVAL;
1041 }
1042
1043 if (arg_template && !(arg_directory || arg_machine)) {
1044 log_error("--template= needs --directory= or --machine=.");
1045 return -EINVAL;
1046 }
1047
1048 if (arg_ephemeral && arg_template) {
1049 log_error("--ephemeral and --template= may not be combined.");
1050 return -EINVAL;
1051 }
1052
1053 if (arg_ephemeral && arg_image) {
1054 log_error("--ephemeral and --image= may not be combined.");
1055 return -EINVAL;
1056 }
1057
1058 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1059 log_error("--ephemeral and --link-journal= may not be combined.");
1060 return -EINVAL;
1061 }
1062
1063 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported()) {
1064 log_error("--private-users= is not supported, kernel compiled without user namespace support.");
1065 return -EOPNOTSUPP;
1066 }
1067
1068 if (arg_userns_chown && arg_read_only) {
1069 log_error("--read-only and --private-users-chown may not be combined.");
1070 return -EINVAL;
1071 }
1072
1073 if (arg_network_bridge && arg_network_zone) {
1074 log_error("--network-bridge= and --network-zone= may not be combined.");
1075 return -EINVAL;
1076 }
1077
1078 if (argc > optind) {
1079 arg_parameters = strv_copy(argv + optind);
1080 if (!arg_parameters)
1081 return log_oom();
1082
1083 arg_settings_mask |= SETTING_START_MODE;
1084 }
1085
1086 /* Load all settings from .nspawn files */
1087 if (mask_no_settings)
1088 arg_settings_mask = 0;
1089
1090 /* Don't load any settings from .nspawn files */
1091 if (mask_all_settings)
1092 arg_settings_mask = _SETTINGS_MASK_ALL;
1093
1094 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
1095
1096 r = detect_unified_cgroup_hierarchy();
1097 if (r < 0)
1098 return r;
1099
1100 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
1101 if (e)
1102 arg_container_service_name = e;
1103
1104 return 1;
1105 }
1106
1107 static int verify_arguments(void) {
1108
1109 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
1110 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1111 return -EINVAL;
1112 }
1113
1114 if (arg_expose_ports && !arg_private_network) {
1115 log_error("Cannot use --port= without private networking.");
1116 return -EINVAL;
1117 }
1118
1119 #ifndef HAVE_LIBIPTC
1120 if (arg_expose_ports) {
1121 log_error("--port= is not supported, compiled without libiptc support.");
1122 return -EOPNOTSUPP;
1123 }
1124 #endif
1125
1126 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1127 arg_kill_signal = SIGRTMIN+3;
1128
1129 return 0;
1130 }
1131
1132 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1133 assert(p);
1134
1135 if (arg_userns_mode == USER_NAMESPACE_NO)
1136 return 0;
1137
1138 if (uid == UID_INVALID && gid == GID_INVALID)
1139 return 0;
1140
1141 if (uid != UID_INVALID) {
1142 uid += arg_uid_shift;
1143
1144 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1145 return -EOVERFLOW;
1146 }
1147
1148 if (gid != GID_INVALID) {
1149 gid += (gid_t) arg_uid_shift;
1150
1151 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1152 return -EOVERFLOW;
1153 }
1154
1155 if (lchown(p, uid, gid) < 0)
1156 return -errno;
1157
1158 return 0;
1159 }
1160
1161 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1162 const char *q;
1163
1164 q = prefix_roota(root, path);
1165 if (mkdir(q, mode) < 0) {
1166 if (errno == EEXIST)
1167 return 0;
1168 return -errno;
1169 }
1170
1171 return userns_lchown(q, uid, gid);
1172 }
1173
1174 static int setup_timezone(const char *dest) {
1175 _cleanup_free_ char *p = NULL, *q = NULL;
1176 const char *where, *check, *what;
1177 char *z, *y;
1178 int r;
1179
1180 assert(dest);
1181
1182 /* Fix the timezone, if possible */
1183 r = readlink_malloc("/etc/localtime", &p);
1184 if (r < 0) {
1185 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1186 return 0;
1187 }
1188
1189 z = path_startswith(p, "../usr/share/zoneinfo/");
1190 if (!z)
1191 z = path_startswith(p, "/usr/share/zoneinfo/");
1192 if (!z) {
1193 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1194 return 0;
1195 }
1196
1197 where = prefix_roota(dest, "/etc/localtime");
1198 r = readlink_malloc(where, &q);
1199 if (r >= 0) {
1200 y = path_startswith(q, "../usr/share/zoneinfo/");
1201 if (!y)
1202 y = path_startswith(q, "/usr/share/zoneinfo/");
1203
1204 /* Already pointing to the right place? Then do nothing .. */
1205 if (y && streq(y, z))
1206 return 0;
1207 }
1208
1209 check = strjoina("/usr/share/zoneinfo/", z);
1210 check = prefix_roota(dest, check);
1211 if (laccess(check, F_OK) < 0) {
1212 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1213 return 0;
1214 }
1215
1216 r = unlink(where);
1217 if (r < 0 && errno != ENOENT) {
1218 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1219 return 0;
1220 }
1221
1222 what = strjoina("../usr/share/zoneinfo/", z);
1223 if (symlink(what, where) < 0) {
1224 log_error_errno(errno, "Failed to correct timezone of container: %m");
1225 return 0;
1226 }
1227
1228 r = userns_lchown(where, 0, 0);
1229 if (r < 0)
1230 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1231
1232 return 0;
1233 }
1234
1235 static int setup_resolv_conf(const char *dest) {
1236 const char *where = NULL;
1237 int r;
1238
1239 assert(dest);
1240
1241 if (arg_private_network)
1242 return 0;
1243
1244 /* Fix resolv.conf, if possible */
1245 where = prefix_roota(dest, "/etc/resolv.conf");
1246
1247 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1248 if (r < 0) {
1249 /* If the file already exists as symlink, let's
1250 * suppress the warning, under the assumption that
1251 * resolved or something similar runs inside and the
1252 * symlink points there.
1253 *
1254 * If the disk image is read-only, there's also no
1255 * point in complaining.
1256 */
1257 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1258 "Failed to copy /etc/resolv.conf to %s: %m", where);
1259 return 0;
1260 }
1261
1262 r = userns_lchown(where, 0, 0);
1263 if (r < 0)
1264 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1265
1266 return 0;
1267 }
1268
1269 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1270 assert(s);
1271
1272 snprintf(s, 37,
1273 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1274 SD_ID128_FORMAT_VAL(id));
1275
1276 return s;
1277 }
1278
1279 static int setup_boot_id(const char *dest) {
1280 sd_id128_t rnd = SD_ID128_NULL;
1281 const char *from, *to;
1282 char as_uuid[37];
1283 int r;
1284
1285 if (arg_share_system)
1286 return 0;
1287
1288 /* Generate a new randomized boot ID, so that each boot-up of
1289 * the container gets a new one */
1290
1291 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1292 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1293
1294 r = sd_id128_randomize(&rnd);
1295 if (r < 0)
1296 return log_error_errno(r, "Failed to generate random boot id: %m");
1297
1298 id128_format_as_uuid(rnd, as_uuid);
1299
1300 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1301 if (r < 0)
1302 return log_error_errno(r, "Failed to write boot id: %m");
1303
1304 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1305 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1306 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1307 log_warning_errno(errno, "Failed to make boot id read-only, ignoring: %m");
1308
1309 (void) unlink(from);
1310 return r;
1311 }
1312
1313 static int copy_devnodes(const char *dest) {
1314
1315 static const char devnodes[] =
1316 "null\0"
1317 "zero\0"
1318 "full\0"
1319 "random\0"
1320 "urandom\0"
1321 "tty\0"
1322 "net/tun\0";
1323
1324 const char *d;
1325 int r = 0;
1326 _cleanup_umask_ mode_t u;
1327
1328 assert(dest);
1329
1330 u = umask(0000);
1331
1332 /* Create /dev/net, so that we can create /dev/net/tun in it */
1333 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1334 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1335
1336 NULSTR_FOREACH(d, devnodes) {
1337 _cleanup_free_ char *from = NULL, *to = NULL;
1338 struct stat st;
1339
1340 from = strappend("/dev/", d);
1341 to = prefix_root(dest, from);
1342
1343 if (stat(from, &st) < 0) {
1344
1345 if (errno != ENOENT)
1346 return log_error_errno(errno, "Failed to stat %s: %m", from);
1347
1348 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1349
1350 log_error("%s is not a char or block device, cannot copy.", from);
1351 return -EIO;
1352
1353 } else {
1354 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1355 if (errno != EPERM)
1356 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1357
1358 /* Some systems abusively restrict mknod but
1359 * allow bind mounts. */
1360 r = touch(to);
1361 if (r < 0)
1362 return log_error_errno(r, "touch (%s) failed: %m", to);
1363 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1364 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1365 }
1366
1367 r = userns_lchown(to, 0, 0);
1368 if (r < 0)
1369 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1370 }
1371 }
1372
1373 return r;
1374 }
1375
1376 static int setup_pts(const char *dest) {
1377 _cleanup_free_ char *options = NULL;
1378 const char *p;
1379 int r;
1380
1381 #ifdef HAVE_SELINUX
1382 if (arg_selinux_apifs_context)
1383 (void) asprintf(&options,
1384 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1385 arg_uid_shift + TTY_GID,
1386 arg_selinux_apifs_context);
1387 else
1388 #endif
1389 (void) asprintf(&options,
1390 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1391 arg_uid_shift + TTY_GID);
1392
1393 if (!options)
1394 return log_oom();
1395
1396 /* Mount /dev/pts itself */
1397 p = prefix_roota(dest, "/dev/pts");
1398 if (mkdir(p, 0755) < 0)
1399 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1400 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1401 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1402 r = userns_lchown(p, 0, 0);
1403 if (r < 0)
1404 return log_error_errno(r, "Failed to chown /dev/pts: %m");
1405
1406 /* Create /dev/ptmx symlink */
1407 p = prefix_roota(dest, "/dev/ptmx");
1408 if (symlink("pts/ptmx", p) < 0)
1409 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1410 r = userns_lchown(p, 0, 0);
1411 if (r < 0)
1412 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
1413
1414 /* And fix /dev/pts/ptmx ownership */
1415 p = prefix_roota(dest, "/dev/pts/ptmx");
1416 r = userns_lchown(p, 0, 0);
1417 if (r < 0)
1418 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
1419
1420 return 0;
1421 }
1422
1423 static int setup_dev_console(const char *dest, const char *console) {
1424 _cleanup_umask_ mode_t u;
1425 const char *to;
1426 int r;
1427
1428 assert(dest);
1429 assert(console);
1430
1431 u = umask(0000);
1432
1433 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1434 if (r < 0)
1435 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1436
1437 /* We need to bind mount the right tty to /dev/console since
1438 * ptys can only exist on pts file systems. To have something
1439 * to bind mount things on we create a empty regular file. */
1440
1441 to = prefix_roota(dest, "/dev/console");
1442 r = touch(to);
1443 if (r < 0)
1444 return log_error_errno(r, "touch() for /dev/console failed: %m");
1445
1446 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1447 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1448
1449 return 0;
1450 }
1451
1452 static int setup_kmsg(const char *dest, int kmsg_socket) {
1453 const char *from, *to;
1454 _cleanup_umask_ mode_t u;
1455 int fd, r;
1456
1457 assert(kmsg_socket >= 0);
1458
1459 u = umask(0000);
1460
1461 /* We create the kmsg FIFO as /run/kmsg, but immediately
1462 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1463 * on the reading side behave very similar to /proc/kmsg,
1464 * their writing side behaves differently from /dev/kmsg in
1465 * that writing blocks when nothing is reading. In order to
1466 * avoid any problems with containers deadlocking due to this
1467 * we simply make /dev/kmsg unavailable to the container. */
1468 from = prefix_roota(dest, "/run/kmsg");
1469 to = prefix_roota(dest, "/proc/kmsg");
1470
1471 if (mkfifo(from, 0600) < 0)
1472 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1473 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1474 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1475
1476 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1477 if (fd < 0)
1478 return log_error_errno(errno, "Failed to open fifo: %m");
1479
1480 /* Store away the fd in the socket, so that it stays open as
1481 * long as we run the child */
1482 r = send_one_fd(kmsg_socket, fd, 0);
1483 safe_close(fd);
1484
1485 if (r < 0)
1486 return log_error_errno(r, "Failed to send FIFO fd: %m");
1487
1488 /* And now make the FIFO unavailable as /run/kmsg... */
1489 (void) unlink(from);
1490
1491 return 0;
1492 }
1493
1494 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
1495 union in_addr_union *exposed = userdata;
1496
1497 assert(rtnl);
1498 assert(m);
1499 assert(exposed);
1500
1501 expose_port_execute(rtnl, arg_expose_ports, exposed);
1502 return 0;
1503 }
1504
1505 static int setup_hostname(void) {
1506
1507 if (arg_share_system)
1508 return 0;
1509
1510 if (sethostname_idempotent(arg_machine) < 0)
1511 return -errno;
1512
1513 return 0;
1514 }
1515
1516 static int setup_journal(const char *directory) {
1517 sd_id128_t this_id;
1518 _cleanup_free_ char *d = NULL;
1519 const char *p, *q;
1520 bool try;
1521 char id[33];
1522 int r;
1523
1524 /* Don't link journals in ephemeral mode */
1525 if (arg_ephemeral)
1526 return 0;
1527
1528 if (arg_link_journal == LINK_NO)
1529 return 0;
1530
1531 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1532
1533 r = sd_id128_get_machine(&this_id);
1534 if (r < 0)
1535 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1536
1537 if (sd_id128_equal(arg_uuid, this_id)) {
1538 log_full(try ? LOG_WARNING : LOG_ERR,
1539 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
1540 if (try)
1541 return 0;
1542 return -EEXIST;
1543 }
1544
1545 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1546 if (r < 0)
1547 return log_error_errno(r, "Failed to create /var: %m");
1548
1549 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1550 if (r < 0)
1551 return log_error_errno(r, "Failed to create /var/log: %m");
1552
1553 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1554 if (r < 0)
1555 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1556
1557 (void) sd_id128_to_string(arg_uuid, id);
1558
1559 p = strjoina("/var/log/journal/", id);
1560 q = prefix_roota(directory, p);
1561
1562 if (path_is_mount_point(p, 0) > 0) {
1563 if (try)
1564 return 0;
1565
1566 log_error("%s: already a mount point, refusing to use for journal", p);
1567 return -EEXIST;
1568 }
1569
1570 if (path_is_mount_point(q, 0) > 0) {
1571 if (try)
1572 return 0;
1573
1574 log_error("%s: already a mount point, refusing to use for journal", q);
1575 return -EEXIST;
1576 }
1577
1578 r = readlink_and_make_absolute(p, &d);
1579 if (r >= 0) {
1580 if ((arg_link_journal == LINK_GUEST ||
1581 arg_link_journal == LINK_AUTO) &&
1582 path_equal(d, q)) {
1583
1584 r = userns_mkdir(directory, p, 0755, 0, 0);
1585 if (r < 0)
1586 log_warning_errno(r, "Failed to create directory %s: %m", q);
1587 return 0;
1588 }
1589
1590 if (unlink(p) < 0)
1591 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1592 } else if (r == -EINVAL) {
1593
1594 if (arg_link_journal == LINK_GUEST &&
1595 rmdir(p) < 0) {
1596
1597 if (errno == ENOTDIR) {
1598 log_error("%s already exists and is neither a symlink nor a directory", p);
1599 return r;
1600 } else
1601 return log_error_errno(errno, "Failed to remove %s: %m", p);
1602 }
1603 } else if (r != -ENOENT)
1604 return log_error_errno(r, "readlink(%s) failed: %m", p);
1605
1606 if (arg_link_journal == LINK_GUEST) {
1607
1608 if (symlink(q, p) < 0) {
1609 if (try) {
1610 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1611 return 0;
1612 } else
1613 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1614 }
1615
1616 r = userns_mkdir(directory, p, 0755, 0, 0);
1617 if (r < 0)
1618 log_warning_errno(r, "Failed to create directory %s: %m", q);
1619 return 0;
1620 }
1621
1622 if (arg_link_journal == LINK_HOST) {
1623 /* don't create parents here — if the host doesn't have
1624 * permanent journal set up, don't force it here */
1625
1626 if (mkdir(p, 0755) < 0 && errno != EEXIST) {
1627 if (try) {
1628 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1629 return 0;
1630 } else
1631 return log_error_errno(errno, "Failed to create %s: %m", p);
1632 }
1633
1634 } else if (access(p, F_OK) < 0)
1635 return 0;
1636
1637 if (dir_is_empty(q) == 0)
1638 log_warning("%s is not empty, proceeding anyway.", q);
1639
1640 r = userns_mkdir(directory, p, 0755, 0, 0);
1641 if (r < 0)
1642 return log_error_errno(r, "Failed to create %s: %m", q);
1643
1644 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1645 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1646
1647 return 0;
1648 }
1649
1650 static int drop_capabilities(void) {
1651 return capability_bounding_set_drop(arg_caps_retain, false);
1652 }
1653
1654 static int reset_audit_loginuid(void) {
1655 _cleanup_free_ char *p = NULL;
1656 int r;
1657
1658 if (arg_share_system)
1659 return 0;
1660
1661 r = read_one_line_file("/proc/self/loginuid", &p);
1662 if (r == -ENOENT)
1663 return 0;
1664 if (r < 0)
1665 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1666
1667 /* Already reset? */
1668 if (streq(p, "4294967295"))
1669 return 0;
1670
1671 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
1672 if (r < 0) {
1673 log_error_errno(r,
1674 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1675 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1676 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1677 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1678 "using systemd-nspawn. Sleeping for 5s... (%m)");
1679
1680 sleep(5);
1681 }
1682
1683 return 0;
1684 }
1685
1686
1687 static int setup_propagate(const char *root) {
1688 const char *p, *q;
1689 int r;
1690
1691 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1692 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1693 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
1694 (void) mkdir_p(p, 0600);
1695
1696 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1697 if (r < 0)
1698 return log_error_errno(r, "Failed to create /run/systemd: %m");
1699
1700 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1701 if (r < 0)
1702 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
1703
1704 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1705 if (r < 0)
1706 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
1707
1708 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
1709 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1710 return log_error_errno(errno, "Failed to install propagation bind mount.");
1711
1712 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1713 return log_error_errno(errno, "Failed to make propagation mount read-only");
1714
1715 return 0;
1716 }
1717
1718 static int setup_image(char **device_path, int *loop_nr) {
1719 struct loop_info64 info = {
1720 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1721 };
1722 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1723 _cleanup_free_ char* loopdev = NULL;
1724 struct stat st;
1725 int r, nr;
1726
1727 assert(device_path);
1728 assert(loop_nr);
1729 assert(arg_image);
1730
1731 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1732 if (fd < 0)
1733 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1734
1735 if (fstat(fd, &st) < 0)
1736 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1737
1738 if (S_ISBLK(st.st_mode)) {
1739 char *p;
1740
1741 p = strdup(arg_image);
1742 if (!p)
1743 return log_oom();
1744
1745 *device_path = p;
1746
1747 *loop_nr = -1;
1748
1749 r = fd;
1750 fd = -1;
1751
1752 return r;
1753 }
1754
1755 if (!S_ISREG(st.st_mode)) {
1756 log_error("%s is not a regular file or block device.", arg_image);
1757 return -EINVAL;
1758 }
1759
1760 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1761 if (control < 0)
1762 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1763
1764 nr = ioctl(control, LOOP_CTL_GET_FREE);
1765 if (nr < 0)
1766 return log_error_errno(errno, "Failed to allocate loop device: %m");
1767
1768 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1769 return log_oom();
1770
1771 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1772 if (loop < 0)
1773 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1774
1775 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1776 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1777
1778 if (arg_read_only)
1779 info.lo_flags |= LO_FLAGS_READ_ONLY;
1780
1781 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1782 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1783
1784 *device_path = loopdev;
1785 loopdev = NULL;
1786
1787 *loop_nr = nr;
1788
1789 r = loop;
1790 loop = -1;
1791
1792 return r;
1793 }
1794
1795 #define PARTITION_TABLE_BLURB \
1796 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1797 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1798 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1799 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1800 "to be bootable with systemd-nspawn."
1801
1802 static int dissect_image(
1803 int fd,
1804 char **root_device, bool *root_device_rw,
1805 char **home_device, bool *home_device_rw,
1806 char **srv_device, bool *srv_device_rw,
1807 bool *secondary) {
1808
1809 #ifdef HAVE_BLKID
1810 int home_nr = -1, srv_nr = -1;
1811 #ifdef GPT_ROOT_NATIVE
1812 int root_nr = -1;
1813 #endif
1814 #ifdef GPT_ROOT_SECONDARY
1815 int secondary_root_nr = -1;
1816 #endif
1817 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1818 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1819 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1820 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1821 _cleanup_udev_unref_ struct udev *udev = NULL;
1822 struct udev_list_entry *first, *item;
1823 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
1824 bool is_gpt, is_mbr, multiple_generic = false;
1825 const char *pttype = NULL;
1826 blkid_partlist pl;
1827 struct stat st;
1828 unsigned i;
1829 int r;
1830
1831 assert(fd >= 0);
1832 assert(root_device);
1833 assert(home_device);
1834 assert(srv_device);
1835 assert(secondary);
1836 assert(arg_image);
1837
1838 b = blkid_new_probe();
1839 if (!b)
1840 return log_oom();
1841
1842 errno = 0;
1843 r = blkid_probe_set_device(b, fd, 0, 0);
1844 if (r != 0) {
1845 if (errno == 0)
1846 return log_oom();
1847
1848 return log_error_errno(errno, "Failed to set device on blkid probe: %m");
1849 }
1850
1851 blkid_probe_enable_partitions(b, 1);
1852 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1853
1854 errno = 0;
1855 r = blkid_do_safeprobe(b);
1856 if (r == -2 || r == 1) {
1857 log_error("Failed to identify any partition table on\n"
1858 " %s\n"
1859 PARTITION_TABLE_BLURB, arg_image);
1860 return -EINVAL;
1861 } else if (r != 0) {
1862 if (errno == 0)
1863 errno = EIO;
1864 return log_error_errno(errno, "Failed to probe: %m");
1865 }
1866
1867 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1868
1869 is_gpt = streq_ptr(pttype, "gpt");
1870 is_mbr = streq_ptr(pttype, "dos");
1871
1872 if (!is_gpt && !is_mbr) {
1873 log_error("No GPT or MBR partition table discovered on\n"
1874 " %s\n"
1875 PARTITION_TABLE_BLURB, arg_image);
1876 return -EINVAL;
1877 }
1878
1879 errno = 0;
1880 pl = blkid_probe_get_partitions(b);
1881 if (!pl) {
1882 if (errno == 0)
1883 return log_oom();
1884
1885 log_error("Failed to list partitions of %s", arg_image);
1886 return -errno;
1887 }
1888
1889 udev = udev_new();
1890 if (!udev)
1891 return log_oom();
1892
1893 if (fstat(fd, &st) < 0)
1894 return log_error_errno(errno, "Failed to stat block device: %m");
1895
1896 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1897 if (!d)
1898 return log_oom();
1899
1900 for (i = 0;; i++) {
1901 int n, m;
1902
1903 if (i >= 10) {
1904 log_error("Kernel partitions never appeared.");
1905 return -ENXIO;
1906 }
1907
1908 e = udev_enumerate_new(udev);
1909 if (!e)
1910 return log_oom();
1911
1912 r = udev_enumerate_add_match_parent(e, d);
1913 if (r < 0)
1914 return log_oom();
1915
1916 r = udev_enumerate_scan_devices(e);
1917 if (r < 0)
1918 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1919
1920 /* Count the partitions enumerated by the kernel */
1921 n = 0;
1922 first = udev_enumerate_get_list_entry(e);
1923 udev_list_entry_foreach(item, first)
1924 n++;
1925
1926 /* Count the partitions enumerated by blkid */
1927 m = blkid_partlist_numof_partitions(pl);
1928 if (n == m + 1)
1929 break;
1930 if (n > m + 1) {
1931 log_error("blkid and kernel partition list do not match.");
1932 return -EIO;
1933 }
1934 if (n < m + 1) {
1935 unsigned j;
1936
1937 /* The kernel has probed fewer partitions than
1938 * blkid? Maybe the kernel prober is still
1939 * running or it got EBUSY because udev
1940 * already opened the device. Let's reprobe
1941 * the device, which is a synchronous call
1942 * that waits until probing is complete. */
1943
1944 for (j = 0; j < 20; j++) {
1945
1946 r = ioctl(fd, BLKRRPART, 0);
1947 if (r < 0)
1948 r = -errno;
1949 if (r >= 0 || r != -EBUSY)
1950 break;
1951
1952 /* If something else has the device
1953 * open, such as an udev rule, the
1954 * ioctl will return EBUSY. Since
1955 * there's no way to wait until it
1956 * isn't busy anymore, let's just wait
1957 * a bit, and try again.
1958 *
1959 * This is really something they
1960 * should fix in the kernel! */
1961
1962 usleep(50 * USEC_PER_MSEC);
1963 }
1964
1965 if (r < 0)
1966 return log_error_errno(r, "Failed to reread partition table: %m");
1967 }
1968
1969 e = udev_enumerate_unref(e);
1970 }
1971
1972 first = udev_enumerate_get_list_entry(e);
1973 udev_list_entry_foreach(item, first) {
1974 _cleanup_udev_device_unref_ struct udev_device *q;
1975 const char *node;
1976 unsigned long long flags;
1977 blkid_partition pp;
1978 dev_t qn;
1979 int nr;
1980
1981 errno = 0;
1982 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1983 if (!q) {
1984 if (!errno)
1985 errno = ENOMEM;
1986
1987 return log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1988 }
1989
1990 qn = udev_device_get_devnum(q);
1991 if (major(qn) == 0)
1992 continue;
1993
1994 if (st.st_rdev == qn)
1995 continue;
1996
1997 node = udev_device_get_devnode(q);
1998 if (!node)
1999 continue;
2000
2001 pp = blkid_partlist_devno_to_partition(pl, qn);
2002 if (!pp)
2003 continue;
2004
2005 flags = blkid_partition_get_flags(pp);
2006
2007 nr = blkid_partition_get_partno(pp);
2008 if (nr < 0)
2009 continue;
2010
2011 if (is_gpt) {
2012 sd_id128_t type_id;
2013 const char *stype;
2014
2015 if (flags & GPT_FLAG_NO_AUTO)
2016 continue;
2017
2018 stype = blkid_partition_get_type_string(pp);
2019 if (!stype)
2020 continue;
2021
2022 if (sd_id128_from_string(stype, &type_id) < 0)
2023 continue;
2024
2025 if (sd_id128_equal(type_id, GPT_HOME)) {
2026
2027 if (home && nr >= home_nr)
2028 continue;
2029
2030 home_nr = nr;
2031 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2032
2033 r = free_and_strdup(&home, node);
2034 if (r < 0)
2035 return log_oom();
2036
2037 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2038
2039 if (srv && nr >= srv_nr)
2040 continue;
2041
2042 srv_nr = nr;
2043 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2044
2045 r = free_and_strdup(&srv, node);
2046 if (r < 0)
2047 return log_oom();
2048 }
2049 #ifdef GPT_ROOT_NATIVE
2050 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2051
2052 if (root && nr >= root_nr)
2053 continue;
2054
2055 root_nr = nr;
2056 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2057
2058 r = free_and_strdup(&root, node);
2059 if (r < 0)
2060 return log_oom();
2061 }
2062 #endif
2063 #ifdef GPT_ROOT_SECONDARY
2064 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2065
2066 if (secondary_root && nr >= secondary_root_nr)
2067 continue;
2068
2069 secondary_root_nr = nr;
2070 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2071
2072 r = free_and_strdup(&secondary_root, node);
2073 if (r < 0)
2074 return log_oom();
2075 }
2076 #endif
2077 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2078
2079 if (generic)
2080 multiple_generic = true;
2081 else {
2082 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2083
2084 r = free_and_strdup(&generic, node);
2085 if (r < 0)
2086 return log_oom();
2087 }
2088 }
2089
2090 } else if (is_mbr) {
2091 int type;
2092
2093 if (flags != 0x80) /* Bootable flag */
2094 continue;
2095
2096 type = blkid_partition_get_type(pp);
2097 if (type != 0x83) /* Linux partition */
2098 continue;
2099
2100 if (generic)
2101 multiple_generic = true;
2102 else {
2103 generic_rw = true;
2104
2105 r = free_and_strdup(&root, node);
2106 if (r < 0)
2107 return log_oom();
2108 }
2109 }
2110 }
2111
2112 if (root) {
2113 *root_device = root;
2114 root = NULL;
2115
2116 *root_device_rw = root_rw;
2117 *secondary = false;
2118 } else if (secondary_root) {
2119 *root_device = secondary_root;
2120 secondary_root = NULL;
2121
2122 *root_device_rw = secondary_root_rw;
2123 *secondary = true;
2124 } else if (generic) {
2125
2126 /* There were no partitions with precise meanings
2127 * around, but we found generic partitions. In this
2128 * case, if there's only one, we can go ahead and boot
2129 * it, otherwise we bail out, because we really cannot
2130 * make any sense of it. */
2131
2132 if (multiple_generic) {
2133 log_error("Identified multiple bootable Linux partitions on\n"
2134 " %s\n"
2135 PARTITION_TABLE_BLURB, arg_image);
2136 return -EINVAL;
2137 }
2138
2139 *root_device = generic;
2140 generic = NULL;
2141
2142 *root_device_rw = generic_rw;
2143 *secondary = false;
2144 } else {
2145 log_error("Failed to identify root partition in disk image\n"
2146 " %s\n"
2147 PARTITION_TABLE_BLURB, arg_image);
2148 return -EINVAL;
2149 }
2150
2151 if (home) {
2152 *home_device = home;
2153 home = NULL;
2154
2155 *home_device_rw = home_rw;
2156 }
2157
2158 if (srv) {
2159 *srv_device = srv;
2160 srv = NULL;
2161
2162 *srv_device_rw = srv_rw;
2163 }
2164
2165 return 0;
2166 #else
2167 log_error("--image= is not supported, compiled without blkid support.");
2168 return -EOPNOTSUPP;
2169 #endif
2170 }
2171
2172 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2173 #ifdef HAVE_BLKID
2174 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2175 const char *fstype, *p;
2176 int r;
2177
2178 assert(what);
2179 assert(where);
2180
2181 if (arg_read_only)
2182 rw = false;
2183
2184 if (directory)
2185 p = strjoina(where, directory);
2186 else
2187 p = where;
2188
2189 errno = 0;
2190 b = blkid_new_probe_from_filename(what);
2191 if (!b) {
2192 if (errno == 0)
2193 return log_oom();
2194 return log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2195 }
2196
2197 blkid_probe_enable_superblocks(b, 1);
2198 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2199
2200 errno = 0;
2201 r = blkid_do_safeprobe(b);
2202 if (r == -1 || r == 1) {
2203 log_error("Cannot determine file system type of %s", what);
2204 return -EINVAL;
2205 } else if (r != 0) {
2206 if (errno == 0)
2207 errno = EIO;
2208 return log_error_errno(errno, "Failed to probe %s: %m", what);
2209 }
2210
2211 errno = 0;
2212 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2213 if (errno == 0)
2214 errno = EINVAL;
2215 log_error("Failed to determine file system type of %s", what);
2216 return -errno;
2217 }
2218
2219 if (streq(fstype, "crypto_LUKS")) {
2220 log_error("nspawn currently does not support LUKS disk images.");
2221 return -EOPNOTSUPP;
2222 }
2223
2224 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2225 return log_error_errno(errno, "Failed to mount %s: %m", what);
2226
2227 return 0;
2228 #else
2229 log_error("--image= is not supported, compiled without blkid support.");
2230 return -EOPNOTSUPP;
2231 #endif
2232 }
2233
2234 static int setup_machine_id(const char *directory) {
2235 const char *etc_machine_id, *t;
2236 _cleanup_free_ char *s = NULL;
2237 int r;
2238
2239 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2240
2241 r = read_one_line_file(etc_machine_id, &s);
2242 if (r < 0)
2243 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
2244
2245 t = strstrip(s);
2246
2247 if (!isempty(t)) {
2248 r = sd_id128_from_string(t, &arg_uuid);
2249 if (r < 0)
2250 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
2251 } else {
2252 if (sd_id128_is_null(arg_uuid)) {
2253 r = sd_id128_randomize(&arg_uuid);
2254 if (r < 0)
2255 return log_error_errno(r, "Failed to generate random machine ID: %m");
2256 }
2257 }
2258
2259 r = machine_id_setup(directory, arg_uuid);
2260 if (r < 0)
2261 return log_error_errno(r, "Failed to setup machine ID: %m");
2262
2263 return 0;
2264 }
2265
2266 static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2267 int r;
2268
2269 assert(directory);
2270
2271 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
2272 return 0;
2273
2274 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2275 if (r == -EOPNOTSUPP)
2276 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2277 if (r == -EBADE)
2278 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2279 if (r < 0)
2280 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2281 if (r == 0)
2282 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2283 else
2284 log_debug("Patched directory tree to match UID/GID range.");
2285
2286 return r;
2287 }
2288
2289 static int mount_devices(
2290 const char *where,
2291 const char *root_device, bool root_device_rw,
2292 const char *home_device, bool home_device_rw,
2293 const char *srv_device, bool srv_device_rw) {
2294 int r;
2295
2296 assert(where);
2297
2298 if (root_device) {
2299 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2300 if (r < 0)
2301 return log_error_errno(r, "Failed to mount root directory: %m");
2302 }
2303
2304 if (home_device) {
2305 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2306 if (r < 0)
2307 return log_error_errno(r, "Failed to mount home directory: %m");
2308 }
2309
2310 if (srv_device) {
2311 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2312 if (r < 0)
2313 return log_error_errno(r, "Failed to mount server data directory: %m");
2314 }
2315
2316 return 0;
2317 }
2318
2319 static void loop_remove(int nr, int *image_fd) {
2320 _cleanup_close_ int control = -1;
2321 int r;
2322
2323 if (nr < 0)
2324 return;
2325
2326 if (image_fd && *image_fd >= 0) {
2327 r = ioctl(*image_fd, LOOP_CLR_FD);
2328 if (r < 0)
2329 log_debug_errno(errno, "Failed to close loop image: %m");
2330 *image_fd = safe_close(*image_fd);
2331 }
2332
2333 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2334 if (control < 0) {
2335 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2336 return;
2337 }
2338
2339 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2340 if (r < 0)
2341 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
2342 }
2343
2344 /*
2345 * Return values:
2346 * < 0 : wait_for_terminate() failed to get the state of the
2347 * container, the container was terminated by a signal, or
2348 * failed for an unknown reason. No change is made to the
2349 * container argument.
2350 * > 0 : The program executed in the container terminated with an
2351 * error. The exit code of the program executed in the
2352 * container is returned. The container argument has been set
2353 * to CONTAINER_TERMINATED.
2354 * 0 : The container is being rebooted, has been shut down or exited
2355 * successfully. The container argument has been set to either
2356 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2357 *
2358 * That is, success is indicated by a return value of zero, and an
2359 * error is indicated by a non-zero value.
2360 */
2361 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2362 siginfo_t status;
2363 int r;
2364
2365 r = wait_for_terminate(pid, &status);
2366 if (r < 0)
2367 return log_warning_errno(r, "Failed to wait for container: %m");
2368
2369 switch (status.si_code) {
2370
2371 case CLD_EXITED:
2372 if (status.si_status == 0)
2373 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2374 else
2375 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2376
2377 *container = CONTAINER_TERMINATED;
2378 return status.si_status;
2379
2380 case CLD_KILLED:
2381 if (status.si_status == SIGINT) {
2382 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2383 *container = CONTAINER_TERMINATED;
2384 return 0;
2385
2386 } else if (status.si_status == SIGHUP) {
2387 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2388 *container = CONTAINER_REBOOTED;
2389 return 0;
2390 }
2391
2392 /* CLD_KILLED fallthrough */
2393
2394 case CLD_DUMPED:
2395 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2396 return -EIO;
2397
2398 default:
2399 log_error("Container %s failed due to unknown reason.", arg_machine);
2400 return -EIO;
2401 }
2402 }
2403
2404 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2405 pid_t pid;
2406
2407 pid = PTR_TO_PID(userdata);
2408 if (pid > 0) {
2409 if (kill(pid, arg_kill_signal) >= 0) {
2410 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2411 sd_event_source_set_userdata(s, NULL);
2412 return 0;
2413 }
2414 }
2415
2416 sd_event_exit(sd_event_source_get_event(s), 0);
2417 return 0;
2418 }
2419
2420 static int determine_names(void) {
2421 int r;
2422
2423 if (arg_template && !arg_directory && arg_machine) {
2424
2425 /* If --template= was specified then we should not
2426 * search for a machine, but instead create a new one
2427 * in /var/lib/machine. */
2428
2429 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2430 if (!arg_directory)
2431 return log_oom();
2432 }
2433
2434 if (!arg_image && !arg_directory) {
2435 if (arg_machine) {
2436 _cleanup_(image_unrefp) Image *i = NULL;
2437
2438 r = image_find(arg_machine, &i);
2439 if (r < 0)
2440 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2441 else if (r == 0) {
2442 log_error("No image for machine '%s': %m", arg_machine);
2443 return -ENOENT;
2444 }
2445
2446 if (i->type == IMAGE_RAW)
2447 r = free_and_strdup(&arg_image, i->path);
2448 else
2449 r = free_and_strdup(&arg_directory, i->path);
2450 if (r < 0)
2451 return log_error_errno(r, "Invalid image directory: %m");
2452
2453 if (!arg_ephemeral)
2454 arg_read_only = arg_read_only || i->read_only;
2455 } else
2456 arg_directory = get_current_dir_name();
2457
2458 if (!arg_directory && !arg_machine) {
2459 log_error("Failed to determine path, please use -D or -i.");
2460 return -EINVAL;
2461 }
2462 }
2463
2464 if (!arg_machine) {
2465 if (arg_directory && path_equal(arg_directory, "/"))
2466 arg_machine = gethostname_malloc();
2467 else
2468 arg_machine = strdup(basename(arg_image ?: arg_directory));
2469
2470 if (!arg_machine)
2471 return log_oom();
2472
2473 hostname_cleanup(arg_machine);
2474 if (!machine_name_is_valid(arg_machine)) {
2475 log_error("Failed to determine machine name automatically, please use -M.");
2476 return -EINVAL;
2477 }
2478
2479 if (arg_ephemeral) {
2480 char *b;
2481
2482 /* Add a random suffix when this is an
2483 * ephemeral machine, so that we can run many
2484 * instances at once without manually having
2485 * to specify -M each time. */
2486
2487 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2488 return log_oom();
2489
2490 free(arg_machine);
2491 arg_machine = b;
2492 }
2493 }
2494
2495 return 0;
2496 }
2497
2498 static int determine_uid_shift(const char *directory) {
2499 int r;
2500
2501 if (arg_userns_mode == USER_NAMESPACE_NO) {
2502 arg_uid_shift = 0;
2503 return 0;
2504 }
2505
2506 if (arg_uid_shift == UID_INVALID) {
2507 struct stat st;
2508
2509 r = stat(directory, &st);
2510 if (r < 0)
2511 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2512
2513 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2514
2515 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
2516 log_error("UID and GID base of %s don't match.", directory);
2517 return -EINVAL;
2518 }
2519
2520 arg_uid_range = UINT32_C(0x10000);
2521 }
2522
2523 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2524 log_error("UID base too high for UID range.");
2525 return -EINVAL;
2526 }
2527
2528 return 0;
2529 }
2530
2531 static int inner_child(
2532 Barrier *barrier,
2533 const char *directory,
2534 bool secondary,
2535 int kmsg_socket,
2536 int rtnl_socket,
2537 FDSet *fds) {
2538
2539 _cleanup_free_ char *home = NULL;
2540 char as_uuid[37];
2541 unsigned n_env = 1;
2542 const char *envp[] = {
2543 "PATH=" DEFAULT_PATH_SPLIT_USR,
2544 NULL, /* container */
2545 NULL, /* TERM */
2546 NULL, /* HOME */
2547 NULL, /* USER */
2548 NULL, /* LOGNAME */
2549 NULL, /* container_uuid */
2550 NULL, /* LISTEN_FDS */
2551 NULL, /* LISTEN_PID */
2552 NULL, /* NOTIFY_SOCKET */
2553 NULL
2554 };
2555
2556 _cleanup_strv_free_ char **env_use = NULL;
2557 int r;
2558
2559 assert(barrier);
2560 assert(directory);
2561 assert(kmsg_socket >= 0);
2562
2563 cg_unified_flush();
2564
2565 if (arg_userns_mode != USER_NAMESPACE_NO) {
2566 /* Tell the parent, that it now can write the UID map. */
2567 (void) barrier_place(barrier); /* #1 */
2568
2569 /* Wait until the parent wrote the UID map */
2570 if (!barrier_place_and_sync(barrier)) { /* #2 */
2571 log_error("Parent died too early");
2572 return -ESRCH;
2573 }
2574 }
2575
2576 r = mount_all(NULL,
2577 arg_userns_mode != USER_NAMESPACE_NO,
2578 true,
2579 arg_private_network,
2580 arg_uid_shift,
2581 arg_uid_range,
2582 arg_selinux_apifs_context);
2583
2584 if (r < 0)
2585 return r;
2586
2587 r = mount_sysfs(NULL);
2588 if (r < 0)
2589 return r;
2590
2591 /* Wait until we are cgroup-ified, so that we
2592 * can mount the right cgroup path writable */
2593 if (!barrier_place_and_sync(barrier)) { /* #3 */
2594 log_error("Parent died too early");
2595 return -ESRCH;
2596 }
2597
2598 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2599 if (r < 0)
2600 return r;
2601
2602 r = reset_uid_gid();
2603 if (r < 0)
2604 return log_error_errno(r, "Couldn't become new root: %m");
2605
2606 r = setup_boot_id(NULL);
2607 if (r < 0)
2608 return r;
2609
2610 r = setup_kmsg(NULL, kmsg_socket);
2611 if (r < 0)
2612 return r;
2613 kmsg_socket = safe_close(kmsg_socket);
2614
2615 umask(0022);
2616
2617 if (setsid() < 0)
2618 return log_error_errno(errno, "setsid() failed: %m");
2619
2620 if (arg_private_network)
2621 loopback_setup();
2622
2623 if (arg_expose_ports) {
2624 r = expose_port_send_rtnl(rtnl_socket);
2625 if (r < 0)
2626 return r;
2627 rtnl_socket = safe_close(rtnl_socket);
2628 }
2629
2630 r = drop_capabilities();
2631 if (r < 0)
2632 return log_error_errno(r, "drop_capabilities() failed: %m");
2633
2634 setup_hostname();
2635
2636 if (arg_personality != PERSONALITY_INVALID) {
2637 if (personality(arg_personality) < 0)
2638 return log_error_errno(errno, "personality() failed: %m");
2639 } else if (secondary) {
2640 if (personality(PER_LINUX32) < 0)
2641 return log_error_errno(errno, "personality() failed: %m");
2642 }
2643
2644 #ifdef HAVE_SELINUX
2645 if (arg_selinux_context)
2646 if (setexeccon(arg_selinux_context) < 0)
2647 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2648 #endif
2649
2650 r = change_uid_gid(arg_user, &home);
2651 if (r < 0)
2652 return r;
2653
2654 /* LXC sets container=lxc, so follow the scheme here */
2655 envp[n_env++] = strjoina("container=", arg_container_service_name);
2656
2657 envp[n_env] = strv_find_prefix(environ, "TERM=");
2658 if (envp[n_env])
2659 n_env++;
2660
2661 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2662 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2663 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2664 return log_oom();
2665
2666 assert(!sd_id128_is_null(arg_uuid));
2667
2668 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2669 return log_oom();
2670
2671 if (fdset_size(fds) > 0) {
2672 r = fdset_cloexec(fds, false);
2673 if (r < 0)
2674 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2675
2676 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2677 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2678 return log_oom();
2679 }
2680 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
2681 return log_oom();
2682
2683 env_use = strv_env_merge(2, envp, arg_setenv);
2684 if (!env_use)
2685 return log_oom();
2686
2687 /* Let the parent know that we are ready and
2688 * wait until the parent is ready with the
2689 * setup, too... */
2690 if (!barrier_place_and_sync(barrier)) { /* #4 */
2691 log_error("Parent died too early");
2692 return -ESRCH;
2693 }
2694
2695 if (arg_chdir)
2696 if (chdir(arg_chdir) < 0)
2697 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2698
2699 if (arg_start_mode == START_PID2) {
2700 r = stub_pid1();
2701 if (r < 0)
2702 return r;
2703 }
2704
2705 /* Now, explicitly close the log, so that we
2706 * then can close all remaining fds. Closing
2707 * the log explicitly first has the benefit
2708 * that the logging subsystem knows about it,
2709 * and is thus ready to be reopened should we
2710 * need it again. Note that the other fds
2711 * closed here are at least the locking and
2712 * barrier fds. */
2713 log_close();
2714 (void) fdset_close_others(fds);
2715
2716 if (arg_start_mode == START_BOOT) {
2717 char **a;
2718 size_t m;
2719
2720 /* Automatically search for the init system */
2721
2722 m = strv_length(arg_parameters);
2723 a = newa(char*, m + 2);
2724 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2725 a[1 + m] = NULL;
2726
2727 a[0] = (char*) "/usr/lib/systemd/systemd";
2728 execve(a[0], a, env_use);
2729
2730 a[0] = (char*) "/lib/systemd/systemd";
2731 execve(a[0], a, env_use);
2732
2733 a[0] = (char*) "/sbin/init";
2734 execve(a[0], a, env_use);
2735 } else if (!strv_isempty(arg_parameters))
2736 execvpe(arg_parameters[0], arg_parameters, env_use);
2737 else {
2738 if (!arg_chdir)
2739 /* If we cannot change the directory, we'll end up in /, that is expected. */
2740 (void) chdir(home ?: "/root");
2741
2742 execle("/bin/bash", "-bash", NULL, env_use);
2743 execle("/bin/sh", "-sh", NULL, env_use);
2744 }
2745
2746 r = -errno;
2747 (void) log_open();
2748 return log_error_errno(r, "execv() failed: %m");
2749 }
2750
2751 static int setup_sd_notify_child(void) {
2752 static const int one = 1;
2753 int fd = -1;
2754 union sockaddr_union sa = {
2755 .sa.sa_family = AF_UNIX,
2756 };
2757 int r;
2758
2759 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2760 if (fd < 0)
2761 return log_error_errno(errno, "Failed to allocate notification socket: %m");
2762
2763 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
2764 (void) unlink(NSPAWN_NOTIFY_SOCKET_PATH);
2765
2766 strncpy(sa.un.sun_path, NSPAWN_NOTIFY_SOCKET_PATH, sizeof(sa.un.sun_path)-1);
2767 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
2768 if (r < 0) {
2769 safe_close(fd);
2770 return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path);
2771 }
2772
2773 r = setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &one, sizeof(one));
2774 if (r < 0) {
2775 safe_close(fd);
2776 return log_error_errno(errno, "SO_PASSCRED failed: %m");
2777 }
2778
2779 return fd;
2780 }
2781
2782 static int outer_child(
2783 Barrier *barrier,
2784 const char *directory,
2785 const char *console,
2786 const char *root_device, bool root_device_rw,
2787 const char *home_device, bool home_device_rw,
2788 const char *srv_device, bool srv_device_rw,
2789 bool interactive,
2790 bool secondary,
2791 int pid_socket,
2792 int uuid_socket,
2793 int notify_socket,
2794 int kmsg_socket,
2795 int rtnl_socket,
2796 int uid_shift_socket,
2797 FDSet *fds) {
2798
2799 pid_t pid;
2800 ssize_t l;
2801 int r;
2802 _cleanup_close_ int fd = -1;
2803
2804 assert(barrier);
2805 assert(directory);
2806 assert(console);
2807 assert(pid_socket >= 0);
2808 assert(uuid_socket >= 0);
2809 assert(notify_socket >= 0);
2810 assert(kmsg_socket >= 0);
2811
2812 cg_unified_flush();
2813
2814 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2815 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2816
2817 if (interactive) {
2818 close_nointr(STDIN_FILENO);
2819 close_nointr(STDOUT_FILENO);
2820 close_nointr(STDERR_FILENO);
2821
2822 r = open_terminal(console, O_RDWR);
2823 if (r != STDIN_FILENO) {
2824 if (r >= 0) {
2825 safe_close(r);
2826 r = -EINVAL;
2827 }
2828
2829 return log_error_errno(r, "Failed to open console: %m");
2830 }
2831
2832 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2833 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2834 return log_error_errno(errno, "Failed to duplicate console: %m");
2835 }
2836
2837 r = reset_audit_loginuid();
2838 if (r < 0)
2839 return r;
2840
2841 /* Mark everything as slave, so that we still
2842 * receive mounts from the real root, but don't
2843 * propagate mounts to the real root. */
2844 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2845 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2846
2847 r = mount_devices(directory,
2848 root_device, root_device_rw,
2849 home_device, home_device_rw,
2850 srv_device, srv_device_rw);
2851 if (r < 0)
2852 return r;
2853
2854 r = determine_uid_shift(directory);
2855 if (r < 0)
2856 return r;
2857
2858 if (arg_userns_mode != USER_NAMESPACE_NO) {
2859 /* Let the parent know which UID shift we read from the image */
2860 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2861 if (l < 0)
2862 return log_error_errno(errno, "Failed to send UID shift: %m");
2863 if (l != sizeof(arg_uid_shift)) {
2864 log_error("Short write while sending UID shift.");
2865 return -EIO;
2866 }
2867
2868 if (arg_userns_mode == USER_NAMESPACE_PICK) {
2869 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
2870 * we just read from the image is available. If yes, it will send the UID shift back to us, if
2871 * not it will pick a different one, and send it back to us. */
2872
2873 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
2874 if (l < 0)
2875 return log_error_errno(errno, "Failed to recv UID shift: %m");
2876 if (l != sizeof(arg_uid_shift)) {
2877 log_error("Short read while receiving UID shift.");
2878 return -EIO;
2879 }
2880 }
2881
2882 log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2883 }
2884
2885 /* Turn directory into bind mount */
2886 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2887 return log_error_errno(errno, "Failed to make bind mount: %m");
2888
2889 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
2890 if (r < 0)
2891 return r;
2892
2893 r = setup_volatile(
2894 directory,
2895 arg_volatile_mode,
2896 arg_userns_mode != USER_NAMESPACE_NO,
2897 arg_uid_shift,
2898 arg_uid_range,
2899 arg_selinux_context);
2900 if (r < 0)
2901 return r;
2902
2903 r = setup_volatile_state(
2904 directory,
2905 arg_volatile_mode,
2906 arg_userns_mode != USER_NAMESPACE_NO,
2907 arg_uid_shift,
2908 arg_uid_range,
2909 arg_selinux_context);
2910 if (r < 0)
2911 return r;
2912
2913 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2914 if (r < 0)
2915 return r;
2916
2917 if (arg_read_only) {
2918 r = bind_remount_recursive(directory, true);
2919 if (r < 0)
2920 return log_error_errno(r, "Failed to make tree read-only: %m");
2921 }
2922
2923 r = mount_all(directory,
2924 arg_userns_mode != USER_NAMESPACE_NO,
2925 false,
2926 arg_private_network,
2927 arg_uid_shift,
2928 arg_uid_range,
2929 arg_selinux_apifs_context);
2930 if (r < 0)
2931 return r;
2932
2933 r = copy_devnodes(directory);
2934 if (r < 0)
2935 return r;
2936
2937 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2938
2939 r = setup_pts(directory);
2940 if (r < 0)
2941 return r;
2942
2943 r = setup_propagate(directory);
2944 if (r < 0)
2945 return r;
2946
2947 r = setup_dev_console(directory, console);
2948 if (r < 0)
2949 return r;
2950
2951 r = setup_seccomp(arg_caps_retain);
2952 if (r < 0)
2953 return r;
2954
2955 r = setup_timezone(directory);
2956 if (r < 0)
2957 return r;
2958
2959 r = setup_resolv_conf(directory);
2960 if (r < 0)
2961 return r;
2962
2963 r = setup_machine_id(directory);
2964 if (r < 0)
2965 return r;
2966
2967 r = setup_journal(directory);
2968 if (r < 0)
2969 return r;
2970
2971 r = mount_custom(
2972 directory,
2973 arg_custom_mounts,
2974 arg_n_custom_mounts,
2975 arg_userns_mode != USER_NAMESPACE_NO,
2976 arg_uid_shift,
2977 arg_uid_range,
2978 arg_selinux_apifs_context);
2979 if (r < 0)
2980 return r;
2981
2982 r = mount_cgroups(
2983 directory,
2984 arg_unified_cgroup_hierarchy,
2985 arg_userns_mode != USER_NAMESPACE_NO,
2986 arg_uid_shift,
2987 arg_uid_range,
2988 arg_selinux_apifs_context);
2989 if (r < 0)
2990 return r;
2991
2992 r = mount_move_root(directory);
2993 if (r < 0)
2994 return log_error_errno(r, "Failed to move root directory: %m");
2995
2996 fd = setup_sd_notify_child();
2997 if (fd < 0)
2998 return fd;
2999
3000 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3001 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
3002 (arg_private_network ? CLONE_NEWNET : 0) |
3003 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
3004 if (pid < 0)
3005 return log_error_errno(errno, "Failed to fork inner child: %m");
3006 if (pid == 0) {
3007 pid_socket = safe_close(pid_socket);
3008 uuid_socket = safe_close(uuid_socket);
3009 notify_socket = safe_close(notify_socket);
3010 uid_shift_socket = safe_close(uid_shift_socket);
3011
3012 /* The inner child has all namespaces that are
3013 * requested, so that we all are owned by the user if
3014 * user namespaces are turned on. */
3015
3016 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
3017 if (r < 0)
3018 _exit(EXIT_FAILURE);
3019
3020 _exit(EXIT_SUCCESS);
3021 }
3022
3023 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3024 if (l < 0)
3025 return log_error_errno(errno, "Failed to send PID: %m");
3026 if (l != sizeof(pid)) {
3027 log_error("Short write while sending PID.");
3028 return -EIO;
3029 }
3030
3031 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3032 if (l < 0)
3033 return log_error_errno(errno, "Failed to send machine ID: %m");
3034 if (l != sizeof(arg_uuid)) {
3035 log_error("Short write while sending machine ID.");
3036 return -EIO;
3037 }
3038
3039 l = send_one_fd(notify_socket, fd, 0);
3040 if (l < 0)
3041 return log_error_errno(errno, "Failed to send notify fd: %m");
3042
3043 pid_socket = safe_close(pid_socket);
3044 uuid_socket = safe_close(uuid_socket);
3045 notify_socket = safe_close(notify_socket);
3046 kmsg_socket = safe_close(kmsg_socket);
3047 rtnl_socket = safe_close(rtnl_socket);
3048
3049 return 0;
3050 }
3051
3052 static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
3053 unsigned n_tries = 100;
3054 uid_t candidate;
3055 int r;
3056
3057 assert(shift);
3058 assert(ret_lock_file);
3059 assert(arg_userns_mode == USER_NAMESPACE_PICK);
3060 assert(arg_uid_range == 0x10000U);
3061
3062 candidate = *shift;
3063
3064 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3065
3066 for (;;) {
3067 char lock_path[strlen("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
3068 _cleanup_release_lock_file_ LockFile lf = LOCK_FILE_INIT;
3069
3070 if (--n_tries <= 0)
3071 return -EBUSY;
3072
3073 if (candidate < UID_SHIFT_PICK_MIN || candidate > UID_SHIFT_PICK_MAX)
3074 goto next;
3075 if ((candidate & UINT32_C(0xFFFF)) != 0)
3076 goto next;
3077
3078 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3079 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3080 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3081 goto next;
3082 if (r < 0)
3083 return r;
3084
3085 /* Make some superficial checks whether the range is currently known in the user database */
3086 if (getpwuid(candidate))
3087 goto next;
3088 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3089 goto next;
3090 if (getgrgid(candidate))
3091 goto next;
3092 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3093 goto next;
3094
3095 *ret_lock_file = lf;
3096 lf = (struct LockFile) LOCK_FILE_INIT;
3097 *shift = candidate;
3098 return 0;
3099
3100 next:
3101 random_bytes(&candidate, sizeof(candidate));
3102 candidate = (candidate % (UID_SHIFT_PICK_MAX - UID_SHIFT_PICK_MIN)) + UID_SHIFT_PICK_MIN;
3103 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3104 }
3105 }
3106
3107 static int setup_uid_map(pid_t pid) {
3108 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
3109 int r;
3110
3111 assert(pid > 1);
3112
3113 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3114 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
3115 r = write_string_file(uid_map, line, 0);
3116 if (r < 0)
3117 return log_error_errno(r, "Failed to write UID map: %m");
3118
3119 /* We always assign the same UID and GID ranges */
3120 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
3121 r = write_string_file(uid_map, line, 0);
3122 if (r < 0)
3123 return log_error_errno(r, "Failed to write GID map: %m");
3124
3125 return 0;
3126 }
3127
3128 static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
3129 char buf[NOTIFY_BUFFER_MAX+1];
3130 char *p = NULL;
3131 struct iovec iovec = {
3132 .iov_base = buf,
3133 .iov_len = sizeof(buf)-1,
3134 };
3135 union {
3136 struct cmsghdr cmsghdr;
3137 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
3138 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
3139 } control = {};
3140 struct msghdr msghdr = {
3141 .msg_iov = &iovec,
3142 .msg_iovlen = 1,
3143 .msg_control = &control,
3144 .msg_controllen = sizeof(control),
3145 };
3146 struct cmsghdr *cmsg;
3147 struct ucred *ucred = NULL;
3148 ssize_t n;
3149 pid_t inner_child_pid;
3150 _cleanup_strv_free_ char **tags = NULL;
3151
3152 assert(userdata);
3153
3154 inner_child_pid = PTR_TO_PID(userdata);
3155
3156 if (revents != EPOLLIN) {
3157 log_warning("Got unexpected poll event for notify fd.");
3158 return 0;
3159 }
3160
3161 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
3162 if (n < 0) {
3163 if (errno == EAGAIN || errno == EINTR)
3164 return 0;
3165
3166 return log_warning_errno(errno, "Couldn't read notification socket: %m");
3167 }
3168 cmsg_close_all(&msghdr);
3169
3170 CMSG_FOREACH(cmsg, &msghdr) {
3171 if (cmsg->cmsg_level == SOL_SOCKET &&
3172 cmsg->cmsg_type == SCM_CREDENTIALS &&
3173 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
3174
3175 ucred = (struct ucred*) CMSG_DATA(cmsg);
3176 }
3177 }
3178
3179 if (!ucred || ucred->pid != inner_child_pid) {
3180 log_warning("Received notify message without valid credentials. Ignoring.");
3181 return 0;
3182 }
3183
3184 if ((size_t) n >= sizeof(buf)) {
3185 log_warning("Received notify message exceeded maximum size. Ignoring.");
3186 return 0;
3187 }
3188
3189 buf[n] = 0;
3190 tags = strv_split(buf, "\n\r");
3191 if (!tags)
3192 return log_oom();
3193
3194 if (strv_find(tags, "READY=1"))
3195 sd_notifyf(false, "READY=1\n");
3196
3197 p = strv_find_startswith(tags, "STATUS=");
3198 if (p)
3199 sd_notifyf(false, "STATUS=Container running: %s", p);
3200
3201 return 0;
3202 }
3203
3204 static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid) {
3205 int r;
3206 sd_event_source *notify_event_source;
3207
3208 r = sd_event_add_io(event, &notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
3209 if (r < 0)
3210 return log_error_errno(r, "Failed to allocate notify event source: %m");
3211
3212 (void) sd_event_source_set_description(notify_event_source, "nspawn-notify");
3213
3214 return 0;
3215 }
3216
3217 static int load_settings(void) {
3218 _cleanup_(settings_freep) Settings *settings = NULL;
3219 _cleanup_fclose_ FILE *f = NULL;
3220 _cleanup_free_ char *p = NULL;
3221 const char *fn, *i;
3222 int r;
3223
3224 /* If all settings are masked, there's no point in looking for
3225 * the settings file */
3226 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
3227 return 0;
3228
3229 fn = strjoina(arg_machine, ".nspawn");
3230
3231 /* We first look in the admin's directories in /etc and /run */
3232 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
3233 _cleanup_free_ char *j = NULL;
3234
3235 j = strjoin(i, "/", fn, NULL);
3236 if (!j)
3237 return log_oom();
3238
3239 f = fopen(j, "re");
3240 if (f) {
3241 p = j;
3242 j = NULL;
3243
3244 /* By default, we trust configuration from /etc and /run */
3245 if (arg_settings_trusted < 0)
3246 arg_settings_trusted = true;
3247
3248 break;
3249 }
3250
3251 if (errno != ENOENT)
3252 return log_error_errno(errno, "Failed to open %s: %m", j);
3253 }
3254
3255 if (!f) {
3256 /* After that, let's look for a file next to the
3257 * actual image we shall boot. */
3258
3259 if (arg_image) {
3260 p = file_in_same_dir(arg_image, fn);
3261 if (!p)
3262 return log_oom();
3263 } else if (arg_directory) {
3264 p = file_in_same_dir(arg_directory, fn);
3265 if (!p)
3266 return log_oom();
3267 }
3268
3269 if (p) {
3270 f = fopen(p, "re");
3271 if (!f && errno != ENOENT)
3272 return log_error_errno(errno, "Failed to open %s: %m", p);
3273
3274 /* By default, we do not trust configuration from /var/lib/machines */
3275 if (arg_settings_trusted < 0)
3276 arg_settings_trusted = false;
3277 }
3278 }
3279
3280 if (!f)
3281 return 0;
3282
3283 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
3284
3285 r = settings_load(f, p, &settings);
3286 if (r < 0)
3287 return r;
3288
3289 /* Copy over bits from the settings, unless they have been
3290 * explicitly masked by command line switches. */
3291
3292 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3293 settings->start_mode >= 0) {
3294 arg_start_mode = settings->start_mode;
3295
3296 strv_free(arg_parameters);
3297 arg_parameters = settings->parameters;
3298 settings->parameters = NULL;
3299 }
3300
3301 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
3302 settings->working_directory) {
3303 free(arg_chdir);
3304 arg_chdir = settings->working_directory;
3305 settings->working_directory = NULL;
3306 }
3307
3308 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
3309 settings->environment) {
3310 strv_free(arg_setenv);
3311 arg_setenv = settings->environment;
3312 settings->environment = NULL;
3313 }
3314
3315 if ((arg_settings_mask & SETTING_USER) == 0 &&
3316 settings->user) {
3317 free(arg_user);
3318 arg_user = settings->user;
3319 settings->user = NULL;
3320 }
3321
3322 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
3323 uint64_t plus;
3324
3325 plus = settings->capability;
3326 if (settings_private_network(settings))
3327 plus |= (1ULL << CAP_NET_ADMIN);
3328
3329 if (!arg_settings_trusted && plus != 0) {
3330 if (settings->capability != 0)
3331 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
3332 } else
3333 arg_caps_retain |= plus;
3334
3335 arg_caps_retain &= ~settings->drop_capability;
3336 }
3337
3338 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3339 settings->kill_signal > 0)
3340 arg_kill_signal = settings->kill_signal;
3341
3342 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3343 settings->personality != PERSONALITY_INVALID)
3344 arg_personality = settings->personality;
3345
3346 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3347 !sd_id128_is_null(settings->machine_id)) {
3348
3349 if (!arg_settings_trusted)
3350 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
3351 else
3352 arg_uuid = settings->machine_id;
3353 }
3354
3355 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3356 settings->read_only >= 0)
3357 arg_read_only = settings->read_only;
3358
3359 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3360 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3361 arg_volatile_mode = settings->volatile_mode;
3362
3363 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3364 settings->n_custom_mounts > 0) {
3365
3366 if (!arg_settings_trusted)
3367 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
3368 else {
3369 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3370 arg_custom_mounts = settings->custom_mounts;
3371 arg_n_custom_mounts = settings->n_custom_mounts;
3372
3373 settings->custom_mounts = NULL;
3374 settings->n_custom_mounts = 0;
3375 }
3376 }
3377
3378 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3379 (settings->private_network >= 0 ||
3380 settings->network_veth >= 0 ||
3381 settings->network_bridge ||
3382 settings->network_zone ||
3383 settings->network_interfaces ||
3384 settings->network_macvlan ||
3385 settings->network_ipvlan ||
3386 settings->network_veth_extra)) {
3387
3388 if (!arg_settings_trusted)
3389 log_warning("Ignoring network settings, file %s is not trusted.", p);
3390 else {
3391 arg_network_veth = settings_network_veth(settings);
3392 arg_private_network = settings_private_network(settings);
3393
3394 strv_free(arg_network_interfaces);
3395 arg_network_interfaces = settings->network_interfaces;
3396 settings->network_interfaces = NULL;
3397
3398 strv_free(arg_network_macvlan);
3399 arg_network_macvlan = settings->network_macvlan;
3400 settings->network_macvlan = NULL;
3401
3402 strv_free(arg_network_ipvlan);
3403 arg_network_ipvlan = settings->network_ipvlan;
3404 settings->network_ipvlan = NULL;
3405
3406 strv_free(arg_network_veth_extra);
3407 arg_network_veth_extra = settings->network_veth_extra;
3408 settings->network_veth_extra = NULL;
3409
3410 free(arg_network_bridge);
3411 arg_network_bridge = settings->network_bridge;
3412 settings->network_bridge = NULL;
3413
3414 free(arg_network_zone);
3415 arg_network_zone = settings->network_zone;
3416 settings->network_zone = NULL;
3417 }
3418 }
3419
3420 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3421 settings->expose_ports) {
3422
3423 if (!arg_settings_trusted)
3424 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3425 else {
3426 expose_port_free_all(arg_expose_ports);
3427 arg_expose_ports = settings->expose_ports;
3428 settings->expose_ports = NULL;
3429 }
3430 }
3431
3432 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3433 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3434
3435 if (!arg_settings_trusted)
3436 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p);
3437 else {
3438 arg_userns_mode = settings->userns_mode;
3439 arg_uid_shift = settings->uid_shift;
3440 arg_uid_range = settings->uid_range;
3441 arg_userns_chown = settings->userns_chown;
3442 }
3443 }
3444
3445 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3446 arg_notify_ready = settings->notify_ready;
3447
3448 return 0;
3449 }
3450
3451 int main(int argc, char *argv[]) {
3452
3453 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3454 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3455 _cleanup_close_ int master = -1, image_fd = -1;
3456 _cleanup_fdset_free_ FDSet *fds = NULL;
3457 int r, n_fd_passed, loop_nr = -1;
3458 char veth_name[IFNAMSIZ] = "";
3459 bool secondary = false, remove_subvol = false;
3460 sigset_t mask_chld;
3461 pid_t pid = 0;
3462 int ret = EXIT_SUCCESS;
3463 union in_addr_union exposed = {};
3464 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3465 bool interactive, veth_created = false;
3466
3467 log_parse_environment();
3468 log_open();
3469
3470 /* Make sure rename_process() in the stub init process can work */
3471 saved_argv = argv;
3472 saved_argc = argc;
3473
3474 r = parse_argv(argc, argv);
3475 if (r <= 0)
3476 goto finish;
3477
3478 if (geteuid() != 0) {
3479 log_error("Need to be root.");
3480 r = -EPERM;
3481 goto finish;
3482 }
3483 r = determine_names();
3484 if (r < 0)
3485 goto finish;
3486
3487 r = load_settings();
3488 if (r < 0)
3489 goto finish;
3490
3491 r = verify_arguments();
3492 if (r < 0)
3493 goto finish;
3494
3495 n_fd_passed = sd_listen_fds(false);
3496 if (n_fd_passed > 0) {
3497 r = fdset_new_listen_fds(&fds, false);
3498 if (r < 0) {
3499 log_error_errno(r, "Failed to collect file descriptors: %m");
3500 goto finish;
3501 }
3502 }
3503
3504 if (arg_directory) {
3505 assert(!arg_image);
3506
3507 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3508 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3509 r = -EINVAL;
3510 goto finish;
3511 }
3512
3513 if (arg_ephemeral) {
3514 _cleanup_free_ char *np = NULL;
3515
3516 /* If the specified path is a mount point we
3517 * generate the new snapshot immediately
3518 * inside it under a random name. However if
3519 * the specified is not a mount point we
3520 * create the new snapshot in the parent
3521 * directory, just next to it. */
3522 r = path_is_mount_point(arg_directory, 0);
3523 if (r < 0) {
3524 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3525 goto finish;
3526 }
3527 if (r > 0)
3528 r = tempfn_random_child(arg_directory, "machine.", &np);
3529 else
3530 r = tempfn_random(arg_directory, "machine.", &np);
3531 if (r < 0) {
3532 log_error_errno(r, "Failed to generate name for snapshot: %m");
3533 goto finish;
3534 }
3535
3536 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3537 if (r < 0) {
3538 log_error_errno(r, "Failed to lock %s: %m", np);
3539 goto finish;
3540 }
3541
3542 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
3543 if (r < 0) {
3544 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3545 goto finish;
3546 }
3547
3548 free(arg_directory);
3549 arg_directory = np;
3550 np = NULL;
3551
3552 remove_subvol = true;
3553
3554 } else {
3555 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3556 if (r == -EBUSY) {
3557 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3558 goto finish;
3559 }
3560 if (r < 0) {
3561 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3562 return r;
3563 }
3564
3565 if (arg_template) {
3566 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
3567 if (r == -EEXIST) {
3568 if (!arg_quiet)
3569 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3570 } else if (r < 0) {
3571 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3572 goto finish;
3573 } else {
3574 if (!arg_quiet)
3575 log_info("Populated %s from template %s.", arg_directory, arg_template);
3576 }
3577 }
3578 }
3579
3580 if (arg_start_mode == START_BOOT) {
3581 if (path_is_os_tree(arg_directory) <= 0) {
3582 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3583 r = -EINVAL;
3584 goto finish;
3585 }
3586 } else {
3587 const char *p;
3588
3589 p = strjoina(arg_directory, "/usr/");
3590 if (laccess(p, F_OK) < 0) {
3591 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
3592 r = -EINVAL;
3593 goto finish;
3594 }
3595 }
3596
3597 } else {
3598 char template[] = "/tmp/nspawn-root-XXXXXX";
3599
3600 assert(arg_image);
3601 assert(!arg_template);
3602
3603 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3604 if (r == -EBUSY) {
3605 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3606 goto finish;
3607 }
3608 if (r < 0) {
3609 r = log_error_errno(r, "Failed to create image lock: %m");
3610 goto finish;
3611 }
3612
3613 if (!mkdtemp(template)) {
3614 log_error_errno(errno, "Failed to create temporary directory: %m");
3615 r = -errno;
3616 goto finish;
3617 }
3618
3619 arg_directory = strdup(template);
3620 if (!arg_directory) {
3621 r = log_oom();
3622 goto finish;
3623 }
3624
3625 image_fd = setup_image(&device_path, &loop_nr);
3626 if (image_fd < 0) {
3627 r = image_fd;
3628 goto finish;
3629 }
3630
3631 r = dissect_image(image_fd,
3632 &root_device, &root_device_rw,
3633 &home_device, &home_device_rw,
3634 &srv_device, &srv_device_rw,
3635 &secondary);
3636 if (r < 0)
3637 goto finish;
3638 }
3639
3640 r = custom_mounts_prepare();
3641 if (r < 0)
3642 goto finish;
3643
3644 interactive =
3645 isatty(STDIN_FILENO) > 0 &&
3646 isatty(STDOUT_FILENO) > 0;
3647
3648 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3649 if (master < 0) {
3650 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3651 goto finish;
3652 }
3653
3654 r = ptsname_malloc(master, &console);
3655 if (r < 0) {
3656 r = log_error_errno(r, "Failed to determine tty name: %m");
3657 goto finish;
3658 }
3659
3660 if (arg_selinux_apifs_context) {
3661 r = mac_selinux_apply(console, arg_selinux_apifs_context);
3662 if (r < 0)
3663 goto finish;
3664 }
3665
3666 if (unlockpt(master) < 0) {
3667 r = log_error_errno(errno, "Failed to unlock tty: %m");
3668 goto finish;
3669 }
3670
3671 if (!arg_quiet)
3672 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3673 arg_machine, arg_image ?: arg_directory);
3674
3675 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
3676
3677 assert_se(sigemptyset(&mask_chld) == 0);
3678 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3679
3680 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3681 r = log_error_errno(errno, "Failed to become subreaper: %m");
3682 goto finish;
3683 }
3684
3685 for (;;) {
3686 static const struct sigaction sa = {
3687 .sa_handler = nop_signal_handler,
3688 .sa_flags = SA_NOCLDSTOP,
3689 };
3690
3691 _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
3692 _cleanup_close_ int etc_passwd_lock = -1;
3693 _cleanup_close_pair_ int
3694 kmsg_socket_pair[2] = { -1, -1 },
3695 rtnl_socket_pair[2] = { -1, -1 },
3696 pid_socket_pair[2] = { -1, -1 },
3697 uuid_socket_pair[2] = { -1, -1 },
3698 notify_socket_pair[2] = { -1, -1 },
3699 uid_shift_socket_pair[2] = { -1, -1 };
3700 _cleanup_close_ int notify_socket= -1;
3701 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3702 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
3703 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3704 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
3705 ContainerStatus container_status;
3706 char last_char = 0;
3707 int ifi = 0;
3708 ssize_t l;
3709
3710 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3711 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3712 * check with getpwuid() if the specific user already exists. Note that /etc might be
3713 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3714 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3715 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3716 * really ours. */
3717
3718 etc_passwd_lock = take_etc_passwd_lock(NULL);
3719 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS) {
3720 log_error_errno(r, "Failed to take /etc/passwd lock: %m");
3721 goto finish;
3722 }
3723 }
3724
3725 r = barrier_create(&barrier);
3726 if (r < 0) {
3727 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3728 goto finish;
3729 }
3730
3731 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3732 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3733 goto finish;
3734 }
3735
3736 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3737 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3738 goto finish;
3739 }
3740
3741 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
3742 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3743 goto finish;
3744 }
3745
3746 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0) {
3747 r = log_error_errno(errno, "Failed to create id socket pair: %m");
3748 goto finish;
3749 }
3750
3751 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0) {
3752 r = log_error_errno(errno, "Failed to create notify socket pair: %m");
3753 goto finish;
3754 }
3755
3756 if (arg_userns_mode != USER_NAMESPACE_NO)
3757 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
3758 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3759 goto finish;
3760 }
3761
3762 /* Child can be killed before execv(), so handle SIGCHLD
3763 * in order to interrupt parent's blocking calls and
3764 * give it a chance to call wait() and terminate. */
3765 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3766 if (r < 0) {
3767 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3768 goto finish;
3769 }
3770
3771 r = sigaction(SIGCHLD, &sa, NULL);
3772 if (r < 0) {
3773 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3774 goto finish;
3775 }
3776
3777 pid = raw_clone(SIGCHLD|CLONE_NEWNS);
3778 if (pid < 0) {
3779 if (errno == EINVAL)
3780 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3781 else
3782 r = log_error_errno(errno, "clone() failed: %m");
3783
3784 goto finish;
3785 }
3786
3787 if (pid == 0) {
3788 /* The outer child only has a file system namespace. */
3789 barrier_set_role(&barrier, BARRIER_CHILD);
3790
3791 master = safe_close(master);
3792
3793 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3794 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3795 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3796 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
3797 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3798 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3799
3800 (void) reset_all_signal_handlers();
3801 (void) reset_signal_mask();
3802
3803 r = outer_child(&barrier,
3804 arg_directory,
3805 console,
3806 root_device, root_device_rw,
3807 home_device, home_device_rw,
3808 srv_device, srv_device_rw,
3809 interactive,
3810 secondary,
3811 pid_socket_pair[1],
3812 uuid_socket_pair[1],
3813 notify_socket_pair[1],
3814 kmsg_socket_pair[1],
3815 rtnl_socket_pair[1],
3816 uid_shift_socket_pair[1],
3817 fds);
3818 if (r < 0)
3819 _exit(EXIT_FAILURE);
3820
3821 _exit(EXIT_SUCCESS);
3822 }
3823
3824 barrier_set_role(&barrier, BARRIER_PARENT);
3825
3826 fds = fdset_free(fds);
3827
3828 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3829 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3830 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3831 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
3832 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3833 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3834
3835 if (arg_userns_mode != USER_NAMESPACE_NO) {
3836 /* The child just let us know the UID shift it might have read from the image. */
3837 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3838 if (l < 0) {
3839 r = log_error_errno(errno, "Failed to read UID shift: %m");
3840 goto finish;
3841 }
3842 if (l != sizeof(arg_uid_shift)) {
3843 log_error("Short read while reading UID shift.");
3844 r = EIO;
3845 goto finish;
3846 }
3847
3848 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3849 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3850 * image, but if that's already in use, pick a new one, and report back to the child,
3851 * which one we now picked. */
3852
3853 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
3854 if (r < 0) {
3855 log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
3856 goto finish;
3857 }
3858
3859 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3860 if (l < 0) {
3861 r = log_error_errno(errno, "Failed to send UID shift: %m");
3862 goto finish;
3863 }
3864 if (l != sizeof(arg_uid_shift)) {
3865 log_error("Short write while writing UID shift.");
3866 r = -EIO;
3867 goto finish;
3868 }
3869 }
3870 }
3871
3872 /* Wait for the outer child. */
3873 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3874 if (r < 0)
3875 goto finish;
3876 if (r != 0) {
3877 r = -EIO;
3878 goto finish;
3879 }
3880 pid = 0;
3881
3882 /* And now retrieve the PID of the inner child. */
3883 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3884 if (l < 0) {
3885 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3886 goto finish;
3887 }
3888 if (l != sizeof(pid)) {
3889 log_error("Short read while reading inner child PID.");
3890 r = EIO;
3891 goto finish;
3892 }
3893
3894 /* We also retrieve container UUID in case it was generated by outer child */
3895 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof(arg_uuid), 0);
3896 if (l < 0) {
3897 r = log_error_errno(errno, "Failed to read container machine ID: %m");
3898 goto finish;
3899 }
3900 if (l != sizeof(arg_uuid)) {
3901 log_error("Short read while reading container machined ID.");
3902 r = EIO;
3903 goto finish;
3904 }
3905
3906 /* We also retrieve the socket used for notifications generated by outer child */
3907 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
3908 if (notify_socket < 0) {
3909 r = log_error_errno(errno, "Failed to receive notification socket from the outer child: %m");
3910 goto finish;
3911 }
3912
3913 log_debug("Init process invoked as PID " PID_FMT, pid);
3914
3915 if (arg_userns_mode != USER_NAMESPACE_NO) {
3916 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3917 log_error("Child died too early.");
3918 r = -ESRCH;
3919 goto finish;
3920 }
3921
3922 r = setup_uid_map(pid);
3923 if (r < 0)
3924 goto finish;
3925
3926 (void) barrier_place(&barrier); /* #2 */
3927 }
3928
3929 if (arg_private_network) {
3930
3931 r = move_network_interfaces(pid, arg_network_interfaces);
3932 if (r < 0)
3933 goto finish;
3934
3935 if (arg_network_veth) {
3936 r = setup_veth(arg_machine, pid, veth_name,
3937 arg_network_bridge || arg_network_zone);
3938 if (r < 0)
3939 goto finish;
3940 else if (r > 0)
3941 ifi = r;
3942
3943 if (arg_network_bridge) {
3944 /* Add the interface to a bridge */
3945 r = setup_bridge(veth_name, arg_network_bridge, false);
3946 if (r < 0)
3947 goto finish;
3948 if (r > 0)
3949 ifi = r;
3950 } else if (arg_network_zone) {
3951 /* Add the interface to a bridge, possibly creating it */
3952 r = setup_bridge(veth_name, arg_network_zone, true);
3953 if (r < 0)
3954 goto finish;
3955 if (r > 0)
3956 ifi = r;
3957 }
3958 }
3959
3960 r = setup_veth_extra(arg_machine, pid, arg_network_veth_extra);
3961 if (r < 0)
3962 goto finish;
3963
3964 /* We created the primary and extra veth links now; let's remember this, so that we know to
3965 remove them later on. Note that we don't bother with removing veth links that were created
3966 here when their setup failed half-way, because in that case the kernel should be able to
3967 remove them on its own, since they cannot be referenced by anything yet. */
3968 veth_created = true;
3969
3970 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3971 if (r < 0)
3972 goto finish;
3973
3974 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3975 if (r < 0)
3976 goto finish;
3977 }
3978
3979 if (arg_register) {
3980 r = register_machine(
3981 arg_machine,
3982 pid,
3983 arg_directory,
3984 arg_uuid,
3985 ifi,
3986 arg_slice,
3987 arg_custom_mounts, arg_n_custom_mounts,
3988 arg_kill_signal,
3989 arg_property,
3990 arg_keep_unit,
3991 arg_container_service_name);
3992 if (r < 0)
3993 goto finish;
3994 }
3995
3996 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
3997 if (r < 0)
3998 goto finish;
3999
4000 if (arg_keep_unit) {
4001 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
4002 if (r < 0)
4003 goto finish;
4004 }
4005
4006 r = chown_cgroup(pid, arg_uid_shift);
4007 if (r < 0)
4008 goto finish;
4009
4010 /* Notify the child that the parent is ready with all
4011 * its setup (including cgroup-ification), and that
4012 * the child can now hand over control to the code to
4013 * run inside the container. */
4014 (void) barrier_place(&barrier); /* #3 */
4015
4016 /* Block SIGCHLD here, before notifying child.
4017 * process_pty() will handle it with the other signals. */
4018 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
4019
4020 /* Reset signal to default */
4021 r = default_signals(SIGCHLD, -1);
4022 if (r < 0) {
4023 log_error_errno(r, "Failed to reset SIGCHLD: %m");
4024 goto finish;
4025 }
4026
4027 r = sd_event_new(&event);
4028 if (r < 0) {
4029 log_error_errno(r, "Failed to get default event source: %m");
4030 goto finish;
4031 }
4032
4033 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(pid));
4034 if (r < 0)
4035 goto finish;
4036
4037 /* Let the child know that we are ready and wait that the child is completely ready now. */
4038 if (!barrier_place_and_sync(&barrier)) { /* #4 */
4039 log_error("Child died too early.");
4040 r = -ESRCH;
4041 goto finish;
4042 }
4043
4044 /* At this point we have made use of the UID we picked, and thus nss-mymachines will make them appear
4045 * in getpwuid(), thus we can release the /etc/passwd lock. */
4046 etc_passwd_lock = safe_close(etc_passwd_lock);
4047
4048 sd_notifyf(false,
4049 "STATUS=Container running.\n"
4050 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4051 if (!arg_notify_ready)
4052 sd_notify(false, "READY=1\n");
4053
4054 if (arg_kill_signal > 0) {
4055 /* Try to kill the init system on SIGINT or SIGTERM */
4056 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(pid));
4057 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(pid));
4058 } else {
4059 /* Immediately exit */
4060 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4061 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4062 }
4063
4064 /* simply exit on sigchld */
4065 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4066
4067 if (arg_expose_ports) {
4068 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
4069 if (r < 0)
4070 goto finish;
4071
4072 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
4073 }
4074
4075 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4076
4077 r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward);
4078 if (r < 0) {
4079 log_error_errno(r, "Failed to create PTY forwarder: %m");
4080 goto finish;
4081 }
4082
4083 r = sd_event_loop(event);
4084 if (r < 0) {
4085 log_error_errno(r, "Failed to run event loop: %m");
4086 goto finish;
4087 }
4088
4089 pty_forward_get_last_char(forward, &last_char);
4090
4091 forward = pty_forward_free(forward);
4092
4093 if (!arg_quiet && last_char != '\n')
4094 putc('\n', stdout);
4095
4096 /* Kill if it is not dead yet anyway */
4097 if (arg_register && !arg_keep_unit)
4098 terminate_machine(pid);
4099
4100 /* Normally redundant, but better safe than sorry */
4101 kill(pid, SIGKILL);
4102
4103 r = wait_for_container(pid, &container_status);
4104 pid = 0;
4105
4106 if (r < 0)
4107 /* We failed to wait for the container, or the
4108 * container exited abnormally */
4109 goto finish;
4110 else if (r > 0 || container_status == CONTAINER_TERMINATED) {
4111 /* The container exited with a non-zero
4112 * status, or with zero status and no reboot
4113 * was requested. */
4114 ret = r;
4115 break;
4116 }
4117
4118 /* CONTAINER_REBOOTED, loop again */
4119
4120 if (arg_keep_unit) {
4121 /* Special handling if we are running as a
4122 * service: instead of simply restarting the
4123 * machine we want to restart the entire
4124 * service, so let's inform systemd about this
4125 * with the special exit code 133. The service
4126 * file uses RestartForceExitStatus=133 so
4127 * that this results in a full nspawn
4128 * restart. This is necessary since we might
4129 * have cgroup parameters set we want to have
4130 * flushed out. */
4131 ret = 133;
4132 r = 0;
4133 break;
4134 }
4135
4136 expose_port_flush(arg_expose_ports, &exposed);
4137
4138 (void) remove_veth_links(veth_name, arg_network_veth_extra);
4139 veth_created = false;
4140 }
4141
4142 finish:
4143 sd_notify(false,
4144 "STOPPING=1\n"
4145 "STATUS=Terminating...");
4146
4147 if (pid > 0)
4148 kill(pid, SIGKILL);
4149
4150 /* Try to flush whatever is still queued in the pty */
4151 if (master >= 0)
4152 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
4153
4154 loop_remove(loop_nr, &image_fd);
4155
4156 if (remove_subvol && arg_directory) {
4157 int k;
4158
4159 k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
4160 if (k < 0)
4161 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4162 }
4163
4164 if (arg_machine) {
4165 const char *p;
4166
4167 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
4168 (void) rm_rf(p, REMOVE_ROOT);
4169 }
4170
4171 expose_port_flush(arg_expose_ports, &exposed);
4172
4173 if (veth_created)
4174 (void) remove_veth_links(veth_name, arg_network_veth_extra);
4175 (void) remove_bridge(arg_network_zone);
4176
4177 free(arg_directory);
4178 free(arg_template);
4179 free(arg_image);
4180 free(arg_machine);
4181 free(arg_user);
4182 free(arg_chdir);
4183 strv_free(arg_setenv);
4184 free(arg_network_bridge);
4185 strv_free(arg_network_interfaces);
4186 strv_free(arg_network_macvlan);
4187 strv_free(arg_network_ipvlan);
4188 strv_free(arg_network_veth_extra);
4189 strv_free(arg_parameters);
4190 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4191 expose_port_free_all(arg_expose_ports);
4192
4193 return r < 0 ? EXIT_FAILURE : ret;
4194 }