]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
socket-util: move remaining socket-related calls from util.[ch] to socket-util.[ch]
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #ifdef HAVE_BLKID
23 #include <blkid/blkid.h>
24 #endif
25 #include <errno.h>
26 #include <getopt.h>
27 #include <linux/loop.h>
28 #include <sched.h>
29 #ifdef HAVE_SECCOMP
30 #include <seccomp.h>
31 #endif
32 #ifdef HAVE_SELINUX
33 #include <selinux/selinux.h>
34 #endif
35 #include <signal.h>
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <sys/file.h>
40 #include <sys/mount.h>
41 #include <sys/personality.h>
42 #include <sys/prctl.h>
43 #include <sys/types.h>
44 #include <unistd.h>
45
46 #include "sd-daemon.h"
47 #include "sd-id128.h"
48
49 #include "barrier.h"
50 #include "base-filesystem.h"
51 #include "blkid-util.h"
52 #include "btrfs-util.h"
53 #include "cap-list.h"
54 #include "capability.h"
55 #include "cgroup-util.h"
56 #include "copy.h"
57 #include "dev-setup.h"
58 #include "env-util.h"
59 #include "event-util.h"
60 #include "fd-util.h"
61 #include "fdset.h"
62 #include "fileio.h"
63 #include "formats-util.h"
64 #include "gpt.h"
65 #include "hostname-util.h"
66 #include "log.h"
67 #include "loopback-setup.h"
68 #include "machine-image.h"
69 #include "macro.h"
70 #include "missing.h"
71 #include "mkdir.h"
72 #include "netlink-util.h"
73 #include "nspawn-cgroup.h"
74 #include "nspawn-expose-ports.h"
75 #include "nspawn-mount.h"
76 #include "nspawn-network.h"
77 #include "nspawn-register.h"
78 #include "nspawn-settings.h"
79 #include "nspawn-setuid.h"
80 #include "path-util.h"
81 #include "process-util.h"
82 #include "ptyfwd.h"
83 #include "random-util.h"
84 #include "rm-rf.h"
85 #ifdef HAVE_SECCOMP
86 #include "seccomp-util.h"
87 #endif
88 #include "signal-util.h"
89 #include "socket-util.h"
90 #include "string-util.h"
91 #include "strv.h"
92 #include "terminal-util.h"
93 #include "udev-util.h"
94 #include "user-util.h"
95 #include "util.h"
96
97 typedef enum ContainerStatus {
98 CONTAINER_TERMINATED,
99 CONTAINER_REBOOTED
100 } ContainerStatus;
101
102 typedef enum LinkJournal {
103 LINK_NO,
104 LINK_AUTO,
105 LINK_HOST,
106 LINK_GUEST
107 } LinkJournal;
108
109 static char *arg_directory = NULL;
110 static char *arg_template = NULL;
111 static char *arg_user = NULL;
112 static sd_id128_t arg_uuid = {};
113 static char *arg_machine = NULL;
114 static const char *arg_selinux_context = NULL;
115 static const char *arg_selinux_apifs_context = NULL;
116 static const char *arg_slice = NULL;
117 static bool arg_private_network = false;
118 static bool arg_read_only = false;
119 static bool arg_boot = false;
120 static bool arg_ephemeral = false;
121 static LinkJournal arg_link_journal = LINK_AUTO;
122 static bool arg_link_journal_try = false;
123 static uint64_t arg_retain =
124 (1ULL << CAP_CHOWN) |
125 (1ULL << CAP_DAC_OVERRIDE) |
126 (1ULL << CAP_DAC_READ_SEARCH) |
127 (1ULL << CAP_FOWNER) |
128 (1ULL << CAP_FSETID) |
129 (1ULL << CAP_IPC_OWNER) |
130 (1ULL << CAP_KILL) |
131 (1ULL << CAP_LEASE) |
132 (1ULL << CAP_LINUX_IMMUTABLE) |
133 (1ULL << CAP_NET_BIND_SERVICE) |
134 (1ULL << CAP_NET_BROADCAST) |
135 (1ULL << CAP_NET_RAW) |
136 (1ULL << CAP_SETGID) |
137 (1ULL << CAP_SETFCAP) |
138 (1ULL << CAP_SETPCAP) |
139 (1ULL << CAP_SETUID) |
140 (1ULL << CAP_SYS_ADMIN) |
141 (1ULL << CAP_SYS_CHROOT) |
142 (1ULL << CAP_SYS_NICE) |
143 (1ULL << CAP_SYS_PTRACE) |
144 (1ULL << CAP_SYS_TTY_CONFIG) |
145 (1ULL << CAP_SYS_RESOURCE) |
146 (1ULL << CAP_SYS_BOOT) |
147 (1ULL << CAP_AUDIT_WRITE) |
148 (1ULL << CAP_AUDIT_CONTROL) |
149 (1ULL << CAP_MKNOD);
150 static CustomMount *arg_custom_mounts = NULL;
151 static unsigned arg_n_custom_mounts = 0;
152 static char **arg_setenv = NULL;
153 static bool arg_quiet = false;
154 static bool arg_share_system = false;
155 static bool arg_register = true;
156 static bool arg_keep_unit = false;
157 static char **arg_network_interfaces = NULL;
158 static char **arg_network_macvlan = NULL;
159 static char **arg_network_ipvlan = NULL;
160 static bool arg_network_veth = false;
161 static char *arg_network_bridge = NULL;
162 static unsigned long arg_personality = PERSONALITY_INVALID;
163 static char *arg_image = NULL;
164 static VolatileMode arg_volatile_mode = VOLATILE_NO;
165 static ExposePort *arg_expose_ports = NULL;
166 static char **arg_property = NULL;
167 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
168 static bool arg_userns = false;
169 static int arg_kill_signal = 0;
170 static bool arg_unified_cgroup_hierarchy = false;
171 static SettingsMask arg_settings_mask = 0;
172 static int arg_settings_trusted = -1;
173 static char **arg_parameters = NULL;
174
175 static void help(void) {
176 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
177 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
178 " -h --help Show this help\n"
179 " --version Print version string\n"
180 " -q --quiet Do not show status information\n"
181 " -D --directory=PATH Root directory for the container\n"
182 " --template=PATH Initialize root directory from template directory,\n"
183 " if missing\n"
184 " -x --ephemeral Run container with snapshot of root directory, and\n"
185 " remove it after exit\n"
186 " -i --image=PATH File system device or disk image for the container\n"
187 " -b --boot Boot up full system (i.e. invoke init)\n"
188 " -u --user=USER Run the command under specified user or uid\n"
189 " -M --machine=NAME Set the machine name for the container\n"
190 " --uuid=UUID Set a specific machine UUID for the container\n"
191 " -S --slice=SLICE Place the container in the specified slice\n"
192 " --property=NAME=VALUE Set scope unit property\n"
193 " --private-users[=UIDBASE[:NUIDS]]\n"
194 " Run within user namespace\n"
195 " --private-network Disable network in container\n"
196 " --network-interface=INTERFACE\n"
197 " Assign an existing network interface to the\n"
198 " container\n"
199 " --network-macvlan=INTERFACE\n"
200 " Create a macvlan network interface based on an\n"
201 " existing network interface to the container\n"
202 " --network-ipvlan=INTERFACE\n"
203 " Create a ipvlan network interface based on an\n"
204 " existing network interface to the container\n"
205 " -n --network-veth Add a virtual ethernet connection between host\n"
206 " and container\n"
207 " --network-bridge=INTERFACE\n"
208 " Add a virtual ethernet connection between host\n"
209 " and container and add it to an existing bridge on\n"
210 " the host\n"
211 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
212 " Expose a container IP port on the host\n"
213 " -Z --selinux-context=SECLABEL\n"
214 " Set the SELinux security context to be used by\n"
215 " processes in the container\n"
216 " -L --selinux-apifs-context=SECLABEL\n"
217 " Set the SELinux security context to be used by\n"
218 " API/tmpfs file systems in the container\n"
219 " --capability=CAP In addition to the default, retain specified\n"
220 " capability\n"
221 " --drop-capability=CAP Drop the specified capability from the default set\n"
222 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
223 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
224 " try-guest, try-host\n"
225 " -j Equivalent to --link-journal=try-guest\n"
226 " --read-only Mount the root directory read-only\n"
227 " --bind=PATH[:PATH[:OPTIONS]]\n"
228 " Bind mount a file or directory from the host into\n"
229 " the container\n"
230 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
231 " Similar, but creates a read-only bind mount\n"
232 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
233 " --overlay=PATH[:PATH...]:PATH\n"
234 " Create an overlay mount from the host to \n"
235 " the container\n"
236 " --overlay-ro=PATH[:PATH...]:PATH\n"
237 " Similar, but creates a read-only overlay mount\n"
238 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
239 " --share-system Share system namespaces with host\n"
240 " --register=BOOLEAN Register container as machine\n"
241 " --keep-unit Do not register a scope for the machine, reuse\n"
242 " the service unit nspawn is running in\n"
243 " --volatile[=MODE] Run the system in volatile mode\n"
244 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
245 , program_invocation_short_name);
246 }
247
248
249 static int custom_mounts_prepare(void) {
250 unsigned i;
251 int r;
252
253 /* Ensure the mounts are applied prefix first. */
254 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
255
256 /* Allocate working directories for the overlay file systems that need it */
257 for (i = 0; i < arg_n_custom_mounts; i++) {
258 CustomMount *m = &arg_custom_mounts[i];
259
260 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
261 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
262 return -EINVAL;
263 }
264
265 if (m->type != CUSTOM_MOUNT_OVERLAY)
266 continue;
267
268 if (m->work_dir)
269 continue;
270
271 if (m->read_only)
272 continue;
273
274 r = tempfn_random(m->source, NULL, &m->work_dir);
275 if (r < 0)
276 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
277 }
278
279 return 0;
280 }
281
282 static int detect_unified_cgroup_hierarchy(void) {
283 const char *e;
284 int r;
285
286 /* Allow the user to control whether the unified hierarchy is used */
287 e = getenv("UNIFIED_CGROUP_HIERARCHY");
288 if (e) {
289 r = parse_boolean(e);
290 if (r < 0)
291 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
292
293 arg_unified_cgroup_hierarchy = r;
294 return 0;
295 }
296
297 /* Otherwise inherit the default from the host system */
298 r = cg_unified();
299 if (r < 0)
300 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
301
302 arg_unified_cgroup_hierarchy = r;
303 return 0;
304 }
305
306 static int parse_argv(int argc, char *argv[]) {
307
308 enum {
309 ARG_VERSION = 0x100,
310 ARG_PRIVATE_NETWORK,
311 ARG_UUID,
312 ARG_READ_ONLY,
313 ARG_CAPABILITY,
314 ARG_DROP_CAPABILITY,
315 ARG_LINK_JOURNAL,
316 ARG_BIND,
317 ARG_BIND_RO,
318 ARG_TMPFS,
319 ARG_OVERLAY,
320 ARG_OVERLAY_RO,
321 ARG_SETENV,
322 ARG_SHARE_SYSTEM,
323 ARG_REGISTER,
324 ARG_KEEP_UNIT,
325 ARG_NETWORK_INTERFACE,
326 ARG_NETWORK_MACVLAN,
327 ARG_NETWORK_IPVLAN,
328 ARG_NETWORK_BRIDGE,
329 ARG_PERSONALITY,
330 ARG_VOLATILE,
331 ARG_TEMPLATE,
332 ARG_PROPERTY,
333 ARG_PRIVATE_USERS,
334 ARG_KILL_SIGNAL,
335 ARG_SETTINGS,
336 };
337
338 static const struct option options[] = {
339 { "help", no_argument, NULL, 'h' },
340 { "version", no_argument, NULL, ARG_VERSION },
341 { "directory", required_argument, NULL, 'D' },
342 { "template", required_argument, NULL, ARG_TEMPLATE },
343 { "ephemeral", no_argument, NULL, 'x' },
344 { "user", required_argument, NULL, 'u' },
345 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
346 { "boot", no_argument, NULL, 'b' },
347 { "uuid", required_argument, NULL, ARG_UUID },
348 { "read-only", no_argument, NULL, ARG_READ_ONLY },
349 { "capability", required_argument, NULL, ARG_CAPABILITY },
350 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
351 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
352 { "bind", required_argument, NULL, ARG_BIND },
353 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
354 { "tmpfs", required_argument, NULL, ARG_TMPFS },
355 { "overlay", required_argument, NULL, ARG_OVERLAY },
356 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
357 { "machine", required_argument, NULL, 'M' },
358 { "slice", required_argument, NULL, 'S' },
359 { "setenv", required_argument, NULL, ARG_SETENV },
360 { "selinux-context", required_argument, NULL, 'Z' },
361 { "selinux-apifs-context", required_argument, NULL, 'L' },
362 { "quiet", no_argument, NULL, 'q' },
363 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
364 { "register", required_argument, NULL, ARG_REGISTER },
365 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
366 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
367 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
368 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
369 { "network-veth", no_argument, NULL, 'n' },
370 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
371 { "personality", required_argument, NULL, ARG_PERSONALITY },
372 { "image", required_argument, NULL, 'i' },
373 { "volatile", optional_argument, NULL, ARG_VOLATILE },
374 { "port", required_argument, NULL, 'p' },
375 { "property", required_argument, NULL, ARG_PROPERTY },
376 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
377 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
378 { "settings", required_argument, NULL, ARG_SETTINGS },
379 {}
380 };
381
382 int c, r;
383 uint64_t plus = 0, minus = 0;
384 bool mask_all_settings = false, mask_no_settings = false;
385
386 assert(argc >= 0);
387 assert(argv);
388
389 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
390
391 switch (c) {
392
393 case 'h':
394 help();
395 return 0;
396
397 case ARG_VERSION:
398 return version();
399
400 case 'D':
401 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
402 if (r < 0)
403 return r;
404 break;
405
406 case ARG_TEMPLATE:
407 r = parse_path_argument_and_warn(optarg, false, &arg_template);
408 if (r < 0)
409 return r;
410 break;
411
412 case 'i':
413 r = parse_path_argument_and_warn(optarg, false, &arg_image);
414 if (r < 0)
415 return r;
416 break;
417
418 case 'x':
419 arg_ephemeral = true;
420 break;
421
422 case 'u':
423 r = free_and_strdup(&arg_user, optarg);
424 if (r < 0)
425 return log_oom();
426
427 arg_settings_mask |= SETTING_USER;
428 break;
429
430 case ARG_NETWORK_BRIDGE:
431 r = free_and_strdup(&arg_network_bridge, optarg);
432 if (r < 0)
433 return log_oom();
434
435 /* fall through */
436
437 case 'n':
438 arg_network_veth = true;
439 arg_private_network = true;
440 arg_settings_mask |= SETTING_NETWORK;
441 break;
442
443 case ARG_NETWORK_INTERFACE:
444 if (strv_extend(&arg_network_interfaces, optarg) < 0)
445 return log_oom();
446
447 arg_private_network = true;
448 arg_settings_mask |= SETTING_NETWORK;
449 break;
450
451 case ARG_NETWORK_MACVLAN:
452 if (strv_extend(&arg_network_macvlan, optarg) < 0)
453 return log_oom();
454
455 arg_private_network = true;
456 arg_settings_mask |= SETTING_NETWORK;
457 break;
458
459 case ARG_NETWORK_IPVLAN:
460 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
461 return log_oom();
462
463 /* fall through */
464
465 case ARG_PRIVATE_NETWORK:
466 arg_private_network = true;
467 arg_settings_mask |= SETTING_NETWORK;
468 break;
469
470 case 'b':
471 arg_boot = true;
472 arg_settings_mask |= SETTING_BOOT;
473 break;
474
475 case ARG_UUID:
476 r = sd_id128_from_string(optarg, &arg_uuid);
477 if (r < 0) {
478 log_error("Invalid UUID: %s", optarg);
479 return r;
480 }
481
482 arg_settings_mask |= SETTING_MACHINE_ID;
483 break;
484
485 case 'S':
486 arg_slice = optarg;
487 break;
488
489 case 'M':
490 if (isempty(optarg))
491 arg_machine = mfree(arg_machine);
492 else {
493 if (!machine_name_is_valid(optarg)) {
494 log_error("Invalid machine name: %s", optarg);
495 return -EINVAL;
496 }
497
498 r = free_and_strdup(&arg_machine, optarg);
499 if (r < 0)
500 return log_oom();
501
502 break;
503 }
504
505 case 'Z':
506 arg_selinux_context = optarg;
507 break;
508
509 case 'L':
510 arg_selinux_apifs_context = optarg;
511 break;
512
513 case ARG_READ_ONLY:
514 arg_read_only = true;
515 arg_settings_mask |= SETTING_READ_ONLY;
516 break;
517
518 case ARG_CAPABILITY:
519 case ARG_DROP_CAPABILITY: {
520 const char *state, *word;
521 size_t length;
522
523 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
524 _cleanup_free_ char *t;
525
526 t = strndup(word, length);
527 if (!t)
528 return log_oom();
529
530 if (streq(t, "all")) {
531 if (c == ARG_CAPABILITY)
532 plus = (uint64_t) -1;
533 else
534 minus = (uint64_t) -1;
535 } else {
536 int cap;
537
538 cap = capability_from_name(t);
539 if (cap < 0) {
540 log_error("Failed to parse capability %s.", t);
541 return -EINVAL;
542 }
543
544 if (c == ARG_CAPABILITY)
545 plus |= 1ULL << (uint64_t) cap;
546 else
547 minus |= 1ULL << (uint64_t) cap;
548 }
549 }
550
551 arg_settings_mask |= SETTING_CAPABILITY;
552 break;
553 }
554
555 case 'j':
556 arg_link_journal = LINK_GUEST;
557 arg_link_journal_try = true;
558 break;
559
560 case ARG_LINK_JOURNAL:
561 if (streq(optarg, "auto")) {
562 arg_link_journal = LINK_AUTO;
563 arg_link_journal_try = false;
564 } else if (streq(optarg, "no")) {
565 arg_link_journal = LINK_NO;
566 arg_link_journal_try = false;
567 } else if (streq(optarg, "guest")) {
568 arg_link_journal = LINK_GUEST;
569 arg_link_journal_try = false;
570 } else if (streq(optarg, "host")) {
571 arg_link_journal = LINK_HOST;
572 arg_link_journal_try = false;
573 } else if (streq(optarg, "try-guest")) {
574 arg_link_journal = LINK_GUEST;
575 arg_link_journal_try = true;
576 } else if (streq(optarg, "try-host")) {
577 arg_link_journal = LINK_HOST;
578 arg_link_journal_try = true;
579 } else {
580 log_error("Failed to parse link journal mode %s", optarg);
581 return -EINVAL;
582 }
583
584 break;
585
586 case ARG_BIND:
587 case ARG_BIND_RO:
588 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
589 if (r < 0)
590 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
591
592 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
593 break;
594
595 case ARG_TMPFS:
596 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
597 if (r < 0)
598 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
599
600 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
601 break;
602
603 case ARG_OVERLAY:
604 case ARG_OVERLAY_RO: {
605 _cleanup_free_ char *upper = NULL, *destination = NULL;
606 _cleanup_strv_free_ char **lower = NULL;
607 CustomMount *m;
608 unsigned n = 0;
609 char **i;
610
611 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
612 if (r == -ENOMEM)
613 return log_oom();
614 else if (r < 0) {
615 log_error("Invalid overlay specification: %s", optarg);
616 return r;
617 }
618
619 STRV_FOREACH(i, lower) {
620 if (!path_is_absolute(*i)) {
621 log_error("Overlay path %s is not absolute.", *i);
622 return -EINVAL;
623 }
624
625 n++;
626 }
627
628 if (n < 2) {
629 log_error("--overlay= needs at least two colon-separated directories specified.");
630 return -EINVAL;
631 }
632
633 if (n == 2) {
634 /* If two parameters are specified,
635 * the first one is the lower, the
636 * second one the upper directory. And
637 * we'll also define the destination
638 * mount point the same as the upper. */
639 upper = lower[1];
640 lower[1] = NULL;
641
642 destination = strdup(upper);
643 if (!destination)
644 return log_oom();
645
646 } else {
647 upper = lower[n - 2];
648 destination = lower[n - 1];
649 lower[n - 2] = NULL;
650 }
651
652 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
653 if (!m)
654 return log_oom();
655
656 m->destination = destination;
657 m->source = upper;
658 m->lower = lower;
659 m->read_only = c == ARG_OVERLAY_RO;
660
661 upper = destination = NULL;
662 lower = NULL;
663
664 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
665 break;
666 }
667
668 case ARG_SETENV: {
669 char **n;
670
671 if (!env_assignment_is_valid(optarg)) {
672 log_error("Environment variable assignment '%s' is not valid.", optarg);
673 return -EINVAL;
674 }
675
676 n = strv_env_set(arg_setenv, optarg);
677 if (!n)
678 return log_oom();
679
680 strv_free(arg_setenv);
681 arg_setenv = n;
682
683 arg_settings_mask |= SETTING_ENVIRONMENT;
684 break;
685 }
686
687 case 'q':
688 arg_quiet = true;
689 break;
690
691 case ARG_SHARE_SYSTEM:
692 arg_share_system = true;
693 break;
694
695 case ARG_REGISTER:
696 r = parse_boolean(optarg);
697 if (r < 0) {
698 log_error("Failed to parse --register= argument: %s", optarg);
699 return r;
700 }
701
702 arg_register = r;
703 break;
704
705 case ARG_KEEP_UNIT:
706 arg_keep_unit = true;
707 break;
708
709 case ARG_PERSONALITY:
710
711 arg_personality = personality_from_string(optarg);
712 if (arg_personality == PERSONALITY_INVALID) {
713 log_error("Unknown or unsupported personality '%s'.", optarg);
714 return -EINVAL;
715 }
716
717 arg_settings_mask |= SETTING_PERSONALITY;
718 break;
719
720 case ARG_VOLATILE:
721
722 if (!optarg)
723 arg_volatile_mode = VOLATILE_YES;
724 else {
725 VolatileMode m;
726
727 m = volatile_mode_from_string(optarg);
728 if (m < 0) {
729 log_error("Failed to parse --volatile= argument: %s", optarg);
730 return -EINVAL;
731 } else
732 arg_volatile_mode = m;
733 }
734
735 arg_settings_mask |= SETTING_VOLATILE_MODE;
736 break;
737
738 case 'p':
739 r = expose_port_parse(&arg_expose_ports, optarg);
740 if (r == -EEXIST)
741 return log_error_errno(r, "Duplicate port specification: %s", optarg);
742 if (r < 0)
743 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
744
745 arg_settings_mask |= SETTING_EXPOSE_PORTS;
746 break;
747
748 case ARG_PROPERTY:
749 if (strv_extend(&arg_property, optarg) < 0)
750 return log_oom();
751
752 break;
753
754 case ARG_PRIVATE_USERS:
755 if (optarg) {
756 _cleanup_free_ char *buffer = NULL;
757 const char *range, *shift;
758
759 range = strchr(optarg, ':');
760 if (range) {
761 buffer = strndup(optarg, range - optarg);
762 if (!buffer)
763 return log_oom();
764 shift = buffer;
765
766 range++;
767 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
768 log_error("Failed to parse UID range: %s", range);
769 return -EINVAL;
770 }
771 } else
772 shift = optarg;
773
774 if (parse_uid(shift, &arg_uid_shift) < 0) {
775 log_error("Failed to parse UID: %s", optarg);
776 return -EINVAL;
777 }
778 }
779
780 arg_userns = true;
781 break;
782
783 case ARG_KILL_SIGNAL:
784 arg_kill_signal = signal_from_string_try_harder(optarg);
785 if (arg_kill_signal < 0) {
786 log_error("Cannot parse signal: %s", optarg);
787 return -EINVAL;
788 }
789
790 arg_settings_mask |= SETTING_KILL_SIGNAL;
791 break;
792
793 case ARG_SETTINGS:
794
795 /* no → do not read files
796 * yes → read files, do not override cmdline, trust only subset
797 * override → read files, override cmdline, trust only subset
798 * trusted → read files, do not override cmdline, trust all
799 */
800
801 r = parse_boolean(optarg);
802 if (r < 0) {
803 if (streq(optarg, "trusted")) {
804 mask_all_settings = false;
805 mask_no_settings = false;
806 arg_settings_trusted = true;
807
808 } else if (streq(optarg, "override")) {
809 mask_all_settings = false;
810 mask_no_settings = true;
811 arg_settings_trusted = -1;
812 } else
813 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
814 } else if (r > 0) {
815 /* yes */
816 mask_all_settings = false;
817 mask_no_settings = false;
818 arg_settings_trusted = -1;
819 } else {
820 /* no */
821 mask_all_settings = true;
822 mask_no_settings = false;
823 arg_settings_trusted = false;
824 }
825
826 break;
827
828 case '?':
829 return -EINVAL;
830
831 default:
832 assert_not_reached("Unhandled option");
833 }
834
835 if (arg_share_system)
836 arg_register = false;
837
838 if (arg_boot && arg_share_system) {
839 log_error("--boot and --share-system may not be combined.");
840 return -EINVAL;
841 }
842
843 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
844 log_error("--keep-unit may not be used when invoked from a user session.");
845 return -EINVAL;
846 }
847
848 if (arg_directory && arg_image) {
849 log_error("--directory= and --image= may not be combined.");
850 return -EINVAL;
851 }
852
853 if (arg_template && arg_image) {
854 log_error("--template= and --image= may not be combined.");
855 return -EINVAL;
856 }
857
858 if (arg_template && !(arg_directory || arg_machine)) {
859 log_error("--template= needs --directory= or --machine=.");
860 return -EINVAL;
861 }
862
863 if (arg_ephemeral && arg_template) {
864 log_error("--ephemeral and --template= may not be combined.");
865 return -EINVAL;
866 }
867
868 if (arg_ephemeral && arg_image) {
869 log_error("--ephemeral and --image= may not be combined.");
870 return -EINVAL;
871 }
872
873 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
874 log_error("--ephemeral and --link-journal= may not be combined.");
875 return -EINVAL;
876 }
877
878 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
879 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
880
881 if (argc > optind) {
882 arg_parameters = strv_copy(argv + optind);
883 if (!arg_parameters)
884 return log_oom();
885
886 arg_settings_mask |= SETTING_BOOT;
887 }
888
889 /* Load all settings from .nspawn files */
890 if (mask_no_settings)
891 arg_settings_mask = 0;
892
893 /* Don't load any settings from .nspawn files */
894 if (mask_all_settings)
895 arg_settings_mask = _SETTINGS_MASK_ALL;
896
897 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
898
899 r = detect_unified_cgroup_hierarchy();
900 if (r < 0)
901 return r;
902
903 return 1;
904 }
905
906 static int verify_arguments(void) {
907
908 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
909 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
910 return -EINVAL;
911 }
912
913 if (arg_expose_ports && !arg_private_network) {
914 log_error("Cannot use --port= without private networking.");
915 return -EINVAL;
916 }
917
918 if (arg_boot && arg_kill_signal <= 0)
919 arg_kill_signal = SIGRTMIN+3;
920
921 return 0;
922 }
923
924 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
925 assert(p);
926
927 if (!arg_userns)
928 return 0;
929
930 if (uid == UID_INVALID && gid == GID_INVALID)
931 return 0;
932
933 if (uid != UID_INVALID) {
934 uid += arg_uid_shift;
935
936 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
937 return -EOVERFLOW;
938 }
939
940 if (gid != GID_INVALID) {
941 gid += (gid_t) arg_uid_shift;
942
943 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
944 return -EOVERFLOW;
945 }
946
947 if (lchown(p, uid, gid) < 0)
948 return -errno;
949
950 return 0;
951 }
952
953 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
954 const char *q;
955
956 q = prefix_roota(root, path);
957 if (mkdir(q, mode) < 0) {
958 if (errno == EEXIST)
959 return 0;
960 return -errno;
961 }
962
963 return userns_lchown(q, uid, gid);
964 }
965
966 static int setup_timezone(const char *dest) {
967 _cleanup_free_ char *p = NULL, *q = NULL;
968 const char *where, *check, *what;
969 char *z, *y;
970 int r;
971
972 assert(dest);
973
974 /* Fix the timezone, if possible */
975 r = readlink_malloc("/etc/localtime", &p);
976 if (r < 0) {
977 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
978 return 0;
979 }
980
981 z = path_startswith(p, "../usr/share/zoneinfo/");
982 if (!z)
983 z = path_startswith(p, "/usr/share/zoneinfo/");
984 if (!z) {
985 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
986 return 0;
987 }
988
989 where = prefix_roota(dest, "/etc/localtime");
990 r = readlink_malloc(where, &q);
991 if (r >= 0) {
992 y = path_startswith(q, "../usr/share/zoneinfo/");
993 if (!y)
994 y = path_startswith(q, "/usr/share/zoneinfo/");
995
996 /* Already pointing to the right place? Then do nothing .. */
997 if (y && streq(y, z))
998 return 0;
999 }
1000
1001 check = strjoina("/usr/share/zoneinfo/", z);
1002 check = prefix_root(dest, check);
1003 if (laccess(check, F_OK) < 0) {
1004 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1005 return 0;
1006 }
1007
1008 r = unlink(where);
1009 if (r < 0 && errno != ENOENT) {
1010 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1011 return 0;
1012 }
1013
1014 what = strjoina("../usr/share/zoneinfo/", z);
1015 if (symlink(what, where) < 0) {
1016 log_error_errno(errno, "Failed to correct timezone of container: %m");
1017 return 0;
1018 }
1019
1020 r = userns_lchown(where, 0, 0);
1021 if (r < 0)
1022 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1023
1024 return 0;
1025 }
1026
1027 static int setup_resolv_conf(const char *dest) {
1028 const char *where = NULL;
1029 int r;
1030
1031 assert(dest);
1032
1033 if (arg_private_network)
1034 return 0;
1035
1036 /* Fix resolv.conf, if possible */
1037 where = prefix_roota(dest, "/etc/resolv.conf");
1038
1039 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1040 if (r < 0) {
1041 /* If the file already exists as symlink, let's
1042 * suppress the warning, under the assumption that
1043 * resolved or something similar runs inside and the
1044 * symlink points there.
1045 *
1046 * If the disk image is read-only, there's also no
1047 * point in complaining.
1048 */
1049 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1050 "Failed to copy /etc/resolv.conf to %s: %m", where);
1051 return 0;
1052 }
1053
1054 r = userns_lchown(where, 0, 0);
1055 if (r < 0)
1056 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1057
1058 return 0;
1059 }
1060
1061 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1062 assert(s);
1063
1064 snprintf(s, 37,
1065 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1066 SD_ID128_FORMAT_VAL(id));
1067
1068 return s;
1069 }
1070
1071 static int setup_boot_id(const char *dest) {
1072 const char *from, *to;
1073 sd_id128_t rnd = {};
1074 char as_uuid[37];
1075 int r;
1076
1077 if (arg_share_system)
1078 return 0;
1079
1080 /* Generate a new randomized boot ID, so that each boot-up of
1081 * the container gets a new one */
1082
1083 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1084 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1085
1086 r = sd_id128_randomize(&rnd);
1087 if (r < 0)
1088 return log_error_errno(r, "Failed to generate random boot id: %m");
1089
1090 id128_format_as_uuid(rnd, as_uuid);
1091
1092 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1093 if (r < 0)
1094 return log_error_errno(r, "Failed to write boot id: %m");
1095
1096 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1097 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1098 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1099 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1100
1101 unlink(from);
1102 return r;
1103 }
1104
1105 static int copy_devnodes(const char *dest) {
1106
1107 static const char devnodes[] =
1108 "null\0"
1109 "zero\0"
1110 "full\0"
1111 "random\0"
1112 "urandom\0"
1113 "tty\0"
1114 "net/tun\0";
1115
1116 const char *d;
1117 int r = 0;
1118 _cleanup_umask_ mode_t u;
1119
1120 assert(dest);
1121
1122 u = umask(0000);
1123
1124 /* Create /dev/net, so that we can create /dev/net/tun in it */
1125 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1126 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1127
1128 NULSTR_FOREACH(d, devnodes) {
1129 _cleanup_free_ char *from = NULL, *to = NULL;
1130 struct stat st;
1131
1132 from = strappend("/dev/", d);
1133 to = prefix_root(dest, from);
1134
1135 if (stat(from, &st) < 0) {
1136
1137 if (errno != ENOENT)
1138 return log_error_errno(errno, "Failed to stat %s: %m", from);
1139
1140 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1141
1142 log_error("%s is not a char or block device, cannot copy.", from);
1143 return -EIO;
1144
1145 } else {
1146 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1147 if (errno != EPERM)
1148 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1149
1150 /* Some systems abusively restrict mknod but
1151 * allow bind mounts. */
1152 r = touch(to);
1153 if (r < 0)
1154 return log_error_errno(r, "touch (%s) failed: %m", to);
1155 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1156 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1157 }
1158
1159 r = userns_lchown(to, 0, 0);
1160 if (r < 0)
1161 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1162 }
1163 }
1164
1165 return r;
1166 }
1167
1168 static int setup_pts(const char *dest) {
1169 _cleanup_free_ char *options = NULL;
1170 const char *p;
1171
1172 #ifdef HAVE_SELINUX
1173 if (arg_selinux_apifs_context)
1174 (void) asprintf(&options,
1175 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1176 arg_uid_shift + TTY_GID,
1177 arg_selinux_apifs_context);
1178 else
1179 #endif
1180 (void) asprintf(&options,
1181 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1182 arg_uid_shift + TTY_GID);
1183
1184 if (!options)
1185 return log_oom();
1186
1187 /* Mount /dev/pts itself */
1188 p = prefix_roota(dest, "/dev/pts");
1189 if (mkdir(p, 0755) < 0)
1190 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1191 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1192 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1193 if (userns_lchown(p, 0, 0) < 0)
1194 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1195
1196 /* Create /dev/ptmx symlink */
1197 p = prefix_roota(dest, "/dev/ptmx");
1198 if (symlink("pts/ptmx", p) < 0)
1199 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1200 if (userns_lchown(p, 0, 0) < 0)
1201 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
1202
1203 /* And fix /dev/pts/ptmx ownership */
1204 p = prefix_roota(dest, "/dev/pts/ptmx");
1205 if (userns_lchown(p, 0, 0) < 0)
1206 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
1207
1208 return 0;
1209 }
1210
1211 static int setup_dev_console(const char *dest, const char *console) {
1212 _cleanup_umask_ mode_t u;
1213 const char *to;
1214 int r;
1215
1216 assert(dest);
1217 assert(console);
1218
1219 u = umask(0000);
1220
1221 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1222 if (r < 0)
1223 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1224
1225 /* We need to bind mount the right tty to /dev/console since
1226 * ptys can only exist on pts file systems. To have something
1227 * to bind mount things on we create a empty regular file. */
1228
1229 to = prefix_roota(dest, "/dev/console");
1230 r = touch(to);
1231 if (r < 0)
1232 return log_error_errno(r, "touch() for /dev/console failed: %m");
1233
1234 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1235 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1236
1237 return 0;
1238 }
1239
1240 static int setup_kmsg(const char *dest, int kmsg_socket) {
1241 const char *from, *to;
1242 _cleanup_umask_ mode_t u;
1243 int fd, r;
1244
1245 assert(kmsg_socket >= 0);
1246
1247 u = umask(0000);
1248
1249 /* We create the kmsg FIFO as /run/kmsg, but immediately
1250 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1251 * on the reading side behave very similar to /proc/kmsg,
1252 * their writing side behaves differently from /dev/kmsg in
1253 * that writing blocks when nothing is reading. In order to
1254 * avoid any problems with containers deadlocking due to this
1255 * we simply make /dev/kmsg unavailable to the container. */
1256 from = prefix_roota(dest, "/run/kmsg");
1257 to = prefix_roota(dest, "/proc/kmsg");
1258
1259 if (mkfifo(from, 0600) < 0)
1260 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1261 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1262 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1263
1264 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1265 if (fd < 0)
1266 return log_error_errno(errno, "Failed to open fifo: %m");
1267
1268 /* Store away the fd in the socket, so that it stays open as
1269 * long as we run the child */
1270 r = send_one_fd(kmsg_socket, fd, 0);
1271 safe_close(fd);
1272
1273 if (r < 0)
1274 return log_error_errno(r, "Failed to send FIFO fd: %m");
1275
1276 /* And now make the FIFO unavailable as /run/kmsg... */
1277 (void) unlink(from);
1278
1279 return 0;
1280 }
1281
1282 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
1283 union in_addr_union *exposed = userdata;
1284
1285 assert(rtnl);
1286 assert(m);
1287 assert(exposed);
1288
1289 expose_port_execute(rtnl, arg_expose_ports, exposed);
1290 return 0;
1291 }
1292
1293 static int setup_hostname(void) {
1294
1295 if (arg_share_system)
1296 return 0;
1297
1298 if (sethostname_idempotent(arg_machine) < 0)
1299 return -errno;
1300
1301 return 0;
1302 }
1303
1304 static int setup_journal(const char *directory) {
1305 sd_id128_t machine_id, this_id;
1306 _cleanup_free_ char *b = NULL, *d = NULL;
1307 const char *etc_machine_id, *p, *q;
1308 char *id;
1309 int r;
1310
1311 /* Don't link journals in ephemeral mode */
1312 if (arg_ephemeral)
1313 return 0;
1314
1315 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
1316
1317 r = read_one_line_file(etc_machine_id, &b);
1318 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1319 return 0;
1320 else if (r < 0)
1321 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
1322
1323 id = strstrip(b);
1324 if (isempty(id) && arg_link_journal == LINK_AUTO)
1325 return 0;
1326
1327 /* Verify validity */
1328 r = sd_id128_from_string(id, &machine_id);
1329 if (r < 0)
1330 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
1331
1332 r = sd_id128_get_machine(&this_id);
1333 if (r < 0)
1334 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1335
1336 if (sd_id128_equal(machine_id, this_id)) {
1337 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1338 "Host and machine ids are equal (%s): refusing to link journals", id);
1339 if (arg_link_journal == LINK_AUTO)
1340 return 0;
1341 return -EEXIST;
1342 }
1343
1344 if (arg_link_journal == LINK_NO)
1345 return 0;
1346
1347 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1348 if (r < 0)
1349 return log_error_errno(r, "Failed to create /var: %m");
1350
1351 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1352 if (r < 0)
1353 return log_error_errno(r, "Failed to create /var/log: %m");
1354
1355 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1356 if (r < 0)
1357 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1358
1359 p = strjoina("/var/log/journal/", id);
1360 q = prefix_roota(directory, p);
1361
1362 if (path_is_mount_point(p, 0) > 0) {
1363 if (arg_link_journal != LINK_AUTO) {
1364 log_error("%s: already a mount point, refusing to use for journal", p);
1365 return -EEXIST;
1366 }
1367
1368 return 0;
1369 }
1370
1371 if (path_is_mount_point(q, 0) > 0) {
1372 if (arg_link_journal != LINK_AUTO) {
1373 log_error("%s: already a mount point, refusing to use for journal", q);
1374 return -EEXIST;
1375 }
1376
1377 return 0;
1378 }
1379
1380 r = readlink_and_make_absolute(p, &d);
1381 if (r >= 0) {
1382 if ((arg_link_journal == LINK_GUEST ||
1383 arg_link_journal == LINK_AUTO) &&
1384 path_equal(d, q)) {
1385
1386 r = userns_mkdir(directory, p, 0755, 0, 0);
1387 if (r < 0)
1388 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1389 return 0;
1390 }
1391
1392 if (unlink(p) < 0)
1393 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1394 } else if (r == -EINVAL) {
1395
1396 if (arg_link_journal == LINK_GUEST &&
1397 rmdir(p) < 0) {
1398
1399 if (errno == ENOTDIR) {
1400 log_error("%s already exists and is neither a symlink nor a directory", p);
1401 return r;
1402 } else {
1403 log_error_errno(errno, "Failed to remove %s: %m", p);
1404 return -errno;
1405 }
1406 }
1407 } else if (r != -ENOENT) {
1408 log_error_errno(errno, "readlink(%s) failed: %m", p);
1409 return r;
1410 }
1411
1412 if (arg_link_journal == LINK_GUEST) {
1413
1414 if (symlink(q, p) < 0) {
1415 if (arg_link_journal_try) {
1416 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1417 return 0;
1418 } else {
1419 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1420 return -errno;
1421 }
1422 }
1423
1424 r = userns_mkdir(directory, p, 0755, 0, 0);
1425 if (r < 0)
1426 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1427 return 0;
1428 }
1429
1430 if (arg_link_journal == LINK_HOST) {
1431 /* don't create parents here -- if the host doesn't have
1432 * permanent journal set up, don't force it here */
1433 r = mkdir(p, 0755);
1434 if (r < 0) {
1435 if (arg_link_journal_try) {
1436 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1437 return 0;
1438 } else {
1439 log_error_errno(errno, "Failed to create %s: %m", p);
1440 return r;
1441 }
1442 }
1443
1444 } else if (access(p, F_OK) < 0)
1445 return 0;
1446
1447 if (dir_is_empty(q) == 0)
1448 log_warning("%s is not empty, proceeding anyway.", q);
1449
1450 r = userns_mkdir(directory, p, 0755, 0, 0);
1451 if (r < 0) {
1452 log_error_errno(errno, "Failed to create %s: %m", q);
1453 return r;
1454 }
1455
1456 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1457 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1458
1459 return 0;
1460 }
1461
1462 static int drop_capabilities(void) {
1463 return capability_bounding_set_drop(~arg_retain, false);
1464 }
1465
1466 static int reset_audit_loginuid(void) {
1467 _cleanup_free_ char *p = NULL;
1468 int r;
1469
1470 if (arg_share_system)
1471 return 0;
1472
1473 r = read_one_line_file("/proc/self/loginuid", &p);
1474 if (r == -ENOENT)
1475 return 0;
1476 if (r < 0)
1477 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1478
1479 /* Already reset? */
1480 if (streq(p, "4294967295"))
1481 return 0;
1482
1483 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
1484 if (r < 0) {
1485 log_error_errno(r,
1486 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1487 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1488 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1489 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1490 "using systemd-nspawn. Sleeping for 5s... (%m)");
1491
1492 sleep(5);
1493 }
1494
1495 return 0;
1496 }
1497
1498 static int setup_seccomp(void) {
1499
1500 #ifdef HAVE_SECCOMP
1501 static const struct {
1502 uint64_t capability;
1503 int syscall_num;
1504 } blacklist[] = {
1505 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
1506 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
1507 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
1508 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
1509 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
1510 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
1511 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
1512 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
1513 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
1514 { CAP_SYSLOG, SCMP_SYS(syslog) },
1515 };
1516
1517 scmp_filter_ctx seccomp;
1518 unsigned i;
1519 int r;
1520
1521 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1522 if (!seccomp)
1523 return log_oom();
1524
1525 r = seccomp_add_secondary_archs(seccomp);
1526 if (r < 0) {
1527 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
1528 goto finish;
1529 }
1530
1531 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1532 if (arg_retain & (1ULL << blacklist[i].capability))
1533 continue;
1534
1535 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
1536 if (r == -EFAULT)
1537 continue; /* unknown syscall */
1538 if (r < 0) {
1539 log_error_errno(r, "Failed to block syscall: %m");
1540 goto finish;
1541 }
1542 }
1543
1544
1545 /*
1546 Audit is broken in containers, much of the userspace audit
1547 hookup will fail if running inside a container. We don't
1548 care and just turn off creation of audit sockets.
1549
1550 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1551 with EAFNOSUPPORT which audit userspace uses as indication
1552 that audit is disabled in the kernel.
1553 */
1554
1555 r = seccomp_rule_add(
1556 seccomp,
1557 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1558 SCMP_SYS(socket),
1559 2,
1560 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1561 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1562 if (r < 0) {
1563 log_error_errno(r, "Failed to add audit seccomp rule: %m");
1564 goto finish;
1565 }
1566
1567 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1568 if (r < 0) {
1569 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
1570 goto finish;
1571 }
1572
1573 r = seccomp_load(seccomp);
1574 if (r == -EINVAL) {
1575 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1576 r = 0;
1577 goto finish;
1578 }
1579 if (r < 0) {
1580 log_error_errno(r, "Failed to install seccomp audit filter: %m");
1581 goto finish;
1582 }
1583
1584 finish:
1585 seccomp_release(seccomp);
1586 return r;
1587 #else
1588 return 0;
1589 #endif
1590
1591 }
1592
1593 static int setup_propagate(const char *root) {
1594 const char *p, *q;
1595
1596 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1597 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1598 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
1599 (void) mkdir_p(p, 0600);
1600
1601 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
1602 return log_error_errno(errno, "Failed to create /run/systemd: %m");
1603
1604 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
1605 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
1606
1607 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
1608 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
1609
1610 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
1611 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1612 return log_error_errno(errno, "Failed to install propagation bind mount.");
1613
1614 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1615 return log_error_errno(errno, "Failed to make propagation mount read-only");
1616
1617 return 0;
1618 }
1619
1620 static int setup_image(char **device_path, int *loop_nr) {
1621 struct loop_info64 info = {
1622 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1623 };
1624 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1625 _cleanup_free_ char* loopdev = NULL;
1626 struct stat st;
1627 int r, nr;
1628
1629 assert(device_path);
1630 assert(loop_nr);
1631 assert(arg_image);
1632
1633 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1634 if (fd < 0)
1635 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1636
1637 if (fstat(fd, &st) < 0)
1638 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1639
1640 if (S_ISBLK(st.st_mode)) {
1641 char *p;
1642
1643 p = strdup(arg_image);
1644 if (!p)
1645 return log_oom();
1646
1647 *device_path = p;
1648
1649 *loop_nr = -1;
1650
1651 r = fd;
1652 fd = -1;
1653
1654 return r;
1655 }
1656
1657 if (!S_ISREG(st.st_mode)) {
1658 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1659 return -EINVAL;
1660 }
1661
1662 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1663 if (control < 0)
1664 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1665
1666 nr = ioctl(control, LOOP_CTL_GET_FREE);
1667 if (nr < 0)
1668 return log_error_errno(errno, "Failed to allocate loop device: %m");
1669
1670 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1671 return log_oom();
1672
1673 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1674 if (loop < 0)
1675 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1676
1677 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1678 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1679
1680 if (arg_read_only)
1681 info.lo_flags |= LO_FLAGS_READ_ONLY;
1682
1683 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1684 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1685
1686 *device_path = loopdev;
1687 loopdev = NULL;
1688
1689 *loop_nr = nr;
1690
1691 r = loop;
1692 loop = -1;
1693
1694 return r;
1695 }
1696
1697 #define PARTITION_TABLE_BLURB \
1698 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1699 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1700 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1701 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1702 "to be bootable with systemd-nspawn."
1703
1704 static int dissect_image(
1705 int fd,
1706 char **root_device, bool *root_device_rw,
1707 char **home_device, bool *home_device_rw,
1708 char **srv_device, bool *srv_device_rw,
1709 bool *secondary) {
1710
1711 #ifdef HAVE_BLKID
1712 int home_nr = -1, srv_nr = -1;
1713 #ifdef GPT_ROOT_NATIVE
1714 int root_nr = -1;
1715 #endif
1716 #ifdef GPT_ROOT_SECONDARY
1717 int secondary_root_nr = -1;
1718 #endif
1719 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1720 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1721 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1722 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1723 _cleanup_udev_unref_ struct udev *udev = NULL;
1724 struct udev_list_entry *first, *item;
1725 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
1726 bool is_gpt, is_mbr, multiple_generic = false;
1727 const char *pttype = NULL;
1728 blkid_partlist pl;
1729 struct stat st;
1730 unsigned i;
1731 int r;
1732
1733 assert(fd >= 0);
1734 assert(root_device);
1735 assert(home_device);
1736 assert(srv_device);
1737 assert(secondary);
1738 assert(arg_image);
1739
1740 b = blkid_new_probe();
1741 if (!b)
1742 return log_oom();
1743
1744 errno = 0;
1745 r = blkid_probe_set_device(b, fd, 0, 0);
1746 if (r != 0) {
1747 if (errno == 0)
1748 return log_oom();
1749
1750 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1751 return -errno;
1752 }
1753
1754 blkid_probe_enable_partitions(b, 1);
1755 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1756
1757 errno = 0;
1758 r = blkid_do_safeprobe(b);
1759 if (r == -2 || r == 1) {
1760 log_error("Failed to identify any partition table on\n"
1761 " %s\n"
1762 PARTITION_TABLE_BLURB, arg_image);
1763 return -EINVAL;
1764 } else if (r != 0) {
1765 if (errno == 0)
1766 errno = EIO;
1767 log_error_errno(errno, "Failed to probe: %m");
1768 return -errno;
1769 }
1770
1771 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1772
1773 is_gpt = streq_ptr(pttype, "gpt");
1774 is_mbr = streq_ptr(pttype, "dos");
1775
1776 if (!is_gpt && !is_mbr) {
1777 log_error("No GPT or MBR partition table discovered on\n"
1778 " %s\n"
1779 PARTITION_TABLE_BLURB, arg_image);
1780 return -EINVAL;
1781 }
1782
1783 errno = 0;
1784 pl = blkid_probe_get_partitions(b);
1785 if (!pl) {
1786 if (errno == 0)
1787 return log_oom();
1788
1789 log_error("Failed to list partitions of %s", arg_image);
1790 return -errno;
1791 }
1792
1793 udev = udev_new();
1794 if (!udev)
1795 return log_oom();
1796
1797 if (fstat(fd, &st) < 0)
1798 return log_error_errno(errno, "Failed to stat block device: %m");
1799
1800 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1801 if (!d)
1802 return log_oom();
1803
1804 for (i = 0;; i++) {
1805 int n, m;
1806
1807 if (i >= 10) {
1808 log_error("Kernel partitions never appeared.");
1809 return -ENXIO;
1810 }
1811
1812 e = udev_enumerate_new(udev);
1813 if (!e)
1814 return log_oom();
1815
1816 r = udev_enumerate_add_match_parent(e, d);
1817 if (r < 0)
1818 return log_oom();
1819
1820 r = udev_enumerate_scan_devices(e);
1821 if (r < 0)
1822 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1823
1824 /* Count the partitions enumerated by the kernel */
1825 n = 0;
1826 first = udev_enumerate_get_list_entry(e);
1827 udev_list_entry_foreach(item, first)
1828 n++;
1829
1830 /* Count the partitions enumerated by blkid */
1831 m = blkid_partlist_numof_partitions(pl);
1832 if (n == m + 1)
1833 break;
1834 if (n > m + 1) {
1835 log_error("blkid and kernel partition list do not match.");
1836 return -EIO;
1837 }
1838 if (n < m + 1) {
1839 unsigned j;
1840
1841 /* The kernel has probed fewer partitions than
1842 * blkid? Maybe the kernel prober is still
1843 * running or it got EBUSY because udev
1844 * already opened the device. Let's reprobe
1845 * the device, which is a synchronous call
1846 * that waits until probing is complete. */
1847
1848 for (j = 0; j < 20; j++) {
1849
1850 r = ioctl(fd, BLKRRPART, 0);
1851 if (r < 0)
1852 r = -errno;
1853 if (r >= 0 || r != -EBUSY)
1854 break;
1855
1856 /* If something else has the device
1857 * open, such as an udev rule, the
1858 * ioctl will return EBUSY. Since
1859 * there's no way to wait until it
1860 * isn't busy anymore, let's just wait
1861 * a bit, and try again.
1862 *
1863 * This is really something they
1864 * should fix in the kernel! */
1865
1866 usleep(50 * USEC_PER_MSEC);
1867 }
1868
1869 if (r < 0)
1870 return log_error_errno(r, "Failed to reread partition table: %m");
1871 }
1872
1873 e = udev_enumerate_unref(e);
1874 }
1875
1876 first = udev_enumerate_get_list_entry(e);
1877 udev_list_entry_foreach(item, first) {
1878 _cleanup_udev_device_unref_ struct udev_device *q;
1879 const char *node;
1880 unsigned long long flags;
1881 blkid_partition pp;
1882 dev_t qn;
1883 int nr;
1884
1885 errno = 0;
1886 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1887 if (!q) {
1888 if (!errno)
1889 errno = ENOMEM;
1890
1891 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1892 return -errno;
1893 }
1894
1895 qn = udev_device_get_devnum(q);
1896 if (major(qn) == 0)
1897 continue;
1898
1899 if (st.st_rdev == qn)
1900 continue;
1901
1902 node = udev_device_get_devnode(q);
1903 if (!node)
1904 continue;
1905
1906 pp = blkid_partlist_devno_to_partition(pl, qn);
1907 if (!pp)
1908 continue;
1909
1910 flags = blkid_partition_get_flags(pp);
1911
1912 nr = blkid_partition_get_partno(pp);
1913 if (nr < 0)
1914 continue;
1915
1916 if (is_gpt) {
1917 sd_id128_t type_id;
1918 const char *stype;
1919
1920 if (flags & GPT_FLAG_NO_AUTO)
1921 continue;
1922
1923 stype = blkid_partition_get_type_string(pp);
1924 if (!stype)
1925 continue;
1926
1927 if (sd_id128_from_string(stype, &type_id) < 0)
1928 continue;
1929
1930 if (sd_id128_equal(type_id, GPT_HOME)) {
1931
1932 if (home && nr >= home_nr)
1933 continue;
1934
1935 home_nr = nr;
1936 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1937
1938 r = free_and_strdup(&home, node);
1939 if (r < 0)
1940 return log_oom();
1941
1942 } else if (sd_id128_equal(type_id, GPT_SRV)) {
1943
1944 if (srv && nr >= srv_nr)
1945 continue;
1946
1947 srv_nr = nr;
1948 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
1949
1950 r = free_and_strdup(&srv, node);
1951 if (r < 0)
1952 return log_oom();
1953 }
1954 #ifdef GPT_ROOT_NATIVE
1955 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1956
1957 if (root && nr >= root_nr)
1958 continue;
1959
1960 root_nr = nr;
1961 root_rw = !(flags & GPT_FLAG_READ_ONLY);
1962
1963 r = free_and_strdup(&root, node);
1964 if (r < 0)
1965 return log_oom();
1966 }
1967 #endif
1968 #ifdef GPT_ROOT_SECONDARY
1969 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
1970
1971 if (secondary_root && nr >= secondary_root_nr)
1972 continue;
1973
1974 secondary_root_nr = nr;
1975 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
1976
1977 r = free_and_strdup(&secondary_root, node);
1978 if (r < 0)
1979 return log_oom();
1980 }
1981 #endif
1982 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
1983
1984 if (generic)
1985 multiple_generic = true;
1986 else {
1987 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
1988
1989 r = free_and_strdup(&generic, node);
1990 if (r < 0)
1991 return log_oom();
1992 }
1993 }
1994
1995 } else if (is_mbr) {
1996 int type;
1997
1998 if (flags != 0x80) /* Bootable flag */
1999 continue;
2000
2001 type = blkid_partition_get_type(pp);
2002 if (type != 0x83) /* Linux partition */
2003 continue;
2004
2005 if (generic)
2006 multiple_generic = true;
2007 else {
2008 generic_rw = true;
2009
2010 r = free_and_strdup(&root, node);
2011 if (r < 0)
2012 return log_oom();
2013 }
2014 }
2015 }
2016
2017 if (root) {
2018 *root_device = root;
2019 root = NULL;
2020
2021 *root_device_rw = root_rw;
2022 *secondary = false;
2023 } else if (secondary_root) {
2024 *root_device = secondary_root;
2025 secondary_root = NULL;
2026
2027 *root_device_rw = secondary_root_rw;
2028 *secondary = true;
2029 } else if (generic) {
2030
2031 /* There were no partitions with precise meanings
2032 * around, but we found generic partitions. In this
2033 * case, if there's only one, we can go ahead and boot
2034 * it, otherwise we bail out, because we really cannot
2035 * make any sense of it. */
2036
2037 if (multiple_generic) {
2038 log_error("Identified multiple bootable Linux partitions on\n"
2039 " %s\n"
2040 PARTITION_TABLE_BLURB, arg_image);
2041 return -EINVAL;
2042 }
2043
2044 *root_device = generic;
2045 generic = NULL;
2046
2047 *root_device_rw = generic_rw;
2048 *secondary = false;
2049 } else {
2050 log_error("Failed to identify root partition in disk image\n"
2051 " %s\n"
2052 PARTITION_TABLE_BLURB, arg_image);
2053 return -EINVAL;
2054 }
2055
2056 if (home) {
2057 *home_device = home;
2058 home = NULL;
2059
2060 *home_device_rw = home_rw;
2061 }
2062
2063 if (srv) {
2064 *srv_device = srv;
2065 srv = NULL;
2066
2067 *srv_device_rw = srv_rw;
2068 }
2069
2070 return 0;
2071 #else
2072 log_error("--image= is not supported, compiled without blkid support.");
2073 return -EOPNOTSUPP;
2074 #endif
2075 }
2076
2077 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2078 #ifdef HAVE_BLKID
2079 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2080 const char *fstype, *p;
2081 int r;
2082
2083 assert(what);
2084 assert(where);
2085
2086 if (arg_read_only)
2087 rw = false;
2088
2089 if (directory)
2090 p = strjoina(where, directory);
2091 else
2092 p = where;
2093
2094 errno = 0;
2095 b = blkid_new_probe_from_filename(what);
2096 if (!b) {
2097 if (errno == 0)
2098 return log_oom();
2099 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2100 return -errno;
2101 }
2102
2103 blkid_probe_enable_superblocks(b, 1);
2104 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2105
2106 errno = 0;
2107 r = blkid_do_safeprobe(b);
2108 if (r == -1 || r == 1) {
2109 log_error("Cannot determine file system type of %s", what);
2110 return -EINVAL;
2111 } else if (r != 0) {
2112 if (errno == 0)
2113 errno = EIO;
2114 log_error_errno(errno, "Failed to probe %s: %m", what);
2115 return -errno;
2116 }
2117
2118 errno = 0;
2119 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2120 if (errno == 0)
2121 errno = EINVAL;
2122 log_error("Failed to determine file system type of %s", what);
2123 return -errno;
2124 }
2125
2126 if (streq(fstype, "crypto_LUKS")) {
2127 log_error("nspawn currently does not support LUKS disk images.");
2128 return -EOPNOTSUPP;
2129 }
2130
2131 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2132 return log_error_errno(errno, "Failed to mount %s: %m", what);
2133
2134 return 0;
2135 #else
2136 log_error("--image= is not supported, compiled without blkid support.");
2137 return -EOPNOTSUPP;
2138 #endif
2139 }
2140
2141 static int mount_devices(
2142 const char *where,
2143 const char *root_device, bool root_device_rw,
2144 const char *home_device, bool home_device_rw,
2145 const char *srv_device, bool srv_device_rw) {
2146 int r;
2147
2148 assert(where);
2149
2150 if (root_device) {
2151 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2152 if (r < 0)
2153 return log_error_errno(r, "Failed to mount root directory: %m");
2154 }
2155
2156 if (home_device) {
2157 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2158 if (r < 0)
2159 return log_error_errno(r, "Failed to mount home directory: %m");
2160 }
2161
2162 if (srv_device) {
2163 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2164 if (r < 0)
2165 return log_error_errno(r, "Failed to mount server data directory: %m");
2166 }
2167
2168 return 0;
2169 }
2170
2171 static void loop_remove(int nr, int *image_fd) {
2172 _cleanup_close_ int control = -1;
2173 int r;
2174
2175 if (nr < 0)
2176 return;
2177
2178 if (image_fd && *image_fd >= 0) {
2179 r = ioctl(*image_fd, LOOP_CLR_FD);
2180 if (r < 0)
2181 log_debug_errno(errno, "Failed to close loop image: %m");
2182 *image_fd = safe_close(*image_fd);
2183 }
2184
2185 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2186 if (control < 0) {
2187 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2188 return;
2189 }
2190
2191 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2192 if (r < 0)
2193 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
2194 }
2195
2196 /*
2197 * Return values:
2198 * < 0 : wait_for_terminate() failed to get the state of the
2199 * container, the container was terminated by a signal, or
2200 * failed for an unknown reason. No change is made to the
2201 * container argument.
2202 * > 0 : The program executed in the container terminated with an
2203 * error. The exit code of the program executed in the
2204 * container is returned. The container argument has been set
2205 * to CONTAINER_TERMINATED.
2206 * 0 : The container is being rebooted, has been shut down or exited
2207 * successfully. The container argument has been set to either
2208 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2209 *
2210 * That is, success is indicated by a return value of zero, and an
2211 * error is indicated by a non-zero value.
2212 */
2213 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2214 siginfo_t status;
2215 int r;
2216
2217 r = wait_for_terminate(pid, &status);
2218 if (r < 0)
2219 return log_warning_errno(r, "Failed to wait for container: %m");
2220
2221 switch (status.si_code) {
2222
2223 case CLD_EXITED:
2224 if (status.si_status == 0) {
2225 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2226
2227 } else
2228 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2229
2230 *container = CONTAINER_TERMINATED;
2231 return status.si_status;
2232
2233 case CLD_KILLED:
2234 if (status.si_status == SIGINT) {
2235
2236 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2237 *container = CONTAINER_TERMINATED;
2238 return 0;
2239
2240 } else if (status.si_status == SIGHUP) {
2241
2242 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2243 *container = CONTAINER_REBOOTED;
2244 return 0;
2245 }
2246
2247 /* CLD_KILLED fallthrough */
2248
2249 case CLD_DUMPED:
2250 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2251 return -EIO;
2252
2253 default:
2254 log_error("Container %s failed due to unknown reason.", arg_machine);
2255 return -EIO;
2256 }
2257
2258 return r;
2259 }
2260
2261 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2262 pid_t pid;
2263
2264 pid = PTR_TO_UINT32(userdata);
2265 if (pid > 0) {
2266 if (kill(pid, arg_kill_signal) >= 0) {
2267 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2268 sd_event_source_set_userdata(s, NULL);
2269 return 0;
2270 }
2271 }
2272
2273 sd_event_exit(sd_event_source_get_event(s), 0);
2274 return 0;
2275 }
2276
2277 static int determine_names(void) {
2278 int r;
2279
2280 if (arg_template && !arg_directory && arg_machine) {
2281
2282 /* If --template= was specified then we should not
2283 * search for a machine, but instead create a new one
2284 * in /var/lib/machine. */
2285
2286 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2287 if (!arg_directory)
2288 return log_oom();
2289 }
2290
2291 if (!arg_image && !arg_directory) {
2292 if (arg_machine) {
2293 _cleanup_(image_unrefp) Image *i = NULL;
2294
2295 r = image_find(arg_machine, &i);
2296 if (r < 0)
2297 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2298 else if (r == 0) {
2299 log_error("No image for machine '%s': %m", arg_machine);
2300 return -ENOENT;
2301 }
2302
2303 if (i->type == IMAGE_RAW)
2304 r = free_and_strdup(&arg_image, i->path);
2305 else
2306 r = free_and_strdup(&arg_directory, i->path);
2307 if (r < 0)
2308 return log_error_errno(r, "Invalid image directory: %m");
2309
2310 if (!arg_ephemeral)
2311 arg_read_only = arg_read_only || i->read_only;
2312 } else
2313 arg_directory = get_current_dir_name();
2314
2315 if (!arg_directory && !arg_machine) {
2316 log_error("Failed to determine path, please use -D or -i.");
2317 return -EINVAL;
2318 }
2319 }
2320
2321 if (!arg_machine) {
2322 if (arg_directory && path_equal(arg_directory, "/"))
2323 arg_machine = gethostname_malloc();
2324 else
2325 arg_machine = strdup(basename(arg_image ?: arg_directory));
2326
2327 if (!arg_machine)
2328 return log_oom();
2329
2330 hostname_cleanup(arg_machine);
2331 if (!machine_name_is_valid(arg_machine)) {
2332 log_error("Failed to determine machine name automatically, please use -M.");
2333 return -EINVAL;
2334 }
2335
2336 if (arg_ephemeral) {
2337 char *b;
2338
2339 /* Add a random suffix when this is an
2340 * ephemeral machine, so that we can run many
2341 * instances at once without manually having
2342 * to specify -M each time. */
2343
2344 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2345 return log_oom();
2346
2347 free(arg_machine);
2348 arg_machine = b;
2349 }
2350 }
2351
2352 return 0;
2353 }
2354
2355 static int determine_uid_shift(const char *directory) {
2356 int r;
2357
2358 if (!arg_userns) {
2359 arg_uid_shift = 0;
2360 return 0;
2361 }
2362
2363 if (arg_uid_shift == UID_INVALID) {
2364 struct stat st;
2365
2366 r = stat(directory, &st);
2367 if (r < 0)
2368 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2369
2370 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2371
2372 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
2373 log_error("UID and GID base of %s don't match.", directory);
2374 return -EINVAL;
2375 }
2376
2377 arg_uid_range = UINT32_C(0x10000);
2378 }
2379
2380 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2381 log_error("UID base too high for UID range.");
2382 return -EINVAL;
2383 }
2384
2385 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2386 return 0;
2387 }
2388
2389 static int inner_child(
2390 Barrier *barrier,
2391 const char *directory,
2392 bool secondary,
2393 int kmsg_socket,
2394 int rtnl_socket,
2395 FDSet *fds) {
2396
2397 _cleanup_free_ char *home = NULL;
2398 unsigned n_env = 2;
2399 const char *envp[] = {
2400 "PATH=" DEFAULT_PATH_SPLIT_USR,
2401 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2402 NULL, /* TERM */
2403 NULL, /* HOME */
2404 NULL, /* USER */
2405 NULL, /* LOGNAME */
2406 NULL, /* container_uuid */
2407 NULL, /* LISTEN_FDS */
2408 NULL, /* LISTEN_PID */
2409 NULL
2410 };
2411
2412 _cleanup_strv_free_ char **env_use = NULL;
2413 int r;
2414
2415 assert(barrier);
2416 assert(directory);
2417 assert(kmsg_socket >= 0);
2418
2419 cg_unified_flush();
2420
2421 if (arg_userns) {
2422 /* Tell the parent, that it now can write the UID map. */
2423 (void) barrier_place(barrier); /* #1 */
2424
2425 /* Wait until the parent wrote the UID map */
2426 if (!barrier_place_and_sync(barrier)) { /* #2 */
2427 log_error("Parent died too early");
2428 return -ESRCH;
2429 }
2430 }
2431
2432 r = mount_all(NULL, arg_userns, true, arg_uid_shift, arg_private_network, arg_uid_range, arg_selinux_apifs_context);
2433 if (r < 0)
2434 return r;
2435
2436 r = mount_sysfs(NULL);
2437 if (r < 0)
2438 return r;
2439
2440 /* Wait until we are cgroup-ified, so that we
2441 * can mount the right cgroup path writable */
2442 if (!barrier_place_and_sync(barrier)) { /* #3 */
2443 log_error("Parent died too early");
2444 return -ESRCH;
2445 }
2446
2447 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2448 if (r < 0)
2449 return r;
2450
2451 r = reset_uid_gid();
2452 if (r < 0)
2453 return log_error_errno(r, "Couldn't become new root: %m");
2454
2455 r = setup_boot_id(NULL);
2456 if (r < 0)
2457 return r;
2458
2459 r = setup_kmsg(NULL, kmsg_socket);
2460 if (r < 0)
2461 return r;
2462 kmsg_socket = safe_close(kmsg_socket);
2463
2464 umask(0022);
2465
2466 if (setsid() < 0)
2467 return log_error_errno(errno, "setsid() failed: %m");
2468
2469 if (arg_private_network)
2470 loopback_setup();
2471
2472 if (arg_expose_ports) {
2473 r = expose_port_send_rtnl(rtnl_socket);
2474 if (r < 0)
2475 return r;
2476 rtnl_socket = safe_close(rtnl_socket);
2477 }
2478
2479 if (drop_capabilities() < 0)
2480 return log_error_errno(errno, "drop_capabilities() failed: %m");
2481
2482 setup_hostname();
2483
2484 if (arg_personality != PERSONALITY_INVALID) {
2485 if (personality(arg_personality) < 0)
2486 return log_error_errno(errno, "personality() failed: %m");
2487 } else if (secondary) {
2488 if (personality(PER_LINUX32) < 0)
2489 return log_error_errno(errno, "personality() failed: %m");
2490 }
2491
2492 #ifdef HAVE_SELINUX
2493 if (arg_selinux_context)
2494 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2495 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2496 #endif
2497
2498 r = change_uid_gid(arg_user, &home);
2499 if (r < 0)
2500 return r;
2501
2502 envp[n_env] = strv_find_prefix(environ, "TERM=");
2503 if (envp[n_env])
2504 n_env ++;
2505
2506 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2507 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2508 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2509 return log_oom();
2510
2511 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2512 char as_uuid[37];
2513
2514 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2515 return log_oom();
2516 }
2517
2518 if (fdset_size(fds) > 0) {
2519 r = fdset_cloexec(fds, false);
2520 if (r < 0)
2521 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2522
2523 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2524 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2525 return log_oom();
2526 }
2527
2528 env_use = strv_env_merge(2, envp, arg_setenv);
2529 if (!env_use)
2530 return log_oom();
2531
2532 /* Let the parent know that we are ready and
2533 * wait until the parent is ready with the
2534 * setup, too... */
2535 if (!barrier_place_and_sync(barrier)) { /* #4 */
2536 log_error("Parent died too early");
2537 return -ESRCH;
2538 }
2539
2540 /* Now, explicitly close the log, so that we
2541 * then can close all remaining fds. Closing
2542 * the log explicitly first has the benefit
2543 * that the logging subsystem knows about it,
2544 * and is thus ready to be reopened should we
2545 * need it again. Note that the other fds
2546 * closed here are at least the locking and
2547 * barrier fds. */
2548 log_close();
2549 (void) fdset_close_others(fds);
2550
2551 if (arg_boot) {
2552 char **a;
2553 size_t m;
2554
2555 /* Automatically search for the init system */
2556
2557 m = 1 + strv_length(arg_parameters);
2558 a = newa(char*, m + 1);
2559 if (strv_isempty(arg_parameters))
2560 a[1] = NULL;
2561 else
2562 memcpy(a + 1, arg_parameters, m * sizeof(char*));
2563
2564 a[0] = (char*) "/usr/lib/systemd/systemd";
2565 execve(a[0], a, env_use);
2566
2567 a[0] = (char*) "/lib/systemd/systemd";
2568 execve(a[0], a, env_use);
2569
2570 a[0] = (char*) "/sbin/init";
2571 execve(a[0], a, env_use);
2572 } else if (!strv_isempty(arg_parameters))
2573 execvpe(arg_parameters[0], arg_parameters, env_use);
2574 else {
2575 chdir(home ?: "/root");
2576 execle("/bin/bash", "-bash", NULL, env_use);
2577 execle("/bin/sh", "-sh", NULL, env_use);
2578 }
2579
2580 (void) log_open();
2581 return log_error_errno(errno, "execv() failed: %m");
2582 }
2583
2584 static int outer_child(
2585 Barrier *barrier,
2586 const char *directory,
2587 const char *console,
2588 const char *root_device, bool root_device_rw,
2589 const char *home_device, bool home_device_rw,
2590 const char *srv_device, bool srv_device_rw,
2591 bool interactive,
2592 bool secondary,
2593 int pid_socket,
2594 int kmsg_socket,
2595 int rtnl_socket,
2596 int uid_shift_socket,
2597 FDSet *fds) {
2598
2599 pid_t pid;
2600 ssize_t l;
2601 int r;
2602
2603 assert(barrier);
2604 assert(directory);
2605 assert(console);
2606 assert(pid_socket >= 0);
2607 assert(kmsg_socket >= 0);
2608
2609 cg_unified_flush();
2610
2611 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2612 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2613
2614 if (interactive) {
2615 close_nointr(STDIN_FILENO);
2616 close_nointr(STDOUT_FILENO);
2617 close_nointr(STDERR_FILENO);
2618
2619 r = open_terminal(console, O_RDWR);
2620 if (r != STDIN_FILENO) {
2621 if (r >= 0) {
2622 safe_close(r);
2623 r = -EINVAL;
2624 }
2625
2626 return log_error_errno(r, "Failed to open console: %m");
2627 }
2628
2629 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2630 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2631 return log_error_errno(errno, "Failed to duplicate console: %m");
2632 }
2633
2634 r = reset_audit_loginuid();
2635 if (r < 0)
2636 return r;
2637
2638 /* Mark everything as slave, so that we still
2639 * receive mounts from the real root, but don't
2640 * propagate mounts to the real root. */
2641 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2642 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2643
2644 r = mount_devices(directory,
2645 root_device, root_device_rw,
2646 home_device, home_device_rw,
2647 srv_device, srv_device_rw);
2648 if (r < 0)
2649 return r;
2650
2651 r = determine_uid_shift(directory);
2652 if (r < 0)
2653 return r;
2654
2655 if (arg_userns) {
2656 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2657 if (l < 0)
2658 return log_error_errno(errno, "Failed to send UID shift: %m");
2659 if (l != sizeof(arg_uid_shift)) {
2660 log_error("Short write while sending UID shift.");
2661 return -EIO;
2662 }
2663 }
2664
2665 /* Turn directory into bind mount */
2666 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2667 return log_error_errno(errno, "Failed to make bind mount: %m");
2668
2669 r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2670 if (r < 0)
2671 return r;
2672
2673 r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2674 if (r < 0)
2675 return r;
2676
2677 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2678 if (r < 0)
2679 return r;
2680
2681 if (arg_read_only) {
2682 r = bind_remount_recursive(directory, true);
2683 if (r < 0)
2684 return log_error_errno(r, "Failed to make tree read-only: %m");
2685 }
2686
2687 r = mount_all(directory, arg_userns, false, arg_private_network, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2688 if (r < 0)
2689 return r;
2690
2691 r = copy_devnodes(directory);
2692 if (r < 0)
2693 return r;
2694
2695 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2696
2697 r = setup_pts(directory);
2698 if (r < 0)
2699 return r;
2700
2701 r = setup_propagate(directory);
2702 if (r < 0)
2703 return r;
2704
2705 r = setup_dev_console(directory, console);
2706 if (r < 0)
2707 return r;
2708
2709 r = setup_seccomp();
2710 if (r < 0)
2711 return r;
2712
2713 r = setup_timezone(directory);
2714 if (r < 0)
2715 return r;
2716
2717 r = setup_resolv_conf(directory);
2718 if (r < 0)
2719 return r;
2720
2721 r = setup_journal(directory);
2722 if (r < 0)
2723 return r;
2724
2725 r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2726 if (r < 0)
2727 return r;
2728
2729 r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2730 if (r < 0)
2731 return r;
2732
2733 r = mount_move_root(directory);
2734 if (r < 0)
2735 return log_error_errno(r, "Failed to move root directory: %m");
2736
2737 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2738 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
2739 (arg_private_network ? CLONE_NEWNET : 0) |
2740 (arg_userns ? CLONE_NEWUSER : 0),
2741 NULL);
2742 if (pid < 0)
2743 return log_error_errno(errno, "Failed to fork inner child: %m");
2744 if (pid == 0) {
2745 pid_socket = safe_close(pid_socket);
2746 uid_shift_socket = safe_close(uid_shift_socket);
2747
2748 /* The inner child has all namespaces that are
2749 * requested, so that we all are owned by the user if
2750 * user namespaces are turned on. */
2751
2752 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
2753 if (r < 0)
2754 _exit(EXIT_FAILURE);
2755
2756 _exit(EXIT_SUCCESS);
2757 }
2758
2759 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2760 if (l < 0)
2761 return log_error_errno(errno, "Failed to send PID: %m");
2762 if (l != sizeof(pid)) {
2763 log_error("Short write while sending PID.");
2764 return -EIO;
2765 }
2766
2767 pid_socket = safe_close(pid_socket);
2768 kmsg_socket = safe_close(kmsg_socket);
2769 rtnl_socket = safe_close(rtnl_socket);
2770
2771 return 0;
2772 }
2773
2774 static int setup_uid_map(pid_t pid) {
2775 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2776 int r;
2777
2778 assert(pid > 1);
2779
2780 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2781 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
2782 r = write_string_file(uid_map, line, 0);
2783 if (r < 0)
2784 return log_error_errno(r, "Failed to write UID map: %m");
2785
2786 /* We always assign the same UID and GID ranges */
2787 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
2788 r = write_string_file(uid_map, line, 0);
2789 if (r < 0)
2790 return log_error_errno(r, "Failed to write GID map: %m");
2791
2792 return 0;
2793 }
2794
2795 static int load_settings(void) {
2796 _cleanup_(settings_freep) Settings *settings = NULL;
2797 _cleanup_fclose_ FILE *f = NULL;
2798 _cleanup_free_ char *p = NULL;
2799 const char *fn, *i;
2800 int r;
2801
2802 /* If all settings are masked, there's no point in looking for
2803 * the settings file */
2804 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2805 return 0;
2806
2807 fn = strjoina(arg_machine, ".nspawn");
2808
2809 /* We first look in the admin's directories in /etc and /run */
2810 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2811 _cleanup_free_ char *j = NULL;
2812
2813 j = strjoin(i, "/", fn, NULL);
2814 if (!j)
2815 return log_oom();
2816
2817 f = fopen(j, "re");
2818 if (f) {
2819 p = j;
2820 j = NULL;
2821
2822 /* By default we trust configuration from /etc and /run */
2823 if (arg_settings_trusted < 0)
2824 arg_settings_trusted = true;
2825
2826 break;
2827 }
2828
2829 if (errno != ENOENT)
2830 return log_error_errno(errno, "Failed to open %s: %m", j);
2831 }
2832
2833 if (!f) {
2834 /* After that, let's look for a file next to the
2835 * actual image we shall boot. */
2836
2837 if (arg_image) {
2838 p = file_in_same_dir(arg_image, fn);
2839 if (!p)
2840 return log_oom();
2841 } else if (arg_directory) {
2842 p = file_in_same_dir(arg_directory, fn);
2843 if (!p)
2844 return log_oom();
2845 }
2846
2847 if (p) {
2848 f = fopen(p, "re");
2849 if (!f && errno != ENOENT)
2850 return log_error_errno(errno, "Failed to open %s: %m", p);
2851
2852 /* By default we do not trust configuration from /var/lib/machines */
2853 if (arg_settings_trusted < 0)
2854 arg_settings_trusted = false;
2855 }
2856 }
2857
2858 if (!f)
2859 return 0;
2860
2861 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2862
2863 r = settings_load(f, p, &settings);
2864 if (r < 0)
2865 return r;
2866
2867 /* Copy over bits from the settings, unless they have been
2868 * explicitly masked by command line switches. */
2869
2870 if ((arg_settings_mask & SETTING_BOOT) == 0 &&
2871 settings->boot >= 0) {
2872 arg_boot = settings->boot;
2873
2874 strv_free(arg_parameters);
2875 arg_parameters = settings->parameters;
2876 settings->parameters = NULL;
2877 }
2878
2879 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2880 settings->environment) {
2881 strv_free(arg_setenv);
2882 arg_setenv = settings->environment;
2883 settings->environment = NULL;
2884 }
2885
2886 if ((arg_settings_mask & SETTING_USER) == 0 &&
2887 settings->user) {
2888 free(arg_user);
2889 arg_user = settings->user;
2890 settings->user = NULL;
2891 }
2892
2893 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
2894 uint64_t plus;
2895
2896 plus = settings->capability;
2897 if (settings_private_network(settings))
2898 plus |= (1ULL << CAP_NET_ADMIN);
2899
2900 if (!arg_settings_trusted && plus != 0) {
2901 if (settings->capability != 0)
2902 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
2903 } else
2904 arg_retain |= plus;
2905
2906 arg_retain &= ~settings->drop_capability;
2907 }
2908
2909 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
2910 settings->kill_signal > 0)
2911 arg_kill_signal = settings->kill_signal;
2912
2913 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
2914 settings->personality != PERSONALITY_INVALID)
2915 arg_personality = settings->personality;
2916
2917 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
2918 !sd_id128_is_null(settings->machine_id)) {
2919
2920 if (!arg_settings_trusted)
2921 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
2922 else
2923 arg_uuid = settings->machine_id;
2924 }
2925
2926 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
2927 settings->read_only >= 0)
2928 arg_read_only = settings->read_only;
2929
2930 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
2931 settings->volatile_mode != _VOLATILE_MODE_INVALID)
2932 arg_volatile_mode = settings->volatile_mode;
2933
2934 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
2935 settings->n_custom_mounts > 0) {
2936
2937 if (!arg_settings_trusted)
2938 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
2939 else {
2940 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
2941 arg_custom_mounts = settings->custom_mounts;
2942 arg_n_custom_mounts = settings->n_custom_mounts;
2943
2944 settings->custom_mounts = NULL;
2945 settings->n_custom_mounts = 0;
2946 }
2947 }
2948
2949 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
2950 (settings->private_network >= 0 ||
2951 settings->network_veth >= 0 ||
2952 settings->network_bridge ||
2953 settings->network_interfaces ||
2954 settings->network_macvlan ||
2955 settings->network_ipvlan)) {
2956
2957 if (!arg_settings_trusted)
2958 log_warning("Ignoring network settings, file %s is not trusted.", p);
2959 else {
2960 arg_network_veth = settings_private_network(settings);
2961 arg_private_network = settings_private_network(settings);
2962
2963 strv_free(arg_network_interfaces);
2964 arg_network_interfaces = settings->network_interfaces;
2965 settings->network_interfaces = NULL;
2966
2967 strv_free(arg_network_macvlan);
2968 arg_network_macvlan = settings->network_macvlan;
2969 settings->network_macvlan = NULL;
2970
2971 strv_free(arg_network_ipvlan);
2972 arg_network_ipvlan = settings->network_ipvlan;
2973 settings->network_ipvlan = NULL;
2974
2975 free(arg_network_bridge);
2976 arg_network_bridge = settings->network_bridge;
2977 settings->network_bridge = NULL;
2978 }
2979 }
2980
2981 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
2982 settings->expose_ports) {
2983
2984 if (!arg_settings_trusted)
2985 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
2986 else {
2987 expose_port_free_all(arg_expose_ports);
2988 arg_expose_ports = settings->expose_ports;
2989 settings->expose_ports = NULL;
2990 }
2991 }
2992
2993 return 0;
2994 }
2995
2996 int main(int argc, char *argv[]) {
2997
2998 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
2999 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3000 _cleanup_close_ int master = -1, image_fd = -1;
3001 _cleanup_fdset_free_ FDSet *fds = NULL;
3002 int r, n_fd_passed, loop_nr = -1;
3003 char veth_name[IFNAMSIZ];
3004 bool secondary = false, remove_subvol = false;
3005 sigset_t mask_chld;
3006 pid_t pid = 0;
3007 int ret = EXIT_SUCCESS;
3008 union in_addr_union exposed = {};
3009 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3010 bool interactive;
3011
3012 log_parse_environment();
3013 log_open();
3014
3015 r = parse_argv(argc, argv);
3016 if (r <= 0)
3017 goto finish;
3018
3019 if (geteuid() != 0) {
3020 log_error("Need to be root.");
3021 r = -EPERM;
3022 goto finish;
3023 }
3024 r = determine_names();
3025 if (r < 0)
3026 goto finish;
3027
3028 r = load_settings();
3029 if (r < 0)
3030 goto finish;
3031
3032 r = verify_arguments();
3033 if (r < 0)
3034 goto finish;
3035
3036 n_fd_passed = sd_listen_fds(false);
3037 if (n_fd_passed > 0) {
3038 r = fdset_new_listen_fds(&fds, false);
3039 if (r < 0) {
3040 log_error_errno(r, "Failed to collect file descriptors: %m");
3041 goto finish;
3042 }
3043 }
3044
3045 if (arg_directory) {
3046 assert(!arg_image);
3047
3048 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3049 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3050 r = -EINVAL;
3051 goto finish;
3052 }
3053
3054 if (arg_ephemeral) {
3055 _cleanup_free_ char *np = NULL;
3056
3057 /* If the specified path is a mount point we
3058 * generate the new snapshot immediately
3059 * inside it under a random name. However if
3060 * the specified is not a mount point we
3061 * create the new snapshot in the parent
3062 * directory, just next to it. */
3063 r = path_is_mount_point(arg_directory, 0);
3064 if (r < 0) {
3065 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3066 goto finish;
3067 }
3068 if (r > 0)
3069 r = tempfn_random_child(arg_directory, "machine.", &np);
3070 else
3071 r = tempfn_random(arg_directory, "machine.", &np);
3072 if (r < 0) {
3073 log_error_errno(r, "Failed to generate name for snapshot: %m");
3074 goto finish;
3075 }
3076
3077 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3078 if (r < 0) {
3079 log_error_errno(r, "Failed to lock %s: %m", np);
3080 goto finish;
3081 }
3082
3083 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
3084 if (r < 0) {
3085 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3086 goto finish;
3087 }
3088
3089 free(arg_directory);
3090 arg_directory = np;
3091 np = NULL;
3092
3093 remove_subvol = true;
3094
3095 } else {
3096 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3097 if (r == -EBUSY) {
3098 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3099 goto finish;
3100 }
3101 if (r < 0) {
3102 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3103 return r;
3104 }
3105
3106 if (arg_template) {
3107 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
3108 if (r == -EEXIST) {
3109 if (!arg_quiet)
3110 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3111 } else if (r < 0) {
3112 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3113 goto finish;
3114 } else {
3115 if (!arg_quiet)
3116 log_info("Populated %s from template %s.", arg_directory, arg_template);
3117 }
3118 }
3119 }
3120
3121 if (arg_boot) {
3122 if (path_is_os_tree(arg_directory) <= 0) {
3123 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3124 r = -EINVAL;
3125 goto finish;
3126 }
3127 } else {
3128 const char *p;
3129
3130 p = strjoina(arg_directory, "/usr/");
3131 if (laccess(p, F_OK) < 0) {
3132 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
3133 r = -EINVAL;
3134 goto finish;
3135 }
3136 }
3137
3138 } else {
3139 char template[] = "/tmp/nspawn-root-XXXXXX";
3140
3141 assert(arg_image);
3142 assert(!arg_template);
3143
3144 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3145 if (r == -EBUSY) {
3146 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3147 goto finish;
3148 }
3149 if (r < 0) {
3150 r = log_error_errno(r, "Failed to create image lock: %m");
3151 goto finish;
3152 }
3153
3154 if (!mkdtemp(template)) {
3155 log_error_errno(errno, "Failed to create temporary directory: %m");
3156 r = -errno;
3157 goto finish;
3158 }
3159
3160 arg_directory = strdup(template);
3161 if (!arg_directory) {
3162 r = log_oom();
3163 goto finish;
3164 }
3165
3166 image_fd = setup_image(&device_path, &loop_nr);
3167 if (image_fd < 0) {
3168 r = image_fd;
3169 goto finish;
3170 }
3171
3172 r = dissect_image(image_fd,
3173 &root_device, &root_device_rw,
3174 &home_device, &home_device_rw,
3175 &srv_device, &srv_device_rw,
3176 &secondary);
3177 if (r < 0)
3178 goto finish;
3179 }
3180
3181 r = custom_mounts_prepare();
3182 if (r < 0)
3183 goto finish;
3184
3185 interactive =
3186 isatty(STDIN_FILENO) > 0 &&
3187 isatty(STDOUT_FILENO) > 0;
3188
3189 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3190 if (master < 0) {
3191 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3192 goto finish;
3193 }
3194
3195 r = ptsname_malloc(master, &console);
3196 if (r < 0) {
3197 r = log_error_errno(r, "Failed to determine tty name: %m");
3198 goto finish;
3199 }
3200
3201 if (unlockpt(master) < 0) {
3202 r = log_error_errno(errno, "Failed to unlock tty: %m");
3203 goto finish;
3204 }
3205
3206 if (!arg_quiet)
3207 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3208 arg_machine, arg_image ?: arg_directory);
3209
3210 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
3211
3212 assert_se(sigemptyset(&mask_chld) == 0);
3213 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3214
3215 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3216 r = log_error_errno(errno, "Failed to become subreaper: %m");
3217 goto finish;
3218 }
3219
3220 for (;;) {
3221 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
3222 uid_shift_socket_pair[2] = { -1, -1 };
3223 ContainerStatus container_status;
3224 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3225 static const struct sigaction sa = {
3226 .sa_handler = nop_signal_handler,
3227 .sa_flags = SA_NOCLDSTOP,
3228 };
3229 int ifi = 0;
3230 ssize_t l;
3231 _cleanup_event_unref_ sd_event *event = NULL;
3232 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3233 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
3234 char last_char = 0;
3235
3236 r = barrier_create(&barrier);
3237 if (r < 0) {
3238 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3239 goto finish;
3240 }
3241
3242 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3243 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3244 goto finish;
3245 }
3246
3247 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3248 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3249 goto finish;
3250 }
3251
3252 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
3253 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3254 goto finish;
3255 }
3256
3257 if (arg_userns)
3258 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
3259 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3260 goto finish;
3261 }
3262
3263 /* Child can be killed before execv(), so handle SIGCHLD
3264 * in order to interrupt parent's blocking calls and
3265 * give it a chance to call wait() and terminate. */
3266 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3267 if (r < 0) {
3268 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3269 goto finish;
3270 }
3271
3272 r = sigaction(SIGCHLD, &sa, NULL);
3273 if (r < 0) {
3274 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3275 goto finish;
3276 }
3277
3278 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
3279 if (pid < 0) {
3280 if (errno == EINVAL)
3281 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3282 else
3283 r = log_error_errno(errno, "clone() failed: %m");
3284
3285 goto finish;
3286 }
3287
3288 if (pid == 0) {
3289 /* The outer child only has a file system namespace. */
3290 barrier_set_role(&barrier, BARRIER_CHILD);
3291
3292 master = safe_close(master);
3293
3294 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3295 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3296 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3297 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3298
3299 (void) reset_all_signal_handlers();
3300 (void) reset_signal_mask();
3301
3302 r = outer_child(&barrier,
3303 arg_directory,
3304 console,
3305 root_device, root_device_rw,
3306 home_device, home_device_rw,
3307 srv_device, srv_device_rw,
3308 interactive,
3309 secondary,
3310 pid_socket_pair[1],
3311 kmsg_socket_pair[1],
3312 rtnl_socket_pair[1],
3313 uid_shift_socket_pair[1],
3314 fds);
3315 if (r < 0)
3316 _exit(EXIT_FAILURE);
3317
3318 _exit(EXIT_SUCCESS);
3319 }
3320
3321 barrier_set_role(&barrier, BARRIER_PARENT);
3322
3323 fds = fdset_free(fds);
3324
3325 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3326 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3327 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3328 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3329
3330 /* Wait for the outer child. */
3331 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3332 if (r < 0)
3333 goto finish;
3334 if (r != 0) {
3335 r = -EIO;
3336 goto finish;
3337 }
3338 pid = 0;
3339
3340 /* And now retrieve the PID of the inner child. */
3341 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3342 if (l < 0) {
3343 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3344 goto finish;
3345 }
3346 if (l != sizeof(pid)) {
3347 log_error("Short read while reading inner child PID.");
3348 r = EIO;
3349 goto finish;
3350 }
3351
3352 log_debug("Init process invoked as PID " PID_FMT, pid);
3353
3354 if (arg_userns) {
3355 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3356 log_error("Child died too early.");
3357 r = -ESRCH;
3358 goto finish;
3359 }
3360
3361 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3362 if (l < 0) {
3363 r = log_error_errno(errno, "Failed to read UID shift: %m");
3364 goto finish;
3365 }
3366 if (l != sizeof(arg_uid_shift)) {
3367 log_error("Short read while reading UID shift.");
3368 r = EIO;
3369 goto finish;
3370 }
3371
3372 r = setup_uid_map(pid);
3373 if (r < 0)
3374 goto finish;
3375
3376 (void) barrier_place(&barrier); /* #2 */
3377 }
3378
3379 if (arg_private_network) {
3380
3381 r = move_network_interfaces(pid, arg_network_interfaces);
3382 if (r < 0)
3383 goto finish;
3384
3385 if (arg_network_veth) {
3386 r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge);
3387 if (r < 0)
3388 goto finish;
3389 else if (r > 0)
3390 ifi = r;
3391
3392 if (arg_network_bridge) {
3393 r = setup_bridge(veth_name, arg_network_bridge);
3394 if (r < 0)
3395 goto finish;
3396 if (r > 0)
3397 ifi = r;
3398 }
3399 }
3400
3401 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3402 if (r < 0)
3403 goto finish;
3404
3405 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3406 if (r < 0)
3407 goto finish;
3408 }
3409
3410 if (arg_register) {
3411 r = register_machine(
3412 arg_machine,
3413 pid,
3414 arg_directory,
3415 arg_uuid,
3416 ifi,
3417 arg_slice,
3418 arg_custom_mounts, arg_n_custom_mounts,
3419 arg_kill_signal,
3420 arg_property,
3421 arg_keep_unit);
3422 if (r < 0)
3423 goto finish;
3424 }
3425
3426 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
3427 if (r < 0)
3428 goto finish;
3429
3430 if (arg_keep_unit) {
3431 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
3432 if (r < 0)
3433 goto finish;
3434 }
3435
3436 r = chown_cgroup(pid, arg_uid_shift);
3437 if (r < 0)
3438 goto finish;
3439
3440 /* Notify the child that the parent is ready with all
3441 * its setup (including cgroup-ification), and that
3442 * the child can now hand over control to the code to
3443 * run inside the container. */
3444 (void) barrier_place(&barrier); /* #3 */
3445
3446 /* Block SIGCHLD here, before notifying child.
3447 * process_pty() will handle it with the other signals. */
3448 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3449
3450 /* Reset signal to default */
3451 r = default_signals(SIGCHLD, -1);
3452 if (r < 0) {
3453 log_error_errno(r, "Failed to reset SIGCHLD: %m");
3454 goto finish;
3455 }
3456
3457 /* Let the child know that we are ready and wait that the child is completely ready now. */
3458 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3459 log_error("Child died too early.");
3460 r = -ESRCH;
3461 goto finish;
3462 }
3463
3464 sd_notifyf(false,
3465 "READY=1\n"
3466 "STATUS=Container running.\n"
3467 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
3468
3469 r = sd_event_new(&event);
3470 if (r < 0) {
3471 log_error_errno(r, "Failed to get default event source: %m");
3472 goto finish;
3473 }
3474
3475 if (arg_kill_signal > 0) {
3476 /* Try to kill the init system on SIGINT or SIGTERM */
3477 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3478 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3479 } else {
3480 /* Immediately exit */
3481 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3482 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3483 }
3484
3485 /* simply exit on sigchld */
3486 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3487
3488 if (arg_expose_ports) {
3489 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
3490 if (r < 0)
3491 goto finish;
3492
3493 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
3494 }
3495
3496 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3497
3498 r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward);
3499 if (r < 0) {
3500 log_error_errno(r, "Failed to create PTY forwarder: %m");
3501 goto finish;
3502 }
3503
3504 r = sd_event_loop(event);
3505 if (r < 0) {
3506 log_error_errno(r, "Failed to run event loop: %m");
3507 goto finish;
3508 }
3509
3510 pty_forward_get_last_char(forward, &last_char);
3511
3512 forward = pty_forward_free(forward);
3513
3514 if (!arg_quiet && last_char != '\n')
3515 putc('\n', stdout);
3516
3517 /* Kill if it is not dead yet anyway */
3518 if (arg_register && !arg_keep_unit)
3519 terminate_machine(pid);
3520
3521 /* Normally redundant, but better safe than sorry */
3522 kill(pid, SIGKILL);
3523
3524 r = wait_for_container(pid, &container_status);
3525 pid = 0;
3526
3527 if (r < 0)
3528 /* We failed to wait for the container, or the
3529 * container exited abnormally */
3530 goto finish;
3531 else if (r > 0 || container_status == CONTAINER_TERMINATED){
3532 /* The container exited with a non-zero
3533 * status, or with zero status and no reboot
3534 * was requested. */
3535 ret = r;
3536 break;
3537 }
3538
3539 /* CONTAINER_REBOOTED, loop again */
3540
3541 if (arg_keep_unit) {
3542 /* Special handling if we are running as a
3543 * service: instead of simply restarting the
3544 * machine we want to restart the entire
3545 * service, so let's inform systemd about this
3546 * with the special exit code 133. The service
3547 * file uses RestartForceExitStatus=133 so
3548 * that this results in a full nspawn
3549 * restart. This is necessary since we might
3550 * have cgroup parameters set we want to have
3551 * flushed out. */
3552 ret = 133;
3553 r = 0;
3554 break;
3555 }
3556
3557 expose_port_flush(arg_expose_ports, &exposed);
3558 }
3559
3560 finish:
3561 sd_notify(false,
3562 "STOPPING=1\n"
3563 "STATUS=Terminating...");
3564
3565 if (pid > 0)
3566 kill(pid, SIGKILL);
3567
3568 /* Try to flush whatever is still queued in the pty */
3569 if (master >= 0)
3570 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
3571
3572 loop_remove(loop_nr, &image_fd);
3573
3574 if (remove_subvol && arg_directory) {
3575 int k;
3576
3577 k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
3578 if (k < 0)
3579 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3580 }
3581
3582 if (arg_machine) {
3583 const char *p;
3584
3585 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
3586 (void) rm_rf(p, REMOVE_ROOT);
3587 }
3588
3589 expose_port_flush(arg_expose_ports, &exposed);
3590
3591 free(arg_directory);
3592 free(arg_template);
3593 free(arg_image);
3594 free(arg_machine);
3595 free(arg_user);
3596 strv_free(arg_setenv);
3597 free(arg_network_bridge);
3598 strv_free(arg_network_interfaces);
3599 strv_free(arg_network_macvlan);
3600 strv_free(arg_network_ipvlan);
3601 strv_free(arg_parameters);
3602 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3603 expose_port_free_all(arg_expose_ports);
3604
3605 return r < 0 ? EXIT_FAILURE : ret;
3606 }