]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
Merge pull request #1681 from ssahani/journal
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #ifdef HAVE_BLKID
23 #include <blkid/blkid.h>
24 #endif
25 #include <errno.h>
26 #include <getopt.h>
27 #include <linux/loop.h>
28 #include <sched.h>
29 #ifdef HAVE_SECCOMP
30 #include <seccomp.h>
31 #endif
32 #ifdef HAVE_SELINUX
33 #include <selinux/selinux.h>
34 #endif
35 #include <signal.h>
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <sys/file.h>
40 #include <sys/mount.h>
41 #include <sys/personality.h>
42 #include <sys/prctl.h>
43 #include <sys/types.h>
44 #include <unistd.h>
45
46 #include "sd-daemon.h"
47 #include "sd-id128.h"
48
49 #include "barrier.h"
50 #include "base-filesystem.h"
51 #include "blkid-util.h"
52 #include "btrfs-util.h"
53 #include "cap-list.h"
54 #include "capability.h"
55 #include "cgroup-util.h"
56 #include "copy.h"
57 #include "dev-setup.h"
58 #include "env-util.h"
59 #include "event-util.h"
60 #include "fd-util.h"
61 #include "fdset.h"
62 #include "fileio.h"
63 #include "formats-util.h"
64 #include "gpt.h"
65 #include "hostname-util.h"
66 #include "log.h"
67 #include "loopback-setup.h"
68 #include "machine-image.h"
69 #include "macro.h"
70 #include "missing.h"
71 #include "mkdir.h"
72 #include "netlink-util.h"
73 #include "nspawn-cgroup.h"
74 #include "nspawn-expose-ports.h"
75 #include "nspawn-mount.h"
76 #include "nspawn-network.h"
77 #include "nspawn-register.h"
78 #include "nspawn-settings.h"
79 #include "nspawn-setuid.h"
80 #include "path-util.h"
81 #include "process-util.h"
82 #include "ptyfwd.h"
83 #include "random-util.h"
84 #include "rm-rf.h"
85 #ifdef HAVE_SECCOMP
86 #include "seccomp-util.h"
87 #endif
88 #include "signal-util.h"
89 #include "string-util.h"
90 #include "strv.h"
91 #include "terminal-util.h"
92 #include "udev-util.h"
93 #include "util.h"
94
95 typedef enum ContainerStatus {
96 CONTAINER_TERMINATED,
97 CONTAINER_REBOOTED
98 } ContainerStatus;
99
100 typedef enum LinkJournal {
101 LINK_NO,
102 LINK_AUTO,
103 LINK_HOST,
104 LINK_GUEST
105 } LinkJournal;
106
107 static char *arg_directory = NULL;
108 static char *arg_template = NULL;
109 static char *arg_user = NULL;
110 static sd_id128_t arg_uuid = {};
111 static char *arg_machine = NULL;
112 static const char *arg_selinux_context = NULL;
113 static const char *arg_selinux_apifs_context = NULL;
114 static const char *arg_slice = NULL;
115 static bool arg_private_network = false;
116 static bool arg_read_only = false;
117 static bool arg_boot = false;
118 static bool arg_ephemeral = false;
119 static LinkJournal arg_link_journal = LINK_AUTO;
120 static bool arg_link_journal_try = false;
121 static uint64_t arg_retain =
122 (1ULL << CAP_CHOWN) |
123 (1ULL << CAP_DAC_OVERRIDE) |
124 (1ULL << CAP_DAC_READ_SEARCH) |
125 (1ULL << CAP_FOWNER) |
126 (1ULL << CAP_FSETID) |
127 (1ULL << CAP_IPC_OWNER) |
128 (1ULL << CAP_KILL) |
129 (1ULL << CAP_LEASE) |
130 (1ULL << CAP_LINUX_IMMUTABLE) |
131 (1ULL << CAP_NET_BIND_SERVICE) |
132 (1ULL << CAP_NET_BROADCAST) |
133 (1ULL << CAP_NET_RAW) |
134 (1ULL << CAP_SETGID) |
135 (1ULL << CAP_SETFCAP) |
136 (1ULL << CAP_SETPCAP) |
137 (1ULL << CAP_SETUID) |
138 (1ULL << CAP_SYS_ADMIN) |
139 (1ULL << CAP_SYS_CHROOT) |
140 (1ULL << CAP_SYS_NICE) |
141 (1ULL << CAP_SYS_PTRACE) |
142 (1ULL << CAP_SYS_TTY_CONFIG) |
143 (1ULL << CAP_SYS_RESOURCE) |
144 (1ULL << CAP_SYS_BOOT) |
145 (1ULL << CAP_AUDIT_WRITE) |
146 (1ULL << CAP_AUDIT_CONTROL) |
147 (1ULL << CAP_MKNOD);
148 static CustomMount *arg_custom_mounts = NULL;
149 static unsigned arg_n_custom_mounts = 0;
150 static char **arg_setenv = NULL;
151 static bool arg_quiet = false;
152 static bool arg_share_system = false;
153 static bool arg_register = true;
154 static bool arg_keep_unit = false;
155 static char **arg_network_interfaces = NULL;
156 static char **arg_network_macvlan = NULL;
157 static char **arg_network_ipvlan = NULL;
158 static bool arg_network_veth = false;
159 static char *arg_network_bridge = NULL;
160 static unsigned long arg_personality = PERSONALITY_INVALID;
161 static char *arg_image = NULL;
162 static VolatileMode arg_volatile_mode = VOLATILE_NO;
163 static ExposePort *arg_expose_ports = NULL;
164 static char **arg_property = NULL;
165 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
166 static bool arg_userns = false;
167 static int arg_kill_signal = 0;
168 static bool arg_unified_cgroup_hierarchy = false;
169 static SettingsMask arg_settings_mask = 0;
170 static int arg_settings_trusted = -1;
171 static char **arg_parameters = NULL;
172
173 static void help(void) {
174 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
175 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
176 " -h --help Show this help\n"
177 " --version Print version string\n"
178 " -q --quiet Do not show status information\n"
179 " -D --directory=PATH Root directory for the container\n"
180 " --template=PATH Initialize root directory from template directory,\n"
181 " if missing\n"
182 " -x --ephemeral Run container with snapshot of root directory, and\n"
183 " remove it after exit\n"
184 " -i --image=PATH File system device or disk image for the container\n"
185 " -b --boot Boot up full system (i.e. invoke init)\n"
186 " -u --user=USER Run the command under specified user or uid\n"
187 " -M --machine=NAME Set the machine name for the container\n"
188 " --uuid=UUID Set a specific machine UUID for the container\n"
189 " -S --slice=SLICE Place the container in the specified slice\n"
190 " --property=NAME=VALUE Set scope unit property\n"
191 " --private-users[=UIDBASE[:NUIDS]]\n"
192 " Run within user namespace\n"
193 " --private-network Disable network in container\n"
194 " --network-interface=INTERFACE\n"
195 " Assign an existing network interface to the\n"
196 " container\n"
197 " --network-macvlan=INTERFACE\n"
198 " Create a macvlan network interface based on an\n"
199 " existing network interface to the container\n"
200 " --network-ipvlan=INTERFACE\n"
201 " Create a ipvlan network interface based on an\n"
202 " existing network interface to the container\n"
203 " -n --network-veth Add a virtual ethernet connection between host\n"
204 " and container\n"
205 " --network-bridge=INTERFACE\n"
206 " Add a virtual ethernet connection between host\n"
207 " and container and add it to an existing bridge on\n"
208 " the host\n"
209 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
210 " Expose a container IP port on the host\n"
211 " -Z --selinux-context=SECLABEL\n"
212 " Set the SELinux security context to be used by\n"
213 " processes in the container\n"
214 " -L --selinux-apifs-context=SECLABEL\n"
215 " Set the SELinux security context to be used by\n"
216 " API/tmpfs file systems in the container\n"
217 " --capability=CAP In addition to the default, retain specified\n"
218 " capability\n"
219 " --drop-capability=CAP Drop the specified capability from the default set\n"
220 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
221 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
222 " try-guest, try-host\n"
223 " -j Equivalent to --link-journal=try-guest\n"
224 " --read-only Mount the root directory read-only\n"
225 " --bind=PATH[:PATH[:OPTIONS]]\n"
226 " Bind mount a file or directory from the host into\n"
227 " the container\n"
228 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
229 " Similar, but creates a read-only bind mount\n"
230 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
231 " --overlay=PATH[:PATH...]:PATH\n"
232 " Create an overlay mount from the host to \n"
233 " the container\n"
234 " --overlay-ro=PATH[:PATH...]:PATH\n"
235 " Similar, but creates a read-only overlay mount\n"
236 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
237 " --share-system Share system namespaces with host\n"
238 " --register=BOOLEAN Register container as machine\n"
239 " --keep-unit Do not register a scope for the machine, reuse\n"
240 " the service unit nspawn is running in\n"
241 " --volatile[=MODE] Run the system in volatile mode\n"
242 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
243 , program_invocation_short_name);
244 }
245
246
247 static int custom_mounts_prepare(void) {
248 unsigned i;
249 int r;
250
251 /* Ensure the mounts are applied prefix first. */
252 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
253
254 /* Allocate working directories for the overlay file systems that need it */
255 for (i = 0; i < arg_n_custom_mounts; i++) {
256 CustomMount *m = &arg_custom_mounts[i];
257
258 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
259 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
260 return -EINVAL;
261 }
262
263 if (m->type != CUSTOM_MOUNT_OVERLAY)
264 continue;
265
266 if (m->work_dir)
267 continue;
268
269 if (m->read_only)
270 continue;
271
272 r = tempfn_random(m->source, NULL, &m->work_dir);
273 if (r < 0)
274 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
275 }
276
277 return 0;
278 }
279
280 static int detect_unified_cgroup_hierarchy(void) {
281 const char *e;
282 int r;
283
284 /* Allow the user to control whether the unified hierarchy is used */
285 e = getenv("UNIFIED_CGROUP_HIERARCHY");
286 if (e) {
287 r = parse_boolean(e);
288 if (r < 0)
289 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
290
291 arg_unified_cgroup_hierarchy = r;
292 return 0;
293 }
294
295 /* Otherwise inherit the default from the host system */
296 r = cg_unified();
297 if (r < 0)
298 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
299
300 arg_unified_cgroup_hierarchy = r;
301 return 0;
302 }
303
304 static int parse_argv(int argc, char *argv[]) {
305
306 enum {
307 ARG_VERSION = 0x100,
308 ARG_PRIVATE_NETWORK,
309 ARG_UUID,
310 ARG_READ_ONLY,
311 ARG_CAPABILITY,
312 ARG_DROP_CAPABILITY,
313 ARG_LINK_JOURNAL,
314 ARG_BIND,
315 ARG_BIND_RO,
316 ARG_TMPFS,
317 ARG_OVERLAY,
318 ARG_OVERLAY_RO,
319 ARG_SETENV,
320 ARG_SHARE_SYSTEM,
321 ARG_REGISTER,
322 ARG_KEEP_UNIT,
323 ARG_NETWORK_INTERFACE,
324 ARG_NETWORK_MACVLAN,
325 ARG_NETWORK_IPVLAN,
326 ARG_NETWORK_BRIDGE,
327 ARG_PERSONALITY,
328 ARG_VOLATILE,
329 ARG_TEMPLATE,
330 ARG_PROPERTY,
331 ARG_PRIVATE_USERS,
332 ARG_KILL_SIGNAL,
333 ARG_SETTINGS,
334 };
335
336 static const struct option options[] = {
337 { "help", no_argument, NULL, 'h' },
338 { "version", no_argument, NULL, ARG_VERSION },
339 { "directory", required_argument, NULL, 'D' },
340 { "template", required_argument, NULL, ARG_TEMPLATE },
341 { "ephemeral", no_argument, NULL, 'x' },
342 { "user", required_argument, NULL, 'u' },
343 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
344 { "boot", no_argument, NULL, 'b' },
345 { "uuid", required_argument, NULL, ARG_UUID },
346 { "read-only", no_argument, NULL, ARG_READ_ONLY },
347 { "capability", required_argument, NULL, ARG_CAPABILITY },
348 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
349 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
350 { "bind", required_argument, NULL, ARG_BIND },
351 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
352 { "tmpfs", required_argument, NULL, ARG_TMPFS },
353 { "overlay", required_argument, NULL, ARG_OVERLAY },
354 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
355 { "machine", required_argument, NULL, 'M' },
356 { "slice", required_argument, NULL, 'S' },
357 { "setenv", required_argument, NULL, ARG_SETENV },
358 { "selinux-context", required_argument, NULL, 'Z' },
359 { "selinux-apifs-context", required_argument, NULL, 'L' },
360 { "quiet", no_argument, NULL, 'q' },
361 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
362 { "register", required_argument, NULL, ARG_REGISTER },
363 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
364 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
365 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
366 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
367 { "network-veth", no_argument, NULL, 'n' },
368 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
369 { "personality", required_argument, NULL, ARG_PERSONALITY },
370 { "image", required_argument, NULL, 'i' },
371 { "volatile", optional_argument, NULL, ARG_VOLATILE },
372 { "port", required_argument, NULL, 'p' },
373 { "property", required_argument, NULL, ARG_PROPERTY },
374 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
375 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
376 { "settings", required_argument, NULL, ARG_SETTINGS },
377 {}
378 };
379
380 int c, r;
381 uint64_t plus = 0, minus = 0;
382 bool mask_all_settings = false, mask_no_settings = false;
383
384 assert(argc >= 0);
385 assert(argv);
386
387 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
388
389 switch (c) {
390
391 case 'h':
392 help();
393 return 0;
394
395 case ARG_VERSION:
396 return version();
397
398 case 'D':
399 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
400 if (r < 0)
401 return r;
402 break;
403
404 case ARG_TEMPLATE:
405 r = parse_path_argument_and_warn(optarg, false, &arg_template);
406 if (r < 0)
407 return r;
408 break;
409
410 case 'i':
411 r = parse_path_argument_and_warn(optarg, false, &arg_image);
412 if (r < 0)
413 return r;
414 break;
415
416 case 'x':
417 arg_ephemeral = true;
418 break;
419
420 case 'u':
421 r = free_and_strdup(&arg_user, optarg);
422 if (r < 0)
423 return log_oom();
424
425 arg_settings_mask |= SETTING_USER;
426 break;
427
428 case ARG_NETWORK_BRIDGE:
429 r = free_and_strdup(&arg_network_bridge, optarg);
430 if (r < 0)
431 return log_oom();
432
433 /* fall through */
434
435 case 'n':
436 arg_network_veth = true;
437 arg_private_network = true;
438 arg_settings_mask |= SETTING_NETWORK;
439 break;
440
441 case ARG_NETWORK_INTERFACE:
442 if (strv_extend(&arg_network_interfaces, optarg) < 0)
443 return log_oom();
444
445 arg_private_network = true;
446 arg_settings_mask |= SETTING_NETWORK;
447 break;
448
449 case ARG_NETWORK_MACVLAN:
450 if (strv_extend(&arg_network_macvlan, optarg) < 0)
451 return log_oom();
452
453 arg_private_network = true;
454 arg_settings_mask |= SETTING_NETWORK;
455 break;
456
457 case ARG_NETWORK_IPVLAN:
458 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
459 return log_oom();
460
461 /* fall through */
462
463 case ARG_PRIVATE_NETWORK:
464 arg_private_network = true;
465 arg_settings_mask |= SETTING_NETWORK;
466 break;
467
468 case 'b':
469 arg_boot = true;
470 arg_settings_mask |= SETTING_BOOT;
471 break;
472
473 case ARG_UUID:
474 r = sd_id128_from_string(optarg, &arg_uuid);
475 if (r < 0) {
476 log_error("Invalid UUID: %s", optarg);
477 return r;
478 }
479
480 arg_settings_mask |= SETTING_MACHINE_ID;
481 break;
482
483 case 'S':
484 arg_slice = optarg;
485 break;
486
487 case 'M':
488 if (isempty(optarg))
489 arg_machine = mfree(arg_machine);
490 else {
491 if (!machine_name_is_valid(optarg)) {
492 log_error("Invalid machine name: %s", optarg);
493 return -EINVAL;
494 }
495
496 r = free_and_strdup(&arg_machine, optarg);
497 if (r < 0)
498 return log_oom();
499
500 break;
501 }
502
503 case 'Z':
504 arg_selinux_context = optarg;
505 break;
506
507 case 'L':
508 arg_selinux_apifs_context = optarg;
509 break;
510
511 case ARG_READ_ONLY:
512 arg_read_only = true;
513 arg_settings_mask |= SETTING_READ_ONLY;
514 break;
515
516 case ARG_CAPABILITY:
517 case ARG_DROP_CAPABILITY: {
518 const char *state, *word;
519 size_t length;
520
521 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
522 _cleanup_free_ char *t;
523
524 t = strndup(word, length);
525 if (!t)
526 return log_oom();
527
528 if (streq(t, "all")) {
529 if (c == ARG_CAPABILITY)
530 plus = (uint64_t) -1;
531 else
532 minus = (uint64_t) -1;
533 } else {
534 int cap;
535
536 cap = capability_from_name(t);
537 if (cap < 0) {
538 log_error("Failed to parse capability %s.", t);
539 return -EINVAL;
540 }
541
542 if (c == ARG_CAPABILITY)
543 plus |= 1ULL << (uint64_t) cap;
544 else
545 minus |= 1ULL << (uint64_t) cap;
546 }
547 }
548
549 arg_settings_mask |= SETTING_CAPABILITY;
550 break;
551 }
552
553 case 'j':
554 arg_link_journal = LINK_GUEST;
555 arg_link_journal_try = true;
556 break;
557
558 case ARG_LINK_JOURNAL:
559 if (streq(optarg, "auto")) {
560 arg_link_journal = LINK_AUTO;
561 arg_link_journal_try = false;
562 } else if (streq(optarg, "no")) {
563 arg_link_journal = LINK_NO;
564 arg_link_journal_try = false;
565 } else if (streq(optarg, "guest")) {
566 arg_link_journal = LINK_GUEST;
567 arg_link_journal_try = false;
568 } else if (streq(optarg, "host")) {
569 arg_link_journal = LINK_HOST;
570 arg_link_journal_try = false;
571 } else if (streq(optarg, "try-guest")) {
572 arg_link_journal = LINK_GUEST;
573 arg_link_journal_try = true;
574 } else if (streq(optarg, "try-host")) {
575 arg_link_journal = LINK_HOST;
576 arg_link_journal_try = true;
577 } else {
578 log_error("Failed to parse link journal mode %s", optarg);
579 return -EINVAL;
580 }
581
582 break;
583
584 case ARG_BIND:
585 case ARG_BIND_RO:
586 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
587 if (r < 0)
588 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
589
590 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
591 break;
592
593 case ARG_TMPFS:
594 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
595 if (r < 0)
596 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
597
598 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
599 break;
600
601 case ARG_OVERLAY:
602 case ARG_OVERLAY_RO: {
603 _cleanup_free_ char *upper = NULL, *destination = NULL;
604 _cleanup_strv_free_ char **lower = NULL;
605 CustomMount *m;
606 unsigned n = 0;
607 char **i;
608
609 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
610 if (r == -ENOMEM)
611 return log_oom();
612 else if (r < 0) {
613 log_error("Invalid overlay specification: %s", optarg);
614 return r;
615 }
616
617 STRV_FOREACH(i, lower) {
618 if (!path_is_absolute(*i)) {
619 log_error("Overlay path %s is not absolute.", *i);
620 return -EINVAL;
621 }
622
623 n++;
624 }
625
626 if (n < 2) {
627 log_error("--overlay= needs at least two colon-separated directories specified.");
628 return -EINVAL;
629 }
630
631 if (n == 2) {
632 /* If two parameters are specified,
633 * the first one is the lower, the
634 * second one the upper directory. And
635 * we'll also define the destination
636 * mount point the same as the upper. */
637 upper = lower[1];
638 lower[1] = NULL;
639
640 destination = strdup(upper);
641 if (!destination)
642 return log_oom();
643
644 } else {
645 upper = lower[n - 2];
646 destination = lower[n - 1];
647 lower[n - 2] = NULL;
648 }
649
650 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
651 if (!m)
652 return log_oom();
653
654 m->destination = destination;
655 m->source = upper;
656 m->lower = lower;
657 m->read_only = c == ARG_OVERLAY_RO;
658
659 upper = destination = NULL;
660 lower = NULL;
661
662 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
663 break;
664 }
665
666 case ARG_SETENV: {
667 char **n;
668
669 if (!env_assignment_is_valid(optarg)) {
670 log_error("Environment variable assignment '%s' is not valid.", optarg);
671 return -EINVAL;
672 }
673
674 n = strv_env_set(arg_setenv, optarg);
675 if (!n)
676 return log_oom();
677
678 strv_free(arg_setenv);
679 arg_setenv = n;
680
681 arg_settings_mask |= SETTING_ENVIRONMENT;
682 break;
683 }
684
685 case 'q':
686 arg_quiet = true;
687 break;
688
689 case ARG_SHARE_SYSTEM:
690 arg_share_system = true;
691 break;
692
693 case ARG_REGISTER:
694 r = parse_boolean(optarg);
695 if (r < 0) {
696 log_error("Failed to parse --register= argument: %s", optarg);
697 return r;
698 }
699
700 arg_register = r;
701 break;
702
703 case ARG_KEEP_UNIT:
704 arg_keep_unit = true;
705 break;
706
707 case ARG_PERSONALITY:
708
709 arg_personality = personality_from_string(optarg);
710 if (arg_personality == PERSONALITY_INVALID) {
711 log_error("Unknown or unsupported personality '%s'.", optarg);
712 return -EINVAL;
713 }
714
715 arg_settings_mask |= SETTING_PERSONALITY;
716 break;
717
718 case ARG_VOLATILE:
719
720 if (!optarg)
721 arg_volatile_mode = VOLATILE_YES;
722 else {
723 VolatileMode m;
724
725 m = volatile_mode_from_string(optarg);
726 if (m < 0) {
727 log_error("Failed to parse --volatile= argument: %s", optarg);
728 return -EINVAL;
729 } else
730 arg_volatile_mode = m;
731 }
732
733 arg_settings_mask |= SETTING_VOLATILE_MODE;
734 break;
735
736 case 'p':
737 r = expose_port_parse(&arg_expose_ports, optarg);
738 if (r == -EEXIST)
739 return log_error_errno(r, "Duplicate port specification: %s", optarg);
740 if (r < 0)
741 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
742
743 arg_settings_mask |= SETTING_EXPOSE_PORTS;
744 break;
745
746 case ARG_PROPERTY:
747 if (strv_extend(&arg_property, optarg) < 0)
748 return log_oom();
749
750 break;
751
752 case ARG_PRIVATE_USERS:
753 if (optarg) {
754 _cleanup_free_ char *buffer = NULL;
755 const char *range, *shift;
756
757 range = strchr(optarg, ':');
758 if (range) {
759 buffer = strndup(optarg, range - optarg);
760 if (!buffer)
761 return log_oom();
762 shift = buffer;
763
764 range++;
765 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
766 log_error("Failed to parse UID range: %s", range);
767 return -EINVAL;
768 }
769 } else
770 shift = optarg;
771
772 if (parse_uid(shift, &arg_uid_shift) < 0) {
773 log_error("Failed to parse UID: %s", optarg);
774 return -EINVAL;
775 }
776 }
777
778 arg_userns = true;
779 break;
780
781 case ARG_KILL_SIGNAL:
782 arg_kill_signal = signal_from_string_try_harder(optarg);
783 if (arg_kill_signal < 0) {
784 log_error("Cannot parse signal: %s", optarg);
785 return -EINVAL;
786 }
787
788 arg_settings_mask |= SETTING_KILL_SIGNAL;
789 break;
790
791 case ARG_SETTINGS:
792
793 /* no → do not read files
794 * yes → read files, do not override cmdline, trust only subset
795 * override → read files, override cmdline, trust only subset
796 * trusted → read files, do not override cmdline, trust all
797 */
798
799 r = parse_boolean(optarg);
800 if (r < 0) {
801 if (streq(optarg, "trusted")) {
802 mask_all_settings = false;
803 mask_no_settings = false;
804 arg_settings_trusted = true;
805
806 } else if (streq(optarg, "override")) {
807 mask_all_settings = false;
808 mask_no_settings = true;
809 arg_settings_trusted = -1;
810 } else
811 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
812 } else if (r > 0) {
813 /* yes */
814 mask_all_settings = false;
815 mask_no_settings = false;
816 arg_settings_trusted = -1;
817 } else {
818 /* no */
819 mask_all_settings = true;
820 mask_no_settings = false;
821 arg_settings_trusted = false;
822 }
823
824 break;
825
826 case '?':
827 return -EINVAL;
828
829 default:
830 assert_not_reached("Unhandled option");
831 }
832
833 if (arg_share_system)
834 arg_register = false;
835
836 if (arg_boot && arg_share_system) {
837 log_error("--boot and --share-system may not be combined.");
838 return -EINVAL;
839 }
840
841 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
842 log_error("--keep-unit may not be used when invoked from a user session.");
843 return -EINVAL;
844 }
845
846 if (arg_directory && arg_image) {
847 log_error("--directory= and --image= may not be combined.");
848 return -EINVAL;
849 }
850
851 if (arg_template && arg_image) {
852 log_error("--template= and --image= may not be combined.");
853 return -EINVAL;
854 }
855
856 if (arg_template && !(arg_directory || arg_machine)) {
857 log_error("--template= needs --directory= or --machine=.");
858 return -EINVAL;
859 }
860
861 if (arg_ephemeral && arg_template) {
862 log_error("--ephemeral and --template= may not be combined.");
863 return -EINVAL;
864 }
865
866 if (arg_ephemeral && arg_image) {
867 log_error("--ephemeral and --image= may not be combined.");
868 return -EINVAL;
869 }
870
871 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
872 log_error("--ephemeral and --link-journal= may not be combined.");
873 return -EINVAL;
874 }
875
876 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
877 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
878
879 if (argc > optind) {
880 arg_parameters = strv_copy(argv + optind);
881 if (!arg_parameters)
882 return log_oom();
883
884 arg_settings_mask |= SETTING_BOOT;
885 }
886
887 /* Load all settings from .nspawn files */
888 if (mask_no_settings)
889 arg_settings_mask = 0;
890
891 /* Don't load any settings from .nspawn files */
892 if (mask_all_settings)
893 arg_settings_mask = _SETTINGS_MASK_ALL;
894
895 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
896
897 r = detect_unified_cgroup_hierarchy();
898 if (r < 0)
899 return r;
900
901 return 1;
902 }
903
904 static int verify_arguments(void) {
905
906 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
907 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
908 return -EINVAL;
909 }
910
911 if (arg_expose_ports && !arg_private_network) {
912 log_error("Cannot use --port= without private networking.");
913 return -EINVAL;
914 }
915
916 if (arg_boot && arg_kill_signal <= 0)
917 arg_kill_signal = SIGRTMIN+3;
918
919 return 0;
920 }
921
922 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
923 assert(p);
924
925 if (!arg_userns)
926 return 0;
927
928 if (uid == UID_INVALID && gid == GID_INVALID)
929 return 0;
930
931 if (uid != UID_INVALID) {
932 uid += arg_uid_shift;
933
934 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
935 return -EOVERFLOW;
936 }
937
938 if (gid != GID_INVALID) {
939 gid += (gid_t) arg_uid_shift;
940
941 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
942 return -EOVERFLOW;
943 }
944
945 if (lchown(p, uid, gid) < 0)
946 return -errno;
947
948 return 0;
949 }
950
951 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
952 const char *q;
953
954 q = prefix_roota(root, path);
955 if (mkdir(q, mode) < 0) {
956 if (errno == EEXIST)
957 return 0;
958 return -errno;
959 }
960
961 return userns_lchown(q, uid, gid);
962 }
963
964 static int setup_timezone(const char *dest) {
965 _cleanup_free_ char *p = NULL, *q = NULL;
966 const char *where, *check, *what;
967 char *z, *y;
968 int r;
969
970 assert(dest);
971
972 /* Fix the timezone, if possible */
973 r = readlink_malloc("/etc/localtime", &p);
974 if (r < 0) {
975 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
976 return 0;
977 }
978
979 z = path_startswith(p, "../usr/share/zoneinfo/");
980 if (!z)
981 z = path_startswith(p, "/usr/share/zoneinfo/");
982 if (!z) {
983 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
984 return 0;
985 }
986
987 where = prefix_roota(dest, "/etc/localtime");
988 r = readlink_malloc(where, &q);
989 if (r >= 0) {
990 y = path_startswith(q, "../usr/share/zoneinfo/");
991 if (!y)
992 y = path_startswith(q, "/usr/share/zoneinfo/");
993
994 /* Already pointing to the right place? Then do nothing .. */
995 if (y && streq(y, z))
996 return 0;
997 }
998
999 check = strjoina("/usr/share/zoneinfo/", z);
1000 check = prefix_root(dest, check);
1001 if (laccess(check, F_OK) < 0) {
1002 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1003 return 0;
1004 }
1005
1006 r = unlink(where);
1007 if (r < 0 && errno != ENOENT) {
1008 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1009 return 0;
1010 }
1011
1012 what = strjoina("../usr/share/zoneinfo/", z);
1013 if (symlink(what, where) < 0) {
1014 log_error_errno(errno, "Failed to correct timezone of container: %m");
1015 return 0;
1016 }
1017
1018 r = userns_lchown(where, 0, 0);
1019 if (r < 0)
1020 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1021
1022 return 0;
1023 }
1024
1025 static int setup_resolv_conf(const char *dest) {
1026 const char *where = NULL;
1027 int r;
1028
1029 assert(dest);
1030
1031 if (arg_private_network)
1032 return 0;
1033
1034 /* Fix resolv.conf, if possible */
1035 where = prefix_roota(dest, "/etc/resolv.conf");
1036
1037 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1038 if (r < 0) {
1039 /* If the file already exists as symlink, let's
1040 * suppress the warning, under the assumption that
1041 * resolved or something similar runs inside and the
1042 * symlink points there.
1043 *
1044 * If the disk image is read-only, there's also no
1045 * point in complaining.
1046 */
1047 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1048 "Failed to copy /etc/resolv.conf to %s: %m", where);
1049 return 0;
1050 }
1051
1052 r = userns_lchown(where, 0, 0);
1053 if (r < 0)
1054 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1055
1056 return 0;
1057 }
1058
1059 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1060 assert(s);
1061
1062 snprintf(s, 37,
1063 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1064 SD_ID128_FORMAT_VAL(id));
1065
1066 return s;
1067 }
1068
1069 static int setup_boot_id(const char *dest) {
1070 const char *from, *to;
1071 sd_id128_t rnd = {};
1072 char as_uuid[37];
1073 int r;
1074
1075 if (arg_share_system)
1076 return 0;
1077
1078 /* Generate a new randomized boot ID, so that each boot-up of
1079 * the container gets a new one */
1080
1081 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1082 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1083
1084 r = sd_id128_randomize(&rnd);
1085 if (r < 0)
1086 return log_error_errno(r, "Failed to generate random boot id: %m");
1087
1088 id128_format_as_uuid(rnd, as_uuid);
1089
1090 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1091 if (r < 0)
1092 return log_error_errno(r, "Failed to write boot id: %m");
1093
1094 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1095 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1096 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1097 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1098
1099 unlink(from);
1100 return r;
1101 }
1102
1103 static int copy_devnodes(const char *dest) {
1104
1105 static const char devnodes[] =
1106 "null\0"
1107 "zero\0"
1108 "full\0"
1109 "random\0"
1110 "urandom\0"
1111 "tty\0"
1112 "net/tun\0";
1113
1114 const char *d;
1115 int r = 0;
1116 _cleanup_umask_ mode_t u;
1117
1118 assert(dest);
1119
1120 u = umask(0000);
1121
1122 /* Create /dev/net, so that we can create /dev/net/tun in it */
1123 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1124 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1125
1126 NULSTR_FOREACH(d, devnodes) {
1127 _cleanup_free_ char *from = NULL, *to = NULL;
1128 struct stat st;
1129
1130 from = strappend("/dev/", d);
1131 to = prefix_root(dest, from);
1132
1133 if (stat(from, &st) < 0) {
1134
1135 if (errno != ENOENT)
1136 return log_error_errno(errno, "Failed to stat %s: %m", from);
1137
1138 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1139
1140 log_error("%s is not a char or block device, cannot copy.", from);
1141 return -EIO;
1142
1143 } else {
1144 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1145 if (errno != EPERM)
1146 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1147
1148 /* Some systems abusively restrict mknod but
1149 * allow bind mounts. */
1150 r = touch(to);
1151 if (r < 0)
1152 return log_error_errno(r, "touch (%s) failed: %m", to);
1153 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1154 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1155 }
1156
1157 r = userns_lchown(to, 0, 0);
1158 if (r < 0)
1159 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1160 }
1161 }
1162
1163 return r;
1164 }
1165
1166 static int setup_pts(const char *dest) {
1167 _cleanup_free_ char *options = NULL;
1168 const char *p;
1169
1170 #ifdef HAVE_SELINUX
1171 if (arg_selinux_apifs_context)
1172 (void) asprintf(&options,
1173 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1174 arg_uid_shift + TTY_GID,
1175 arg_selinux_apifs_context);
1176 else
1177 #endif
1178 (void) asprintf(&options,
1179 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1180 arg_uid_shift + TTY_GID);
1181
1182 if (!options)
1183 return log_oom();
1184
1185 /* Mount /dev/pts itself */
1186 p = prefix_roota(dest, "/dev/pts");
1187 if (mkdir(p, 0755) < 0)
1188 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1189 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1190 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1191 if (userns_lchown(p, 0, 0) < 0)
1192 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1193
1194 /* Create /dev/ptmx symlink */
1195 p = prefix_roota(dest, "/dev/ptmx");
1196 if (symlink("pts/ptmx", p) < 0)
1197 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1198 if (userns_lchown(p, 0, 0) < 0)
1199 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
1200
1201 /* And fix /dev/pts/ptmx ownership */
1202 p = prefix_roota(dest, "/dev/pts/ptmx");
1203 if (userns_lchown(p, 0, 0) < 0)
1204 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
1205
1206 return 0;
1207 }
1208
1209 static int setup_dev_console(const char *dest, const char *console) {
1210 _cleanup_umask_ mode_t u;
1211 const char *to;
1212 int r;
1213
1214 assert(dest);
1215 assert(console);
1216
1217 u = umask(0000);
1218
1219 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1220 if (r < 0)
1221 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1222
1223 /* We need to bind mount the right tty to /dev/console since
1224 * ptys can only exist on pts file systems. To have something
1225 * to bind mount things on we create a empty regular file. */
1226
1227 to = prefix_roota(dest, "/dev/console");
1228 r = touch(to);
1229 if (r < 0)
1230 return log_error_errno(r, "touch() for /dev/console failed: %m");
1231
1232 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1233 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1234
1235 return 0;
1236 }
1237
1238 static int setup_kmsg(const char *dest, int kmsg_socket) {
1239 const char *from, *to;
1240 _cleanup_umask_ mode_t u;
1241 int fd, r;
1242
1243 assert(kmsg_socket >= 0);
1244
1245 u = umask(0000);
1246
1247 /* We create the kmsg FIFO as /run/kmsg, but immediately
1248 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1249 * on the reading side behave very similar to /proc/kmsg,
1250 * their writing side behaves differently from /dev/kmsg in
1251 * that writing blocks when nothing is reading. In order to
1252 * avoid any problems with containers deadlocking due to this
1253 * we simply make /dev/kmsg unavailable to the container. */
1254 from = prefix_roota(dest, "/run/kmsg");
1255 to = prefix_roota(dest, "/proc/kmsg");
1256
1257 if (mkfifo(from, 0600) < 0)
1258 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1259 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1260 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1261
1262 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1263 if (fd < 0)
1264 return log_error_errno(errno, "Failed to open fifo: %m");
1265
1266 /* Store away the fd in the socket, so that it stays open as
1267 * long as we run the child */
1268 r = send_one_fd(kmsg_socket, fd, 0);
1269 safe_close(fd);
1270
1271 if (r < 0)
1272 return log_error_errno(r, "Failed to send FIFO fd: %m");
1273
1274 /* And now make the FIFO unavailable as /run/kmsg... */
1275 (void) unlink(from);
1276
1277 return 0;
1278 }
1279
1280 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
1281 union in_addr_union *exposed = userdata;
1282
1283 assert(rtnl);
1284 assert(m);
1285 assert(exposed);
1286
1287 expose_port_execute(rtnl, arg_expose_ports, exposed);
1288 return 0;
1289 }
1290
1291 static int setup_hostname(void) {
1292
1293 if (arg_share_system)
1294 return 0;
1295
1296 if (sethostname_idempotent(arg_machine) < 0)
1297 return -errno;
1298
1299 return 0;
1300 }
1301
1302 static int setup_journal(const char *directory) {
1303 sd_id128_t machine_id, this_id;
1304 _cleanup_free_ char *b = NULL, *d = NULL;
1305 const char *etc_machine_id, *p, *q;
1306 char *id;
1307 int r;
1308
1309 /* Don't link journals in ephemeral mode */
1310 if (arg_ephemeral)
1311 return 0;
1312
1313 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
1314
1315 r = read_one_line_file(etc_machine_id, &b);
1316 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1317 return 0;
1318 else if (r < 0)
1319 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
1320
1321 id = strstrip(b);
1322 if (isempty(id) && arg_link_journal == LINK_AUTO)
1323 return 0;
1324
1325 /* Verify validity */
1326 r = sd_id128_from_string(id, &machine_id);
1327 if (r < 0)
1328 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
1329
1330 r = sd_id128_get_machine(&this_id);
1331 if (r < 0)
1332 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1333
1334 if (sd_id128_equal(machine_id, this_id)) {
1335 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1336 "Host and machine ids are equal (%s): refusing to link journals", id);
1337 if (arg_link_journal == LINK_AUTO)
1338 return 0;
1339 return -EEXIST;
1340 }
1341
1342 if (arg_link_journal == LINK_NO)
1343 return 0;
1344
1345 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1346 if (r < 0)
1347 return log_error_errno(r, "Failed to create /var: %m");
1348
1349 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1350 if (r < 0)
1351 return log_error_errno(r, "Failed to create /var/log: %m");
1352
1353 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1354 if (r < 0)
1355 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1356
1357 p = strjoina("/var/log/journal/", id);
1358 q = prefix_roota(directory, p);
1359
1360 if (path_is_mount_point(p, 0) > 0) {
1361 if (arg_link_journal != LINK_AUTO) {
1362 log_error("%s: already a mount point, refusing to use for journal", p);
1363 return -EEXIST;
1364 }
1365
1366 return 0;
1367 }
1368
1369 if (path_is_mount_point(q, 0) > 0) {
1370 if (arg_link_journal != LINK_AUTO) {
1371 log_error("%s: already a mount point, refusing to use for journal", q);
1372 return -EEXIST;
1373 }
1374
1375 return 0;
1376 }
1377
1378 r = readlink_and_make_absolute(p, &d);
1379 if (r >= 0) {
1380 if ((arg_link_journal == LINK_GUEST ||
1381 arg_link_journal == LINK_AUTO) &&
1382 path_equal(d, q)) {
1383
1384 r = userns_mkdir(directory, p, 0755, 0, 0);
1385 if (r < 0)
1386 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1387 return 0;
1388 }
1389
1390 if (unlink(p) < 0)
1391 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1392 } else if (r == -EINVAL) {
1393
1394 if (arg_link_journal == LINK_GUEST &&
1395 rmdir(p) < 0) {
1396
1397 if (errno == ENOTDIR) {
1398 log_error("%s already exists and is neither a symlink nor a directory", p);
1399 return r;
1400 } else {
1401 log_error_errno(errno, "Failed to remove %s: %m", p);
1402 return -errno;
1403 }
1404 }
1405 } else if (r != -ENOENT) {
1406 log_error_errno(errno, "readlink(%s) failed: %m", p);
1407 return r;
1408 }
1409
1410 if (arg_link_journal == LINK_GUEST) {
1411
1412 if (symlink(q, p) < 0) {
1413 if (arg_link_journal_try) {
1414 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1415 return 0;
1416 } else {
1417 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1418 return -errno;
1419 }
1420 }
1421
1422 r = userns_mkdir(directory, p, 0755, 0, 0);
1423 if (r < 0)
1424 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1425 return 0;
1426 }
1427
1428 if (arg_link_journal == LINK_HOST) {
1429 /* don't create parents here -- if the host doesn't have
1430 * permanent journal set up, don't force it here */
1431 r = mkdir(p, 0755);
1432 if (r < 0) {
1433 if (arg_link_journal_try) {
1434 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1435 return 0;
1436 } else {
1437 log_error_errno(errno, "Failed to create %s: %m", p);
1438 return r;
1439 }
1440 }
1441
1442 } else if (access(p, F_OK) < 0)
1443 return 0;
1444
1445 if (dir_is_empty(q) == 0)
1446 log_warning("%s is not empty, proceeding anyway.", q);
1447
1448 r = userns_mkdir(directory, p, 0755, 0, 0);
1449 if (r < 0) {
1450 log_error_errno(errno, "Failed to create %s: %m", q);
1451 return r;
1452 }
1453
1454 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1455 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1456
1457 return 0;
1458 }
1459
1460 static int drop_capabilities(void) {
1461 return capability_bounding_set_drop(~arg_retain, false);
1462 }
1463
1464 static int reset_audit_loginuid(void) {
1465 _cleanup_free_ char *p = NULL;
1466 int r;
1467
1468 if (arg_share_system)
1469 return 0;
1470
1471 r = read_one_line_file("/proc/self/loginuid", &p);
1472 if (r == -ENOENT)
1473 return 0;
1474 if (r < 0)
1475 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1476
1477 /* Already reset? */
1478 if (streq(p, "4294967295"))
1479 return 0;
1480
1481 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
1482 if (r < 0) {
1483 log_error_errno(r,
1484 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1485 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1486 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1487 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1488 "using systemd-nspawn. Sleeping for 5s... (%m)");
1489
1490 sleep(5);
1491 }
1492
1493 return 0;
1494 }
1495
1496 static int setup_seccomp(void) {
1497
1498 #ifdef HAVE_SECCOMP
1499 static const struct {
1500 uint64_t capability;
1501 int syscall_num;
1502 } blacklist[] = {
1503 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
1504 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
1505 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
1506 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
1507 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
1508 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
1509 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
1510 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
1511 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
1512 { CAP_SYSLOG, SCMP_SYS(syslog) },
1513 };
1514
1515 scmp_filter_ctx seccomp;
1516 unsigned i;
1517 int r;
1518
1519 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1520 if (!seccomp)
1521 return log_oom();
1522
1523 r = seccomp_add_secondary_archs(seccomp);
1524 if (r < 0) {
1525 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
1526 goto finish;
1527 }
1528
1529 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1530 if (arg_retain & (1ULL << blacklist[i].capability))
1531 continue;
1532
1533 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
1534 if (r == -EFAULT)
1535 continue; /* unknown syscall */
1536 if (r < 0) {
1537 log_error_errno(r, "Failed to block syscall: %m");
1538 goto finish;
1539 }
1540 }
1541
1542
1543 /*
1544 Audit is broken in containers, much of the userspace audit
1545 hookup will fail if running inside a container. We don't
1546 care and just turn off creation of audit sockets.
1547
1548 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1549 with EAFNOSUPPORT which audit userspace uses as indication
1550 that audit is disabled in the kernel.
1551 */
1552
1553 r = seccomp_rule_add(
1554 seccomp,
1555 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1556 SCMP_SYS(socket),
1557 2,
1558 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1559 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1560 if (r < 0) {
1561 log_error_errno(r, "Failed to add audit seccomp rule: %m");
1562 goto finish;
1563 }
1564
1565 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1566 if (r < 0) {
1567 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
1568 goto finish;
1569 }
1570
1571 r = seccomp_load(seccomp);
1572 if (r == -EINVAL) {
1573 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1574 r = 0;
1575 goto finish;
1576 }
1577 if (r < 0) {
1578 log_error_errno(r, "Failed to install seccomp audit filter: %m");
1579 goto finish;
1580 }
1581
1582 finish:
1583 seccomp_release(seccomp);
1584 return r;
1585 #else
1586 return 0;
1587 #endif
1588
1589 }
1590
1591 static int setup_propagate(const char *root) {
1592 const char *p, *q;
1593
1594 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1595 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1596 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
1597 (void) mkdir_p(p, 0600);
1598
1599 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
1600 return log_error_errno(errno, "Failed to create /run/systemd: %m");
1601
1602 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
1603 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
1604
1605 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
1606 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
1607
1608 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
1609 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1610 return log_error_errno(errno, "Failed to install propagation bind mount.");
1611
1612 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1613 return log_error_errno(errno, "Failed to make propagation mount read-only");
1614
1615 return 0;
1616 }
1617
1618 static int setup_image(char **device_path, int *loop_nr) {
1619 struct loop_info64 info = {
1620 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1621 };
1622 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1623 _cleanup_free_ char* loopdev = NULL;
1624 struct stat st;
1625 int r, nr;
1626
1627 assert(device_path);
1628 assert(loop_nr);
1629 assert(arg_image);
1630
1631 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1632 if (fd < 0)
1633 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1634
1635 if (fstat(fd, &st) < 0)
1636 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1637
1638 if (S_ISBLK(st.st_mode)) {
1639 char *p;
1640
1641 p = strdup(arg_image);
1642 if (!p)
1643 return log_oom();
1644
1645 *device_path = p;
1646
1647 *loop_nr = -1;
1648
1649 r = fd;
1650 fd = -1;
1651
1652 return r;
1653 }
1654
1655 if (!S_ISREG(st.st_mode)) {
1656 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1657 return -EINVAL;
1658 }
1659
1660 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1661 if (control < 0)
1662 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1663
1664 nr = ioctl(control, LOOP_CTL_GET_FREE);
1665 if (nr < 0)
1666 return log_error_errno(errno, "Failed to allocate loop device: %m");
1667
1668 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1669 return log_oom();
1670
1671 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1672 if (loop < 0)
1673 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1674
1675 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1676 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1677
1678 if (arg_read_only)
1679 info.lo_flags |= LO_FLAGS_READ_ONLY;
1680
1681 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1682 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1683
1684 *device_path = loopdev;
1685 loopdev = NULL;
1686
1687 *loop_nr = nr;
1688
1689 r = loop;
1690 loop = -1;
1691
1692 return r;
1693 }
1694
1695 #define PARTITION_TABLE_BLURB \
1696 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1697 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1698 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1699 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1700 "to be bootable with systemd-nspawn."
1701
1702 static int dissect_image(
1703 int fd,
1704 char **root_device, bool *root_device_rw,
1705 char **home_device, bool *home_device_rw,
1706 char **srv_device, bool *srv_device_rw,
1707 bool *secondary) {
1708
1709 #ifdef HAVE_BLKID
1710 int home_nr = -1, srv_nr = -1;
1711 #ifdef GPT_ROOT_NATIVE
1712 int root_nr = -1;
1713 #endif
1714 #ifdef GPT_ROOT_SECONDARY
1715 int secondary_root_nr = -1;
1716 #endif
1717 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1718 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1719 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1720 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1721 _cleanup_udev_unref_ struct udev *udev = NULL;
1722 struct udev_list_entry *first, *item;
1723 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
1724 bool is_gpt, is_mbr, multiple_generic = false;
1725 const char *pttype = NULL;
1726 blkid_partlist pl;
1727 struct stat st;
1728 unsigned i;
1729 int r;
1730
1731 assert(fd >= 0);
1732 assert(root_device);
1733 assert(home_device);
1734 assert(srv_device);
1735 assert(secondary);
1736 assert(arg_image);
1737
1738 b = blkid_new_probe();
1739 if (!b)
1740 return log_oom();
1741
1742 errno = 0;
1743 r = blkid_probe_set_device(b, fd, 0, 0);
1744 if (r != 0) {
1745 if (errno == 0)
1746 return log_oom();
1747
1748 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1749 return -errno;
1750 }
1751
1752 blkid_probe_enable_partitions(b, 1);
1753 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1754
1755 errno = 0;
1756 r = blkid_do_safeprobe(b);
1757 if (r == -2 || r == 1) {
1758 log_error("Failed to identify any partition table on\n"
1759 " %s\n"
1760 PARTITION_TABLE_BLURB, arg_image);
1761 return -EINVAL;
1762 } else if (r != 0) {
1763 if (errno == 0)
1764 errno = EIO;
1765 log_error_errno(errno, "Failed to probe: %m");
1766 return -errno;
1767 }
1768
1769 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1770
1771 is_gpt = streq_ptr(pttype, "gpt");
1772 is_mbr = streq_ptr(pttype, "dos");
1773
1774 if (!is_gpt && !is_mbr) {
1775 log_error("No GPT or MBR partition table discovered on\n"
1776 " %s\n"
1777 PARTITION_TABLE_BLURB, arg_image);
1778 return -EINVAL;
1779 }
1780
1781 errno = 0;
1782 pl = blkid_probe_get_partitions(b);
1783 if (!pl) {
1784 if (errno == 0)
1785 return log_oom();
1786
1787 log_error("Failed to list partitions of %s", arg_image);
1788 return -errno;
1789 }
1790
1791 udev = udev_new();
1792 if (!udev)
1793 return log_oom();
1794
1795 if (fstat(fd, &st) < 0)
1796 return log_error_errno(errno, "Failed to stat block device: %m");
1797
1798 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1799 if (!d)
1800 return log_oom();
1801
1802 for (i = 0;; i++) {
1803 int n, m;
1804
1805 if (i >= 10) {
1806 log_error("Kernel partitions never appeared.");
1807 return -ENXIO;
1808 }
1809
1810 e = udev_enumerate_new(udev);
1811 if (!e)
1812 return log_oom();
1813
1814 r = udev_enumerate_add_match_parent(e, d);
1815 if (r < 0)
1816 return log_oom();
1817
1818 r = udev_enumerate_scan_devices(e);
1819 if (r < 0)
1820 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1821
1822 /* Count the partitions enumerated by the kernel */
1823 n = 0;
1824 first = udev_enumerate_get_list_entry(e);
1825 udev_list_entry_foreach(item, first)
1826 n++;
1827
1828 /* Count the partitions enumerated by blkid */
1829 m = blkid_partlist_numof_partitions(pl);
1830 if (n == m + 1)
1831 break;
1832 if (n > m + 1) {
1833 log_error("blkid and kernel partition list do not match.");
1834 return -EIO;
1835 }
1836 if (n < m + 1) {
1837 unsigned j;
1838
1839 /* The kernel has probed fewer partitions than
1840 * blkid? Maybe the kernel prober is still
1841 * running or it got EBUSY because udev
1842 * already opened the device. Let's reprobe
1843 * the device, which is a synchronous call
1844 * that waits until probing is complete. */
1845
1846 for (j = 0; j < 20; j++) {
1847
1848 r = ioctl(fd, BLKRRPART, 0);
1849 if (r < 0)
1850 r = -errno;
1851 if (r >= 0 || r != -EBUSY)
1852 break;
1853
1854 /* If something else has the device
1855 * open, such as an udev rule, the
1856 * ioctl will return EBUSY. Since
1857 * there's no way to wait until it
1858 * isn't busy anymore, let's just wait
1859 * a bit, and try again.
1860 *
1861 * This is really something they
1862 * should fix in the kernel! */
1863
1864 usleep(50 * USEC_PER_MSEC);
1865 }
1866
1867 if (r < 0)
1868 return log_error_errno(r, "Failed to reread partition table: %m");
1869 }
1870
1871 e = udev_enumerate_unref(e);
1872 }
1873
1874 first = udev_enumerate_get_list_entry(e);
1875 udev_list_entry_foreach(item, first) {
1876 _cleanup_udev_device_unref_ struct udev_device *q;
1877 const char *node;
1878 unsigned long long flags;
1879 blkid_partition pp;
1880 dev_t qn;
1881 int nr;
1882
1883 errno = 0;
1884 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1885 if (!q) {
1886 if (!errno)
1887 errno = ENOMEM;
1888
1889 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1890 return -errno;
1891 }
1892
1893 qn = udev_device_get_devnum(q);
1894 if (major(qn) == 0)
1895 continue;
1896
1897 if (st.st_rdev == qn)
1898 continue;
1899
1900 node = udev_device_get_devnode(q);
1901 if (!node)
1902 continue;
1903
1904 pp = blkid_partlist_devno_to_partition(pl, qn);
1905 if (!pp)
1906 continue;
1907
1908 flags = blkid_partition_get_flags(pp);
1909
1910 nr = blkid_partition_get_partno(pp);
1911 if (nr < 0)
1912 continue;
1913
1914 if (is_gpt) {
1915 sd_id128_t type_id;
1916 const char *stype;
1917
1918 if (flags & GPT_FLAG_NO_AUTO)
1919 continue;
1920
1921 stype = blkid_partition_get_type_string(pp);
1922 if (!stype)
1923 continue;
1924
1925 if (sd_id128_from_string(stype, &type_id) < 0)
1926 continue;
1927
1928 if (sd_id128_equal(type_id, GPT_HOME)) {
1929
1930 if (home && nr >= home_nr)
1931 continue;
1932
1933 home_nr = nr;
1934 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1935
1936 r = free_and_strdup(&home, node);
1937 if (r < 0)
1938 return log_oom();
1939
1940 } else if (sd_id128_equal(type_id, GPT_SRV)) {
1941
1942 if (srv && nr >= srv_nr)
1943 continue;
1944
1945 srv_nr = nr;
1946 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
1947
1948 r = free_and_strdup(&srv, node);
1949 if (r < 0)
1950 return log_oom();
1951 }
1952 #ifdef GPT_ROOT_NATIVE
1953 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1954
1955 if (root && nr >= root_nr)
1956 continue;
1957
1958 root_nr = nr;
1959 root_rw = !(flags & GPT_FLAG_READ_ONLY);
1960
1961 r = free_and_strdup(&root, node);
1962 if (r < 0)
1963 return log_oom();
1964 }
1965 #endif
1966 #ifdef GPT_ROOT_SECONDARY
1967 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
1968
1969 if (secondary_root && nr >= secondary_root_nr)
1970 continue;
1971
1972 secondary_root_nr = nr;
1973 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
1974
1975 r = free_and_strdup(&secondary_root, node);
1976 if (r < 0)
1977 return log_oom();
1978 }
1979 #endif
1980 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
1981
1982 if (generic)
1983 multiple_generic = true;
1984 else {
1985 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
1986
1987 r = free_and_strdup(&generic, node);
1988 if (r < 0)
1989 return log_oom();
1990 }
1991 }
1992
1993 } else if (is_mbr) {
1994 int type;
1995
1996 if (flags != 0x80) /* Bootable flag */
1997 continue;
1998
1999 type = blkid_partition_get_type(pp);
2000 if (type != 0x83) /* Linux partition */
2001 continue;
2002
2003 if (generic)
2004 multiple_generic = true;
2005 else {
2006 generic_rw = true;
2007
2008 r = free_and_strdup(&root, node);
2009 if (r < 0)
2010 return log_oom();
2011 }
2012 }
2013 }
2014
2015 if (root) {
2016 *root_device = root;
2017 root = NULL;
2018
2019 *root_device_rw = root_rw;
2020 *secondary = false;
2021 } else if (secondary_root) {
2022 *root_device = secondary_root;
2023 secondary_root = NULL;
2024
2025 *root_device_rw = secondary_root_rw;
2026 *secondary = true;
2027 } else if (generic) {
2028
2029 /* There were no partitions with precise meanings
2030 * around, but we found generic partitions. In this
2031 * case, if there's only one, we can go ahead and boot
2032 * it, otherwise we bail out, because we really cannot
2033 * make any sense of it. */
2034
2035 if (multiple_generic) {
2036 log_error("Identified multiple bootable Linux partitions on\n"
2037 " %s\n"
2038 PARTITION_TABLE_BLURB, arg_image);
2039 return -EINVAL;
2040 }
2041
2042 *root_device = generic;
2043 generic = NULL;
2044
2045 *root_device_rw = generic_rw;
2046 *secondary = false;
2047 } else {
2048 log_error("Failed to identify root partition in disk image\n"
2049 " %s\n"
2050 PARTITION_TABLE_BLURB, arg_image);
2051 return -EINVAL;
2052 }
2053
2054 if (home) {
2055 *home_device = home;
2056 home = NULL;
2057
2058 *home_device_rw = home_rw;
2059 }
2060
2061 if (srv) {
2062 *srv_device = srv;
2063 srv = NULL;
2064
2065 *srv_device_rw = srv_rw;
2066 }
2067
2068 return 0;
2069 #else
2070 log_error("--image= is not supported, compiled without blkid support.");
2071 return -EOPNOTSUPP;
2072 #endif
2073 }
2074
2075 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2076 #ifdef HAVE_BLKID
2077 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2078 const char *fstype, *p;
2079 int r;
2080
2081 assert(what);
2082 assert(where);
2083
2084 if (arg_read_only)
2085 rw = false;
2086
2087 if (directory)
2088 p = strjoina(where, directory);
2089 else
2090 p = where;
2091
2092 errno = 0;
2093 b = blkid_new_probe_from_filename(what);
2094 if (!b) {
2095 if (errno == 0)
2096 return log_oom();
2097 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2098 return -errno;
2099 }
2100
2101 blkid_probe_enable_superblocks(b, 1);
2102 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2103
2104 errno = 0;
2105 r = blkid_do_safeprobe(b);
2106 if (r == -1 || r == 1) {
2107 log_error("Cannot determine file system type of %s", what);
2108 return -EINVAL;
2109 } else if (r != 0) {
2110 if (errno == 0)
2111 errno = EIO;
2112 log_error_errno(errno, "Failed to probe %s: %m", what);
2113 return -errno;
2114 }
2115
2116 errno = 0;
2117 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2118 if (errno == 0)
2119 errno = EINVAL;
2120 log_error("Failed to determine file system type of %s", what);
2121 return -errno;
2122 }
2123
2124 if (streq(fstype, "crypto_LUKS")) {
2125 log_error("nspawn currently does not support LUKS disk images.");
2126 return -EOPNOTSUPP;
2127 }
2128
2129 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2130 return log_error_errno(errno, "Failed to mount %s: %m", what);
2131
2132 return 0;
2133 #else
2134 log_error("--image= is not supported, compiled without blkid support.");
2135 return -EOPNOTSUPP;
2136 #endif
2137 }
2138
2139 static int mount_devices(
2140 const char *where,
2141 const char *root_device, bool root_device_rw,
2142 const char *home_device, bool home_device_rw,
2143 const char *srv_device, bool srv_device_rw) {
2144 int r;
2145
2146 assert(where);
2147
2148 if (root_device) {
2149 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2150 if (r < 0)
2151 return log_error_errno(r, "Failed to mount root directory: %m");
2152 }
2153
2154 if (home_device) {
2155 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2156 if (r < 0)
2157 return log_error_errno(r, "Failed to mount home directory: %m");
2158 }
2159
2160 if (srv_device) {
2161 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2162 if (r < 0)
2163 return log_error_errno(r, "Failed to mount server data directory: %m");
2164 }
2165
2166 return 0;
2167 }
2168
2169 static void loop_remove(int nr, int *image_fd) {
2170 _cleanup_close_ int control = -1;
2171 int r;
2172
2173 if (nr < 0)
2174 return;
2175
2176 if (image_fd && *image_fd >= 0) {
2177 r = ioctl(*image_fd, LOOP_CLR_FD);
2178 if (r < 0)
2179 log_debug_errno(errno, "Failed to close loop image: %m");
2180 *image_fd = safe_close(*image_fd);
2181 }
2182
2183 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2184 if (control < 0) {
2185 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2186 return;
2187 }
2188
2189 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2190 if (r < 0)
2191 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
2192 }
2193
2194 /*
2195 * Return values:
2196 * < 0 : wait_for_terminate() failed to get the state of the
2197 * container, the container was terminated by a signal, or
2198 * failed for an unknown reason. No change is made to the
2199 * container argument.
2200 * > 0 : The program executed in the container terminated with an
2201 * error. The exit code of the program executed in the
2202 * container is returned. The container argument has been set
2203 * to CONTAINER_TERMINATED.
2204 * 0 : The container is being rebooted, has been shut down or exited
2205 * successfully. The container argument has been set to either
2206 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2207 *
2208 * That is, success is indicated by a return value of zero, and an
2209 * error is indicated by a non-zero value.
2210 */
2211 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2212 siginfo_t status;
2213 int r;
2214
2215 r = wait_for_terminate(pid, &status);
2216 if (r < 0)
2217 return log_warning_errno(r, "Failed to wait for container: %m");
2218
2219 switch (status.si_code) {
2220
2221 case CLD_EXITED:
2222 if (status.si_status == 0) {
2223 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2224
2225 } else
2226 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2227
2228 *container = CONTAINER_TERMINATED;
2229 return status.si_status;
2230
2231 case CLD_KILLED:
2232 if (status.si_status == SIGINT) {
2233
2234 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2235 *container = CONTAINER_TERMINATED;
2236 return 0;
2237
2238 } else if (status.si_status == SIGHUP) {
2239
2240 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2241 *container = CONTAINER_REBOOTED;
2242 return 0;
2243 }
2244
2245 /* CLD_KILLED fallthrough */
2246
2247 case CLD_DUMPED:
2248 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2249 return -EIO;
2250
2251 default:
2252 log_error("Container %s failed due to unknown reason.", arg_machine);
2253 return -EIO;
2254 }
2255
2256 return r;
2257 }
2258
2259 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2260 pid_t pid;
2261
2262 pid = PTR_TO_UINT32(userdata);
2263 if (pid > 0) {
2264 if (kill(pid, arg_kill_signal) >= 0) {
2265 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2266 sd_event_source_set_userdata(s, NULL);
2267 return 0;
2268 }
2269 }
2270
2271 sd_event_exit(sd_event_source_get_event(s), 0);
2272 return 0;
2273 }
2274
2275 static int determine_names(void) {
2276 int r;
2277
2278 if (arg_template && !arg_directory && arg_machine) {
2279
2280 /* If --template= was specified then we should not
2281 * search for a machine, but instead create a new one
2282 * in /var/lib/machine. */
2283
2284 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2285 if (!arg_directory)
2286 return log_oom();
2287 }
2288
2289 if (!arg_image && !arg_directory) {
2290 if (arg_machine) {
2291 _cleanup_(image_unrefp) Image *i = NULL;
2292
2293 r = image_find(arg_machine, &i);
2294 if (r < 0)
2295 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2296 else if (r == 0) {
2297 log_error("No image for machine '%s': %m", arg_machine);
2298 return -ENOENT;
2299 }
2300
2301 if (i->type == IMAGE_RAW)
2302 r = free_and_strdup(&arg_image, i->path);
2303 else
2304 r = free_and_strdup(&arg_directory, i->path);
2305 if (r < 0)
2306 return log_error_errno(r, "Invalid image directory: %m");
2307
2308 if (!arg_ephemeral)
2309 arg_read_only = arg_read_only || i->read_only;
2310 } else
2311 arg_directory = get_current_dir_name();
2312
2313 if (!arg_directory && !arg_machine) {
2314 log_error("Failed to determine path, please use -D or -i.");
2315 return -EINVAL;
2316 }
2317 }
2318
2319 if (!arg_machine) {
2320 if (arg_directory && path_equal(arg_directory, "/"))
2321 arg_machine = gethostname_malloc();
2322 else
2323 arg_machine = strdup(basename(arg_image ?: arg_directory));
2324
2325 if (!arg_machine)
2326 return log_oom();
2327
2328 hostname_cleanup(arg_machine);
2329 if (!machine_name_is_valid(arg_machine)) {
2330 log_error("Failed to determine machine name automatically, please use -M.");
2331 return -EINVAL;
2332 }
2333
2334 if (arg_ephemeral) {
2335 char *b;
2336
2337 /* Add a random suffix when this is an
2338 * ephemeral machine, so that we can run many
2339 * instances at once without manually having
2340 * to specify -M each time. */
2341
2342 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2343 return log_oom();
2344
2345 free(arg_machine);
2346 arg_machine = b;
2347 }
2348 }
2349
2350 return 0;
2351 }
2352
2353 static int determine_uid_shift(const char *directory) {
2354 int r;
2355
2356 if (!arg_userns) {
2357 arg_uid_shift = 0;
2358 return 0;
2359 }
2360
2361 if (arg_uid_shift == UID_INVALID) {
2362 struct stat st;
2363
2364 r = stat(directory, &st);
2365 if (r < 0)
2366 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2367
2368 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2369
2370 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
2371 log_error("UID and GID base of %s don't match.", directory);
2372 return -EINVAL;
2373 }
2374
2375 arg_uid_range = UINT32_C(0x10000);
2376 }
2377
2378 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2379 log_error("UID base too high for UID range.");
2380 return -EINVAL;
2381 }
2382
2383 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2384 return 0;
2385 }
2386
2387 static int inner_child(
2388 Barrier *barrier,
2389 const char *directory,
2390 bool secondary,
2391 int kmsg_socket,
2392 int rtnl_socket,
2393 FDSet *fds) {
2394
2395 _cleanup_free_ char *home = NULL;
2396 unsigned n_env = 2;
2397 const char *envp[] = {
2398 "PATH=" DEFAULT_PATH_SPLIT_USR,
2399 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2400 NULL, /* TERM */
2401 NULL, /* HOME */
2402 NULL, /* USER */
2403 NULL, /* LOGNAME */
2404 NULL, /* container_uuid */
2405 NULL, /* LISTEN_FDS */
2406 NULL, /* LISTEN_PID */
2407 NULL
2408 };
2409
2410 _cleanup_strv_free_ char **env_use = NULL;
2411 int r;
2412
2413 assert(barrier);
2414 assert(directory);
2415 assert(kmsg_socket >= 0);
2416
2417 cg_unified_flush();
2418
2419 if (arg_userns) {
2420 /* Tell the parent, that it now can write the UID map. */
2421 (void) barrier_place(barrier); /* #1 */
2422
2423 /* Wait until the parent wrote the UID map */
2424 if (!barrier_place_and_sync(barrier)) { /* #2 */
2425 log_error("Parent died too early");
2426 return -ESRCH;
2427 }
2428 }
2429
2430 r = mount_all(NULL, arg_userns, true, arg_uid_shift, arg_private_network, arg_uid_range, arg_selinux_apifs_context);
2431 if (r < 0)
2432 return r;
2433
2434 r = mount_sysfs(NULL);
2435 if (r < 0)
2436 return r;
2437
2438 /* Wait until we are cgroup-ified, so that we
2439 * can mount the right cgroup path writable */
2440 if (!barrier_place_and_sync(barrier)) { /* #3 */
2441 log_error("Parent died too early");
2442 return -ESRCH;
2443 }
2444
2445 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2446 if (r < 0)
2447 return r;
2448
2449 r = reset_uid_gid();
2450 if (r < 0)
2451 return log_error_errno(r, "Couldn't become new root: %m");
2452
2453 r = setup_boot_id(NULL);
2454 if (r < 0)
2455 return r;
2456
2457 r = setup_kmsg(NULL, kmsg_socket);
2458 if (r < 0)
2459 return r;
2460 kmsg_socket = safe_close(kmsg_socket);
2461
2462 umask(0022);
2463
2464 if (setsid() < 0)
2465 return log_error_errno(errno, "setsid() failed: %m");
2466
2467 if (arg_private_network)
2468 loopback_setup();
2469
2470 if (arg_expose_ports) {
2471 r = expose_port_send_rtnl(rtnl_socket);
2472 if (r < 0)
2473 return r;
2474 rtnl_socket = safe_close(rtnl_socket);
2475 }
2476
2477 if (drop_capabilities() < 0)
2478 return log_error_errno(errno, "drop_capabilities() failed: %m");
2479
2480 setup_hostname();
2481
2482 if (arg_personality != PERSONALITY_INVALID) {
2483 if (personality(arg_personality) < 0)
2484 return log_error_errno(errno, "personality() failed: %m");
2485 } else if (secondary) {
2486 if (personality(PER_LINUX32) < 0)
2487 return log_error_errno(errno, "personality() failed: %m");
2488 }
2489
2490 #ifdef HAVE_SELINUX
2491 if (arg_selinux_context)
2492 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2493 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2494 #endif
2495
2496 r = change_uid_gid(arg_user, &home);
2497 if (r < 0)
2498 return r;
2499
2500 envp[n_env] = strv_find_prefix(environ, "TERM=");
2501 if (envp[n_env])
2502 n_env ++;
2503
2504 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2505 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2506 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2507 return log_oom();
2508
2509 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2510 char as_uuid[37];
2511
2512 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2513 return log_oom();
2514 }
2515
2516 if (fdset_size(fds) > 0) {
2517 r = fdset_cloexec(fds, false);
2518 if (r < 0)
2519 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2520
2521 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2522 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2523 return log_oom();
2524 }
2525
2526 env_use = strv_env_merge(2, envp, arg_setenv);
2527 if (!env_use)
2528 return log_oom();
2529
2530 /* Let the parent know that we are ready and
2531 * wait until the parent is ready with the
2532 * setup, too... */
2533 if (!barrier_place_and_sync(barrier)) { /* #4 */
2534 log_error("Parent died too early");
2535 return -ESRCH;
2536 }
2537
2538 /* Now, explicitly close the log, so that we
2539 * then can close all remaining fds. Closing
2540 * the log explicitly first has the benefit
2541 * that the logging subsystem knows about it,
2542 * and is thus ready to be reopened should we
2543 * need it again. Note that the other fds
2544 * closed here are at least the locking and
2545 * barrier fds. */
2546 log_close();
2547 (void) fdset_close_others(fds);
2548
2549 if (arg_boot) {
2550 char **a;
2551 size_t m;
2552
2553 /* Automatically search for the init system */
2554
2555 m = 1 + strv_length(arg_parameters);
2556 a = newa(char*, m + 1);
2557 if (strv_isempty(arg_parameters))
2558 a[1] = NULL;
2559 else
2560 memcpy(a + 1, arg_parameters, m * sizeof(char*));
2561
2562 a[0] = (char*) "/usr/lib/systemd/systemd";
2563 execve(a[0], a, env_use);
2564
2565 a[0] = (char*) "/lib/systemd/systemd";
2566 execve(a[0], a, env_use);
2567
2568 a[0] = (char*) "/sbin/init";
2569 execve(a[0], a, env_use);
2570 } else if (!strv_isempty(arg_parameters))
2571 execvpe(arg_parameters[0], arg_parameters, env_use);
2572 else {
2573 chdir(home ?: "/root");
2574 execle("/bin/bash", "-bash", NULL, env_use);
2575 execle("/bin/sh", "-sh", NULL, env_use);
2576 }
2577
2578 (void) log_open();
2579 return log_error_errno(errno, "execv() failed: %m");
2580 }
2581
2582 static int outer_child(
2583 Barrier *barrier,
2584 const char *directory,
2585 const char *console,
2586 const char *root_device, bool root_device_rw,
2587 const char *home_device, bool home_device_rw,
2588 const char *srv_device, bool srv_device_rw,
2589 bool interactive,
2590 bool secondary,
2591 int pid_socket,
2592 int kmsg_socket,
2593 int rtnl_socket,
2594 int uid_shift_socket,
2595 FDSet *fds) {
2596
2597 pid_t pid;
2598 ssize_t l;
2599 int r;
2600
2601 assert(barrier);
2602 assert(directory);
2603 assert(console);
2604 assert(pid_socket >= 0);
2605 assert(kmsg_socket >= 0);
2606
2607 cg_unified_flush();
2608
2609 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2610 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2611
2612 if (interactive) {
2613 close_nointr(STDIN_FILENO);
2614 close_nointr(STDOUT_FILENO);
2615 close_nointr(STDERR_FILENO);
2616
2617 r = open_terminal(console, O_RDWR);
2618 if (r != STDIN_FILENO) {
2619 if (r >= 0) {
2620 safe_close(r);
2621 r = -EINVAL;
2622 }
2623
2624 return log_error_errno(r, "Failed to open console: %m");
2625 }
2626
2627 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2628 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2629 return log_error_errno(errno, "Failed to duplicate console: %m");
2630 }
2631
2632 r = reset_audit_loginuid();
2633 if (r < 0)
2634 return r;
2635
2636 /* Mark everything as slave, so that we still
2637 * receive mounts from the real root, but don't
2638 * propagate mounts to the real root. */
2639 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2640 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2641
2642 r = mount_devices(directory,
2643 root_device, root_device_rw,
2644 home_device, home_device_rw,
2645 srv_device, srv_device_rw);
2646 if (r < 0)
2647 return r;
2648
2649 r = determine_uid_shift(directory);
2650 if (r < 0)
2651 return r;
2652
2653 if (arg_userns) {
2654 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2655 if (l < 0)
2656 return log_error_errno(errno, "Failed to send UID shift: %m");
2657 if (l != sizeof(arg_uid_shift)) {
2658 log_error("Short write while sending UID shift.");
2659 return -EIO;
2660 }
2661 }
2662
2663 /* Turn directory into bind mount */
2664 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2665 return log_error_errno(errno, "Failed to make bind mount: %m");
2666
2667 r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2668 if (r < 0)
2669 return r;
2670
2671 r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2672 if (r < 0)
2673 return r;
2674
2675 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2676 if (r < 0)
2677 return r;
2678
2679 if (arg_read_only) {
2680 r = bind_remount_recursive(directory, true);
2681 if (r < 0)
2682 return log_error_errno(r, "Failed to make tree read-only: %m");
2683 }
2684
2685 r = mount_all(directory, arg_userns, false, arg_private_network, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2686 if (r < 0)
2687 return r;
2688
2689 r = copy_devnodes(directory);
2690 if (r < 0)
2691 return r;
2692
2693 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2694
2695 r = setup_pts(directory);
2696 if (r < 0)
2697 return r;
2698
2699 r = setup_propagate(directory);
2700 if (r < 0)
2701 return r;
2702
2703 r = setup_dev_console(directory, console);
2704 if (r < 0)
2705 return r;
2706
2707 r = setup_seccomp();
2708 if (r < 0)
2709 return r;
2710
2711 r = setup_timezone(directory);
2712 if (r < 0)
2713 return r;
2714
2715 r = setup_resolv_conf(directory);
2716 if (r < 0)
2717 return r;
2718
2719 r = setup_journal(directory);
2720 if (r < 0)
2721 return r;
2722
2723 r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2724 if (r < 0)
2725 return r;
2726
2727 r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2728 if (r < 0)
2729 return r;
2730
2731 r = mount_move_root(directory);
2732 if (r < 0)
2733 return log_error_errno(r, "Failed to move root directory: %m");
2734
2735 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2736 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
2737 (arg_private_network ? CLONE_NEWNET : 0) |
2738 (arg_userns ? CLONE_NEWUSER : 0),
2739 NULL);
2740 if (pid < 0)
2741 return log_error_errno(errno, "Failed to fork inner child: %m");
2742 if (pid == 0) {
2743 pid_socket = safe_close(pid_socket);
2744 uid_shift_socket = safe_close(uid_shift_socket);
2745
2746 /* The inner child has all namespaces that are
2747 * requested, so that we all are owned by the user if
2748 * user namespaces are turned on. */
2749
2750 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
2751 if (r < 0)
2752 _exit(EXIT_FAILURE);
2753
2754 _exit(EXIT_SUCCESS);
2755 }
2756
2757 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2758 if (l < 0)
2759 return log_error_errno(errno, "Failed to send PID: %m");
2760 if (l != sizeof(pid)) {
2761 log_error("Short write while sending PID.");
2762 return -EIO;
2763 }
2764
2765 pid_socket = safe_close(pid_socket);
2766 kmsg_socket = safe_close(kmsg_socket);
2767 rtnl_socket = safe_close(rtnl_socket);
2768
2769 return 0;
2770 }
2771
2772 static int setup_uid_map(pid_t pid) {
2773 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2774 int r;
2775
2776 assert(pid > 1);
2777
2778 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2779 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
2780 r = write_string_file(uid_map, line, 0);
2781 if (r < 0)
2782 return log_error_errno(r, "Failed to write UID map: %m");
2783
2784 /* We always assign the same UID and GID ranges */
2785 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
2786 r = write_string_file(uid_map, line, 0);
2787 if (r < 0)
2788 return log_error_errno(r, "Failed to write GID map: %m");
2789
2790 return 0;
2791 }
2792
2793 static int load_settings(void) {
2794 _cleanup_(settings_freep) Settings *settings = NULL;
2795 _cleanup_fclose_ FILE *f = NULL;
2796 _cleanup_free_ char *p = NULL;
2797 const char *fn, *i;
2798 int r;
2799
2800 /* If all settings are masked, there's no point in looking for
2801 * the settings file */
2802 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2803 return 0;
2804
2805 fn = strjoina(arg_machine, ".nspawn");
2806
2807 /* We first look in the admin's directories in /etc and /run */
2808 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2809 _cleanup_free_ char *j = NULL;
2810
2811 j = strjoin(i, "/", fn, NULL);
2812 if (!j)
2813 return log_oom();
2814
2815 f = fopen(j, "re");
2816 if (f) {
2817 p = j;
2818 j = NULL;
2819
2820 /* By default we trust configuration from /etc and /run */
2821 if (arg_settings_trusted < 0)
2822 arg_settings_trusted = true;
2823
2824 break;
2825 }
2826
2827 if (errno != ENOENT)
2828 return log_error_errno(errno, "Failed to open %s: %m", j);
2829 }
2830
2831 if (!f) {
2832 /* After that, let's look for a file next to the
2833 * actual image we shall boot. */
2834
2835 if (arg_image) {
2836 p = file_in_same_dir(arg_image, fn);
2837 if (!p)
2838 return log_oom();
2839 } else if (arg_directory) {
2840 p = file_in_same_dir(arg_directory, fn);
2841 if (!p)
2842 return log_oom();
2843 }
2844
2845 if (p) {
2846 f = fopen(p, "re");
2847 if (!f && errno != ENOENT)
2848 return log_error_errno(errno, "Failed to open %s: %m", p);
2849
2850 /* By default we do not trust configuration from /var/lib/machines */
2851 if (arg_settings_trusted < 0)
2852 arg_settings_trusted = false;
2853 }
2854 }
2855
2856 if (!f)
2857 return 0;
2858
2859 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2860
2861 r = settings_load(f, p, &settings);
2862 if (r < 0)
2863 return r;
2864
2865 /* Copy over bits from the settings, unless they have been
2866 * explicitly masked by command line switches. */
2867
2868 if ((arg_settings_mask & SETTING_BOOT) == 0 &&
2869 settings->boot >= 0) {
2870 arg_boot = settings->boot;
2871
2872 strv_free(arg_parameters);
2873 arg_parameters = settings->parameters;
2874 settings->parameters = NULL;
2875 }
2876
2877 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2878 settings->environment) {
2879 strv_free(arg_setenv);
2880 arg_setenv = settings->environment;
2881 settings->environment = NULL;
2882 }
2883
2884 if ((arg_settings_mask & SETTING_USER) == 0 &&
2885 settings->user) {
2886 free(arg_user);
2887 arg_user = settings->user;
2888 settings->user = NULL;
2889 }
2890
2891 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
2892 uint64_t plus;
2893
2894 plus = settings->capability;
2895 if (settings_private_network(settings))
2896 plus |= (1ULL << CAP_NET_ADMIN);
2897
2898 if (!arg_settings_trusted && plus != 0) {
2899 if (settings->capability != 0)
2900 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
2901 } else
2902 arg_retain |= plus;
2903
2904 arg_retain &= ~settings->drop_capability;
2905 }
2906
2907 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
2908 settings->kill_signal > 0)
2909 arg_kill_signal = settings->kill_signal;
2910
2911 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
2912 settings->personality != PERSONALITY_INVALID)
2913 arg_personality = settings->personality;
2914
2915 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
2916 !sd_id128_is_null(settings->machine_id)) {
2917
2918 if (!arg_settings_trusted)
2919 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
2920 else
2921 arg_uuid = settings->machine_id;
2922 }
2923
2924 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
2925 settings->read_only >= 0)
2926 arg_read_only = settings->read_only;
2927
2928 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
2929 settings->volatile_mode != _VOLATILE_MODE_INVALID)
2930 arg_volatile_mode = settings->volatile_mode;
2931
2932 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
2933 settings->n_custom_mounts > 0) {
2934
2935 if (!arg_settings_trusted)
2936 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
2937 else {
2938 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
2939 arg_custom_mounts = settings->custom_mounts;
2940 arg_n_custom_mounts = settings->n_custom_mounts;
2941
2942 settings->custom_mounts = NULL;
2943 settings->n_custom_mounts = 0;
2944 }
2945 }
2946
2947 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
2948 (settings->private_network >= 0 ||
2949 settings->network_veth >= 0 ||
2950 settings->network_bridge ||
2951 settings->network_interfaces ||
2952 settings->network_macvlan ||
2953 settings->network_ipvlan)) {
2954
2955 if (!arg_settings_trusted)
2956 log_warning("Ignoring network settings, file %s is not trusted.", p);
2957 else {
2958 arg_network_veth = settings_private_network(settings);
2959 arg_private_network = settings_private_network(settings);
2960
2961 strv_free(arg_network_interfaces);
2962 arg_network_interfaces = settings->network_interfaces;
2963 settings->network_interfaces = NULL;
2964
2965 strv_free(arg_network_macvlan);
2966 arg_network_macvlan = settings->network_macvlan;
2967 settings->network_macvlan = NULL;
2968
2969 strv_free(arg_network_ipvlan);
2970 arg_network_ipvlan = settings->network_ipvlan;
2971 settings->network_ipvlan = NULL;
2972
2973 free(arg_network_bridge);
2974 arg_network_bridge = settings->network_bridge;
2975 settings->network_bridge = NULL;
2976 }
2977 }
2978
2979 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
2980 settings->expose_ports) {
2981
2982 if (!arg_settings_trusted)
2983 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
2984 else {
2985 expose_port_free_all(arg_expose_ports);
2986 arg_expose_ports = settings->expose_ports;
2987 settings->expose_ports = NULL;
2988 }
2989 }
2990
2991 return 0;
2992 }
2993
2994 int main(int argc, char *argv[]) {
2995
2996 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
2997 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2998 _cleanup_close_ int master = -1, image_fd = -1;
2999 _cleanup_fdset_free_ FDSet *fds = NULL;
3000 int r, n_fd_passed, loop_nr = -1;
3001 char veth_name[IFNAMSIZ];
3002 bool secondary = false, remove_subvol = false;
3003 sigset_t mask_chld;
3004 pid_t pid = 0;
3005 int ret = EXIT_SUCCESS;
3006 union in_addr_union exposed = {};
3007 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3008 bool interactive;
3009
3010 log_parse_environment();
3011 log_open();
3012
3013 r = parse_argv(argc, argv);
3014 if (r <= 0)
3015 goto finish;
3016
3017 if (geteuid() != 0) {
3018 log_error("Need to be root.");
3019 r = -EPERM;
3020 goto finish;
3021 }
3022 r = determine_names();
3023 if (r < 0)
3024 goto finish;
3025
3026 r = load_settings();
3027 if (r < 0)
3028 goto finish;
3029
3030 r = verify_arguments();
3031 if (r < 0)
3032 goto finish;
3033
3034 n_fd_passed = sd_listen_fds(false);
3035 if (n_fd_passed > 0) {
3036 r = fdset_new_listen_fds(&fds, false);
3037 if (r < 0) {
3038 log_error_errno(r, "Failed to collect file descriptors: %m");
3039 goto finish;
3040 }
3041 }
3042
3043 if (arg_directory) {
3044 assert(!arg_image);
3045
3046 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3047 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3048 r = -EINVAL;
3049 goto finish;
3050 }
3051
3052 if (arg_ephemeral) {
3053 _cleanup_free_ char *np = NULL;
3054
3055 /* If the specified path is a mount point we
3056 * generate the new snapshot immediately
3057 * inside it under a random name. However if
3058 * the specified is not a mount point we
3059 * create the new snapshot in the parent
3060 * directory, just next to it. */
3061 r = path_is_mount_point(arg_directory, 0);
3062 if (r < 0) {
3063 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3064 goto finish;
3065 }
3066 if (r > 0)
3067 r = tempfn_random_child(arg_directory, "machine.", &np);
3068 else
3069 r = tempfn_random(arg_directory, "machine.", &np);
3070 if (r < 0) {
3071 log_error_errno(r, "Failed to generate name for snapshot: %m");
3072 goto finish;
3073 }
3074
3075 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3076 if (r < 0) {
3077 log_error_errno(r, "Failed to lock %s: %m", np);
3078 goto finish;
3079 }
3080
3081 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
3082 if (r < 0) {
3083 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3084 goto finish;
3085 }
3086
3087 free(arg_directory);
3088 arg_directory = np;
3089 np = NULL;
3090
3091 remove_subvol = true;
3092
3093 } else {
3094 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3095 if (r == -EBUSY) {
3096 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3097 goto finish;
3098 }
3099 if (r < 0) {
3100 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3101 return r;
3102 }
3103
3104 if (arg_template) {
3105 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
3106 if (r == -EEXIST) {
3107 if (!arg_quiet)
3108 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3109 } else if (r < 0) {
3110 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3111 goto finish;
3112 } else {
3113 if (!arg_quiet)
3114 log_info("Populated %s from template %s.", arg_directory, arg_template);
3115 }
3116 }
3117 }
3118
3119 if (arg_boot) {
3120 if (path_is_os_tree(arg_directory) <= 0) {
3121 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3122 r = -EINVAL;
3123 goto finish;
3124 }
3125 } else {
3126 const char *p;
3127
3128 p = strjoina(arg_directory, "/usr/");
3129 if (laccess(p, F_OK) < 0) {
3130 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
3131 r = -EINVAL;
3132 goto finish;
3133 }
3134 }
3135
3136 } else {
3137 char template[] = "/tmp/nspawn-root-XXXXXX";
3138
3139 assert(arg_image);
3140 assert(!arg_template);
3141
3142 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3143 if (r == -EBUSY) {
3144 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3145 goto finish;
3146 }
3147 if (r < 0) {
3148 r = log_error_errno(r, "Failed to create image lock: %m");
3149 goto finish;
3150 }
3151
3152 if (!mkdtemp(template)) {
3153 log_error_errno(errno, "Failed to create temporary directory: %m");
3154 r = -errno;
3155 goto finish;
3156 }
3157
3158 arg_directory = strdup(template);
3159 if (!arg_directory) {
3160 r = log_oom();
3161 goto finish;
3162 }
3163
3164 image_fd = setup_image(&device_path, &loop_nr);
3165 if (image_fd < 0) {
3166 r = image_fd;
3167 goto finish;
3168 }
3169
3170 r = dissect_image(image_fd,
3171 &root_device, &root_device_rw,
3172 &home_device, &home_device_rw,
3173 &srv_device, &srv_device_rw,
3174 &secondary);
3175 if (r < 0)
3176 goto finish;
3177 }
3178
3179 r = custom_mounts_prepare();
3180 if (r < 0)
3181 goto finish;
3182
3183 interactive =
3184 isatty(STDIN_FILENO) > 0 &&
3185 isatty(STDOUT_FILENO) > 0;
3186
3187 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3188 if (master < 0) {
3189 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3190 goto finish;
3191 }
3192
3193 r = ptsname_malloc(master, &console);
3194 if (r < 0) {
3195 r = log_error_errno(r, "Failed to determine tty name: %m");
3196 goto finish;
3197 }
3198
3199 if (unlockpt(master) < 0) {
3200 r = log_error_errno(errno, "Failed to unlock tty: %m");
3201 goto finish;
3202 }
3203
3204 if (!arg_quiet)
3205 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3206 arg_machine, arg_image ?: arg_directory);
3207
3208 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
3209
3210 assert_se(sigemptyset(&mask_chld) == 0);
3211 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3212
3213 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3214 r = log_error_errno(errno, "Failed to become subreaper: %m");
3215 goto finish;
3216 }
3217
3218 for (;;) {
3219 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
3220 uid_shift_socket_pair[2] = { -1, -1 };
3221 ContainerStatus container_status;
3222 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3223 static const struct sigaction sa = {
3224 .sa_handler = nop_signal_handler,
3225 .sa_flags = SA_NOCLDSTOP,
3226 };
3227 int ifi = 0;
3228 ssize_t l;
3229 _cleanup_event_unref_ sd_event *event = NULL;
3230 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3231 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
3232 char last_char = 0;
3233
3234 r = barrier_create(&barrier);
3235 if (r < 0) {
3236 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3237 goto finish;
3238 }
3239
3240 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3241 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3242 goto finish;
3243 }
3244
3245 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3246 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3247 goto finish;
3248 }
3249
3250 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
3251 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3252 goto finish;
3253 }
3254
3255 if (arg_userns)
3256 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
3257 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3258 goto finish;
3259 }
3260
3261 /* Child can be killed before execv(), so handle SIGCHLD
3262 * in order to interrupt parent's blocking calls and
3263 * give it a chance to call wait() and terminate. */
3264 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3265 if (r < 0) {
3266 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3267 goto finish;
3268 }
3269
3270 r = sigaction(SIGCHLD, &sa, NULL);
3271 if (r < 0) {
3272 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3273 goto finish;
3274 }
3275
3276 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
3277 if (pid < 0) {
3278 if (errno == EINVAL)
3279 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3280 else
3281 r = log_error_errno(errno, "clone() failed: %m");
3282
3283 goto finish;
3284 }
3285
3286 if (pid == 0) {
3287 /* The outer child only has a file system namespace. */
3288 barrier_set_role(&barrier, BARRIER_CHILD);
3289
3290 master = safe_close(master);
3291
3292 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3293 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3294 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3295 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3296
3297 (void) reset_all_signal_handlers();
3298 (void) reset_signal_mask();
3299
3300 r = outer_child(&barrier,
3301 arg_directory,
3302 console,
3303 root_device, root_device_rw,
3304 home_device, home_device_rw,
3305 srv_device, srv_device_rw,
3306 interactive,
3307 secondary,
3308 pid_socket_pair[1],
3309 kmsg_socket_pair[1],
3310 rtnl_socket_pair[1],
3311 uid_shift_socket_pair[1],
3312 fds);
3313 if (r < 0)
3314 _exit(EXIT_FAILURE);
3315
3316 _exit(EXIT_SUCCESS);
3317 }
3318
3319 barrier_set_role(&barrier, BARRIER_PARENT);
3320
3321 fds = fdset_free(fds);
3322
3323 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3324 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3325 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3326 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3327
3328 /* Wait for the outer child. */
3329 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3330 if (r < 0)
3331 goto finish;
3332 if (r != 0) {
3333 r = -EIO;
3334 goto finish;
3335 }
3336 pid = 0;
3337
3338 /* And now retrieve the PID of the inner child. */
3339 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3340 if (l < 0) {
3341 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3342 goto finish;
3343 }
3344 if (l != sizeof(pid)) {
3345 log_error("Short read while reading inner child PID.");
3346 r = EIO;
3347 goto finish;
3348 }
3349
3350 log_debug("Init process invoked as PID " PID_FMT, pid);
3351
3352 if (arg_userns) {
3353 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3354 log_error("Child died too early.");
3355 r = -ESRCH;
3356 goto finish;
3357 }
3358
3359 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3360 if (l < 0) {
3361 r = log_error_errno(errno, "Failed to read UID shift: %m");
3362 goto finish;
3363 }
3364 if (l != sizeof(arg_uid_shift)) {
3365 log_error("Short read while reading UID shift.");
3366 r = EIO;
3367 goto finish;
3368 }
3369
3370 r = setup_uid_map(pid);
3371 if (r < 0)
3372 goto finish;
3373
3374 (void) barrier_place(&barrier); /* #2 */
3375 }
3376
3377 if (arg_private_network) {
3378
3379 r = move_network_interfaces(pid, arg_network_interfaces);
3380 if (r < 0)
3381 goto finish;
3382
3383 if (arg_network_veth) {
3384 r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge);
3385 if (r < 0)
3386 goto finish;
3387 else if (r > 0)
3388 ifi = r;
3389
3390 if (arg_network_bridge) {
3391 r = setup_bridge(veth_name, arg_network_bridge);
3392 if (r < 0)
3393 goto finish;
3394 if (r > 0)
3395 ifi = r;
3396 }
3397 }
3398
3399 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3400 if (r < 0)
3401 goto finish;
3402
3403 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3404 if (r < 0)
3405 goto finish;
3406 }
3407
3408 if (arg_register) {
3409 r = register_machine(
3410 arg_machine,
3411 pid,
3412 arg_directory,
3413 arg_uuid,
3414 ifi,
3415 arg_slice,
3416 arg_custom_mounts, arg_n_custom_mounts,
3417 arg_kill_signal,
3418 arg_property,
3419 arg_keep_unit);
3420 if (r < 0)
3421 goto finish;
3422 }
3423
3424 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
3425 if (r < 0)
3426 goto finish;
3427
3428 if (arg_keep_unit) {
3429 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
3430 if (r < 0)
3431 goto finish;
3432 }
3433
3434 r = chown_cgroup(pid, arg_uid_shift);
3435 if (r < 0)
3436 goto finish;
3437
3438 /* Notify the child that the parent is ready with all
3439 * its setup (including cgroup-ification), and that
3440 * the child can now hand over control to the code to
3441 * run inside the container. */
3442 (void) barrier_place(&barrier); /* #3 */
3443
3444 /* Block SIGCHLD here, before notifying child.
3445 * process_pty() will handle it with the other signals. */
3446 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3447
3448 /* Reset signal to default */
3449 r = default_signals(SIGCHLD, -1);
3450 if (r < 0) {
3451 log_error_errno(r, "Failed to reset SIGCHLD: %m");
3452 goto finish;
3453 }
3454
3455 /* Let the child know that we are ready and wait that the child is completely ready now. */
3456 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3457 log_error("Child died too early.");
3458 r = -ESRCH;
3459 goto finish;
3460 }
3461
3462 sd_notifyf(false,
3463 "READY=1\n"
3464 "STATUS=Container running.\n"
3465 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
3466
3467 r = sd_event_new(&event);
3468 if (r < 0) {
3469 log_error_errno(r, "Failed to get default event source: %m");
3470 goto finish;
3471 }
3472
3473 if (arg_kill_signal > 0) {
3474 /* Try to kill the init system on SIGINT or SIGTERM */
3475 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3476 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3477 } else {
3478 /* Immediately exit */
3479 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3480 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3481 }
3482
3483 /* simply exit on sigchld */
3484 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3485
3486 if (arg_expose_ports) {
3487 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
3488 if (r < 0)
3489 goto finish;
3490
3491 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
3492 }
3493
3494 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3495
3496 r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward);
3497 if (r < 0) {
3498 log_error_errno(r, "Failed to create PTY forwarder: %m");
3499 goto finish;
3500 }
3501
3502 r = sd_event_loop(event);
3503 if (r < 0) {
3504 log_error_errno(r, "Failed to run event loop: %m");
3505 goto finish;
3506 }
3507
3508 pty_forward_get_last_char(forward, &last_char);
3509
3510 forward = pty_forward_free(forward);
3511
3512 if (!arg_quiet && last_char != '\n')
3513 putc('\n', stdout);
3514
3515 /* Kill if it is not dead yet anyway */
3516 if (arg_register && !arg_keep_unit)
3517 terminate_machine(pid);
3518
3519 /* Normally redundant, but better safe than sorry */
3520 kill(pid, SIGKILL);
3521
3522 r = wait_for_container(pid, &container_status);
3523 pid = 0;
3524
3525 if (r < 0)
3526 /* We failed to wait for the container, or the
3527 * container exited abnormally */
3528 goto finish;
3529 else if (r > 0 || container_status == CONTAINER_TERMINATED){
3530 /* The container exited with a non-zero
3531 * status, or with zero status and no reboot
3532 * was requested. */
3533 ret = r;
3534 break;
3535 }
3536
3537 /* CONTAINER_REBOOTED, loop again */
3538
3539 if (arg_keep_unit) {
3540 /* Special handling if we are running as a
3541 * service: instead of simply restarting the
3542 * machine we want to restart the entire
3543 * service, so let's inform systemd about this
3544 * with the special exit code 133. The service
3545 * file uses RestartForceExitStatus=133 so
3546 * that this results in a full nspawn
3547 * restart. This is necessary since we might
3548 * have cgroup parameters set we want to have
3549 * flushed out. */
3550 ret = 133;
3551 r = 0;
3552 break;
3553 }
3554
3555 expose_port_flush(arg_expose_ports, &exposed);
3556 }
3557
3558 finish:
3559 sd_notify(false,
3560 "STOPPING=1\n"
3561 "STATUS=Terminating...");
3562
3563 if (pid > 0)
3564 kill(pid, SIGKILL);
3565
3566 /* Try to flush whatever is still queued in the pty */
3567 if (master >= 0)
3568 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
3569
3570 loop_remove(loop_nr, &image_fd);
3571
3572 if (remove_subvol && arg_directory) {
3573 int k;
3574
3575 k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
3576 if (k < 0)
3577 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3578 }
3579
3580 if (arg_machine) {
3581 const char *p;
3582
3583 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
3584 (void) rm_rf(p, REMOVE_ROOT);
3585 }
3586
3587 expose_port_flush(arg_expose_ports, &exposed);
3588
3589 free(arg_directory);
3590 free(arg_template);
3591 free(arg_image);
3592 free(arg_machine);
3593 free(arg_user);
3594 strv_free(arg_setenv);
3595 free(arg_network_bridge);
3596 strv_free(arg_network_interfaces);
3597 strv_free(arg_network_macvlan);
3598 strv_free(arg_network_ipvlan);
3599 strv_free(arg_parameters);
3600 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3601 expose_port_free_all(arg_expose_ports);
3602
3603 return r < 0 ? EXIT_FAILURE : ret;
3604 }