]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
util-lib: split our string related calls from util.[ch] into its own file string...
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #ifdef HAVE_BLKID
23 #include <blkid/blkid.h>
24 #endif
25 #include <errno.h>
26 #include <getopt.h>
27 #include <linux/loop.h>
28 #include <sched.h>
29 #ifdef HAVE_SECCOMP
30 #include <seccomp.h>
31 #endif
32 #ifdef HAVE_SELINUX
33 #include <selinux/selinux.h>
34 #endif
35 #include <signal.h>
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <sys/file.h>
40 #include <sys/mount.h>
41 #include <sys/personality.h>
42 #include <sys/prctl.h>
43 #include <sys/types.h>
44 #include <unistd.h>
45
46 #include "sd-daemon.h"
47 #include "sd-id128.h"
48
49 #include "barrier.h"
50 #include "base-filesystem.h"
51 #include "blkid-util.h"
52 #include "btrfs-util.h"
53 #include "cap-list.h"
54 #include "capability.h"
55 #include "cgroup-util.h"
56 #include "copy.h"
57 #include "dev-setup.h"
58 #include "env-util.h"
59 #include "event-util.h"
60 #include "fdset.h"
61 #include "fileio.h"
62 #include "formats-util.h"
63 #include "gpt.h"
64 #include "hostname-util.h"
65 #include "log.h"
66 #include "loopback-setup.h"
67 #include "machine-image.h"
68 #include "macro.h"
69 #include "missing.h"
70 #include "mkdir.h"
71 #include "netlink-util.h"
72 #include "nspawn-cgroup.h"
73 #include "nspawn-expose-ports.h"
74 #include "nspawn-mount.h"
75 #include "nspawn-network.h"
76 #include "nspawn-register.h"
77 #include "nspawn-settings.h"
78 #include "nspawn-setuid.h"
79 #include "path-util.h"
80 #include "process-util.h"
81 #include "ptyfwd.h"
82 #include "random-util.h"
83 #include "rm-rf.h"
84 #ifdef HAVE_SECCOMP
85 #include "seccomp-util.h"
86 #endif
87 #include "signal-util.h"
88 #include "string-util.h"
89 #include "strv.h"
90 #include "terminal-util.h"
91 #include "udev-util.h"
92 #include "util.h"
93
94 typedef enum ContainerStatus {
95 CONTAINER_TERMINATED,
96 CONTAINER_REBOOTED
97 } ContainerStatus;
98
99 typedef enum LinkJournal {
100 LINK_NO,
101 LINK_AUTO,
102 LINK_HOST,
103 LINK_GUEST
104 } LinkJournal;
105
106 static char *arg_directory = NULL;
107 static char *arg_template = NULL;
108 static char *arg_user = NULL;
109 static sd_id128_t arg_uuid = {};
110 static char *arg_machine = NULL;
111 static const char *arg_selinux_context = NULL;
112 static const char *arg_selinux_apifs_context = NULL;
113 static const char *arg_slice = NULL;
114 static bool arg_private_network = false;
115 static bool arg_read_only = false;
116 static bool arg_boot = false;
117 static bool arg_ephemeral = false;
118 static LinkJournal arg_link_journal = LINK_AUTO;
119 static bool arg_link_journal_try = false;
120 static uint64_t arg_retain =
121 (1ULL << CAP_CHOWN) |
122 (1ULL << CAP_DAC_OVERRIDE) |
123 (1ULL << CAP_DAC_READ_SEARCH) |
124 (1ULL << CAP_FOWNER) |
125 (1ULL << CAP_FSETID) |
126 (1ULL << CAP_IPC_OWNER) |
127 (1ULL << CAP_KILL) |
128 (1ULL << CAP_LEASE) |
129 (1ULL << CAP_LINUX_IMMUTABLE) |
130 (1ULL << CAP_NET_BIND_SERVICE) |
131 (1ULL << CAP_NET_BROADCAST) |
132 (1ULL << CAP_NET_RAW) |
133 (1ULL << CAP_SETGID) |
134 (1ULL << CAP_SETFCAP) |
135 (1ULL << CAP_SETPCAP) |
136 (1ULL << CAP_SETUID) |
137 (1ULL << CAP_SYS_ADMIN) |
138 (1ULL << CAP_SYS_CHROOT) |
139 (1ULL << CAP_SYS_NICE) |
140 (1ULL << CAP_SYS_PTRACE) |
141 (1ULL << CAP_SYS_TTY_CONFIG) |
142 (1ULL << CAP_SYS_RESOURCE) |
143 (1ULL << CAP_SYS_BOOT) |
144 (1ULL << CAP_AUDIT_WRITE) |
145 (1ULL << CAP_AUDIT_CONTROL) |
146 (1ULL << CAP_MKNOD);
147 static CustomMount *arg_custom_mounts = NULL;
148 static unsigned arg_n_custom_mounts = 0;
149 static char **arg_setenv = NULL;
150 static bool arg_quiet = false;
151 static bool arg_share_system = false;
152 static bool arg_register = true;
153 static bool arg_keep_unit = false;
154 static char **arg_network_interfaces = NULL;
155 static char **arg_network_macvlan = NULL;
156 static char **arg_network_ipvlan = NULL;
157 static bool arg_network_veth = false;
158 static char *arg_network_bridge = NULL;
159 static unsigned long arg_personality = PERSONALITY_INVALID;
160 static char *arg_image = NULL;
161 static VolatileMode arg_volatile_mode = VOLATILE_NO;
162 static ExposePort *arg_expose_ports = NULL;
163 static char **arg_property = NULL;
164 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
165 static bool arg_userns = false;
166 static int arg_kill_signal = 0;
167 static bool arg_unified_cgroup_hierarchy = false;
168 static SettingsMask arg_settings_mask = 0;
169 static int arg_settings_trusted = -1;
170 static char **arg_parameters = NULL;
171
172 static void help(void) {
173 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
174 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
175 " -h --help Show this help\n"
176 " --version Print version string\n"
177 " -q --quiet Do not show status information\n"
178 " -D --directory=PATH Root directory for the container\n"
179 " --template=PATH Initialize root directory from template directory,\n"
180 " if missing\n"
181 " -x --ephemeral Run container with snapshot of root directory, and\n"
182 " remove it after exit\n"
183 " -i --image=PATH File system device or disk image for the container\n"
184 " -b --boot Boot up full system (i.e. invoke init)\n"
185 " -u --user=USER Run the command under specified user or uid\n"
186 " -M --machine=NAME Set the machine name for the container\n"
187 " --uuid=UUID Set a specific machine UUID for the container\n"
188 " -S --slice=SLICE Place the container in the specified slice\n"
189 " --property=NAME=VALUE Set scope unit property\n"
190 " --private-users[=UIDBASE[:NUIDS]]\n"
191 " Run within user namespace\n"
192 " --private-network Disable network in container\n"
193 " --network-interface=INTERFACE\n"
194 " Assign an existing network interface to the\n"
195 " container\n"
196 " --network-macvlan=INTERFACE\n"
197 " Create a macvlan network interface based on an\n"
198 " existing network interface to the container\n"
199 " --network-ipvlan=INTERFACE\n"
200 " Create a ipvlan network interface based on an\n"
201 " existing network interface to the container\n"
202 " -n --network-veth Add a virtual ethernet connection between host\n"
203 " and container\n"
204 " --network-bridge=INTERFACE\n"
205 " Add a virtual ethernet connection between host\n"
206 " and container and add it to an existing bridge on\n"
207 " the host\n"
208 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
209 " Expose a container IP port on the host\n"
210 " -Z --selinux-context=SECLABEL\n"
211 " Set the SELinux security context to be used by\n"
212 " processes in the container\n"
213 " -L --selinux-apifs-context=SECLABEL\n"
214 " Set the SELinux security context to be used by\n"
215 " API/tmpfs file systems in the container\n"
216 " --capability=CAP In addition to the default, retain specified\n"
217 " capability\n"
218 " --drop-capability=CAP Drop the specified capability from the default set\n"
219 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
220 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
221 " try-guest, try-host\n"
222 " -j Equivalent to --link-journal=try-guest\n"
223 " --read-only Mount the root directory read-only\n"
224 " --bind=PATH[:PATH[:OPTIONS]]\n"
225 " Bind mount a file or directory from the host into\n"
226 " the container\n"
227 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
228 " Similar, but creates a read-only bind mount\n"
229 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
230 " --overlay=PATH[:PATH...]:PATH\n"
231 " Create an overlay mount from the host to \n"
232 " the container\n"
233 " --overlay-ro=PATH[:PATH...]:PATH\n"
234 " Similar, but creates a read-only overlay mount\n"
235 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
236 " --share-system Share system namespaces with host\n"
237 " --register=BOOLEAN Register container as machine\n"
238 " --keep-unit Do not register a scope for the machine, reuse\n"
239 " the service unit nspawn is running in\n"
240 " --volatile[=MODE] Run the system in volatile mode\n"
241 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
242 , program_invocation_short_name);
243 }
244
245
246 static int custom_mounts_prepare(void) {
247 unsigned i;
248 int r;
249
250 /* Ensure the mounts are applied prefix first. */
251 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
252
253 /* Allocate working directories for the overlay file systems that need it */
254 for (i = 0; i < arg_n_custom_mounts; i++) {
255 CustomMount *m = &arg_custom_mounts[i];
256
257 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
258 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
259 return -EINVAL;
260 }
261
262 if (m->type != CUSTOM_MOUNT_OVERLAY)
263 continue;
264
265 if (m->work_dir)
266 continue;
267
268 if (m->read_only)
269 continue;
270
271 r = tempfn_random(m->source, NULL, &m->work_dir);
272 if (r < 0)
273 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
274 }
275
276 return 0;
277 }
278
279 static int detect_unified_cgroup_hierarchy(void) {
280 const char *e;
281 int r;
282
283 /* Allow the user to control whether the unified hierarchy is used */
284 e = getenv("UNIFIED_CGROUP_HIERARCHY");
285 if (e) {
286 r = parse_boolean(e);
287 if (r < 0)
288 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
289
290 arg_unified_cgroup_hierarchy = r;
291 return 0;
292 }
293
294 /* Otherwise inherit the default from the host system */
295 r = cg_unified();
296 if (r < 0)
297 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
298
299 arg_unified_cgroup_hierarchy = r;
300 return 0;
301 }
302
303 static int parse_argv(int argc, char *argv[]) {
304
305 enum {
306 ARG_VERSION = 0x100,
307 ARG_PRIVATE_NETWORK,
308 ARG_UUID,
309 ARG_READ_ONLY,
310 ARG_CAPABILITY,
311 ARG_DROP_CAPABILITY,
312 ARG_LINK_JOURNAL,
313 ARG_BIND,
314 ARG_BIND_RO,
315 ARG_TMPFS,
316 ARG_OVERLAY,
317 ARG_OVERLAY_RO,
318 ARG_SETENV,
319 ARG_SHARE_SYSTEM,
320 ARG_REGISTER,
321 ARG_KEEP_UNIT,
322 ARG_NETWORK_INTERFACE,
323 ARG_NETWORK_MACVLAN,
324 ARG_NETWORK_IPVLAN,
325 ARG_NETWORK_BRIDGE,
326 ARG_PERSONALITY,
327 ARG_VOLATILE,
328 ARG_TEMPLATE,
329 ARG_PROPERTY,
330 ARG_PRIVATE_USERS,
331 ARG_KILL_SIGNAL,
332 ARG_SETTINGS,
333 };
334
335 static const struct option options[] = {
336 { "help", no_argument, NULL, 'h' },
337 { "version", no_argument, NULL, ARG_VERSION },
338 { "directory", required_argument, NULL, 'D' },
339 { "template", required_argument, NULL, ARG_TEMPLATE },
340 { "ephemeral", no_argument, NULL, 'x' },
341 { "user", required_argument, NULL, 'u' },
342 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
343 { "boot", no_argument, NULL, 'b' },
344 { "uuid", required_argument, NULL, ARG_UUID },
345 { "read-only", no_argument, NULL, ARG_READ_ONLY },
346 { "capability", required_argument, NULL, ARG_CAPABILITY },
347 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
348 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
349 { "bind", required_argument, NULL, ARG_BIND },
350 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
351 { "tmpfs", required_argument, NULL, ARG_TMPFS },
352 { "overlay", required_argument, NULL, ARG_OVERLAY },
353 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
354 { "machine", required_argument, NULL, 'M' },
355 { "slice", required_argument, NULL, 'S' },
356 { "setenv", required_argument, NULL, ARG_SETENV },
357 { "selinux-context", required_argument, NULL, 'Z' },
358 { "selinux-apifs-context", required_argument, NULL, 'L' },
359 { "quiet", no_argument, NULL, 'q' },
360 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
361 { "register", required_argument, NULL, ARG_REGISTER },
362 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
363 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
364 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
365 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
366 { "network-veth", no_argument, NULL, 'n' },
367 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
368 { "personality", required_argument, NULL, ARG_PERSONALITY },
369 { "image", required_argument, NULL, 'i' },
370 { "volatile", optional_argument, NULL, ARG_VOLATILE },
371 { "port", required_argument, NULL, 'p' },
372 { "property", required_argument, NULL, ARG_PROPERTY },
373 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
374 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
375 { "settings", required_argument, NULL, ARG_SETTINGS },
376 {}
377 };
378
379 int c, r;
380 uint64_t plus = 0, minus = 0;
381 bool mask_all_settings = false, mask_no_settings = false;
382
383 assert(argc >= 0);
384 assert(argv);
385
386 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
387
388 switch (c) {
389
390 case 'h':
391 help();
392 return 0;
393
394 case ARG_VERSION:
395 return version();
396
397 case 'D':
398 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
399 if (r < 0)
400 return r;
401 break;
402
403 case ARG_TEMPLATE:
404 r = parse_path_argument_and_warn(optarg, false, &arg_template);
405 if (r < 0)
406 return r;
407 break;
408
409 case 'i':
410 r = parse_path_argument_and_warn(optarg, false, &arg_image);
411 if (r < 0)
412 return r;
413 break;
414
415 case 'x':
416 arg_ephemeral = true;
417 break;
418
419 case 'u':
420 r = free_and_strdup(&arg_user, optarg);
421 if (r < 0)
422 return log_oom();
423
424 arg_settings_mask |= SETTING_USER;
425 break;
426
427 case ARG_NETWORK_BRIDGE:
428 r = free_and_strdup(&arg_network_bridge, optarg);
429 if (r < 0)
430 return log_oom();
431
432 /* fall through */
433
434 case 'n':
435 arg_network_veth = true;
436 arg_private_network = true;
437 arg_settings_mask |= SETTING_NETWORK;
438 break;
439
440 case ARG_NETWORK_INTERFACE:
441 if (strv_extend(&arg_network_interfaces, optarg) < 0)
442 return log_oom();
443
444 arg_private_network = true;
445 arg_settings_mask |= SETTING_NETWORK;
446 break;
447
448 case ARG_NETWORK_MACVLAN:
449 if (strv_extend(&arg_network_macvlan, optarg) < 0)
450 return log_oom();
451
452 arg_private_network = true;
453 arg_settings_mask |= SETTING_NETWORK;
454 break;
455
456 case ARG_NETWORK_IPVLAN:
457 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
458 return log_oom();
459
460 /* fall through */
461
462 case ARG_PRIVATE_NETWORK:
463 arg_private_network = true;
464 arg_settings_mask |= SETTING_NETWORK;
465 break;
466
467 case 'b':
468 arg_boot = true;
469 arg_settings_mask |= SETTING_BOOT;
470 break;
471
472 case ARG_UUID:
473 r = sd_id128_from_string(optarg, &arg_uuid);
474 if (r < 0) {
475 log_error("Invalid UUID: %s", optarg);
476 return r;
477 }
478
479 arg_settings_mask |= SETTING_MACHINE_ID;
480 break;
481
482 case 'S':
483 arg_slice = optarg;
484 break;
485
486 case 'M':
487 if (isempty(optarg))
488 arg_machine = mfree(arg_machine);
489 else {
490 if (!machine_name_is_valid(optarg)) {
491 log_error("Invalid machine name: %s", optarg);
492 return -EINVAL;
493 }
494
495 r = free_and_strdup(&arg_machine, optarg);
496 if (r < 0)
497 return log_oom();
498
499 break;
500 }
501
502 case 'Z':
503 arg_selinux_context = optarg;
504 break;
505
506 case 'L':
507 arg_selinux_apifs_context = optarg;
508 break;
509
510 case ARG_READ_ONLY:
511 arg_read_only = true;
512 arg_settings_mask |= SETTING_READ_ONLY;
513 break;
514
515 case ARG_CAPABILITY:
516 case ARG_DROP_CAPABILITY: {
517 const char *state, *word;
518 size_t length;
519
520 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
521 _cleanup_free_ char *t;
522
523 t = strndup(word, length);
524 if (!t)
525 return log_oom();
526
527 if (streq(t, "all")) {
528 if (c == ARG_CAPABILITY)
529 plus = (uint64_t) -1;
530 else
531 minus = (uint64_t) -1;
532 } else {
533 int cap;
534
535 cap = capability_from_name(t);
536 if (cap < 0) {
537 log_error("Failed to parse capability %s.", t);
538 return -EINVAL;
539 }
540
541 if (c == ARG_CAPABILITY)
542 plus |= 1ULL << (uint64_t) cap;
543 else
544 minus |= 1ULL << (uint64_t) cap;
545 }
546 }
547
548 arg_settings_mask |= SETTING_CAPABILITY;
549 break;
550 }
551
552 case 'j':
553 arg_link_journal = LINK_GUEST;
554 arg_link_journal_try = true;
555 break;
556
557 case ARG_LINK_JOURNAL:
558 if (streq(optarg, "auto")) {
559 arg_link_journal = LINK_AUTO;
560 arg_link_journal_try = false;
561 } else if (streq(optarg, "no")) {
562 arg_link_journal = LINK_NO;
563 arg_link_journal_try = false;
564 } else if (streq(optarg, "guest")) {
565 arg_link_journal = LINK_GUEST;
566 arg_link_journal_try = false;
567 } else if (streq(optarg, "host")) {
568 arg_link_journal = LINK_HOST;
569 arg_link_journal_try = false;
570 } else if (streq(optarg, "try-guest")) {
571 arg_link_journal = LINK_GUEST;
572 arg_link_journal_try = true;
573 } else if (streq(optarg, "try-host")) {
574 arg_link_journal = LINK_HOST;
575 arg_link_journal_try = true;
576 } else {
577 log_error("Failed to parse link journal mode %s", optarg);
578 return -EINVAL;
579 }
580
581 break;
582
583 case ARG_BIND:
584 case ARG_BIND_RO:
585 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
586 if (r < 0)
587 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
588
589 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
590 break;
591
592 case ARG_TMPFS:
593 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
594 if (r < 0)
595 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
596
597 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
598 break;
599
600 case ARG_OVERLAY:
601 case ARG_OVERLAY_RO: {
602 _cleanup_free_ char *upper = NULL, *destination = NULL;
603 _cleanup_strv_free_ char **lower = NULL;
604 CustomMount *m;
605 unsigned n = 0;
606 char **i;
607
608 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
609 if (r == -ENOMEM)
610 return log_oom();
611 else if (r < 0) {
612 log_error("Invalid overlay specification: %s", optarg);
613 return r;
614 }
615
616 STRV_FOREACH(i, lower) {
617 if (!path_is_absolute(*i)) {
618 log_error("Overlay path %s is not absolute.", *i);
619 return -EINVAL;
620 }
621
622 n++;
623 }
624
625 if (n < 2) {
626 log_error("--overlay= needs at least two colon-separated directories specified.");
627 return -EINVAL;
628 }
629
630 if (n == 2) {
631 /* If two parameters are specified,
632 * the first one is the lower, the
633 * second one the upper directory. And
634 * we'll also define the destination
635 * mount point the same as the upper. */
636 upper = lower[1];
637 lower[1] = NULL;
638
639 destination = strdup(upper);
640 if (!destination)
641 return log_oom();
642
643 } else {
644 upper = lower[n - 2];
645 destination = lower[n - 1];
646 lower[n - 2] = NULL;
647 }
648
649 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
650 if (!m)
651 return log_oom();
652
653 m->destination = destination;
654 m->source = upper;
655 m->lower = lower;
656 m->read_only = c == ARG_OVERLAY_RO;
657
658 upper = destination = NULL;
659 lower = NULL;
660
661 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
662 break;
663 }
664
665 case ARG_SETENV: {
666 char **n;
667
668 if (!env_assignment_is_valid(optarg)) {
669 log_error("Environment variable assignment '%s' is not valid.", optarg);
670 return -EINVAL;
671 }
672
673 n = strv_env_set(arg_setenv, optarg);
674 if (!n)
675 return log_oom();
676
677 strv_free(arg_setenv);
678 arg_setenv = n;
679
680 arg_settings_mask |= SETTING_ENVIRONMENT;
681 break;
682 }
683
684 case 'q':
685 arg_quiet = true;
686 break;
687
688 case ARG_SHARE_SYSTEM:
689 arg_share_system = true;
690 break;
691
692 case ARG_REGISTER:
693 r = parse_boolean(optarg);
694 if (r < 0) {
695 log_error("Failed to parse --register= argument: %s", optarg);
696 return r;
697 }
698
699 arg_register = r;
700 break;
701
702 case ARG_KEEP_UNIT:
703 arg_keep_unit = true;
704 break;
705
706 case ARG_PERSONALITY:
707
708 arg_personality = personality_from_string(optarg);
709 if (arg_personality == PERSONALITY_INVALID) {
710 log_error("Unknown or unsupported personality '%s'.", optarg);
711 return -EINVAL;
712 }
713
714 arg_settings_mask |= SETTING_PERSONALITY;
715 break;
716
717 case ARG_VOLATILE:
718
719 if (!optarg)
720 arg_volatile_mode = VOLATILE_YES;
721 else {
722 VolatileMode m;
723
724 m = volatile_mode_from_string(optarg);
725 if (m < 0) {
726 log_error("Failed to parse --volatile= argument: %s", optarg);
727 return -EINVAL;
728 } else
729 arg_volatile_mode = m;
730 }
731
732 arg_settings_mask |= SETTING_VOLATILE_MODE;
733 break;
734
735 case 'p':
736 r = expose_port_parse(&arg_expose_ports, optarg);
737 if (r == -EEXIST)
738 return log_error_errno(r, "Duplicate port specification: %s", optarg);
739 if (r < 0)
740 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
741
742 arg_settings_mask |= SETTING_EXPOSE_PORTS;
743 break;
744
745 case ARG_PROPERTY:
746 if (strv_extend(&arg_property, optarg) < 0)
747 return log_oom();
748
749 break;
750
751 case ARG_PRIVATE_USERS:
752 if (optarg) {
753 _cleanup_free_ char *buffer = NULL;
754 const char *range, *shift;
755
756 range = strchr(optarg, ':');
757 if (range) {
758 buffer = strndup(optarg, range - optarg);
759 if (!buffer)
760 return log_oom();
761 shift = buffer;
762
763 range++;
764 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
765 log_error("Failed to parse UID range: %s", range);
766 return -EINVAL;
767 }
768 } else
769 shift = optarg;
770
771 if (parse_uid(shift, &arg_uid_shift) < 0) {
772 log_error("Failed to parse UID: %s", optarg);
773 return -EINVAL;
774 }
775 }
776
777 arg_userns = true;
778 break;
779
780 case ARG_KILL_SIGNAL:
781 arg_kill_signal = signal_from_string_try_harder(optarg);
782 if (arg_kill_signal < 0) {
783 log_error("Cannot parse signal: %s", optarg);
784 return -EINVAL;
785 }
786
787 arg_settings_mask |= SETTING_KILL_SIGNAL;
788 break;
789
790 case ARG_SETTINGS:
791
792 /* no → do not read files
793 * yes → read files, do not override cmdline, trust only subset
794 * override → read files, override cmdline, trust only subset
795 * trusted → read files, do not override cmdline, trust all
796 */
797
798 r = parse_boolean(optarg);
799 if (r < 0) {
800 if (streq(optarg, "trusted")) {
801 mask_all_settings = false;
802 mask_no_settings = false;
803 arg_settings_trusted = true;
804
805 } else if (streq(optarg, "override")) {
806 mask_all_settings = false;
807 mask_no_settings = true;
808 arg_settings_trusted = -1;
809 } else
810 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
811 } else if (r > 0) {
812 /* yes */
813 mask_all_settings = false;
814 mask_no_settings = false;
815 arg_settings_trusted = -1;
816 } else {
817 /* no */
818 mask_all_settings = true;
819 mask_no_settings = false;
820 arg_settings_trusted = false;
821 }
822
823 break;
824
825 case '?':
826 return -EINVAL;
827
828 default:
829 assert_not_reached("Unhandled option");
830 }
831
832 if (arg_share_system)
833 arg_register = false;
834
835 if (arg_boot && arg_share_system) {
836 log_error("--boot and --share-system may not be combined.");
837 return -EINVAL;
838 }
839
840 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
841 log_error("--keep-unit may not be used when invoked from a user session.");
842 return -EINVAL;
843 }
844
845 if (arg_directory && arg_image) {
846 log_error("--directory= and --image= may not be combined.");
847 return -EINVAL;
848 }
849
850 if (arg_template && arg_image) {
851 log_error("--template= and --image= may not be combined.");
852 return -EINVAL;
853 }
854
855 if (arg_template && !(arg_directory || arg_machine)) {
856 log_error("--template= needs --directory= or --machine=.");
857 return -EINVAL;
858 }
859
860 if (arg_ephemeral && arg_template) {
861 log_error("--ephemeral and --template= may not be combined.");
862 return -EINVAL;
863 }
864
865 if (arg_ephemeral && arg_image) {
866 log_error("--ephemeral and --image= may not be combined.");
867 return -EINVAL;
868 }
869
870 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
871 log_error("--ephemeral and --link-journal= may not be combined.");
872 return -EINVAL;
873 }
874
875 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
876 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
877
878 if (argc > optind) {
879 arg_parameters = strv_copy(argv + optind);
880 if (!arg_parameters)
881 return log_oom();
882
883 arg_settings_mask |= SETTING_BOOT;
884 }
885
886 /* Load all settings from .nspawn files */
887 if (mask_no_settings)
888 arg_settings_mask = 0;
889
890 /* Don't load any settings from .nspawn files */
891 if (mask_all_settings)
892 arg_settings_mask = _SETTINGS_MASK_ALL;
893
894 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
895
896 r = detect_unified_cgroup_hierarchy();
897 if (r < 0)
898 return r;
899
900 return 1;
901 }
902
903 static int verify_arguments(void) {
904
905 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
906 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
907 return -EINVAL;
908 }
909
910 if (arg_expose_ports && !arg_private_network) {
911 log_error("Cannot use --port= without private networking.");
912 return -EINVAL;
913 }
914
915 if (arg_boot && arg_kill_signal <= 0)
916 arg_kill_signal = SIGRTMIN+3;
917
918 return 0;
919 }
920
921 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
922 assert(p);
923
924 if (!arg_userns)
925 return 0;
926
927 if (uid == UID_INVALID && gid == GID_INVALID)
928 return 0;
929
930 if (uid != UID_INVALID) {
931 uid += arg_uid_shift;
932
933 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
934 return -EOVERFLOW;
935 }
936
937 if (gid != GID_INVALID) {
938 gid += (gid_t) arg_uid_shift;
939
940 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
941 return -EOVERFLOW;
942 }
943
944 if (lchown(p, uid, gid) < 0)
945 return -errno;
946
947 return 0;
948 }
949
950 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
951 const char *q;
952
953 q = prefix_roota(root, path);
954 if (mkdir(q, mode) < 0) {
955 if (errno == EEXIST)
956 return 0;
957 return -errno;
958 }
959
960 return userns_lchown(q, uid, gid);
961 }
962
963 static int setup_timezone(const char *dest) {
964 _cleanup_free_ char *p = NULL, *q = NULL;
965 const char *where, *check, *what;
966 char *z, *y;
967 int r;
968
969 assert(dest);
970
971 /* Fix the timezone, if possible */
972 r = readlink_malloc("/etc/localtime", &p);
973 if (r < 0) {
974 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
975 return 0;
976 }
977
978 z = path_startswith(p, "../usr/share/zoneinfo/");
979 if (!z)
980 z = path_startswith(p, "/usr/share/zoneinfo/");
981 if (!z) {
982 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
983 return 0;
984 }
985
986 where = prefix_roota(dest, "/etc/localtime");
987 r = readlink_malloc(where, &q);
988 if (r >= 0) {
989 y = path_startswith(q, "../usr/share/zoneinfo/");
990 if (!y)
991 y = path_startswith(q, "/usr/share/zoneinfo/");
992
993 /* Already pointing to the right place? Then do nothing .. */
994 if (y && streq(y, z))
995 return 0;
996 }
997
998 check = strjoina("/usr/share/zoneinfo/", z);
999 check = prefix_root(dest, check);
1000 if (laccess(check, F_OK) < 0) {
1001 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1002 return 0;
1003 }
1004
1005 r = unlink(where);
1006 if (r < 0 && errno != ENOENT) {
1007 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1008 return 0;
1009 }
1010
1011 what = strjoina("../usr/share/zoneinfo/", z);
1012 if (symlink(what, where) < 0) {
1013 log_error_errno(errno, "Failed to correct timezone of container: %m");
1014 return 0;
1015 }
1016
1017 r = userns_lchown(where, 0, 0);
1018 if (r < 0)
1019 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1020
1021 return 0;
1022 }
1023
1024 static int setup_resolv_conf(const char *dest) {
1025 const char *where = NULL;
1026 int r;
1027
1028 assert(dest);
1029
1030 if (arg_private_network)
1031 return 0;
1032
1033 /* Fix resolv.conf, if possible */
1034 where = prefix_roota(dest, "/etc/resolv.conf");
1035
1036 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1037 if (r < 0) {
1038 /* If the file already exists as symlink, let's
1039 * suppress the warning, under the assumption that
1040 * resolved or something similar runs inside and the
1041 * symlink points there.
1042 *
1043 * If the disk image is read-only, there's also no
1044 * point in complaining.
1045 */
1046 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1047 "Failed to copy /etc/resolv.conf to %s: %m", where);
1048 return 0;
1049 }
1050
1051 r = userns_lchown(where, 0, 0);
1052 if (r < 0)
1053 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1054
1055 return 0;
1056 }
1057
1058 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1059 assert(s);
1060
1061 snprintf(s, 37,
1062 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1063 SD_ID128_FORMAT_VAL(id));
1064
1065 return s;
1066 }
1067
1068 static int setup_boot_id(const char *dest) {
1069 const char *from, *to;
1070 sd_id128_t rnd = {};
1071 char as_uuid[37];
1072 int r;
1073
1074 if (arg_share_system)
1075 return 0;
1076
1077 /* Generate a new randomized boot ID, so that each boot-up of
1078 * the container gets a new one */
1079
1080 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1081 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1082
1083 r = sd_id128_randomize(&rnd);
1084 if (r < 0)
1085 return log_error_errno(r, "Failed to generate random boot id: %m");
1086
1087 id128_format_as_uuid(rnd, as_uuid);
1088
1089 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1090 if (r < 0)
1091 return log_error_errno(r, "Failed to write boot id: %m");
1092
1093 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1094 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1095 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1096 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1097
1098 unlink(from);
1099 return r;
1100 }
1101
1102 static int copy_devnodes(const char *dest) {
1103
1104 static const char devnodes[] =
1105 "null\0"
1106 "zero\0"
1107 "full\0"
1108 "random\0"
1109 "urandom\0"
1110 "tty\0"
1111 "net/tun\0";
1112
1113 const char *d;
1114 int r = 0;
1115 _cleanup_umask_ mode_t u;
1116
1117 assert(dest);
1118
1119 u = umask(0000);
1120
1121 /* Create /dev/net, so that we can create /dev/net/tun in it */
1122 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1123 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1124
1125 NULSTR_FOREACH(d, devnodes) {
1126 _cleanup_free_ char *from = NULL, *to = NULL;
1127 struct stat st;
1128
1129 from = strappend("/dev/", d);
1130 to = prefix_root(dest, from);
1131
1132 if (stat(from, &st) < 0) {
1133
1134 if (errno != ENOENT)
1135 return log_error_errno(errno, "Failed to stat %s: %m", from);
1136
1137 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1138
1139 log_error("%s is not a char or block device, cannot copy.", from);
1140 return -EIO;
1141
1142 } else {
1143 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1144 if (errno != EPERM)
1145 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1146
1147 /* Some systems abusively restrict mknod but
1148 * allow bind mounts. */
1149 r = touch(to);
1150 if (r < 0)
1151 return log_error_errno(r, "touch (%s) failed: %m", to);
1152 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1153 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1154 }
1155
1156 r = userns_lchown(to, 0, 0);
1157 if (r < 0)
1158 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1159 }
1160 }
1161
1162 return r;
1163 }
1164
1165 static int setup_pts(const char *dest) {
1166 _cleanup_free_ char *options = NULL;
1167 const char *p;
1168
1169 #ifdef HAVE_SELINUX
1170 if (arg_selinux_apifs_context)
1171 (void) asprintf(&options,
1172 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1173 arg_uid_shift + TTY_GID,
1174 arg_selinux_apifs_context);
1175 else
1176 #endif
1177 (void) asprintf(&options,
1178 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1179 arg_uid_shift + TTY_GID);
1180
1181 if (!options)
1182 return log_oom();
1183
1184 /* Mount /dev/pts itself */
1185 p = prefix_roota(dest, "/dev/pts");
1186 if (mkdir(p, 0755) < 0)
1187 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1188 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1189 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1190 if (userns_lchown(p, 0, 0) < 0)
1191 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1192
1193 /* Create /dev/ptmx symlink */
1194 p = prefix_roota(dest, "/dev/ptmx");
1195 if (symlink("pts/ptmx", p) < 0)
1196 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1197 if (userns_lchown(p, 0, 0) < 0)
1198 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
1199
1200 /* And fix /dev/pts/ptmx ownership */
1201 p = prefix_roota(dest, "/dev/pts/ptmx");
1202 if (userns_lchown(p, 0, 0) < 0)
1203 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
1204
1205 return 0;
1206 }
1207
1208 static int setup_dev_console(const char *dest, const char *console) {
1209 _cleanup_umask_ mode_t u;
1210 const char *to;
1211 int r;
1212
1213 assert(dest);
1214 assert(console);
1215
1216 u = umask(0000);
1217
1218 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1219 if (r < 0)
1220 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1221
1222 /* We need to bind mount the right tty to /dev/console since
1223 * ptys can only exist on pts file systems. To have something
1224 * to bind mount things on we create a empty regular file. */
1225
1226 to = prefix_roota(dest, "/dev/console");
1227 r = touch(to);
1228 if (r < 0)
1229 return log_error_errno(r, "touch() for /dev/console failed: %m");
1230
1231 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1232 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1233
1234 return 0;
1235 }
1236
1237 static int setup_kmsg(const char *dest, int kmsg_socket) {
1238 const char *from, *to;
1239 _cleanup_umask_ mode_t u;
1240 int fd, r;
1241
1242 assert(kmsg_socket >= 0);
1243
1244 u = umask(0000);
1245
1246 /* We create the kmsg FIFO as /run/kmsg, but immediately
1247 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1248 * on the reading side behave very similar to /proc/kmsg,
1249 * their writing side behaves differently from /dev/kmsg in
1250 * that writing blocks when nothing is reading. In order to
1251 * avoid any problems with containers deadlocking due to this
1252 * we simply make /dev/kmsg unavailable to the container. */
1253 from = prefix_roota(dest, "/run/kmsg");
1254 to = prefix_roota(dest, "/proc/kmsg");
1255
1256 if (mkfifo(from, 0600) < 0)
1257 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1258 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1259 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1260
1261 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1262 if (fd < 0)
1263 return log_error_errno(errno, "Failed to open fifo: %m");
1264
1265 /* Store away the fd in the socket, so that it stays open as
1266 * long as we run the child */
1267 r = send_one_fd(kmsg_socket, fd, 0);
1268 safe_close(fd);
1269
1270 if (r < 0)
1271 return log_error_errno(r, "Failed to send FIFO fd: %m");
1272
1273 /* And now make the FIFO unavailable as /run/kmsg... */
1274 (void) unlink(from);
1275
1276 return 0;
1277 }
1278
1279 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
1280 union in_addr_union *exposed = userdata;
1281
1282 assert(rtnl);
1283 assert(m);
1284 assert(exposed);
1285
1286 expose_port_execute(rtnl, arg_expose_ports, exposed);
1287 return 0;
1288 }
1289
1290 static int setup_hostname(void) {
1291
1292 if (arg_share_system)
1293 return 0;
1294
1295 if (sethostname_idempotent(arg_machine) < 0)
1296 return -errno;
1297
1298 return 0;
1299 }
1300
1301 static int setup_journal(const char *directory) {
1302 sd_id128_t machine_id, this_id;
1303 _cleanup_free_ char *b = NULL, *d = NULL;
1304 const char *etc_machine_id, *p, *q;
1305 char *id;
1306 int r;
1307
1308 /* Don't link journals in ephemeral mode */
1309 if (arg_ephemeral)
1310 return 0;
1311
1312 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
1313
1314 r = read_one_line_file(etc_machine_id, &b);
1315 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1316 return 0;
1317 else if (r < 0)
1318 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
1319
1320 id = strstrip(b);
1321 if (isempty(id) && arg_link_journal == LINK_AUTO)
1322 return 0;
1323
1324 /* Verify validity */
1325 r = sd_id128_from_string(id, &machine_id);
1326 if (r < 0)
1327 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
1328
1329 r = sd_id128_get_machine(&this_id);
1330 if (r < 0)
1331 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1332
1333 if (sd_id128_equal(machine_id, this_id)) {
1334 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1335 "Host and machine ids are equal (%s): refusing to link journals", id);
1336 if (arg_link_journal == LINK_AUTO)
1337 return 0;
1338 return -EEXIST;
1339 }
1340
1341 if (arg_link_journal == LINK_NO)
1342 return 0;
1343
1344 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1345 if (r < 0)
1346 return log_error_errno(r, "Failed to create /var: %m");
1347
1348 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1349 if (r < 0)
1350 return log_error_errno(r, "Failed to create /var/log: %m");
1351
1352 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1353 if (r < 0)
1354 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1355
1356 p = strjoina("/var/log/journal/", id);
1357 q = prefix_roota(directory, p);
1358
1359 if (path_is_mount_point(p, 0) > 0) {
1360 if (arg_link_journal != LINK_AUTO) {
1361 log_error("%s: already a mount point, refusing to use for journal", p);
1362 return -EEXIST;
1363 }
1364
1365 return 0;
1366 }
1367
1368 if (path_is_mount_point(q, 0) > 0) {
1369 if (arg_link_journal != LINK_AUTO) {
1370 log_error("%s: already a mount point, refusing to use for journal", q);
1371 return -EEXIST;
1372 }
1373
1374 return 0;
1375 }
1376
1377 r = readlink_and_make_absolute(p, &d);
1378 if (r >= 0) {
1379 if ((arg_link_journal == LINK_GUEST ||
1380 arg_link_journal == LINK_AUTO) &&
1381 path_equal(d, q)) {
1382
1383 r = userns_mkdir(directory, p, 0755, 0, 0);
1384 if (r < 0)
1385 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1386 return 0;
1387 }
1388
1389 if (unlink(p) < 0)
1390 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1391 } else if (r == -EINVAL) {
1392
1393 if (arg_link_journal == LINK_GUEST &&
1394 rmdir(p) < 0) {
1395
1396 if (errno == ENOTDIR) {
1397 log_error("%s already exists and is neither a symlink nor a directory", p);
1398 return r;
1399 } else {
1400 log_error_errno(errno, "Failed to remove %s: %m", p);
1401 return -errno;
1402 }
1403 }
1404 } else if (r != -ENOENT) {
1405 log_error_errno(errno, "readlink(%s) failed: %m", p);
1406 return r;
1407 }
1408
1409 if (arg_link_journal == LINK_GUEST) {
1410
1411 if (symlink(q, p) < 0) {
1412 if (arg_link_journal_try) {
1413 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1414 return 0;
1415 } else {
1416 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1417 return -errno;
1418 }
1419 }
1420
1421 r = userns_mkdir(directory, p, 0755, 0, 0);
1422 if (r < 0)
1423 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1424 return 0;
1425 }
1426
1427 if (arg_link_journal == LINK_HOST) {
1428 /* don't create parents here -- if the host doesn't have
1429 * permanent journal set up, don't force it here */
1430 r = mkdir(p, 0755);
1431 if (r < 0) {
1432 if (arg_link_journal_try) {
1433 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1434 return 0;
1435 } else {
1436 log_error_errno(errno, "Failed to create %s: %m", p);
1437 return r;
1438 }
1439 }
1440
1441 } else if (access(p, F_OK) < 0)
1442 return 0;
1443
1444 if (dir_is_empty(q) == 0)
1445 log_warning("%s is not empty, proceeding anyway.", q);
1446
1447 r = userns_mkdir(directory, p, 0755, 0, 0);
1448 if (r < 0) {
1449 log_error_errno(errno, "Failed to create %s: %m", q);
1450 return r;
1451 }
1452
1453 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1454 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1455
1456 return 0;
1457 }
1458
1459 static int drop_capabilities(void) {
1460 return capability_bounding_set_drop(~arg_retain, false);
1461 }
1462
1463 static int reset_audit_loginuid(void) {
1464 _cleanup_free_ char *p = NULL;
1465 int r;
1466
1467 if (arg_share_system)
1468 return 0;
1469
1470 r = read_one_line_file("/proc/self/loginuid", &p);
1471 if (r == -ENOENT)
1472 return 0;
1473 if (r < 0)
1474 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1475
1476 /* Already reset? */
1477 if (streq(p, "4294967295"))
1478 return 0;
1479
1480 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
1481 if (r < 0) {
1482 log_error_errno(r,
1483 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1484 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1485 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1486 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1487 "using systemd-nspawn. Sleeping for 5s... (%m)");
1488
1489 sleep(5);
1490 }
1491
1492 return 0;
1493 }
1494
1495 static int setup_seccomp(void) {
1496
1497 #ifdef HAVE_SECCOMP
1498 static const struct {
1499 uint64_t capability;
1500 int syscall_num;
1501 } blacklist[] = {
1502 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
1503 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
1504 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
1505 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
1506 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
1507 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
1508 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
1509 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
1510 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
1511 { CAP_SYSLOG, SCMP_SYS(syslog) },
1512 };
1513
1514 scmp_filter_ctx seccomp;
1515 unsigned i;
1516 int r;
1517
1518 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1519 if (!seccomp)
1520 return log_oom();
1521
1522 r = seccomp_add_secondary_archs(seccomp);
1523 if (r < 0) {
1524 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
1525 goto finish;
1526 }
1527
1528 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1529 if (arg_retain & (1ULL << blacklist[i].capability))
1530 continue;
1531
1532 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
1533 if (r == -EFAULT)
1534 continue; /* unknown syscall */
1535 if (r < 0) {
1536 log_error_errno(r, "Failed to block syscall: %m");
1537 goto finish;
1538 }
1539 }
1540
1541
1542 /*
1543 Audit is broken in containers, much of the userspace audit
1544 hookup will fail if running inside a container. We don't
1545 care and just turn off creation of audit sockets.
1546
1547 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1548 with EAFNOSUPPORT which audit userspace uses as indication
1549 that audit is disabled in the kernel.
1550 */
1551
1552 r = seccomp_rule_add(
1553 seccomp,
1554 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1555 SCMP_SYS(socket),
1556 2,
1557 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1558 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1559 if (r < 0) {
1560 log_error_errno(r, "Failed to add audit seccomp rule: %m");
1561 goto finish;
1562 }
1563
1564 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1565 if (r < 0) {
1566 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
1567 goto finish;
1568 }
1569
1570 r = seccomp_load(seccomp);
1571 if (r == -EINVAL) {
1572 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1573 r = 0;
1574 goto finish;
1575 }
1576 if (r < 0) {
1577 log_error_errno(r, "Failed to install seccomp audit filter: %m");
1578 goto finish;
1579 }
1580
1581 finish:
1582 seccomp_release(seccomp);
1583 return r;
1584 #else
1585 return 0;
1586 #endif
1587
1588 }
1589
1590 static int setup_propagate(const char *root) {
1591 const char *p, *q;
1592
1593 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1594 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1595 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
1596 (void) mkdir_p(p, 0600);
1597
1598 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
1599 return log_error_errno(errno, "Failed to create /run/systemd: %m");
1600
1601 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
1602 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
1603
1604 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
1605 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
1606
1607 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
1608 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1609 return log_error_errno(errno, "Failed to install propagation bind mount.");
1610
1611 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1612 return log_error_errno(errno, "Failed to make propagation mount read-only");
1613
1614 return 0;
1615 }
1616
1617 static int setup_image(char **device_path, int *loop_nr) {
1618 struct loop_info64 info = {
1619 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1620 };
1621 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1622 _cleanup_free_ char* loopdev = NULL;
1623 struct stat st;
1624 int r, nr;
1625
1626 assert(device_path);
1627 assert(loop_nr);
1628 assert(arg_image);
1629
1630 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1631 if (fd < 0)
1632 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1633
1634 if (fstat(fd, &st) < 0)
1635 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1636
1637 if (S_ISBLK(st.st_mode)) {
1638 char *p;
1639
1640 p = strdup(arg_image);
1641 if (!p)
1642 return log_oom();
1643
1644 *device_path = p;
1645
1646 *loop_nr = -1;
1647
1648 r = fd;
1649 fd = -1;
1650
1651 return r;
1652 }
1653
1654 if (!S_ISREG(st.st_mode)) {
1655 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1656 return -EINVAL;
1657 }
1658
1659 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1660 if (control < 0)
1661 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1662
1663 nr = ioctl(control, LOOP_CTL_GET_FREE);
1664 if (nr < 0)
1665 return log_error_errno(errno, "Failed to allocate loop device: %m");
1666
1667 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1668 return log_oom();
1669
1670 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1671 if (loop < 0)
1672 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1673
1674 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1675 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1676
1677 if (arg_read_only)
1678 info.lo_flags |= LO_FLAGS_READ_ONLY;
1679
1680 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1681 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1682
1683 *device_path = loopdev;
1684 loopdev = NULL;
1685
1686 *loop_nr = nr;
1687
1688 r = loop;
1689 loop = -1;
1690
1691 return r;
1692 }
1693
1694 #define PARTITION_TABLE_BLURB \
1695 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1696 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1697 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1698 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1699 "to be bootable with systemd-nspawn."
1700
1701 static int dissect_image(
1702 int fd,
1703 char **root_device, bool *root_device_rw,
1704 char **home_device, bool *home_device_rw,
1705 char **srv_device, bool *srv_device_rw,
1706 bool *secondary) {
1707
1708 #ifdef HAVE_BLKID
1709 int home_nr = -1, srv_nr = -1;
1710 #ifdef GPT_ROOT_NATIVE
1711 int root_nr = -1;
1712 #endif
1713 #ifdef GPT_ROOT_SECONDARY
1714 int secondary_root_nr = -1;
1715 #endif
1716 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1717 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1718 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1719 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1720 _cleanup_udev_unref_ struct udev *udev = NULL;
1721 struct udev_list_entry *first, *item;
1722 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
1723 bool is_gpt, is_mbr, multiple_generic = false;
1724 const char *pttype = NULL;
1725 blkid_partlist pl;
1726 struct stat st;
1727 unsigned i;
1728 int r;
1729
1730 assert(fd >= 0);
1731 assert(root_device);
1732 assert(home_device);
1733 assert(srv_device);
1734 assert(secondary);
1735 assert(arg_image);
1736
1737 b = blkid_new_probe();
1738 if (!b)
1739 return log_oom();
1740
1741 errno = 0;
1742 r = blkid_probe_set_device(b, fd, 0, 0);
1743 if (r != 0) {
1744 if (errno == 0)
1745 return log_oom();
1746
1747 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1748 return -errno;
1749 }
1750
1751 blkid_probe_enable_partitions(b, 1);
1752 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1753
1754 errno = 0;
1755 r = blkid_do_safeprobe(b);
1756 if (r == -2 || r == 1) {
1757 log_error("Failed to identify any partition table on\n"
1758 " %s\n"
1759 PARTITION_TABLE_BLURB, arg_image);
1760 return -EINVAL;
1761 } else if (r != 0) {
1762 if (errno == 0)
1763 errno = EIO;
1764 log_error_errno(errno, "Failed to probe: %m");
1765 return -errno;
1766 }
1767
1768 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1769
1770 is_gpt = streq_ptr(pttype, "gpt");
1771 is_mbr = streq_ptr(pttype, "dos");
1772
1773 if (!is_gpt && !is_mbr) {
1774 log_error("No GPT or MBR partition table discovered on\n"
1775 " %s\n"
1776 PARTITION_TABLE_BLURB, arg_image);
1777 return -EINVAL;
1778 }
1779
1780 errno = 0;
1781 pl = blkid_probe_get_partitions(b);
1782 if (!pl) {
1783 if (errno == 0)
1784 return log_oom();
1785
1786 log_error("Failed to list partitions of %s", arg_image);
1787 return -errno;
1788 }
1789
1790 udev = udev_new();
1791 if (!udev)
1792 return log_oom();
1793
1794 if (fstat(fd, &st) < 0)
1795 return log_error_errno(errno, "Failed to stat block device: %m");
1796
1797 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1798 if (!d)
1799 return log_oom();
1800
1801 for (i = 0;; i++) {
1802 int n, m;
1803
1804 if (i >= 10) {
1805 log_error("Kernel partitions never appeared.");
1806 return -ENXIO;
1807 }
1808
1809 e = udev_enumerate_new(udev);
1810 if (!e)
1811 return log_oom();
1812
1813 r = udev_enumerate_add_match_parent(e, d);
1814 if (r < 0)
1815 return log_oom();
1816
1817 r = udev_enumerate_scan_devices(e);
1818 if (r < 0)
1819 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1820
1821 /* Count the partitions enumerated by the kernel */
1822 n = 0;
1823 first = udev_enumerate_get_list_entry(e);
1824 udev_list_entry_foreach(item, first)
1825 n++;
1826
1827 /* Count the partitions enumerated by blkid */
1828 m = blkid_partlist_numof_partitions(pl);
1829 if (n == m + 1)
1830 break;
1831 if (n > m + 1) {
1832 log_error("blkid and kernel partition list do not match.");
1833 return -EIO;
1834 }
1835 if (n < m + 1) {
1836 unsigned j;
1837
1838 /* The kernel has probed fewer partitions than
1839 * blkid? Maybe the kernel prober is still
1840 * running or it got EBUSY because udev
1841 * already opened the device. Let's reprobe
1842 * the device, which is a synchronous call
1843 * that waits until probing is complete. */
1844
1845 for (j = 0; j < 20; j++) {
1846
1847 r = ioctl(fd, BLKRRPART, 0);
1848 if (r < 0)
1849 r = -errno;
1850 if (r >= 0 || r != -EBUSY)
1851 break;
1852
1853 /* If something else has the device
1854 * open, such as an udev rule, the
1855 * ioctl will return EBUSY. Since
1856 * there's no way to wait until it
1857 * isn't busy anymore, let's just wait
1858 * a bit, and try again.
1859 *
1860 * This is really something they
1861 * should fix in the kernel! */
1862
1863 usleep(50 * USEC_PER_MSEC);
1864 }
1865
1866 if (r < 0)
1867 return log_error_errno(r, "Failed to reread partition table: %m");
1868 }
1869
1870 e = udev_enumerate_unref(e);
1871 }
1872
1873 first = udev_enumerate_get_list_entry(e);
1874 udev_list_entry_foreach(item, first) {
1875 _cleanup_udev_device_unref_ struct udev_device *q;
1876 const char *node;
1877 unsigned long long flags;
1878 blkid_partition pp;
1879 dev_t qn;
1880 int nr;
1881
1882 errno = 0;
1883 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1884 if (!q) {
1885 if (!errno)
1886 errno = ENOMEM;
1887
1888 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1889 return -errno;
1890 }
1891
1892 qn = udev_device_get_devnum(q);
1893 if (major(qn) == 0)
1894 continue;
1895
1896 if (st.st_rdev == qn)
1897 continue;
1898
1899 node = udev_device_get_devnode(q);
1900 if (!node)
1901 continue;
1902
1903 pp = blkid_partlist_devno_to_partition(pl, qn);
1904 if (!pp)
1905 continue;
1906
1907 flags = blkid_partition_get_flags(pp);
1908
1909 nr = blkid_partition_get_partno(pp);
1910 if (nr < 0)
1911 continue;
1912
1913 if (is_gpt) {
1914 sd_id128_t type_id;
1915 const char *stype;
1916
1917 if (flags & GPT_FLAG_NO_AUTO)
1918 continue;
1919
1920 stype = blkid_partition_get_type_string(pp);
1921 if (!stype)
1922 continue;
1923
1924 if (sd_id128_from_string(stype, &type_id) < 0)
1925 continue;
1926
1927 if (sd_id128_equal(type_id, GPT_HOME)) {
1928
1929 if (home && nr >= home_nr)
1930 continue;
1931
1932 home_nr = nr;
1933 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1934
1935 r = free_and_strdup(&home, node);
1936 if (r < 0)
1937 return log_oom();
1938
1939 } else if (sd_id128_equal(type_id, GPT_SRV)) {
1940
1941 if (srv && nr >= srv_nr)
1942 continue;
1943
1944 srv_nr = nr;
1945 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
1946
1947 r = free_and_strdup(&srv, node);
1948 if (r < 0)
1949 return log_oom();
1950 }
1951 #ifdef GPT_ROOT_NATIVE
1952 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1953
1954 if (root && nr >= root_nr)
1955 continue;
1956
1957 root_nr = nr;
1958 root_rw = !(flags & GPT_FLAG_READ_ONLY);
1959
1960 r = free_and_strdup(&root, node);
1961 if (r < 0)
1962 return log_oom();
1963 }
1964 #endif
1965 #ifdef GPT_ROOT_SECONDARY
1966 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
1967
1968 if (secondary_root && nr >= secondary_root_nr)
1969 continue;
1970
1971 secondary_root_nr = nr;
1972 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
1973
1974 r = free_and_strdup(&secondary_root, node);
1975 if (r < 0)
1976 return log_oom();
1977 }
1978 #endif
1979 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
1980
1981 if (generic)
1982 multiple_generic = true;
1983 else {
1984 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
1985
1986 r = free_and_strdup(&generic, node);
1987 if (r < 0)
1988 return log_oom();
1989 }
1990 }
1991
1992 } else if (is_mbr) {
1993 int type;
1994
1995 if (flags != 0x80) /* Bootable flag */
1996 continue;
1997
1998 type = blkid_partition_get_type(pp);
1999 if (type != 0x83) /* Linux partition */
2000 continue;
2001
2002 if (generic)
2003 multiple_generic = true;
2004 else {
2005 generic_rw = true;
2006
2007 r = free_and_strdup(&root, node);
2008 if (r < 0)
2009 return log_oom();
2010 }
2011 }
2012 }
2013
2014 if (root) {
2015 *root_device = root;
2016 root = NULL;
2017
2018 *root_device_rw = root_rw;
2019 *secondary = false;
2020 } else if (secondary_root) {
2021 *root_device = secondary_root;
2022 secondary_root = NULL;
2023
2024 *root_device_rw = secondary_root_rw;
2025 *secondary = true;
2026 } else if (generic) {
2027
2028 /* There were no partitions with precise meanings
2029 * around, but we found generic partitions. In this
2030 * case, if there's only one, we can go ahead and boot
2031 * it, otherwise we bail out, because we really cannot
2032 * make any sense of it. */
2033
2034 if (multiple_generic) {
2035 log_error("Identified multiple bootable Linux partitions on\n"
2036 " %s\n"
2037 PARTITION_TABLE_BLURB, arg_image);
2038 return -EINVAL;
2039 }
2040
2041 *root_device = generic;
2042 generic = NULL;
2043
2044 *root_device_rw = generic_rw;
2045 *secondary = false;
2046 } else {
2047 log_error("Failed to identify root partition in disk image\n"
2048 " %s\n"
2049 PARTITION_TABLE_BLURB, arg_image);
2050 return -EINVAL;
2051 }
2052
2053 if (home) {
2054 *home_device = home;
2055 home = NULL;
2056
2057 *home_device_rw = home_rw;
2058 }
2059
2060 if (srv) {
2061 *srv_device = srv;
2062 srv = NULL;
2063
2064 *srv_device_rw = srv_rw;
2065 }
2066
2067 return 0;
2068 #else
2069 log_error("--image= is not supported, compiled without blkid support.");
2070 return -EOPNOTSUPP;
2071 #endif
2072 }
2073
2074 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2075 #ifdef HAVE_BLKID
2076 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2077 const char *fstype, *p;
2078 int r;
2079
2080 assert(what);
2081 assert(where);
2082
2083 if (arg_read_only)
2084 rw = false;
2085
2086 if (directory)
2087 p = strjoina(where, directory);
2088 else
2089 p = where;
2090
2091 errno = 0;
2092 b = blkid_new_probe_from_filename(what);
2093 if (!b) {
2094 if (errno == 0)
2095 return log_oom();
2096 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2097 return -errno;
2098 }
2099
2100 blkid_probe_enable_superblocks(b, 1);
2101 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2102
2103 errno = 0;
2104 r = blkid_do_safeprobe(b);
2105 if (r == -1 || r == 1) {
2106 log_error("Cannot determine file system type of %s", what);
2107 return -EINVAL;
2108 } else if (r != 0) {
2109 if (errno == 0)
2110 errno = EIO;
2111 log_error_errno(errno, "Failed to probe %s: %m", what);
2112 return -errno;
2113 }
2114
2115 errno = 0;
2116 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2117 if (errno == 0)
2118 errno = EINVAL;
2119 log_error("Failed to determine file system type of %s", what);
2120 return -errno;
2121 }
2122
2123 if (streq(fstype, "crypto_LUKS")) {
2124 log_error("nspawn currently does not support LUKS disk images.");
2125 return -EOPNOTSUPP;
2126 }
2127
2128 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2129 return log_error_errno(errno, "Failed to mount %s: %m", what);
2130
2131 return 0;
2132 #else
2133 log_error("--image= is not supported, compiled without blkid support.");
2134 return -EOPNOTSUPP;
2135 #endif
2136 }
2137
2138 static int mount_devices(
2139 const char *where,
2140 const char *root_device, bool root_device_rw,
2141 const char *home_device, bool home_device_rw,
2142 const char *srv_device, bool srv_device_rw) {
2143 int r;
2144
2145 assert(where);
2146
2147 if (root_device) {
2148 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2149 if (r < 0)
2150 return log_error_errno(r, "Failed to mount root directory: %m");
2151 }
2152
2153 if (home_device) {
2154 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2155 if (r < 0)
2156 return log_error_errno(r, "Failed to mount home directory: %m");
2157 }
2158
2159 if (srv_device) {
2160 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2161 if (r < 0)
2162 return log_error_errno(r, "Failed to mount server data directory: %m");
2163 }
2164
2165 return 0;
2166 }
2167
2168 static void loop_remove(int nr, int *image_fd) {
2169 _cleanup_close_ int control = -1;
2170 int r;
2171
2172 if (nr < 0)
2173 return;
2174
2175 if (image_fd && *image_fd >= 0) {
2176 r = ioctl(*image_fd, LOOP_CLR_FD);
2177 if (r < 0)
2178 log_debug_errno(errno, "Failed to close loop image: %m");
2179 *image_fd = safe_close(*image_fd);
2180 }
2181
2182 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2183 if (control < 0) {
2184 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2185 return;
2186 }
2187
2188 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2189 if (r < 0)
2190 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
2191 }
2192
2193 /*
2194 * Return values:
2195 * < 0 : wait_for_terminate() failed to get the state of the
2196 * container, the container was terminated by a signal, or
2197 * failed for an unknown reason. No change is made to the
2198 * container argument.
2199 * > 0 : The program executed in the container terminated with an
2200 * error. The exit code of the program executed in the
2201 * container is returned. The container argument has been set
2202 * to CONTAINER_TERMINATED.
2203 * 0 : The container is being rebooted, has been shut down or exited
2204 * successfully. The container argument has been set to either
2205 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2206 *
2207 * That is, success is indicated by a return value of zero, and an
2208 * error is indicated by a non-zero value.
2209 */
2210 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2211 siginfo_t status;
2212 int r;
2213
2214 r = wait_for_terminate(pid, &status);
2215 if (r < 0)
2216 return log_warning_errno(r, "Failed to wait for container: %m");
2217
2218 switch (status.si_code) {
2219
2220 case CLD_EXITED:
2221 if (status.si_status == 0) {
2222 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2223
2224 } else
2225 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2226
2227 *container = CONTAINER_TERMINATED;
2228 return status.si_status;
2229
2230 case CLD_KILLED:
2231 if (status.si_status == SIGINT) {
2232
2233 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2234 *container = CONTAINER_TERMINATED;
2235 return 0;
2236
2237 } else if (status.si_status == SIGHUP) {
2238
2239 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2240 *container = CONTAINER_REBOOTED;
2241 return 0;
2242 }
2243
2244 /* CLD_KILLED fallthrough */
2245
2246 case CLD_DUMPED:
2247 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2248 return -EIO;
2249
2250 default:
2251 log_error("Container %s failed due to unknown reason.", arg_machine);
2252 return -EIO;
2253 }
2254
2255 return r;
2256 }
2257
2258 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2259 pid_t pid;
2260
2261 pid = PTR_TO_UINT32(userdata);
2262 if (pid > 0) {
2263 if (kill(pid, arg_kill_signal) >= 0) {
2264 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2265 sd_event_source_set_userdata(s, NULL);
2266 return 0;
2267 }
2268 }
2269
2270 sd_event_exit(sd_event_source_get_event(s), 0);
2271 return 0;
2272 }
2273
2274 static int determine_names(void) {
2275 int r;
2276
2277 if (arg_template && !arg_directory && arg_machine) {
2278
2279 /* If --template= was specified then we should not
2280 * search for a machine, but instead create a new one
2281 * in /var/lib/machine. */
2282
2283 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2284 if (!arg_directory)
2285 return log_oom();
2286 }
2287
2288 if (!arg_image && !arg_directory) {
2289 if (arg_machine) {
2290 _cleanup_(image_unrefp) Image *i = NULL;
2291
2292 r = image_find(arg_machine, &i);
2293 if (r < 0)
2294 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2295 else if (r == 0) {
2296 log_error("No image for machine '%s': %m", arg_machine);
2297 return -ENOENT;
2298 }
2299
2300 if (i->type == IMAGE_RAW)
2301 r = free_and_strdup(&arg_image, i->path);
2302 else
2303 r = free_and_strdup(&arg_directory, i->path);
2304 if (r < 0)
2305 return log_error_errno(r, "Invalid image directory: %m");
2306
2307 if (!arg_ephemeral)
2308 arg_read_only = arg_read_only || i->read_only;
2309 } else
2310 arg_directory = get_current_dir_name();
2311
2312 if (!arg_directory && !arg_machine) {
2313 log_error("Failed to determine path, please use -D or -i.");
2314 return -EINVAL;
2315 }
2316 }
2317
2318 if (!arg_machine) {
2319 if (arg_directory && path_equal(arg_directory, "/"))
2320 arg_machine = gethostname_malloc();
2321 else
2322 arg_machine = strdup(basename(arg_image ?: arg_directory));
2323
2324 if (!arg_machine)
2325 return log_oom();
2326
2327 hostname_cleanup(arg_machine);
2328 if (!machine_name_is_valid(arg_machine)) {
2329 log_error("Failed to determine machine name automatically, please use -M.");
2330 return -EINVAL;
2331 }
2332
2333 if (arg_ephemeral) {
2334 char *b;
2335
2336 /* Add a random suffix when this is an
2337 * ephemeral machine, so that we can run many
2338 * instances at once without manually having
2339 * to specify -M each time. */
2340
2341 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2342 return log_oom();
2343
2344 free(arg_machine);
2345 arg_machine = b;
2346 }
2347 }
2348
2349 return 0;
2350 }
2351
2352 static int determine_uid_shift(const char *directory) {
2353 int r;
2354
2355 if (!arg_userns) {
2356 arg_uid_shift = 0;
2357 return 0;
2358 }
2359
2360 if (arg_uid_shift == UID_INVALID) {
2361 struct stat st;
2362
2363 r = stat(directory, &st);
2364 if (r < 0)
2365 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2366
2367 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2368
2369 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
2370 log_error("UID and GID base of %s don't match.", directory);
2371 return -EINVAL;
2372 }
2373
2374 arg_uid_range = UINT32_C(0x10000);
2375 }
2376
2377 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2378 log_error("UID base too high for UID range.");
2379 return -EINVAL;
2380 }
2381
2382 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2383 return 0;
2384 }
2385
2386 static int inner_child(
2387 Barrier *barrier,
2388 const char *directory,
2389 bool secondary,
2390 int kmsg_socket,
2391 int rtnl_socket,
2392 FDSet *fds) {
2393
2394 _cleanup_free_ char *home = NULL;
2395 unsigned n_env = 2;
2396 const char *envp[] = {
2397 "PATH=" DEFAULT_PATH_SPLIT_USR,
2398 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2399 NULL, /* TERM */
2400 NULL, /* HOME */
2401 NULL, /* USER */
2402 NULL, /* LOGNAME */
2403 NULL, /* container_uuid */
2404 NULL, /* LISTEN_FDS */
2405 NULL, /* LISTEN_PID */
2406 NULL
2407 };
2408
2409 _cleanup_strv_free_ char **env_use = NULL;
2410 int r;
2411
2412 assert(barrier);
2413 assert(directory);
2414 assert(kmsg_socket >= 0);
2415
2416 cg_unified_flush();
2417
2418 if (arg_userns) {
2419 /* Tell the parent, that it now can write the UID map. */
2420 (void) barrier_place(barrier); /* #1 */
2421
2422 /* Wait until the parent wrote the UID map */
2423 if (!barrier_place_and_sync(barrier)) { /* #2 */
2424 log_error("Parent died too early");
2425 return -ESRCH;
2426 }
2427 }
2428
2429 r = mount_all(NULL, arg_userns, true, arg_uid_shift, arg_private_network, arg_uid_range, arg_selinux_apifs_context);
2430 if (r < 0)
2431 return r;
2432
2433 r = mount_sysfs(NULL);
2434 if (r < 0)
2435 return r;
2436
2437 /* Wait until we are cgroup-ified, so that we
2438 * can mount the right cgroup path writable */
2439 if (!barrier_place_and_sync(barrier)) { /* #3 */
2440 log_error("Parent died too early");
2441 return -ESRCH;
2442 }
2443
2444 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2445 if (r < 0)
2446 return r;
2447
2448 r = reset_uid_gid();
2449 if (r < 0)
2450 return log_error_errno(r, "Couldn't become new root: %m");
2451
2452 r = setup_boot_id(NULL);
2453 if (r < 0)
2454 return r;
2455
2456 r = setup_kmsg(NULL, kmsg_socket);
2457 if (r < 0)
2458 return r;
2459 kmsg_socket = safe_close(kmsg_socket);
2460
2461 umask(0022);
2462
2463 if (setsid() < 0)
2464 return log_error_errno(errno, "setsid() failed: %m");
2465
2466 if (arg_private_network)
2467 loopback_setup();
2468
2469 if (arg_expose_ports) {
2470 r = expose_port_send_rtnl(rtnl_socket);
2471 if (r < 0)
2472 return r;
2473 rtnl_socket = safe_close(rtnl_socket);
2474 }
2475
2476 if (drop_capabilities() < 0)
2477 return log_error_errno(errno, "drop_capabilities() failed: %m");
2478
2479 setup_hostname();
2480
2481 if (arg_personality != PERSONALITY_INVALID) {
2482 if (personality(arg_personality) < 0)
2483 return log_error_errno(errno, "personality() failed: %m");
2484 } else if (secondary) {
2485 if (personality(PER_LINUX32) < 0)
2486 return log_error_errno(errno, "personality() failed: %m");
2487 }
2488
2489 #ifdef HAVE_SELINUX
2490 if (arg_selinux_context)
2491 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2492 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2493 #endif
2494
2495 r = change_uid_gid(arg_user, &home);
2496 if (r < 0)
2497 return r;
2498
2499 envp[n_env] = strv_find_prefix(environ, "TERM=");
2500 if (envp[n_env])
2501 n_env ++;
2502
2503 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2504 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2505 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2506 return log_oom();
2507
2508 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2509 char as_uuid[37];
2510
2511 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2512 return log_oom();
2513 }
2514
2515 if (fdset_size(fds) > 0) {
2516 r = fdset_cloexec(fds, false);
2517 if (r < 0)
2518 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2519
2520 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2521 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2522 return log_oom();
2523 }
2524
2525 env_use = strv_env_merge(2, envp, arg_setenv);
2526 if (!env_use)
2527 return log_oom();
2528
2529 /* Let the parent know that we are ready and
2530 * wait until the parent is ready with the
2531 * setup, too... */
2532 if (!barrier_place_and_sync(barrier)) { /* #4 */
2533 log_error("Parent died too early");
2534 return -ESRCH;
2535 }
2536
2537 /* Now, explicitly close the log, so that we
2538 * then can close all remaining fds. Closing
2539 * the log explicitly first has the benefit
2540 * that the logging subsystem knows about it,
2541 * and is thus ready to be reopened should we
2542 * need it again. Note that the other fds
2543 * closed here are at least the locking and
2544 * barrier fds. */
2545 log_close();
2546 (void) fdset_close_others(fds);
2547
2548 if (arg_boot) {
2549 char **a;
2550 size_t m;
2551
2552 /* Automatically search for the init system */
2553
2554 m = 1 + strv_length(arg_parameters);
2555 a = newa(char*, m + 1);
2556 if (strv_isempty(arg_parameters))
2557 a[1] = NULL;
2558 else
2559 memcpy(a + 1, arg_parameters, m * sizeof(char*));
2560
2561 a[0] = (char*) "/usr/lib/systemd/systemd";
2562 execve(a[0], a, env_use);
2563
2564 a[0] = (char*) "/lib/systemd/systemd";
2565 execve(a[0], a, env_use);
2566
2567 a[0] = (char*) "/sbin/init";
2568 execve(a[0], a, env_use);
2569 } else if (!strv_isempty(arg_parameters))
2570 execvpe(arg_parameters[0], arg_parameters, env_use);
2571 else {
2572 chdir(home ?: "/root");
2573 execle("/bin/bash", "-bash", NULL, env_use);
2574 execle("/bin/sh", "-sh", NULL, env_use);
2575 }
2576
2577 (void) log_open();
2578 return log_error_errno(errno, "execv() failed: %m");
2579 }
2580
2581 static int outer_child(
2582 Barrier *barrier,
2583 const char *directory,
2584 const char *console,
2585 const char *root_device, bool root_device_rw,
2586 const char *home_device, bool home_device_rw,
2587 const char *srv_device, bool srv_device_rw,
2588 bool interactive,
2589 bool secondary,
2590 int pid_socket,
2591 int kmsg_socket,
2592 int rtnl_socket,
2593 int uid_shift_socket,
2594 FDSet *fds) {
2595
2596 pid_t pid;
2597 ssize_t l;
2598 int r;
2599
2600 assert(barrier);
2601 assert(directory);
2602 assert(console);
2603 assert(pid_socket >= 0);
2604 assert(kmsg_socket >= 0);
2605
2606 cg_unified_flush();
2607
2608 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2609 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2610
2611 if (interactive) {
2612 close_nointr(STDIN_FILENO);
2613 close_nointr(STDOUT_FILENO);
2614 close_nointr(STDERR_FILENO);
2615
2616 r = open_terminal(console, O_RDWR);
2617 if (r != STDIN_FILENO) {
2618 if (r >= 0) {
2619 safe_close(r);
2620 r = -EINVAL;
2621 }
2622
2623 return log_error_errno(r, "Failed to open console: %m");
2624 }
2625
2626 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2627 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2628 return log_error_errno(errno, "Failed to duplicate console: %m");
2629 }
2630
2631 r = reset_audit_loginuid();
2632 if (r < 0)
2633 return r;
2634
2635 /* Mark everything as slave, so that we still
2636 * receive mounts from the real root, but don't
2637 * propagate mounts to the real root. */
2638 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2639 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2640
2641 r = mount_devices(directory,
2642 root_device, root_device_rw,
2643 home_device, home_device_rw,
2644 srv_device, srv_device_rw);
2645 if (r < 0)
2646 return r;
2647
2648 r = determine_uid_shift(directory);
2649 if (r < 0)
2650 return r;
2651
2652 if (arg_userns) {
2653 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2654 if (l < 0)
2655 return log_error_errno(errno, "Failed to send UID shift: %m");
2656 if (l != sizeof(arg_uid_shift)) {
2657 log_error("Short write while sending UID shift.");
2658 return -EIO;
2659 }
2660 }
2661
2662 /* Turn directory into bind mount */
2663 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2664 return log_error_errno(errno, "Failed to make bind mount: %m");
2665
2666 r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2667 if (r < 0)
2668 return r;
2669
2670 r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2671 if (r < 0)
2672 return r;
2673
2674 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2675 if (r < 0)
2676 return r;
2677
2678 if (arg_read_only) {
2679 r = bind_remount_recursive(directory, true);
2680 if (r < 0)
2681 return log_error_errno(r, "Failed to make tree read-only: %m");
2682 }
2683
2684 r = mount_all(directory, arg_userns, false, arg_private_network, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2685 if (r < 0)
2686 return r;
2687
2688 r = copy_devnodes(directory);
2689 if (r < 0)
2690 return r;
2691
2692 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2693
2694 r = setup_pts(directory);
2695 if (r < 0)
2696 return r;
2697
2698 r = setup_propagate(directory);
2699 if (r < 0)
2700 return r;
2701
2702 r = setup_dev_console(directory, console);
2703 if (r < 0)
2704 return r;
2705
2706 r = setup_seccomp();
2707 if (r < 0)
2708 return r;
2709
2710 r = setup_timezone(directory);
2711 if (r < 0)
2712 return r;
2713
2714 r = setup_resolv_conf(directory);
2715 if (r < 0)
2716 return r;
2717
2718 r = setup_journal(directory);
2719 if (r < 0)
2720 return r;
2721
2722 r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2723 if (r < 0)
2724 return r;
2725
2726 r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2727 if (r < 0)
2728 return r;
2729
2730 r = mount_move_root(directory);
2731 if (r < 0)
2732 return log_error_errno(r, "Failed to move root directory: %m");
2733
2734 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2735 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
2736 (arg_private_network ? CLONE_NEWNET : 0) |
2737 (arg_userns ? CLONE_NEWUSER : 0),
2738 NULL);
2739 if (pid < 0)
2740 return log_error_errno(errno, "Failed to fork inner child: %m");
2741 if (pid == 0) {
2742 pid_socket = safe_close(pid_socket);
2743 uid_shift_socket = safe_close(uid_shift_socket);
2744
2745 /* The inner child has all namespaces that are
2746 * requested, so that we all are owned by the user if
2747 * user namespaces are turned on. */
2748
2749 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
2750 if (r < 0)
2751 _exit(EXIT_FAILURE);
2752
2753 _exit(EXIT_SUCCESS);
2754 }
2755
2756 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2757 if (l < 0)
2758 return log_error_errno(errno, "Failed to send PID: %m");
2759 if (l != sizeof(pid)) {
2760 log_error("Short write while sending PID.");
2761 return -EIO;
2762 }
2763
2764 pid_socket = safe_close(pid_socket);
2765 kmsg_socket = safe_close(kmsg_socket);
2766 rtnl_socket = safe_close(rtnl_socket);
2767
2768 return 0;
2769 }
2770
2771 static int setup_uid_map(pid_t pid) {
2772 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2773 int r;
2774
2775 assert(pid > 1);
2776
2777 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2778 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
2779 r = write_string_file(uid_map, line, 0);
2780 if (r < 0)
2781 return log_error_errno(r, "Failed to write UID map: %m");
2782
2783 /* We always assign the same UID and GID ranges */
2784 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
2785 r = write_string_file(uid_map, line, 0);
2786 if (r < 0)
2787 return log_error_errno(r, "Failed to write GID map: %m");
2788
2789 return 0;
2790 }
2791
2792 static int load_settings(void) {
2793 _cleanup_(settings_freep) Settings *settings = NULL;
2794 _cleanup_fclose_ FILE *f = NULL;
2795 _cleanup_free_ char *p = NULL;
2796 const char *fn, *i;
2797 int r;
2798
2799 /* If all settings are masked, there's no point in looking for
2800 * the settings file */
2801 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2802 return 0;
2803
2804 fn = strjoina(arg_machine, ".nspawn");
2805
2806 /* We first look in the admin's directories in /etc and /run */
2807 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2808 _cleanup_free_ char *j = NULL;
2809
2810 j = strjoin(i, "/", fn, NULL);
2811 if (!j)
2812 return log_oom();
2813
2814 f = fopen(j, "re");
2815 if (f) {
2816 p = j;
2817 j = NULL;
2818
2819 /* By default we trust configuration from /etc and /run */
2820 if (arg_settings_trusted < 0)
2821 arg_settings_trusted = true;
2822
2823 break;
2824 }
2825
2826 if (errno != ENOENT)
2827 return log_error_errno(errno, "Failed to open %s: %m", j);
2828 }
2829
2830 if (!f) {
2831 /* After that, let's look for a file next to the
2832 * actual image we shall boot. */
2833
2834 if (arg_image) {
2835 p = file_in_same_dir(arg_image, fn);
2836 if (!p)
2837 return log_oom();
2838 } else if (arg_directory) {
2839 p = file_in_same_dir(arg_directory, fn);
2840 if (!p)
2841 return log_oom();
2842 }
2843
2844 if (p) {
2845 f = fopen(p, "re");
2846 if (!f && errno != ENOENT)
2847 return log_error_errno(errno, "Failed to open %s: %m", p);
2848
2849 /* By default we do not trust configuration from /var/lib/machines */
2850 if (arg_settings_trusted < 0)
2851 arg_settings_trusted = false;
2852 }
2853 }
2854
2855 if (!f)
2856 return 0;
2857
2858 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2859
2860 r = settings_load(f, p, &settings);
2861 if (r < 0)
2862 return r;
2863
2864 /* Copy over bits from the settings, unless they have been
2865 * explicitly masked by command line switches. */
2866
2867 if ((arg_settings_mask & SETTING_BOOT) == 0 &&
2868 settings->boot >= 0) {
2869 arg_boot = settings->boot;
2870
2871 strv_free(arg_parameters);
2872 arg_parameters = settings->parameters;
2873 settings->parameters = NULL;
2874 }
2875
2876 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2877 settings->environment) {
2878 strv_free(arg_setenv);
2879 arg_setenv = settings->environment;
2880 settings->environment = NULL;
2881 }
2882
2883 if ((arg_settings_mask & SETTING_USER) == 0 &&
2884 settings->user) {
2885 free(arg_user);
2886 arg_user = settings->user;
2887 settings->user = NULL;
2888 }
2889
2890 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
2891 uint64_t plus;
2892
2893 plus = settings->capability;
2894 if (settings_private_network(settings))
2895 plus |= (1ULL << CAP_NET_ADMIN);
2896
2897 if (!arg_settings_trusted && plus != 0) {
2898 if (settings->capability != 0)
2899 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
2900 } else
2901 arg_retain |= plus;
2902
2903 arg_retain &= ~settings->drop_capability;
2904 }
2905
2906 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
2907 settings->kill_signal > 0)
2908 arg_kill_signal = settings->kill_signal;
2909
2910 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
2911 settings->personality != PERSONALITY_INVALID)
2912 arg_personality = settings->personality;
2913
2914 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
2915 !sd_id128_is_null(settings->machine_id)) {
2916
2917 if (!arg_settings_trusted)
2918 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
2919 else
2920 arg_uuid = settings->machine_id;
2921 }
2922
2923 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
2924 settings->read_only >= 0)
2925 arg_read_only = settings->read_only;
2926
2927 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
2928 settings->volatile_mode != _VOLATILE_MODE_INVALID)
2929 arg_volatile_mode = settings->volatile_mode;
2930
2931 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
2932 settings->n_custom_mounts > 0) {
2933
2934 if (!arg_settings_trusted)
2935 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
2936 else {
2937 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
2938 arg_custom_mounts = settings->custom_mounts;
2939 arg_n_custom_mounts = settings->n_custom_mounts;
2940
2941 settings->custom_mounts = NULL;
2942 settings->n_custom_mounts = 0;
2943 }
2944 }
2945
2946 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
2947 (settings->private_network >= 0 ||
2948 settings->network_veth >= 0 ||
2949 settings->network_bridge ||
2950 settings->network_interfaces ||
2951 settings->network_macvlan ||
2952 settings->network_ipvlan)) {
2953
2954 if (!arg_settings_trusted)
2955 log_warning("Ignoring network settings, file %s is not trusted.", p);
2956 else {
2957 arg_network_veth = settings_private_network(settings);
2958 arg_private_network = settings_private_network(settings);
2959
2960 strv_free(arg_network_interfaces);
2961 arg_network_interfaces = settings->network_interfaces;
2962 settings->network_interfaces = NULL;
2963
2964 strv_free(arg_network_macvlan);
2965 arg_network_macvlan = settings->network_macvlan;
2966 settings->network_macvlan = NULL;
2967
2968 strv_free(arg_network_ipvlan);
2969 arg_network_ipvlan = settings->network_ipvlan;
2970 settings->network_ipvlan = NULL;
2971
2972 free(arg_network_bridge);
2973 arg_network_bridge = settings->network_bridge;
2974 settings->network_bridge = NULL;
2975 }
2976 }
2977
2978 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
2979 settings->expose_ports) {
2980
2981 if (!arg_settings_trusted)
2982 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
2983 else {
2984 expose_port_free_all(arg_expose_ports);
2985 arg_expose_ports = settings->expose_ports;
2986 settings->expose_ports = NULL;
2987 }
2988 }
2989
2990 return 0;
2991 }
2992
2993 int main(int argc, char *argv[]) {
2994
2995 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
2996 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2997 _cleanup_close_ int master = -1, image_fd = -1;
2998 _cleanup_fdset_free_ FDSet *fds = NULL;
2999 int r, n_fd_passed, loop_nr = -1;
3000 char veth_name[IFNAMSIZ];
3001 bool secondary = false, remove_subvol = false;
3002 sigset_t mask_chld;
3003 pid_t pid = 0;
3004 int ret = EXIT_SUCCESS;
3005 union in_addr_union exposed = {};
3006 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3007 bool interactive;
3008
3009 log_parse_environment();
3010 log_open();
3011
3012 r = parse_argv(argc, argv);
3013 if (r <= 0)
3014 goto finish;
3015
3016 if (geteuid() != 0) {
3017 log_error("Need to be root.");
3018 r = -EPERM;
3019 goto finish;
3020 }
3021 r = determine_names();
3022 if (r < 0)
3023 goto finish;
3024
3025 r = load_settings();
3026 if (r < 0)
3027 goto finish;
3028
3029 r = verify_arguments();
3030 if (r < 0)
3031 goto finish;
3032
3033 n_fd_passed = sd_listen_fds(false);
3034 if (n_fd_passed > 0) {
3035 r = fdset_new_listen_fds(&fds, false);
3036 if (r < 0) {
3037 log_error_errno(r, "Failed to collect file descriptors: %m");
3038 goto finish;
3039 }
3040 }
3041
3042 if (arg_directory) {
3043 assert(!arg_image);
3044
3045 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3046 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3047 r = -EINVAL;
3048 goto finish;
3049 }
3050
3051 if (arg_ephemeral) {
3052 _cleanup_free_ char *np = NULL;
3053
3054 /* If the specified path is a mount point we
3055 * generate the new snapshot immediately
3056 * inside it under a random name. However if
3057 * the specified is not a mount point we
3058 * create the new snapshot in the parent
3059 * directory, just next to it. */
3060 r = path_is_mount_point(arg_directory, 0);
3061 if (r < 0) {
3062 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3063 goto finish;
3064 }
3065 if (r > 0)
3066 r = tempfn_random_child(arg_directory, "machine.", &np);
3067 else
3068 r = tempfn_random(arg_directory, "machine.", &np);
3069 if (r < 0) {
3070 log_error_errno(r, "Failed to generate name for snapshot: %m");
3071 goto finish;
3072 }
3073
3074 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3075 if (r < 0) {
3076 log_error_errno(r, "Failed to lock %s: %m", np);
3077 goto finish;
3078 }
3079
3080 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
3081 if (r < 0) {
3082 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3083 goto finish;
3084 }
3085
3086 free(arg_directory);
3087 arg_directory = np;
3088 np = NULL;
3089
3090 remove_subvol = true;
3091
3092 } else {
3093 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3094 if (r == -EBUSY) {
3095 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3096 goto finish;
3097 }
3098 if (r < 0) {
3099 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3100 return r;
3101 }
3102
3103 if (arg_template) {
3104 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
3105 if (r == -EEXIST) {
3106 if (!arg_quiet)
3107 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3108 } else if (r < 0) {
3109 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3110 goto finish;
3111 } else {
3112 if (!arg_quiet)
3113 log_info("Populated %s from template %s.", arg_directory, arg_template);
3114 }
3115 }
3116 }
3117
3118 if (arg_boot) {
3119 if (path_is_os_tree(arg_directory) <= 0) {
3120 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3121 r = -EINVAL;
3122 goto finish;
3123 }
3124 } else {
3125 const char *p;
3126
3127 p = strjoina(arg_directory, "/usr/");
3128 if (laccess(p, F_OK) < 0) {
3129 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
3130 r = -EINVAL;
3131 goto finish;
3132 }
3133 }
3134
3135 } else {
3136 char template[] = "/tmp/nspawn-root-XXXXXX";
3137
3138 assert(arg_image);
3139 assert(!arg_template);
3140
3141 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3142 if (r == -EBUSY) {
3143 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3144 goto finish;
3145 }
3146 if (r < 0) {
3147 r = log_error_errno(r, "Failed to create image lock: %m");
3148 goto finish;
3149 }
3150
3151 if (!mkdtemp(template)) {
3152 log_error_errno(errno, "Failed to create temporary directory: %m");
3153 r = -errno;
3154 goto finish;
3155 }
3156
3157 arg_directory = strdup(template);
3158 if (!arg_directory) {
3159 r = log_oom();
3160 goto finish;
3161 }
3162
3163 image_fd = setup_image(&device_path, &loop_nr);
3164 if (image_fd < 0) {
3165 r = image_fd;
3166 goto finish;
3167 }
3168
3169 r = dissect_image(image_fd,
3170 &root_device, &root_device_rw,
3171 &home_device, &home_device_rw,
3172 &srv_device, &srv_device_rw,
3173 &secondary);
3174 if (r < 0)
3175 goto finish;
3176 }
3177
3178 r = custom_mounts_prepare();
3179 if (r < 0)
3180 goto finish;
3181
3182 interactive =
3183 isatty(STDIN_FILENO) > 0 &&
3184 isatty(STDOUT_FILENO) > 0;
3185
3186 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3187 if (master < 0) {
3188 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3189 goto finish;
3190 }
3191
3192 r = ptsname_malloc(master, &console);
3193 if (r < 0) {
3194 r = log_error_errno(r, "Failed to determine tty name: %m");
3195 goto finish;
3196 }
3197
3198 if (unlockpt(master) < 0) {
3199 r = log_error_errno(errno, "Failed to unlock tty: %m");
3200 goto finish;
3201 }
3202
3203 if (!arg_quiet)
3204 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3205 arg_machine, arg_image ?: arg_directory);
3206
3207 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
3208
3209 assert_se(sigemptyset(&mask_chld) == 0);
3210 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3211
3212 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3213 r = log_error_errno(errno, "Failed to become subreaper: %m");
3214 goto finish;
3215 }
3216
3217 for (;;) {
3218 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
3219 uid_shift_socket_pair[2] = { -1, -1 };
3220 ContainerStatus container_status;
3221 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3222 static const struct sigaction sa = {
3223 .sa_handler = nop_signal_handler,
3224 .sa_flags = SA_NOCLDSTOP,
3225 };
3226 int ifi = 0;
3227 ssize_t l;
3228 _cleanup_event_unref_ sd_event *event = NULL;
3229 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3230 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
3231 char last_char = 0;
3232
3233 r = barrier_create(&barrier);
3234 if (r < 0) {
3235 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3236 goto finish;
3237 }
3238
3239 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3240 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3241 goto finish;
3242 }
3243
3244 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3245 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3246 goto finish;
3247 }
3248
3249 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
3250 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3251 goto finish;
3252 }
3253
3254 if (arg_userns)
3255 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
3256 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3257 goto finish;
3258 }
3259
3260 /* Child can be killed before execv(), so handle SIGCHLD
3261 * in order to interrupt parent's blocking calls and
3262 * give it a chance to call wait() and terminate. */
3263 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3264 if (r < 0) {
3265 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3266 goto finish;
3267 }
3268
3269 r = sigaction(SIGCHLD, &sa, NULL);
3270 if (r < 0) {
3271 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3272 goto finish;
3273 }
3274
3275 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
3276 if (pid < 0) {
3277 if (errno == EINVAL)
3278 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3279 else
3280 r = log_error_errno(errno, "clone() failed: %m");
3281
3282 goto finish;
3283 }
3284
3285 if (pid == 0) {
3286 /* The outer child only has a file system namespace. */
3287 barrier_set_role(&barrier, BARRIER_CHILD);
3288
3289 master = safe_close(master);
3290
3291 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3292 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3293 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3294 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3295
3296 (void) reset_all_signal_handlers();
3297 (void) reset_signal_mask();
3298
3299 r = outer_child(&barrier,
3300 arg_directory,
3301 console,
3302 root_device, root_device_rw,
3303 home_device, home_device_rw,
3304 srv_device, srv_device_rw,
3305 interactive,
3306 secondary,
3307 pid_socket_pair[1],
3308 kmsg_socket_pair[1],
3309 rtnl_socket_pair[1],
3310 uid_shift_socket_pair[1],
3311 fds);
3312 if (r < 0)
3313 _exit(EXIT_FAILURE);
3314
3315 _exit(EXIT_SUCCESS);
3316 }
3317
3318 barrier_set_role(&barrier, BARRIER_PARENT);
3319
3320 fds = fdset_free(fds);
3321
3322 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3323 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3324 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3325 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3326
3327 /* Wait for the outer child. */
3328 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3329 if (r < 0)
3330 goto finish;
3331 if (r != 0) {
3332 r = -EIO;
3333 goto finish;
3334 }
3335 pid = 0;
3336
3337 /* And now retrieve the PID of the inner child. */
3338 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3339 if (l < 0) {
3340 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3341 goto finish;
3342 }
3343 if (l != sizeof(pid)) {
3344 log_error("Short read while reading inner child PID.");
3345 r = EIO;
3346 goto finish;
3347 }
3348
3349 log_debug("Init process invoked as PID " PID_FMT, pid);
3350
3351 if (arg_userns) {
3352 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3353 log_error("Child died too early.");
3354 r = -ESRCH;
3355 goto finish;
3356 }
3357
3358 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3359 if (l < 0) {
3360 r = log_error_errno(errno, "Failed to read UID shift: %m");
3361 goto finish;
3362 }
3363 if (l != sizeof(arg_uid_shift)) {
3364 log_error("Short read while reading UID shift.");
3365 r = EIO;
3366 goto finish;
3367 }
3368
3369 r = setup_uid_map(pid);
3370 if (r < 0)
3371 goto finish;
3372
3373 (void) barrier_place(&barrier); /* #2 */
3374 }
3375
3376 if (arg_private_network) {
3377
3378 r = move_network_interfaces(pid, arg_network_interfaces);
3379 if (r < 0)
3380 goto finish;
3381
3382 if (arg_network_veth) {
3383 r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge);
3384 if (r < 0)
3385 goto finish;
3386 else if (r > 0)
3387 ifi = r;
3388
3389 if (arg_network_bridge) {
3390 r = setup_bridge(veth_name, arg_network_bridge);
3391 if (r < 0)
3392 goto finish;
3393 if (r > 0)
3394 ifi = r;
3395 }
3396 }
3397
3398 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3399 if (r < 0)
3400 goto finish;
3401
3402 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3403 if (r < 0)
3404 goto finish;
3405 }
3406
3407 if (arg_register) {
3408 r = register_machine(
3409 arg_machine,
3410 pid,
3411 arg_directory,
3412 arg_uuid,
3413 ifi,
3414 arg_slice,
3415 arg_custom_mounts, arg_n_custom_mounts,
3416 arg_kill_signal,
3417 arg_property,
3418 arg_keep_unit);
3419 if (r < 0)
3420 goto finish;
3421 }
3422
3423 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
3424 if (r < 0)
3425 goto finish;
3426
3427 if (arg_keep_unit) {
3428 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
3429 if (r < 0)
3430 goto finish;
3431 }
3432
3433 r = chown_cgroup(pid, arg_uid_shift);
3434 if (r < 0)
3435 goto finish;
3436
3437 /* Notify the child that the parent is ready with all
3438 * its setup (including cgroup-ification), and that
3439 * the child can now hand over control to the code to
3440 * run inside the container. */
3441 (void) barrier_place(&barrier); /* #3 */
3442
3443 /* Block SIGCHLD here, before notifying child.
3444 * process_pty() will handle it with the other signals. */
3445 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3446
3447 /* Reset signal to default */
3448 r = default_signals(SIGCHLD, -1);
3449 if (r < 0) {
3450 log_error_errno(r, "Failed to reset SIGCHLD: %m");
3451 goto finish;
3452 }
3453
3454 /* Let the child know that we are ready and wait that the child is completely ready now. */
3455 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3456 log_error("Child died too early.");
3457 r = -ESRCH;
3458 goto finish;
3459 }
3460
3461 sd_notifyf(false,
3462 "READY=1\n"
3463 "STATUS=Container running.\n"
3464 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
3465
3466 r = sd_event_new(&event);
3467 if (r < 0) {
3468 log_error_errno(r, "Failed to get default event source: %m");
3469 goto finish;
3470 }
3471
3472 if (arg_kill_signal > 0) {
3473 /* Try to kill the init system on SIGINT or SIGTERM */
3474 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3475 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3476 } else {
3477 /* Immediately exit */
3478 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3479 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3480 }
3481
3482 /* simply exit on sigchld */
3483 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3484
3485 if (arg_expose_ports) {
3486 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
3487 if (r < 0)
3488 goto finish;
3489
3490 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
3491 }
3492
3493 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3494
3495 r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward);
3496 if (r < 0) {
3497 log_error_errno(r, "Failed to create PTY forwarder: %m");
3498 goto finish;
3499 }
3500
3501 r = sd_event_loop(event);
3502 if (r < 0) {
3503 log_error_errno(r, "Failed to run event loop: %m");
3504 goto finish;
3505 }
3506
3507 pty_forward_get_last_char(forward, &last_char);
3508
3509 forward = pty_forward_free(forward);
3510
3511 if (!arg_quiet && last_char != '\n')
3512 putc('\n', stdout);
3513
3514 /* Kill if it is not dead yet anyway */
3515 if (arg_register && !arg_keep_unit)
3516 terminate_machine(pid);
3517
3518 /* Normally redundant, but better safe than sorry */
3519 kill(pid, SIGKILL);
3520
3521 r = wait_for_container(pid, &container_status);
3522 pid = 0;
3523
3524 if (r < 0)
3525 /* We failed to wait for the container, or the
3526 * container exited abnormally */
3527 goto finish;
3528 else if (r > 0 || container_status == CONTAINER_TERMINATED){
3529 /* The container exited with a non-zero
3530 * status, or with zero status and no reboot
3531 * was requested. */
3532 ret = r;
3533 break;
3534 }
3535
3536 /* CONTAINER_REBOOTED, loop again */
3537
3538 if (arg_keep_unit) {
3539 /* Special handling if we are running as a
3540 * service: instead of simply restarting the
3541 * machine we want to restart the entire
3542 * service, so let's inform systemd about this
3543 * with the special exit code 133. The service
3544 * file uses RestartForceExitStatus=133 so
3545 * that this results in a full nspawn
3546 * restart. This is necessary since we might
3547 * have cgroup parameters set we want to have
3548 * flushed out. */
3549 ret = 133;
3550 r = 0;
3551 break;
3552 }
3553
3554 expose_port_flush(arg_expose_ports, &exposed);
3555 }
3556
3557 finish:
3558 sd_notify(false,
3559 "STOPPING=1\n"
3560 "STATUS=Terminating...");
3561
3562 if (pid > 0)
3563 kill(pid, SIGKILL);
3564
3565 /* Try to flush whatever is still queued in the pty */
3566 if (master >= 0)
3567 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
3568
3569 loop_remove(loop_nr, &image_fd);
3570
3571 if (remove_subvol && arg_directory) {
3572 int k;
3573
3574 k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
3575 if (k < 0)
3576 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3577 }
3578
3579 if (arg_machine) {
3580 const char *p;
3581
3582 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
3583 (void) rm_rf(p, REMOVE_ROOT);
3584 }
3585
3586 expose_port_flush(arg_expose_ports, &exposed);
3587
3588 free(arg_directory);
3589 free(arg_template);
3590 free(arg_image);
3591 free(arg_machine);
3592 free(arg_user);
3593 strv_free(arg_setenv);
3594 free(arg_network_bridge);
3595 strv_free(arg_network_interfaces);
3596 strv_free(arg_network_macvlan);
3597 strv_free(arg_network_ipvlan);
3598 strv_free(arg_parameters);
3599 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3600 expose_port_free_all(arg_expose_ports);
3601
3602 return r < 0 ? EXIT_FAILURE : ret;
3603 }