]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
f01a376af8333664de37ca4343ade8f53195666e
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #ifdef HAVE_BLKID
23 #include <blkid/blkid.h>
24 #endif
25 #include <errno.h>
26 #include <getopt.h>
27 #include <linux/loop.h>
28 #include <sched.h>
29 #ifdef HAVE_SECCOMP
30 #include <seccomp.h>
31 #endif
32 #ifdef HAVE_SELINUX
33 #include <selinux/selinux.h>
34 #endif
35 #include <signal.h>
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <sys/file.h>
40 #include <sys/mount.h>
41 #include <sys/personality.h>
42 #include <sys/prctl.h>
43 #include <sys/types.h>
44 #include <unistd.h>
45
46 #include "sd-daemon.h"
47 #include "sd-id128.h"
48
49 #include "barrier.h"
50 #include "base-filesystem.h"
51 #include "blkid-util.h"
52 #include "btrfs-util.h"
53 #include "cap-list.h"
54 #include "capability.h"
55 #include "cgroup-util.h"
56 #include "copy.h"
57 #include "dev-setup.h"
58 #include "env-util.h"
59 #include "event-util.h"
60 #include "fd-util.h"
61 #include "fdset.h"
62 #include "fileio.h"
63 #include "formats-util.h"
64 #include "gpt.h"
65 #include "hostname-util.h"
66 #include "log.h"
67 #include "loopback-setup.h"
68 #include "machine-image.h"
69 #include "macro.h"
70 #include "missing.h"
71 #include "mkdir.h"
72 #include "netlink-util.h"
73 #include "nspawn-cgroup.h"
74 #include "nspawn-expose-ports.h"
75 #include "nspawn-mount.h"
76 #include "nspawn-network.h"
77 #include "nspawn-register.h"
78 #include "nspawn-settings.h"
79 #include "nspawn-setuid.h"
80 #include "path-util.h"
81 #include "process-util.h"
82 #include "ptyfwd.h"
83 #include "random-util.h"
84 #include "rm-rf.h"
85 #ifdef HAVE_SECCOMP
86 #include "seccomp-util.h"
87 #endif
88 #include "signal-util.h"
89 #include "string-util.h"
90 #include "strv.h"
91 #include "terminal-util.h"
92 #include "udev-util.h"
93 #include "user-util.h"
94 #include "util.h"
95
96 typedef enum ContainerStatus {
97 CONTAINER_TERMINATED,
98 CONTAINER_REBOOTED
99 } ContainerStatus;
100
101 typedef enum LinkJournal {
102 LINK_NO,
103 LINK_AUTO,
104 LINK_HOST,
105 LINK_GUEST
106 } LinkJournal;
107
108 static char *arg_directory = NULL;
109 static char *arg_template = NULL;
110 static char *arg_user = NULL;
111 static sd_id128_t arg_uuid = {};
112 static char *arg_machine = NULL;
113 static const char *arg_selinux_context = NULL;
114 static const char *arg_selinux_apifs_context = NULL;
115 static const char *arg_slice = NULL;
116 static bool arg_private_network = false;
117 static bool arg_read_only = false;
118 static bool arg_boot = false;
119 static bool arg_ephemeral = false;
120 static LinkJournal arg_link_journal = LINK_AUTO;
121 static bool arg_link_journal_try = false;
122 static uint64_t arg_retain =
123 (1ULL << CAP_CHOWN) |
124 (1ULL << CAP_DAC_OVERRIDE) |
125 (1ULL << CAP_DAC_READ_SEARCH) |
126 (1ULL << CAP_FOWNER) |
127 (1ULL << CAP_FSETID) |
128 (1ULL << CAP_IPC_OWNER) |
129 (1ULL << CAP_KILL) |
130 (1ULL << CAP_LEASE) |
131 (1ULL << CAP_LINUX_IMMUTABLE) |
132 (1ULL << CAP_NET_BIND_SERVICE) |
133 (1ULL << CAP_NET_BROADCAST) |
134 (1ULL << CAP_NET_RAW) |
135 (1ULL << CAP_SETGID) |
136 (1ULL << CAP_SETFCAP) |
137 (1ULL << CAP_SETPCAP) |
138 (1ULL << CAP_SETUID) |
139 (1ULL << CAP_SYS_ADMIN) |
140 (1ULL << CAP_SYS_CHROOT) |
141 (1ULL << CAP_SYS_NICE) |
142 (1ULL << CAP_SYS_PTRACE) |
143 (1ULL << CAP_SYS_TTY_CONFIG) |
144 (1ULL << CAP_SYS_RESOURCE) |
145 (1ULL << CAP_SYS_BOOT) |
146 (1ULL << CAP_AUDIT_WRITE) |
147 (1ULL << CAP_AUDIT_CONTROL) |
148 (1ULL << CAP_MKNOD);
149 static CustomMount *arg_custom_mounts = NULL;
150 static unsigned arg_n_custom_mounts = 0;
151 static char **arg_setenv = NULL;
152 static bool arg_quiet = false;
153 static bool arg_share_system = false;
154 static bool arg_register = true;
155 static bool arg_keep_unit = false;
156 static char **arg_network_interfaces = NULL;
157 static char **arg_network_macvlan = NULL;
158 static char **arg_network_ipvlan = NULL;
159 static bool arg_network_veth = false;
160 static char *arg_network_bridge = NULL;
161 static unsigned long arg_personality = PERSONALITY_INVALID;
162 static char *arg_image = NULL;
163 static VolatileMode arg_volatile_mode = VOLATILE_NO;
164 static ExposePort *arg_expose_ports = NULL;
165 static char **arg_property = NULL;
166 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
167 static bool arg_userns = false;
168 static int arg_kill_signal = 0;
169 static bool arg_unified_cgroup_hierarchy = false;
170 static SettingsMask arg_settings_mask = 0;
171 static int arg_settings_trusted = -1;
172 static char **arg_parameters = NULL;
173
174 static void help(void) {
175 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
176 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
177 " -h --help Show this help\n"
178 " --version Print version string\n"
179 " -q --quiet Do not show status information\n"
180 " -D --directory=PATH Root directory for the container\n"
181 " --template=PATH Initialize root directory from template directory,\n"
182 " if missing\n"
183 " -x --ephemeral Run container with snapshot of root directory, and\n"
184 " remove it after exit\n"
185 " -i --image=PATH File system device or disk image for the container\n"
186 " -b --boot Boot up full system (i.e. invoke init)\n"
187 " -u --user=USER Run the command under specified user or uid\n"
188 " -M --machine=NAME Set the machine name for the container\n"
189 " --uuid=UUID Set a specific machine UUID for the container\n"
190 " -S --slice=SLICE Place the container in the specified slice\n"
191 " --property=NAME=VALUE Set scope unit property\n"
192 " --private-users[=UIDBASE[:NUIDS]]\n"
193 " Run within user namespace\n"
194 " --private-network Disable network in container\n"
195 " --network-interface=INTERFACE\n"
196 " Assign an existing network interface to the\n"
197 " container\n"
198 " --network-macvlan=INTERFACE\n"
199 " Create a macvlan network interface based on an\n"
200 " existing network interface to the container\n"
201 " --network-ipvlan=INTERFACE\n"
202 " Create a ipvlan network interface based on an\n"
203 " existing network interface to the container\n"
204 " -n --network-veth Add a virtual ethernet connection between host\n"
205 " and container\n"
206 " --network-bridge=INTERFACE\n"
207 " Add a virtual ethernet connection between host\n"
208 " and container and add it to an existing bridge on\n"
209 " the host\n"
210 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
211 " Expose a container IP port on the host\n"
212 " -Z --selinux-context=SECLABEL\n"
213 " Set the SELinux security context to be used by\n"
214 " processes in the container\n"
215 " -L --selinux-apifs-context=SECLABEL\n"
216 " Set the SELinux security context to be used by\n"
217 " API/tmpfs file systems in the container\n"
218 " --capability=CAP In addition to the default, retain specified\n"
219 " capability\n"
220 " --drop-capability=CAP Drop the specified capability from the default set\n"
221 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
222 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
223 " try-guest, try-host\n"
224 " -j Equivalent to --link-journal=try-guest\n"
225 " --read-only Mount the root directory read-only\n"
226 " --bind=PATH[:PATH[:OPTIONS]]\n"
227 " Bind mount a file or directory from the host into\n"
228 " the container\n"
229 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
230 " Similar, but creates a read-only bind mount\n"
231 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
232 " --overlay=PATH[:PATH...]:PATH\n"
233 " Create an overlay mount from the host to \n"
234 " the container\n"
235 " --overlay-ro=PATH[:PATH...]:PATH\n"
236 " Similar, but creates a read-only overlay mount\n"
237 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
238 " --share-system Share system namespaces with host\n"
239 " --register=BOOLEAN Register container as machine\n"
240 " --keep-unit Do not register a scope for the machine, reuse\n"
241 " the service unit nspawn is running in\n"
242 " --volatile[=MODE] Run the system in volatile mode\n"
243 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
244 , program_invocation_short_name);
245 }
246
247
248 static int custom_mounts_prepare(void) {
249 unsigned i;
250 int r;
251
252 /* Ensure the mounts are applied prefix first. */
253 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
254
255 /* Allocate working directories for the overlay file systems that need it */
256 for (i = 0; i < arg_n_custom_mounts; i++) {
257 CustomMount *m = &arg_custom_mounts[i];
258
259 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
260 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
261 return -EINVAL;
262 }
263
264 if (m->type != CUSTOM_MOUNT_OVERLAY)
265 continue;
266
267 if (m->work_dir)
268 continue;
269
270 if (m->read_only)
271 continue;
272
273 r = tempfn_random(m->source, NULL, &m->work_dir);
274 if (r < 0)
275 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
276 }
277
278 return 0;
279 }
280
281 static int detect_unified_cgroup_hierarchy(void) {
282 const char *e;
283 int r;
284
285 /* Allow the user to control whether the unified hierarchy is used */
286 e = getenv("UNIFIED_CGROUP_HIERARCHY");
287 if (e) {
288 r = parse_boolean(e);
289 if (r < 0)
290 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
291
292 arg_unified_cgroup_hierarchy = r;
293 return 0;
294 }
295
296 /* Otherwise inherit the default from the host system */
297 r = cg_unified();
298 if (r < 0)
299 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
300
301 arg_unified_cgroup_hierarchy = r;
302 return 0;
303 }
304
305 static int parse_argv(int argc, char *argv[]) {
306
307 enum {
308 ARG_VERSION = 0x100,
309 ARG_PRIVATE_NETWORK,
310 ARG_UUID,
311 ARG_READ_ONLY,
312 ARG_CAPABILITY,
313 ARG_DROP_CAPABILITY,
314 ARG_LINK_JOURNAL,
315 ARG_BIND,
316 ARG_BIND_RO,
317 ARG_TMPFS,
318 ARG_OVERLAY,
319 ARG_OVERLAY_RO,
320 ARG_SETENV,
321 ARG_SHARE_SYSTEM,
322 ARG_REGISTER,
323 ARG_KEEP_UNIT,
324 ARG_NETWORK_INTERFACE,
325 ARG_NETWORK_MACVLAN,
326 ARG_NETWORK_IPVLAN,
327 ARG_NETWORK_BRIDGE,
328 ARG_PERSONALITY,
329 ARG_VOLATILE,
330 ARG_TEMPLATE,
331 ARG_PROPERTY,
332 ARG_PRIVATE_USERS,
333 ARG_KILL_SIGNAL,
334 ARG_SETTINGS,
335 };
336
337 static const struct option options[] = {
338 { "help", no_argument, NULL, 'h' },
339 { "version", no_argument, NULL, ARG_VERSION },
340 { "directory", required_argument, NULL, 'D' },
341 { "template", required_argument, NULL, ARG_TEMPLATE },
342 { "ephemeral", no_argument, NULL, 'x' },
343 { "user", required_argument, NULL, 'u' },
344 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
345 { "boot", no_argument, NULL, 'b' },
346 { "uuid", required_argument, NULL, ARG_UUID },
347 { "read-only", no_argument, NULL, ARG_READ_ONLY },
348 { "capability", required_argument, NULL, ARG_CAPABILITY },
349 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
350 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
351 { "bind", required_argument, NULL, ARG_BIND },
352 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
353 { "tmpfs", required_argument, NULL, ARG_TMPFS },
354 { "overlay", required_argument, NULL, ARG_OVERLAY },
355 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
356 { "machine", required_argument, NULL, 'M' },
357 { "slice", required_argument, NULL, 'S' },
358 { "setenv", required_argument, NULL, ARG_SETENV },
359 { "selinux-context", required_argument, NULL, 'Z' },
360 { "selinux-apifs-context", required_argument, NULL, 'L' },
361 { "quiet", no_argument, NULL, 'q' },
362 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
363 { "register", required_argument, NULL, ARG_REGISTER },
364 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
365 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
366 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
367 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
368 { "network-veth", no_argument, NULL, 'n' },
369 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
370 { "personality", required_argument, NULL, ARG_PERSONALITY },
371 { "image", required_argument, NULL, 'i' },
372 { "volatile", optional_argument, NULL, ARG_VOLATILE },
373 { "port", required_argument, NULL, 'p' },
374 { "property", required_argument, NULL, ARG_PROPERTY },
375 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
376 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
377 { "settings", required_argument, NULL, ARG_SETTINGS },
378 {}
379 };
380
381 int c, r;
382 uint64_t plus = 0, minus = 0;
383 bool mask_all_settings = false, mask_no_settings = false;
384
385 assert(argc >= 0);
386 assert(argv);
387
388 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
389
390 switch (c) {
391
392 case 'h':
393 help();
394 return 0;
395
396 case ARG_VERSION:
397 return version();
398
399 case 'D':
400 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
401 if (r < 0)
402 return r;
403 break;
404
405 case ARG_TEMPLATE:
406 r = parse_path_argument_and_warn(optarg, false, &arg_template);
407 if (r < 0)
408 return r;
409 break;
410
411 case 'i':
412 r = parse_path_argument_and_warn(optarg, false, &arg_image);
413 if (r < 0)
414 return r;
415 break;
416
417 case 'x':
418 arg_ephemeral = true;
419 break;
420
421 case 'u':
422 r = free_and_strdup(&arg_user, optarg);
423 if (r < 0)
424 return log_oom();
425
426 arg_settings_mask |= SETTING_USER;
427 break;
428
429 case ARG_NETWORK_BRIDGE:
430 r = free_and_strdup(&arg_network_bridge, optarg);
431 if (r < 0)
432 return log_oom();
433
434 /* fall through */
435
436 case 'n':
437 arg_network_veth = true;
438 arg_private_network = true;
439 arg_settings_mask |= SETTING_NETWORK;
440 break;
441
442 case ARG_NETWORK_INTERFACE:
443 if (strv_extend(&arg_network_interfaces, optarg) < 0)
444 return log_oom();
445
446 arg_private_network = true;
447 arg_settings_mask |= SETTING_NETWORK;
448 break;
449
450 case ARG_NETWORK_MACVLAN:
451 if (strv_extend(&arg_network_macvlan, optarg) < 0)
452 return log_oom();
453
454 arg_private_network = true;
455 arg_settings_mask |= SETTING_NETWORK;
456 break;
457
458 case ARG_NETWORK_IPVLAN:
459 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
460 return log_oom();
461
462 /* fall through */
463
464 case ARG_PRIVATE_NETWORK:
465 arg_private_network = true;
466 arg_settings_mask |= SETTING_NETWORK;
467 break;
468
469 case 'b':
470 arg_boot = true;
471 arg_settings_mask |= SETTING_BOOT;
472 break;
473
474 case ARG_UUID:
475 r = sd_id128_from_string(optarg, &arg_uuid);
476 if (r < 0) {
477 log_error("Invalid UUID: %s", optarg);
478 return r;
479 }
480
481 arg_settings_mask |= SETTING_MACHINE_ID;
482 break;
483
484 case 'S':
485 arg_slice = optarg;
486 break;
487
488 case 'M':
489 if (isempty(optarg))
490 arg_machine = mfree(arg_machine);
491 else {
492 if (!machine_name_is_valid(optarg)) {
493 log_error("Invalid machine name: %s", optarg);
494 return -EINVAL;
495 }
496
497 r = free_and_strdup(&arg_machine, optarg);
498 if (r < 0)
499 return log_oom();
500
501 break;
502 }
503
504 case 'Z':
505 arg_selinux_context = optarg;
506 break;
507
508 case 'L':
509 arg_selinux_apifs_context = optarg;
510 break;
511
512 case ARG_READ_ONLY:
513 arg_read_only = true;
514 arg_settings_mask |= SETTING_READ_ONLY;
515 break;
516
517 case ARG_CAPABILITY:
518 case ARG_DROP_CAPABILITY: {
519 const char *state, *word;
520 size_t length;
521
522 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
523 _cleanup_free_ char *t;
524
525 t = strndup(word, length);
526 if (!t)
527 return log_oom();
528
529 if (streq(t, "all")) {
530 if (c == ARG_CAPABILITY)
531 plus = (uint64_t) -1;
532 else
533 minus = (uint64_t) -1;
534 } else {
535 int cap;
536
537 cap = capability_from_name(t);
538 if (cap < 0) {
539 log_error("Failed to parse capability %s.", t);
540 return -EINVAL;
541 }
542
543 if (c == ARG_CAPABILITY)
544 plus |= 1ULL << (uint64_t) cap;
545 else
546 minus |= 1ULL << (uint64_t) cap;
547 }
548 }
549
550 arg_settings_mask |= SETTING_CAPABILITY;
551 break;
552 }
553
554 case 'j':
555 arg_link_journal = LINK_GUEST;
556 arg_link_journal_try = true;
557 break;
558
559 case ARG_LINK_JOURNAL:
560 if (streq(optarg, "auto")) {
561 arg_link_journal = LINK_AUTO;
562 arg_link_journal_try = false;
563 } else if (streq(optarg, "no")) {
564 arg_link_journal = LINK_NO;
565 arg_link_journal_try = false;
566 } else if (streq(optarg, "guest")) {
567 arg_link_journal = LINK_GUEST;
568 arg_link_journal_try = false;
569 } else if (streq(optarg, "host")) {
570 arg_link_journal = LINK_HOST;
571 arg_link_journal_try = false;
572 } else if (streq(optarg, "try-guest")) {
573 arg_link_journal = LINK_GUEST;
574 arg_link_journal_try = true;
575 } else if (streq(optarg, "try-host")) {
576 arg_link_journal = LINK_HOST;
577 arg_link_journal_try = true;
578 } else {
579 log_error("Failed to parse link journal mode %s", optarg);
580 return -EINVAL;
581 }
582
583 break;
584
585 case ARG_BIND:
586 case ARG_BIND_RO:
587 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
588 if (r < 0)
589 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
590
591 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
592 break;
593
594 case ARG_TMPFS:
595 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
596 if (r < 0)
597 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
598
599 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
600 break;
601
602 case ARG_OVERLAY:
603 case ARG_OVERLAY_RO: {
604 _cleanup_free_ char *upper = NULL, *destination = NULL;
605 _cleanup_strv_free_ char **lower = NULL;
606 CustomMount *m;
607 unsigned n = 0;
608 char **i;
609
610 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
611 if (r == -ENOMEM)
612 return log_oom();
613 else if (r < 0) {
614 log_error("Invalid overlay specification: %s", optarg);
615 return r;
616 }
617
618 STRV_FOREACH(i, lower) {
619 if (!path_is_absolute(*i)) {
620 log_error("Overlay path %s is not absolute.", *i);
621 return -EINVAL;
622 }
623
624 n++;
625 }
626
627 if (n < 2) {
628 log_error("--overlay= needs at least two colon-separated directories specified.");
629 return -EINVAL;
630 }
631
632 if (n == 2) {
633 /* If two parameters are specified,
634 * the first one is the lower, the
635 * second one the upper directory. And
636 * we'll also define the destination
637 * mount point the same as the upper. */
638 upper = lower[1];
639 lower[1] = NULL;
640
641 destination = strdup(upper);
642 if (!destination)
643 return log_oom();
644
645 } else {
646 upper = lower[n - 2];
647 destination = lower[n - 1];
648 lower[n - 2] = NULL;
649 }
650
651 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
652 if (!m)
653 return log_oom();
654
655 m->destination = destination;
656 m->source = upper;
657 m->lower = lower;
658 m->read_only = c == ARG_OVERLAY_RO;
659
660 upper = destination = NULL;
661 lower = NULL;
662
663 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
664 break;
665 }
666
667 case ARG_SETENV: {
668 char **n;
669
670 if (!env_assignment_is_valid(optarg)) {
671 log_error("Environment variable assignment '%s' is not valid.", optarg);
672 return -EINVAL;
673 }
674
675 n = strv_env_set(arg_setenv, optarg);
676 if (!n)
677 return log_oom();
678
679 strv_free(arg_setenv);
680 arg_setenv = n;
681
682 arg_settings_mask |= SETTING_ENVIRONMENT;
683 break;
684 }
685
686 case 'q':
687 arg_quiet = true;
688 break;
689
690 case ARG_SHARE_SYSTEM:
691 arg_share_system = true;
692 break;
693
694 case ARG_REGISTER:
695 r = parse_boolean(optarg);
696 if (r < 0) {
697 log_error("Failed to parse --register= argument: %s", optarg);
698 return r;
699 }
700
701 arg_register = r;
702 break;
703
704 case ARG_KEEP_UNIT:
705 arg_keep_unit = true;
706 break;
707
708 case ARG_PERSONALITY:
709
710 arg_personality = personality_from_string(optarg);
711 if (arg_personality == PERSONALITY_INVALID) {
712 log_error("Unknown or unsupported personality '%s'.", optarg);
713 return -EINVAL;
714 }
715
716 arg_settings_mask |= SETTING_PERSONALITY;
717 break;
718
719 case ARG_VOLATILE:
720
721 if (!optarg)
722 arg_volatile_mode = VOLATILE_YES;
723 else {
724 VolatileMode m;
725
726 m = volatile_mode_from_string(optarg);
727 if (m < 0) {
728 log_error("Failed to parse --volatile= argument: %s", optarg);
729 return -EINVAL;
730 } else
731 arg_volatile_mode = m;
732 }
733
734 arg_settings_mask |= SETTING_VOLATILE_MODE;
735 break;
736
737 case 'p':
738 r = expose_port_parse(&arg_expose_ports, optarg);
739 if (r == -EEXIST)
740 return log_error_errno(r, "Duplicate port specification: %s", optarg);
741 if (r < 0)
742 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
743
744 arg_settings_mask |= SETTING_EXPOSE_PORTS;
745 break;
746
747 case ARG_PROPERTY:
748 if (strv_extend(&arg_property, optarg) < 0)
749 return log_oom();
750
751 break;
752
753 case ARG_PRIVATE_USERS:
754 if (optarg) {
755 _cleanup_free_ char *buffer = NULL;
756 const char *range, *shift;
757
758 range = strchr(optarg, ':');
759 if (range) {
760 buffer = strndup(optarg, range - optarg);
761 if (!buffer)
762 return log_oom();
763 shift = buffer;
764
765 range++;
766 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
767 log_error("Failed to parse UID range: %s", range);
768 return -EINVAL;
769 }
770 } else
771 shift = optarg;
772
773 if (parse_uid(shift, &arg_uid_shift) < 0) {
774 log_error("Failed to parse UID: %s", optarg);
775 return -EINVAL;
776 }
777 }
778
779 arg_userns = true;
780 break;
781
782 case ARG_KILL_SIGNAL:
783 arg_kill_signal = signal_from_string_try_harder(optarg);
784 if (arg_kill_signal < 0) {
785 log_error("Cannot parse signal: %s", optarg);
786 return -EINVAL;
787 }
788
789 arg_settings_mask |= SETTING_KILL_SIGNAL;
790 break;
791
792 case ARG_SETTINGS:
793
794 /* no → do not read files
795 * yes → read files, do not override cmdline, trust only subset
796 * override → read files, override cmdline, trust only subset
797 * trusted → read files, do not override cmdline, trust all
798 */
799
800 r = parse_boolean(optarg);
801 if (r < 0) {
802 if (streq(optarg, "trusted")) {
803 mask_all_settings = false;
804 mask_no_settings = false;
805 arg_settings_trusted = true;
806
807 } else if (streq(optarg, "override")) {
808 mask_all_settings = false;
809 mask_no_settings = true;
810 arg_settings_trusted = -1;
811 } else
812 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
813 } else if (r > 0) {
814 /* yes */
815 mask_all_settings = false;
816 mask_no_settings = false;
817 arg_settings_trusted = -1;
818 } else {
819 /* no */
820 mask_all_settings = true;
821 mask_no_settings = false;
822 arg_settings_trusted = false;
823 }
824
825 break;
826
827 case '?':
828 return -EINVAL;
829
830 default:
831 assert_not_reached("Unhandled option");
832 }
833
834 if (arg_share_system)
835 arg_register = false;
836
837 if (arg_boot && arg_share_system) {
838 log_error("--boot and --share-system may not be combined.");
839 return -EINVAL;
840 }
841
842 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
843 log_error("--keep-unit may not be used when invoked from a user session.");
844 return -EINVAL;
845 }
846
847 if (arg_directory && arg_image) {
848 log_error("--directory= and --image= may not be combined.");
849 return -EINVAL;
850 }
851
852 if (arg_template && arg_image) {
853 log_error("--template= and --image= may not be combined.");
854 return -EINVAL;
855 }
856
857 if (arg_template && !(arg_directory || arg_machine)) {
858 log_error("--template= needs --directory= or --machine=.");
859 return -EINVAL;
860 }
861
862 if (arg_ephemeral && arg_template) {
863 log_error("--ephemeral and --template= may not be combined.");
864 return -EINVAL;
865 }
866
867 if (arg_ephemeral && arg_image) {
868 log_error("--ephemeral and --image= may not be combined.");
869 return -EINVAL;
870 }
871
872 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
873 log_error("--ephemeral and --link-journal= may not be combined.");
874 return -EINVAL;
875 }
876
877 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
878 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
879
880 if (argc > optind) {
881 arg_parameters = strv_copy(argv + optind);
882 if (!arg_parameters)
883 return log_oom();
884
885 arg_settings_mask |= SETTING_BOOT;
886 }
887
888 /* Load all settings from .nspawn files */
889 if (mask_no_settings)
890 arg_settings_mask = 0;
891
892 /* Don't load any settings from .nspawn files */
893 if (mask_all_settings)
894 arg_settings_mask = _SETTINGS_MASK_ALL;
895
896 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
897
898 r = detect_unified_cgroup_hierarchy();
899 if (r < 0)
900 return r;
901
902 return 1;
903 }
904
905 static int verify_arguments(void) {
906
907 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
908 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
909 return -EINVAL;
910 }
911
912 if (arg_expose_ports && !arg_private_network) {
913 log_error("Cannot use --port= without private networking.");
914 return -EINVAL;
915 }
916
917 if (arg_boot && arg_kill_signal <= 0)
918 arg_kill_signal = SIGRTMIN+3;
919
920 return 0;
921 }
922
923 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
924 assert(p);
925
926 if (!arg_userns)
927 return 0;
928
929 if (uid == UID_INVALID && gid == GID_INVALID)
930 return 0;
931
932 if (uid != UID_INVALID) {
933 uid += arg_uid_shift;
934
935 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
936 return -EOVERFLOW;
937 }
938
939 if (gid != GID_INVALID) {
940 gid += (gid_t) arg_uid_shift;
941
942 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
943 return -EOVERFLOW;
944 }
945
946 if (lchown(p, uid, gid) < 0)
947 return -errno;
948
949 return 0;
950 }
951
952 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
953 const char *q;
954
955 q = prefix_roota(root, path);
956 if (mkdir(q, mode) < 0) {
957 if (errno == EEXIST)
958 return 0;
959 return -errno;
960 }
961
962 return userns_lchown(q, uid, gid);
963 }
964
965 static int setup_timezone(const char *dest) {
966 _cleanup_free_ char *p = NULL, *q = NULL;
967 const char *where, *check, *what;
968 char *z, *y;
969 int r;
970
971 assert(dest);
972
973 /* Fix the timezone, if possible */
974 r = readlink_malloc("/etc/localtime", &p);
975 if (r < 0) {
976 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
977 return 0;
978 }
979
980 z = path_startswith(p, "../usr/share/zoneinfo/");
981 if (!z)
982 z = path_startswith(p, "/usr/share/zoneinfo/");
983 if (!z) {
984 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
985 return 0;
986 }
987
988 where = prefix_roota(dest, "/etc/localtime");
989 r = readlink_malloc(where, &q);
990 if (r >= 0) {
991 y = path_startswith(q, "../usr/share/zoneinfo/");
992 if (!y)
993 y = path_startswith(q, "/usr/share/zoneinfo/");
994
995 /* Already pointing to the right place? Then do nothing .. */
996 if (y && streq(y, z))
997 return 0;
998 }
999
1000 check = strjoina("/usr/share/zoneinfo/", z);
1001 check = prefix_root(dest, check);
1002 if (laccess(check, F_OK) < 0) {
1003 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1004 return 0;
1005 }
1006
1007 r = unlink(where);
1008 if (r < 0 && errno != ENOENT) {
1009 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1010 return 0;
1011 }
1012
1013 what = strjoina("../usr/share/zoneinfo/", z);
1014 if (symlink(what, where) < 0) {
1015 log_error_errno(errno, "Failed to correct timezone of container: %m");
1016 return 0;
1017 }
1018
1019 r = userns_lchown(where, 0, 0);
1020 if (r < 0)
1021 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1022
1023 return 0;
1024 }
1025
1026 static int setup_resolv_conf(const char *dest) {
1027 const char *where = NULL;
1028 int r;
1029
1030 assert(dest);
1031
1032 if (arg_private_network)
1033 return 0;
1034
1035 /* Fix resolv.conf, if possible */
1036 where = prefix_roota(dest, "/etc/resolv.conf");
1037
1038 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1039 if (r < 0) {
1040 /* If the file already exists as symlink, let's
1041 * suppress the warning, under the assumption that
1042 * resolved or something similar runs inside and the
1043 * symlink points there.
1044 *
1045 * If the disk image is read-only, there's also no
1046 * point in complaining.
1047 */
1048 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1049 "Failed to copy /etc/resolv.conf to %s: %m", where);
1050 return 0;
1051 }
1052
1053 r = userns_lchown(where, 0, 0);
1054 if (r < 0)
1055 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1056
1057 return 0;
1058 }
1059
1060 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1061 assert(s);
1062
1063 snprintf(s, 37,
1064 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1065 SD_ID128_FORMAT_VAL(id));
1066
1067 return s;
1068 }
1069
1070 static int setup_boot_id(const char *dest) {
1071 const char *from, *to;
1072 sd_id128_t rnd = {};
1073 char as_uuid[37];
1074 int r;
1075
1076 if (arg_share_system)
1077 return 0;
1078
1079 /* Generate a new randomized boot ID, so that each boot-up of
1080 * the container gets a new one */
1081
1082 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1083 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1084
1085 r = sd_id128_randomize(&rnd);
1086 if (r < 0)
1087 return log_error_errno(r, "Failed to generate random boot id: %m");
1088
1089 id128_format_as_uuid(rnd, as_uuid);
1090
1091 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1092 if (r < 0)
1093 return log_error_errno(r, "Failed to write boot id: %m");
1094
1095 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1096 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1097 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1098 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1099
1100 unlink(from);
1101 return r;
1102 }
1103
1104 static int copy_devnodes(const char *dest) {
1105
1106 static const char devnodes[] =
1107 "null\0"
1108 "zero\0"
1109 "full\0"
1110 "random\0"
1111 "urandom\0"
1112 "tty\0"
1113 "net/tun\0";
1114
1115 const char *d;
1116 int r = 0;
1117 _cleanup_umask_ mode_t u;
1118
1119 assert(dest);
1120
1121 u = umask(0000);
1122
1123 /* Create /dev/net, so that we can create /dev/net/tun in it */
1124 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1125 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1126
1127 NULSTR_FOREACH(d, devnodes) {
1128 _cleanup_free_ char *from = NULL, *to = NULL;
1129 struct stat st;
1130
1131 from = strappend("/dev/", d);
1132 to = prefix_root(dest, from);
1133
1134 if (stat(from, &st) < 0) {
1135
1136 if (errno != ENOENT)
1137 return log_error_errno(errno, "Failed to stat %s: %m", from);
1138
1139 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1140
1141 log_error("%s is not a char or block device, cannot copy.", from);
1142 return -EIO;
1143
1144 } else {
1145 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1146 if (errno != EPERM)
1147 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1148
1149 /* Some systems abusively restrict mknod but
1150 * allow bind mounts. */
1151 r = touch(to);
1152 if (r < 0)
1153 return log_error_errno(r, "touch (%s) failed: %m", to);
1154 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1155 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1156 }
1157
1158 r = userns_lchown(to, 0, 0);
1159 if (r < 0)
1160 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1161 }
1162 }
1163
1164 return r;
1165 }
1166
1167 static int setup_pts(const char *dest) {
1168 _cleanup_free_ char *options = NULL;
1169 const char *p;
1170
1171 #ifdef HAVE_SELINUX
1172 if (arg_selinux_apifs_context)
1173 (void) asprintf(&options,
1174 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1175 arg_uid_shift + TTY_GID,
1176 arg_selinux_apifs_context);
1177 else
1178 #endif
1179 (void) asprintf(&options,
1180 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1181 arg_uid_shift + TTY_GID);
1182
1183 if (!options)
1184 return log_oom();
1185
1186 /* Mount /dev/pts itself */
1187 p = prefix_roota(dest, "/dev/pts");
1188 if (mkdir(p, 0755) < 0)
1189 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1190 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1191 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1192 if (userns_lchown(p, 0, 0) < 0)
1193 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1194
1195 /* Create /dev/ptmx symlink */
1196 p = prefix_roota(dest, "/dev/ptmx");
1197 if (symlink("pts/ptmx", p) < 0)
1198 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1199 if (userns_lchown(p, 0, 0) < 0)
1200 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
1201
1202 /* And fix /dev/pts/ptmx ownership */
1203 p = prefix_roota(dest, "/dev/pts/ptmx");
1204 if (userns_lchown(p, 0, 0) < 0)
1205 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
1206
1207 return 0;
1208 }
1209
1210 static int setup_dev_console(const char *dest, const char *console) {
1211 _cleanup_umask_ mode_t u;
1212 const char *to;
1213 int r;
1214
1215 assert(dest);
1216 assert(console);
1217
1218 u = umask(0000);
1219
1220 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1221 if (r < 0)
1222 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1223
1224 /* We need to bind mount the right tty to /dev/console since
1225 * ptys can only exist on pts file systems. To have something
1226 * to bind mount things on we create a empty regular file. */
1227
1228 to = prefix_roota(dest, "/dev/console");
1229 r = touch(to);
1230 if (r < 0)
1231 return log_error_errno(r, "touch() for /dev/console failed: %m");
1232
1233 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1234 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1235
1236 return 0;
1237 }
1238
1239 static int setup_kmsg(const char *dest, int kmsg_socket) {
1240 const char *from, *to;
1241 _cleanup_umask_ mode_t u;
1242 int fd, r;
1243
1244 assert(kmsg_socket >= 0);
1245
1246 u = umask(0000);
1247
1248 /* We create the kmsg FIFO as /run/kmsg, but immediately
1249 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1250 * on the reading side behave very similar to /proc/kmsg,
1251 * their writing side behaves differently from /dev/kmsg in
1252 * that writing blocks when nothing is reading. In order to
1253 * avoid any problems with containers deadlocking due to this
1254 * we simply make /dev/kmsg unavailable to the container. */
1255 from = prefix_roota(dest, "/run/kmsg");
1256 to = prefix_roota(dest, "/proc/kmsg");
1257
1258 if (mkfifo(from, 0600) < 0)
1259 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1260 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1261 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1262
1263 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1264 if (fd < 0)
1265 return log_error_errno(errno, "Failed to open fifo: %m");
1266
1267 /* Store away the fd in the socket, so that it stays open as
1268 * long as we run the child */
1269 r = send_one_fd(kmsg_socket, fd, 0);
1270 safe_close(fd);
1271
1272 if (r < 0)
1273 return log_error_errno(r, "Failed to send FIFO fd: %m");
1274
1275 /* And now make the FIFO unavailable as /run/kmsg... */
1276 (void) unlink(from);
1277
1278 return 0;
1279 }
1280
1281 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
1282 union in_addr_union *exposed = userdata;
1283
1284 assert(rtnl);
1285 assert(m);
1286 assert(exposed);
1287
1288 expose_port_execute(rtnl, arg_expose_ports, exposed);
1289 return 0;
1290 }
1291
1292 static int setup_hostname(void) {
1293
1294 if (arg_share_system)
1295 return 0;
1296
1297 if (sethostname_idempotent(arg_machine) < 0)
1298 return -errno;
1299
1300 return 0;
1301 }
1302
1303 static int setup_journal(const char *directory) {
1304 sd_id128_t machine_id, this_id;
1305 _cleanup_free_ char *b = NULL, *d = NULL;
1306 const char *etc_machine_id, *p, *q;
1307 char *id;
1308 int r;
1309
1310 /* Don't link journals in ephemeral mode */
1311 if (arg_ephemeral)
1312 return 0;
1313
1314 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
1315
1316 r = read_one_line_file(etc_machine_id, &b);
1317 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1318 return 0;
1319 else if (r < 0)
1320 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
1321
1322 id = strstrip(b);
1323 if (isempty(id) && arg_link_journal == LINK_AUTO)
1324 return 0;
1325
1326 /* Verify validity */
1327 r = sd_id128_from_string(id, &machine_id);
1328 if (r < 0)
1329 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
1330
1331 r = sd_id128_get_machine(&this_id);
1332 if (r < 0)
1333 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1334
1335 if (sd_id128_equal(machine_id, this_id)) {
1336 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1337 "Host and machine ids are equal (%s): refusing to link journals", id);
1338 if (arg_link_journal == LINK_AUTO)
1339 return 0;
1340 return -EEXIST;
1341 }
1342
1343 if (arg_link_journal == LINK_NO)
1344 return 0;
1345
1346 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1347 if (r < 0)
1348 return log_error_errno(r, "Failed to create /var: %m");
1349
1350 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1351 if (r < 0)
1352 return log_error_errno(r, "Failed to create /var/log: %m");
1353
1354 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1355 if (r < 0)
1356 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1357
1358 p = strjoina("/var/log/journal/", id);
1359 q = prefix_roota(directory, p);
1360
1361 if (path_is_mount_point(p, 0) > 0) {
1362 if (arg_link_journal != LINK_AUTO) {
1363 log_error("%s: already a mount point, refusing to use for journal", p);
1364 return -EEXIST;
1365 }
1366
1367 return 0;
1368 }
1369
1370 if (path_is_mount_point(q, 0) > 0) {
1371 if (arg_link_journal != LINK_AUTO) {
1372 log_error("%s: already a mount point, refusing to use for journal", q);
1373 return -EEXIST;
1374 }
1375
1376 return 0;
1377 }
1378
1379 r = readlink_and_make_absolute(p, &d);
1380 if (r >= 0) {
1381 if ((arg_link_journal == LINK_GUEST ||
1382 arg_link_journal == LINK_AUTO) &&
1383 path_equal(d, q)) {
1384
1385 r = userns_mkdir(directory, p, 0755, 0, 0);
1386 if (r < 0)
1387 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1388 return 0;
1389 }
1390
1391 if (unlink(p) < 0)
1392 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1393 } else if (r == -EINVAL) {
1394
1395 if (arg_link_journal == LINK_GUEST &&
1396 rmdir(p) < 0) {
1397
1398 if (errno == ENOTDIR) {
1399 log_error("%s already exists and is neither a symlink nor a directory", p);
1400 return r;
1401 } else {
1402 log_error_errno(errno, "Failed to remove %s: %m", p);
1403 return -errno;
1404 }
1405 }
1406 } else if (r != -ENOENT) {
1407 log_error_errno(errno, "readlink(%s) failed: %m", p);
1408 return r;
1409 }
1410
1411 if (arg_link_journal == LINK_GUEST) {
1412
1413 if (symlink(q, p) < 0) {
1414 if (arg_link_journal_try) {
1415 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1416 return 0;
1417 } else {
1418 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1419 return -errno;
1420 }
1421 }
1422
1423 r = userns_mkdir(directory, p, 0755, 0, 0);
1424 if (r < 0)
1425 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1426 return 0;
1427 }
1428
1429 if (arg_link_journal == LINK_HOST) {
1430 /* don't create parents here -- if the host doesn't have
1431 * permanent journal set up, don't force it here */
1432 r = mkdir(p, 0755);
1433 if (r < 0) {
1434 if (arg_link_journal_try) {
1435 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1436 return 0;
1437 } else {
1438 log_error_errno(errno, "Failed to create %s: %m", p);
1439 return r;
1440 }
1441 }
1442
1443 } else if (access(p, F_OK) < 0)
1444 return 0;
1445
1446 if (dir_is_empty(q) == 0)
1447 log_warning("%s is not empty, proceeding anyway.", q);
1448
1449 r = userns_mkdir(directory, p, 0755, 0, 0);
1450 if (r < 0) {
1451 log_error_errno(errno, "Failed to create %s: %m", q);
1452 return r;
1453 }
1454
1455 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1456 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1457
1458 return 0;
1459 }
1460
1461 static int drop_capabilities(void) {
1462 return capability_bounding_set_drop(~arg_retain, false);
1463 }
1464
1465 static int reset_audit_loginuid(void) {
1466 _cleanup_free_ char *p = NULL;
1467 int r;
1468
1469 if (arg_share_system)
1470 return 0;
1471
1472 r = read_one_line_file("/proc/self/loginuid", &p);
1473 if (r == -ENOENT)
1474 return 0;
1475 if (r < 0)
1476 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1477
1478 /* Already reset? */
1479 if (streq(p, "4294967295"))
1480 return 0;
1481
1482 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
1483 if (r < 0) {
1484 log_error_errno(r,
1485 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1486 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1487 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1488 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1489 "using systemd-nspawn. Sleeping for 5s... (%m)");
1490
1491 sleep(5);
1492 }
1493
1494 return 0;
1495 }
1496
1497 static int setup_seccomp(void) {
1498
1499 #ifdef HAVE_SECCOMP
1500 static const struct {
1501 uint64_t capability;
1502 int syscall_num;
1503 } blacklist[] = {
1504 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
1505 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
1506 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
1507 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
1508 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
1509 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
1510 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
1511 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
1512 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
1513 { CAP_SYSLOG, SCMP_SYS(syslog) },
1514 };
1515
1516 scmp_filter_ctx seccomp;
1517 unsigned i;
1518 int r;
1519
1520 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1521 if (!seccomp)
1522 return log_oom();
1523
1524 r = seccomp_add_secondary_archs(seccomp);
1525 if (r < 0) {
1526 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
1527 goto finish;
1528 }
1529
1530 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1531 if (arg_retain & (1ULL << blacklist[i].capability))
1532 continue;
1533
1534 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
1535 if (r == -EFAULT)
1536 continue; /* unknown syscall */
1537 if (r < 0) {
1538 log_error_errno(r, "Failed to block syscall: %m");
1539 goto finish;
1540 }
1541 }
1542
1543
1544 /*
1545 Audit is broken in containers, much of the userspace audit
1546 hookup will fail if running inside a container. We don't
1547 care and just turn off creation of audit sockets.
1548
1549 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1550 with EAFNOSUPPORT which audit userspace uses as indication
1551 that audit is disabled in the kernel.
1552 */
1553
1554 r = seccomp_rule_add(
1555 seccomp,
1556 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1557 SCMP_SYS(socket),
1558 2,
1559 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1560 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1561 if (r < 0) {
1562 log_error_errno(r, "Failed to add audit seccomp rule: %m");
1563 goto finish;
1564 }
1565
1566 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1567 if (r < 0) {
1568 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
1569 goto finish;
1570 }
1571
1572 r = seccomp_load(seccomp);
1573 if (r == -EINVAL) {
1574 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1575 r = 0;
1576 goto finish;
1577 }
1578 if (r < 0) {
1579 log_error_errno(r, "Failed to install seccomp audit filter: %m");
1580 goto finish;
1581 }
1582
1583 finish:
1584 seccomp_release(seccomp);
1585 return r;
1586 #else
1587 return 0;
1588 #endif
1589
1590 }
1591
1592 static int setup_propagate(const char *root) {
1593 const char *p, *q;
1594
1595 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1596 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1597 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
1598 (void) mkdir_p(p, 0600);
1599
1600 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
1601 return log_error_errno(errno, "Failed to create /run/systemd: %m");
1602
1603 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
1604 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
1605
1606 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
1607 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
1608
1609 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
1610 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1611 return log_error_errno(errno, "Failed to install propagation bind mount.");
1612
1613 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1614 return log_error_errno(errno, "Failed to make propagation mount read-only");
1615
1616 return 0;
1617 }
1618
1619 static int setup_image(char **device_path, int *loop_nr) {
1620 struct loop_info64 info = {
1621 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1622 };
1623 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1624 _cleanup_free_ char* loopdev = NULL;
1625 struct stat st;
1626 int r, nr;
1627
1628 assert(device_path);
1629 assert(loop_nr);
1630 assert(arg_image);
1631
1632 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1633 if (fd < 0)
1634 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1635
1636 if (fstat(fd, &st) < 0)
1637 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1638
1639 if (S_ISBLK(st.st_mode)) {
1640 char *p;
1641
1642 p = strdup(arg_image);
1643 if (!p)
1644 return log_oom();
1645
1646 *device_path = p;
1647
1648 *loop_nr = -1;
1649
1650 r = fd;
1651 fd = -1;
1652
1653 return r;
1654 }
1655
1656 if (!S_ISREG(st.st_mode)) {
1657 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1658 return -EINVAL;
1659 }
1660
1661 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1662 if (control < 0)
1663 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1664
1665 nr = ioctl(control, LOOP_CTL_GET_FREE);
1666 if (nr < 0)
1667 return log_error_errno(errno, "Failed to allocate loop device: %m");
1668
1669 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1670 return log_oom();
1671
1672 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1673 if (loop < 0)
1674 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1675
1676 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1677 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1678
1679 if (arg_read_only)
1680 info.lo_flags |= LO_FLAGS_READ_ONLY;
1681
1682 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1683 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1684
1685 *device_path = loopdev;
1686 loopdev = NULL;
1687
1688 *loop_nr = nr;
1689
1690 r = loop;
1691 loop = -1;
1692
1693 return r;
1694 }
1695
1696 #define PARTITION_TABLE_BLURB \
1697 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1698 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1699 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1700 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1701 "to be bootable with systemd-nspawn."
1702
1703 static int dissect_image(
1704 int fd,
1705 char **root_device, bool *root_device_rw,
1706 char **home_device, bool *home_device_rw,
1707 char **srv_device, bool *srv_device_rw,
1708 bool *secondary) {
1709
1710 #ifdef HAVE_BLKID
1711 int home_nr = -1, srv_nr = -1;
1712 #ifdef GPT_ROOT_NATIVE
1713 int root_nr = -1;
1714 #endif
1715 #ifdef GPT_ROOT_SECONDARY
1716 int secondary_root_nr = -1;
1717 #endif
1718 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1719 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1720 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1721 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1722 _cleanup_udev_unref_ struct udev *udev = NULL;
1723 struct udev_list_entry *first, *item;
1724 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
1725 bool is_gpt, is_mbr, multiple_generic = false;
1726 const char *pttype = NULL;
1727 blkid_partlist pl;
1728 struct stat st;
1729 unsigned i;
1730 int r;
1731
1732 assert(fd >= 0);
1733 assert(root_device);
1734 assert(home_device);
1735 assert(srv_device);
1736 assert(secondary);
1737 assert(arg_image);
1738
1739 b = blkid_new_probe();
1740 if (!b)
1741 return log_oom();
1742
1743 errno = 0;
1744 r = blkid_probe_set_device(b, fd, 0, 0);
1745 if (r != 0) {
1746 if (errno == 0)
1747 return log_oom();
1748
1749 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1750 return -errno;
1751 }
1752
1753 blkid_probe_enable_partitions(b, 1);
1754 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1755
1756 errno = 0;
1757 r = blkid_do_safeprobe(b);
1758 if (r == -2 || r == 1) {
1759 log_error("Failed to identify any partition table on\n"
1760 " %s\n"
1761 PARTITION_TABLE_BLURB, arg_image);
1762 return -EINVAL;
1763 } else if (r != 0) {
1764 if (errno == 0)
1765 errno = EIO;
1766 log_error_errno(errno, "Failed to probe: %m");
1767 return -errno;
1768 }
1769
1770 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1771
1772 is_gpt = streq_ptr(pttype, "gpt");
1773 is_mbr = streq_ptr(pttype, "dos");
1774
1775 if (!is_gpt && !is_mbr) {
1776 log_error("No GPT or MBR partition table discovered on\n"
1777 " %s\n"
1778 PARTITION_TABLE_BLURB, arg_image);
1779 return -EINVAL;
1780 }
1781
1782 errno = 0;
1783 pl = blkid_probe_get_partitions(b);
1784 if (!pl) {
1785 if (errno == 0)
1786 return log_oom();
1787
1788 log_error("Failed to list partitions of %s", arg_image);
1789 return -errno;
1790 }
1791
1792 udev = udev_new();
1793 if (!udev)
1794 return log_oom();
1795
1796 if (fstat(fd, &st) < 0)
1797 return log_error_errno(errno, "Failed to stat block device: %m");
1798
1799 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1800 if (!d)
1801 return log_oom();
1802
1803 for (i = 0;; i++) {
1804 int n, m;
1805
1806 if (i >= 10) {
1807 log_error("Kernel partitions never appeared.");
1808 return -ENXIO;
1809 }
1810
1811 e = udev_enumerate_new(udev);
1812 if (!e)
1813 return log_oom();
1814
1815 r = udev_enumerate_add_match_parent(e, d);
1816 if (r < 0)
1817 return log_oom();
1818
1819 r = udev_enumerate_scan_devices(e);
1820 if (r < 0)
1821 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1822
1823 /* Count the partitions enumerated by the kernel */
1824 n = 0;
1825 first = udev_enumerate_get_list_entry(e);
1826 udev_list_entry_foreach(item, first)
1827 n++;
1828
1829 /* Count the partitions enumerated by blkid */
1830 m = blkid_partlist_numof_partitions(pl);
1831 if (n == m + 1)
1832 break;
1833 if (n > m + 1) {
1834 log_error("blkid and kernel partition list do not match.");
1835 return -EIO;
1836 }
1837 if (n < m + 1) {
1838 unsigned j;
1839
1840 /* The kernel has probed fewer partitions than
1841 * blkid? Maybe the kernel prober is still
1842 * running or it got EBUSY because udev
1843 * already opened the device. Let's reprobe
1844 * the device, which is a synchronous call
1845 * that waits until probing is complete. */
1846
1847 for (j = 0; j < 20; j++) {
1848
1849 r = ioctl(fd, BLKRRPART, 0);
1850 if (r < 0)
1851 r = -errno;
1852 if (r >= 0 || r != -EBUSY)
1853 break;
1854
1855 /* If something else has the device
1856 * open, such as an udev rule, the
1857 * ioctl will return EBUSY. Since
1858 * there's no way to wait until it
1859 * isn't busy anymore, let's just wait
1860 * a bit, and try again.
1861 *
1862 * This is really something they
1863 * should fix in the kernel! */
1864
1865 usleep(50 * USEC_PER_MSEC);
1866 }
1867
1868 if (r < 0)
1869 return log_error_errno(r, "Failed to reread partition table: %m");
1870 }
1871
1872 e = udev_enumerate_unref(e);
1873 }
1874
1875 first = udev_enumerate_get_list_entry(e);
1876 udev_list_entry_foreach(item, first) {
1877 _cleanup_udev_device_unref_ struct udev_device *q;
1878 const char *node;
1879 unsigned long long flags;
1880 blkid_partition pp;
1881 dev_t qn;
1882 int nr;
1883
1884 errno = 0;
1885 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1886 if (!q) {
1887 if (!errno)
1888 errno = ENOMEM;
1889
1890 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1891 return -errno;
1892 }
1893
1894 qn = udev_device_get_devnum(q);
1895 if (major(qn) == 0)
1896 continue;
1897
1898 if (st.st_rdev == qn)
1899 continue;
1900
1901 node = udev_device_get_devnode(q);
1902 if (!node)
1903 continue;
1904
1905 pp = blkid_partlist_devno_to_partition(pl, qn);
1906 if (!pp)
1907 continue;
1908
1909 flags = blkid_partition_get_flags(pp);
1910
1911 nr = blkid_partition_get_partno(pp);
1912 if (nr < 0)
1913 continue;
1914
1915 if (is_gpt) {
1916 sd_id128_t type_id;
1917 const char *stype;
1918
1919 if (flags & GPT_FLAG_NO_AUTO)
1920 continue;
1921
1922 stype = blkid_partition_get_type_string(pp);
1923 if (!stype)
1924 continue;
1925
1926 if (sd_id128_from_string(stype, &type_id) < 0)
1927 continue;
1928
1929 if (sd_id128_equal(type_id, GPT_HOME)) {
1930
1931 if (home && nr >= home_nr)
1932 continue;
1933
1934 home_nr = nr;
1935 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1936
1937 r = free_and_strdup(&home, node);
1938 if (r < 0)
1939 return log_oom();
1940
1941 } else if (sd_id128_equal(type_id, GPT_SRV)) {
1942
1943 if (srv && nr >= srv_nr)
1944 continue;
1945
1946 srv_nr = nr;
1947 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
1948
1949 r = free_and_strdup(&srv, node);
1950 if (r < 0)
1951 return log_oom();
1952 }
1953 #ifdef GPT_ROOT_NATIVE
1954 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1955
1956 if (root && nr >= root_nr)
1957 continue;
1958
1959 root_nr = nr;
1960 root_rw = !(flags & GPT_FLAG_READ_ONLY);
1961
1962 r = free_and_strdup(&root, node);
1963 if (r < 0)
1964 return log_oom();
1965 }
1966 #endif
1967 #ifdef GPT_ROOT_SECONDARY
1968 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
1969
1970 if (secondary_root && nr >= secondary_root_nr)
1971 continue;
1972
1973 secondary_root_nr = nr;
1974 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
1975
1976 r = free_and_strdup(&secondary_root, node);
1977 if (r < 0)
1978 return log_oom();
1979 }
1980 #endif
1981 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
1982
1983 if (generic)
1984 multiple_generic = true;
1985 else {
1986 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
1987
1988 r = free_and_strdup(&generic, node);
1989 if (r < 0)
1990 return log_oom();
1991 }
1992 }
1993
1994 } else if (is_mbr) {
1995 int type;
1996
1997 if (flags != 0x80) /* Bootable flag */
1998 continue;
1999
2000 type = blkid_partition_get_type(pp);
2001 if (type != 0x83) /* Linux partition */
2002 continue;
2003
2004 if (generic)
2005 multiple_generic = true;
2006 else {
2007 generic_rw = true;
2008
2009 r = free_and_strdup(&root, node);
2010 if (r < 0)
2011 return log_oom();
2012 }
2013 }
2014 }
2015
2016 if (root) {
2017 *root_device = root;
2018 root = NULL;
2019
2020 *root_device_rw = root_rw;
2021 *secondary = false;
2022 } else if (secondary_root) {
2023 *root_device = secondary_root;
2024 secondary_root = NULL;
2025
2026 *root_device_rw = secondary_root_rw;
2027 *secondary = true;
2028 } else if (generic) {
2029
2030 /* There were no partitions with precise meanings
2031 * around, but we found generic partitions. In this
2032 * case, if there's only one, we can go ahead and boot
2033 * it, otherwise we bail out, because we really cannot
2034 * make any sense of it. */
2035
2036 if (multiple_generic) {
2037 log_error("Identified multiple bootable Linux partitions on\n"
2038 " %s\n"
2039 PARTITION_TABLE_BLURB, arg_image);
2040 return -EINVAL;
2041 }
2042
2043 *root_device = generic;
2044 generic = NULL;
2045
2046 *root_device_rw = generic_rw;
2047 *secondary = false;
2048 } else {
2049 log_error("Failed to identify root partition in disk image\n"
2050 " %s\n"
2051 PARTITION_TABLE_BLURB, arg_image);
2052 return -EINVAL;
2053 }
2054
2055 if (home) {
2056 *home_device = home;
2057 home = NULL;
2058
2059 *home_device_rw = home_rw;
2060 }
2061
2062 if (srv) {
2063 *srv_device = srv;
2064 srv = NULL;
2065
2066 *srv_device_rw = srv_rw;
2067 }
2068
2069 return 0;
2070 #else
2071 log_error("--image= is not supported, compiled without blkid support.");
2072 return -EOPNOTSUPP;
2073 #endif
2074 }
2075
2076 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2077 #ifdef HAVE_BLKID
2078 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2079 const char *fstype, *p;
2080 int r;
2081
2082 assert(what);
2083 assert(where);
2084
2085 if (arg_read_only)
2086 rw = false;
2087
2088 if (directory)
2089 p = strjoina(where, directory);
2090 else
2091 p = where;
2092
2093 errno = 0;
2094 b = blkid_new_probe_from_filename(what);
2095 if (!b) {
2096 if (errno == 0)
2097 return log_oom();
2098 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2099 return -errno;
2100 }
2101
2102 blkid_probe_enable_superblocks(b, 1);
2103 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2104
2105 errno = 0;
2106 r = blkid_do_safeprobe(b);
2107 if (r == -1 || r == 1) {
2108 log_error("Cannot determine file system type of %s", what);
2109 return -EINVAL;
2110 } else if (r != 0) {
2111 if (errno == 0)
2112 errno = EIO;
2113 log_error_errno(errno, "Failed to probe %s: %m", what);
2114 return -errno;
2115 }
2116
2117 errno = 0;
2118 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2119 if (errno == 0)
2120 errno = EINVAL;
2121 log_error("Failed to determine file system type of %s", what);
2122 return -errno;
2123 }
2124
2125 if (streq(fstype, "crypto_LUKS")) {
2126 log_error("nspawn currently does not support LUKS disk images.");
2127 return -EOPNOTSUPP;
2128 }
2129
2130 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2131 return log_error_errno(errno, "Failed to mount %s: %m", what);
2132
2133 return 0;
2134 #else
2135 log_error("--image= is not supported, compiled without blkid support.");
2136 return -EOPNOTSUPP;
2137 #endif
2138 }
2139
2140 static int mount_devices(
2141 const char *where,
2142 const char *root_device, bool root_device_rw,
2143 const char *home_device, bool home_device_rw,
2144 const char *srv_device, bool srv_device_rw) {
2145 int r;
2146
2147 assert(where);
2148
2149 if (root_device) {
2150 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2151 if (r < 0)
2152 return log_error_errno(r, "Failed to mount root directory: %m");
2153 }
2154
2155 if (home_device) {
2156 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2157 if (r < 0)
2158 return log_error_errno(r, "Failed to mount home directory: %m");
2159 }
2160
2161 if (srv_device) {
2162 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2163 if (r < 0)
2164 return log_error_errno(r, "Failed to mount server data directory: %m");
2165 }
2166
2167 return 0;
2168 }
2169
2170 static void loop_remove(int nr, int *image_fd) {
2171 _cleanup_close_ int control = -1;
2172 int r;
2173
2174 if (nr < 0)
2175 return;
2176
2177 if (image_fd && *image_fd >= 0) {
2178 r = ioctl(*image_fd, LOOP_CLR_FD);
2179 if (r < 0)
2180 log_debug_errno(errno, "Failed to close loop image: %m");
2181 *image_fd = safe_close(*image_fd);
2182 }
2183
2184 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2185 if (control < 0) {
2186 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2187 return;
2188 }
2189
2190 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2191 if (r < 0)
2192 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
2193 }
2194
2195 /*
2196 * Return values:
2197 * < 0 : wait_for_terminate() failed to get the state of the
2198 * container, the container was terminated by a signal, or
2199 * failed for an unknown reason. No change is made to the
2200 * container argument.
2201 * > 0 : The program executed in the container terminated with an
2202 * error. The exit code of the program executed in the
2203 * container is returned. The container argument has been set
2204 * to CONTAINER_TERMINATED.
2205 * 0 : The container is being rebooted, has been shut down or exited
2206 * successfully. The container argument has been set to either
2207 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2208 *
2209 * That is, success is indicated by a return value of zero, and an
2210 * error is indicated by a non-zero value.
2211 */
2212 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2213 siginfo_t status;
2214 int r;
2215
2216 r = wait_for_terminate(pid, &status);
2217 if (r < 0)
2218 return log_warning_errno(r, "Failed to wait for container: %m");
2219
2220 switch (status.si_code) {
2221
2222 case CLD_EXITED:
2223 if (status.si_status == 0) {
2224 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2225
2226 } else
2227 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2228
2229 *container = CONTAINER_TERMINATED;
2230 return status.si_status;
2231
2232 case CLD_KILLED:
2233 if (status.si_status == SIGINT) {
2234
2235 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2236 *container = CONTAINER_TERMINATED;
2237 return 0;
2238
2239 } else if (status.si_status == SIGHUP) {
2240
2241 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2242 *container = CONTAINER_REBOOTED;
2243 return 0;
2244 }
2245
2246 /* CLD_KILLED fallthrough */
2247
2248 case CLD_DUMPED:
2249 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2250 return -EIO;
2251
2252 default:
2253 log_error("Container %s failed due to unknown reason.", arg_machine);
2254 return -EIO;
2255 }
2256
2257 return r;
2258 }
2259
2260 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2261 pid_t pid;
2262
2263 pid = PTR_TO_UINT32(userdata);
2264 if (pid > 0) {
2265 if (kill(pid, arg_kill_signal) >= 0) {
2266 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2267 sd_event_source_set_userdata(s, NULL);
2268 return 0;
2269 }
2270 }
2271
2272 sd_event_exit(sd_event_source_get_event(s), 0);
2273 return 0;
2274 }
2275
2276 static int determine_names(void) {
2277 int r;
2278
2279 if (arg_template && !arg_directory && arg_machine) {
2280
2281 /* If --template= was specified then we should not
2282 * search for a machine, but instead create a new one
2283 * in /var/lib/machine. */
2284
2285 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2286 if (!arg_directory)
2287 return log_oom();
2288 }
2289
2290 if (!arg_image && !arg_directory) {
2291 if (arg_machine) {
2292 _cleanup_(image_unrefp) Image *i = NULL;
2293
2294 r = image_find(arg_machine, &i);
2295 if (r < 0)
2296 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2297 else if (r == 0) {
2298 log_error("No image for machine '%s': %m", arg_machine);
2299 return -ENOENT;
2300 }
2301
2302 if (i->type == IMAGE_RAW)
2303 r = free_and_strdup(&arg_image, i->path);
2304 else
2305 r = free_and_strdup(&arg_directory, i->path);
2306 if (r < 0)
2307 return log_error_errno(r, "Invalid image directory: %m");
2308
2309 if (!arg_ephemeral)
2310 arg_read_only = arg_read_only || i->read_only;
2311 } else
2312 arg_directory = get_current_dir_name();
2313
2314 if (!arg_directory && !arg_machine) {
2315 log_error("Failed to determine path, please use -D or -i.");
2316 return -EINVAL;
2317 }
2318 }
2319
2320 if (!arg_machine) {
2321 if (arg_directory && path_equal(arg_directory, "/"))
2322 arg_machine = gethostname_malloc();
2323 else
2324 arg_machine = strdup(basename(arg_image ?: arg_directory));
2325
2326 if (!arg_machine)
2327 return log_oom();
2328
2329 hostname_cleanup(arg_machine);
2330 if (!machine_name_is_valid(arg_machine)) {
2331 log_error("Failed to determine machine name automatically, please use -M.");
2332 return -EINVAL;
2333 }
2334
2335 if (arg_ephemeral) {
2336 char *b;
2337
2338 /* Add a random suffix when this is an
2339 * ephemeral machine, so that we can run many
2340 * instances at once without manually having
2341 * to specify -M each time. */
2342
2343 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2344 return log_oom();
2345
2346 free(arg_machine);
2347 arg_machine = b;
2348 }
2349 }
2350
2351 return 0;
2352 }
2353
2354 static int determine_uid_shift(const char *directory) {
2355 int r;
2356
2357 if (!arg_userns) {
2358 arg_uid_shift = 0;
2359 return 0;
2360 }
2361
2362 if (arg_uid_shift == UID_INVALID) {
2363 struct stat st;
2364
2365 r = stat(directory, &st);
2366 if (r < 0)
2367 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2368
2369 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2370
2371 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
2372 log_error("UID and GID base of %s don't match.", directory);
2373 return -EINVAL;
2374 }
2375
2376 arg_uid_range = UINT32_C(0x10000);
2377 }
2378
2379 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2380 log_error("UID base too high for UID range.");
2381 return -EINVAL;
2382 }
2383
2384 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2385 return 0;
2386 }
2387
2388 static int inner_child(
2389 Barrier *barrier,
2390 const char *directory,
2391 bool secondary,
2392 int kmsg_socket,
2393 int rtnl_socket,
2394 FDSet *fds) {
2395
2396 _cleanup_free_ char *home = NULL;
2397 unsigned n_env = 2;
2398 const char *envp[] = {
2399 "PATH=" DEFAULT_PATH_SPLIT_USR,
2400 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2401 NULL, /* TERM */
2402 NULL, /* HOME */
2403 NULL, /* USER */
2404 NULL, /* LOGNAME */
2405 NULL, /* container_uuid */
2406 NULL, /* LISTEN_FDS */
2407 NULL, /* LISTEN_PID */
2408 NULL
2409 };
2410
2411 _cleanup_strv_free_ char **env_use = NULL;
2412 int r;
2413
2414 assert(barrier);
2415 assert(directory);
2416 assert(kmsg_socket >= 0);
2417
2418 cg_unified_flush();
2419
2420 if (arg_userns) {
2421 /* Tell the parent, that it now can write the UID map. */
2422 (void) barrier_place(barrier); /* #1 */
2423
2424 /* Wait until the parent wrote the UID map */
2425 if (!barrier_place_and_sync(barrier)) { /* #2 */
2426 log_error("Parent died too early");
2427 return -ESRCH;
2428 }
2429 }
2430
2431 r = mount_all(NULL, arg_userns, true, arg_uid_shift, arg_private_network, arg_uid_range, arg_selinux_apifs_context);
2432 if (r < 0)
2433 return r;
2434
2435 r = mount_sysfs(NULL);
2436 if (r < 0)
2437 return r;
2438
2439 /* Wait until we are cgroup-ified, so that we
2440 * can mount the right cgroup path writable */
2441 if (!barrier_place_and_sync(barrier)) { /* #3 */
2442 log_error("Parent died too early");
2443 return -ESRCH;
2444 }
2445
2446 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2447 if (r < 0)
2448 return r;
2449
2450 r = reset_uid_gid();
2451 if (r < 0)
2452 return log_error_errno(r, "Couldn't become new root: %m");
2453
2454 r = setup_boot_id(NULL);
2455 if (r < 0)
2456 return r;
2457
2458 r = setup_kmsg(NULL, kmsg_socket);
2459 if (r < 0)
2460 return r;
2461 kmsg_socket = safe_close(kmsg_socket);
2462
2463 umask(0022);
2464
2465 if (setsid() < 0)
2466 return log_error_errno(errno, "setsid() failed: %m");
2467
2468 if (arg_private_network)
2469 loopback_setup();
2470
2471 if (arg_expose_ports) {
2472 r = expose_port_send_rtnl(rtnl_socket);
2473 if (r < 0)
2474 return r;
2475 rtnl_socket = safe_close(rtnl_socket);
2476 }
2477
2478 if (drop_capabilities() < 0)
2479 return log_error_errno(errno, "drop_capabilities() failed: %m");
2480
2481 setup_hostname();
2482
2483 if (arg_personality != PERSONALITY_INVALID) {
2484 if (personality(arg_personality) < 0)
2485 return log_error_errno(errno, "personality() failed: %m");
2486 } else if (secondary) {
2487 if (personality(PER_LINUX32) < 0)
2488 return log_error_errno(errno, "personality() failed: %m");
2489 }
2490
2491 #ifdef HAVE_SELINUX
2492 if (arg_selinux_context)
2493 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2494 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2495 #endif
2496
2497 r = change_uid_gid(arg_user, &home);
2498 if (r < 0)
2499 return r;
2500
2501 envp[n_env] = strv_find_prefix(environ, "TERM=");
2502 if (envp[n_env])
2503 n_env ++;
2504
2505 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2506 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2507 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2508 return log_oom();
2509
2510 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2511 char as_uuid[37];
2512
2513 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2514 return log_oom();
2515 }
2516
2517 if (fdset_size(fds) > 0) {
2518 r = fdset_cloexec(fds, false);
2519 if (r < 0)
2520 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2521
2522 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2523 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2524 return log_oom();
2525 }
2526
2527 env_use = strv_env_merge(2, envp, arg_setenv);
2528 if (!env_use)
2529 return log_oom();
2530
2531 /* Let the parent know that we are ready and
2532 * wait until the parent is ready with the
2533 * setup, too... */
2534 if (!barrier_place_and_sync(barrier)) { /* #4 */
2535 log_error("Parent died too early");
2536 return -ESRCH;
2537 }
2538
2539 /* Now, explicitly close the log, so that we
2540 * then can close all remaining fds. Closing
2541 * the log explicitly first has the benefit
2542 * that the logging subsystem knows about it,
2543 * and is thus ready to be reopened should we
2544 * need it again. Note that the other fds
2545 * closed here are at least the locking and
2546 * barrier fds. */
2547 log_close();
2548 (void) fdset_close_others(fds);
2549
2550 if (arg_boot) {
2551 char **a;
2552 size_t m;
2553
2554 /* Automatically search for the init system */
2555
2556 m = 1 + strv_length(arg_parameters);
2557 a = newa(char*, m + 1);
2558 if (strv_isempty(arg_parameters))
2559 a[1] = NULL;
2560 else
2561 memcpy(a + 1, arg_parameters, m * sizeof(char*));
2562
2563 a[0] = (char*) "/usr/lib/systemd/systemd";
2564 execve(a[0], a, env_use);
2565
2566 a[0] = (char*) "/lib/systemd/systemd";
2567 execve(a[0], a, env_use);
2568
2569 a[0] = (char*) "/sbin/init";
2570 execve(a[0], a, env_use);
2571 } else if (!strv_isempty(arg_parameters))
2572 execvpe(arg_parameters[0], arg_parameters, env_use);
2573 else {
2574 chdir(home ?: "/root");
2575 execle("/bin/bash", "-bash", NULL, env_use);
2576 execle("/bin/sh", "-sh", NULL, env_use);
2577 }
2578
2579 (void) log_open();
2580 return log_error_errno(errno, "execv() failed: %m");
2581 }
2582
2583 static int outer_child(
2584 Barrier *barrier,
2585 const char *directory,
2586 const char *console,
2587 const char *root_device, bool root_device_rw,
2588 const char *home_device, bool home_device_rw,
2589 const char *srv_device, bool srv_device_rw,
2590 bool interactive,
2591 bool secondary,
2592 int pid_socket,
2593 int kmsg_socket,
2594 int rtnl_socket,
2595 int uid_shift_socket,
2596 FDSet *fds) {
2597
2598 pid_t pid;
2599 ssize_t l;
2600 int r;
2601
2602 assert(barrier);
2603 assert(directory);
2604 assert(console);
2605 assert(pid_socket >= 0);
2606 assert(kmsg_socket >= 0);
2607
2608 cg_unified_flush();
2609
2610 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2611 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2612
2613 if (interactive) {
2614 close_nointr(STDIN_FILENO);
2615 close_nointr(STDOUT_FILENO);
2616 close_nointr(STDERR_FILENO);
2617
2618 r = open_terminal(console, O_RDWR);
2619 if (r != STDIN_FILENO) {
2620 if (r >= 0) {
2621 safe_close(r);
2622 r = -EINVAL;
2623 }
2624
2625 return log_error_errno(r, "Failed to open console: %m");
2626 }
2627
2628 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2629 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2630 return log_error_errno(errno, "Failed to duplicate console: %m");
2631 }
2632
2633 r = reset_audit_loginuid();
2634 if (r < 0)
2635 return r;
2636
2637 /* Mark everything as slave, so that we still
2638 * receive mounts from the real root, but don't
2639 * propagate mounts to the real root. */
2640 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2641 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2642
2643 r = mount_devices(directory,
2644 root_device, root_device_rw,
2645 home_device, home_device_rw,
2646 srv_device, srv_device_rw);
2647 if (r < 0)
2648 return r;
2649
2650 r = determine_uid_shift(directory);
2651 if (r < 0)
2652 return r;
2653
2654 if (arg_userns) {
2655 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2656 if (l < 0)
2657 return log_error_errno(errno, "Failed to send UID shift: %m");
2658 if (l != sizeof(arg_uid_shift)) {
2659 log_error("Short write while sending UID shift.");
2660 return -EIO;
2661 }
2662 }
2663
2664 /* Turn directory into bind mount */
2665 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2666 return log_error_errno(errno, "Failed to make bind mount: %m");
2667
2668 r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2669 if (r < 0)
2670 return r;
2671
2672 r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2673 if (r < 0)
2674 return r;
2675
2676 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2677 if (r < 0)
2678 return r;
2679
2680 if (arg_read_only) {
2681 r = bind_remount_recursive(directory, true);
2682 if (r < 0)
2683 return log_error_errno(r, "Failed to make tree read-only: %m");
2684 }
2685
2686 r = mount_all(directory, arg_userns, false, arg_private_network, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2687 if (r < 0)
2688 return r;
2689
2690 r = copy_devnodes(directory);
2691 if (r < 0)
2692 return r;
2693
2694 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2695
2696 r = setup_pts(directory);
2697 if (r < 0)
2698 return r;
2699
2700 r = setup_propagate(directory);
2701 if (r < 0)
2702 return r;
2703
2704 r = setup_dev_console(directory, console);
2705 if (r < 0)
2706 return r;
2707
2708 r = setup_seccomp();
2709 if (r < 0)
2710 return r;
2711
2712 r = setup_timezone(directory);
2713 if (r < 0)
2714 return r;
2715
2716 r = setup_resolv_conf(directory);
2717 if (r < 0)
2718 return r;
2719
2720 r = setup_journal(directory);
2721 if (r < 0)
2722 return r;
2723
2724 r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2725 if (r < 0)
2726 return r;
2727
2728 r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2729 if (r < 0)
2730 return r;
2731
2732 r = mount_move_root(directory);
2733 if (r < 0)
2734 return log_error_errno(r, "Failed to move root directory: %m");
2735
2736 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2737 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
2738 (arg_private_network ? CLONE_NEWNET : 0) |
2739 (arg_userns ? CLONE_NEWUSER : 0),
2740 NULL);
2741 if (pid < 0)
2742 return log_error_errno(errno, "Failed to fork inner child: %m");
2743 if (pid == 0) {
2744 pid_socket = safe_close(pid_socket);
2745 uid_shift_socket = safe_close(uid_shift_socket);
2746
2747 /* The inner child has all namespaces that are
2748 * requested, so that we all are owned by the user if
2749 * user namespaces are turned on. */
2750
2751 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
2752 if (r < 0)
2753 _exit(EXIT_FAILURE);
2754
2755 _exit(EXIT_SUCCESS);
2756 }
2757
2758 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2759 if (l < 0)
2760 return log_error_errno(errno, "Failed to send PID: %m");
2761 if (l != sizeof(pid)) {
2762 log_error("Short write while sending PID.");
2763 return -EIO;
2764 }
2765
2766 pid_socket = safe_close(pid_socket);
2767 kmsg_socket = safe_close(kmsg_socket);
2768 rtnl_socket = safe_close(rtnl_socket);
2769
2770 return 0;
2771 }
2772
2773 static int setup_uid_map(pid_t pid) {
2774 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2775 int r;
2776
2777 assert(pid > 1);
2778
2779 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2780 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
2781 r = write_string_file(uid_map, line, 0);
2782 if (r < 0)
2783 return log_error_errno(r, "Failed to write UID map: %m");
2784
2785 /* We always assign the same UID and GID ranges */
2786 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
2787 r = write_string_file(uid_map, line, 0);
2788 if (r < 0)
2789 return log_error_errno(r, "Failed to write GID map: %m");
2790
2791 return 0;
2792 }
2793
2794 static int load_settings(void) {
2795 _cleanup_(settings_freep) Settings *settings = NULL;
2796 _cleanup_fclose_ FILE *f = NULL;
2797 _cleanup_free_ char *p = NULL;
2798 const char *fn, *i;
2799 int r;
2800
2801 /* If all settings are masked, there's no point in looking for
2802 * the settings file */
2803 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2804 return 0;
2805
2806 fn = strjoina(arg_machine, ".nspawn");
2807
2808 /* We first look in the admin's directories in /etc and /run */
2809 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2810 _cleanup_free_ char *j = NULL;
2811
2812 j = strjoin(i, "/", fn, NULL);
2813 if (!j)
2814 return log_oom();
2815
2816 f = fopen(j, "re");
2817 if (f) {
2818 p = j;
2819 j = NULL;
2820
2821 /* By default we trust configuration from /etc and /run */
2822 if (arg_settings_trusted < 0)
2823 arg_settings_trusted = true;
2824
2825 break;
2826 }
2827
2828 if (errno != ENOENT)
2829 return log_error_errno(errno, "Failed to open %s: %m", j);
2830 }
2831
2832 if (!f) {
2833 /* After that, let's look for a file next to the
2834 * actual image we shall boot. */
2835
2836 if (arg_image) {
2837 p = file_in_same_dir(arg_image, fn);
2838 if (!p)
2839 return log_oom();
2840 } else if (arg_directory) {
2841 p = file_in_same_dir(arg_directory, fn);
2842 if (!p)
2843 return log_oom();
2844 }
2845
2846 if (p) {
2847 f = fopen(p, "re");
2848 if (!f && errno != ENOENT)
2849 return log_error_errno(errno, "Failed to open %s: %m", p);
2850
2851 /* By default we do not trust configuration from /var/lib/machines */
2852 if (arg_settings_trusted < 0)
2853 arg_settings_trusted = false;
2854 }
2855 }
2856
2857 if (!f)
2858 return 0;
2859
2860 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2861
2862 r = settings_load(f, p, &settings);
2863 if (r < 0)
2864 return r;
2865
2866 /* Copy over bits from the settings, unless they have been
2867 * explicitly masked by command line switches. */
2868
2869 if ((arg_settings_mask & SETTING_BOOT) == 0 &&
2870 settings->boot >= 0) {
2871 arg_boot = settings->boot;
2872
2873 strv_free(arg_parameters);
2874 arg_parameters = settings->parameters;
2875 settings->parameters = NULL;
2876 }
2877
2878 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2879 settings->environment) {
2880 strv_free(arg_setenv);
2881 arg_setenv = settings->environment;
2882 settings->environment = NULL;
2883 }
2884
2885 if ((arg_settings_mask & SETTING_USER) == 0 &&
2886 settings->user) {
2887 free(arg_user);
2888 arg_user = settings->user;
2889 settings->user = NULL;
2890 }
2891
2892 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
2893 uint64_t plus;
2894
2895 plus = settings->capability;
2896 if (settings_private_network(settings))
2897 plus |= (1ULL << CAP_NET_ADMIN);
2898
2899 if (!arg_settings_trusted && plus != 0) {
2900 if (settings->capability != 0)
2901 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
2902 } else
2903 arg_retain |= plus;
2904
2905 arg_retain &= ~settings->drop_capability;
2906 }
2907
2908 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
2909 settings->kill_signal > 0)
2910 arg_kill_signal = settings->kill_signal;
2911
2912 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
2913 settings->personality != PERSONALITY_INVALID)
2914 arg_personality = settings->personality;
2915
2916 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
2917 !sd_id128_is_null(settings->machine_id)) {
2918
2919 if (!arg_settings_trusted)
2920 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
2921 else
2922 arg_uuid = settings->machine_id;
2923 }
2924
2925 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
2926 settings->read_only >= 0)
2927 arg_read_only = settings->read_only;
2928
2929 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
2930 settings->volatile_mode != _VOLATILE_MODE_INVALID)
2931 arg_volatile_mode = settings->volatile_mode;
2932
2933 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
2934 settings->n_custom_mounts > 0) {
2935
2936 if (!arg_settings_trusted)
2937 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
2938 else {
2939 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
2940 arg_custom_mounts = settings->custom_mounts;
2941 arg_n_custom_mounts = settings->n_custom_mounts;
2942
2943 settings->custom_mounts = NULL;
2944 settings->n_custom_mounts = 0;
2945 }
2946 }
2947
2948 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
2949 (settings->private_network >= 0 ||
2950 settings->network_veth >= 0 ||
2951 settings->network_bridge ||
2952 settings->network_interfaces ||
2953 settings->network_macvlan ||
2954 settings->network_ipvlan)) {
2955
2956 if (!arg_settings_trusted)
2957 log_warning("Ignoring network settings, file %s is not trusted.", p);
2958 else {
2959 arg_network_veth = settings_private_network(settings);
2960 arg_private_network = settings_private_network(settings);
2961
2962 strv_free(arg_network_interfaces);
2963 arg_network_interfaces = settings->network_interfaces;
2964 settings->network_interfaces = NULL;
2965
2966 strv_free(arg_network_macvlan);
2967 arg_network_macvlan = settings->network_macvlan;
2968 settings->network_macvlan = NULL;
2969
2970 strv_free(arg_network_ipvlan);
2971 arg_network_ipvlan = settings->network_ipvlan;
2972 settings->network_ipvlan = NULL;
2973
2974 free(arg_network_bridge);
2975 arg_network_bridge = settings->network_bridge;
2976 settings->network_bridge = NULL;
2977 }
2978 }
2979
2980 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
2981 settings->expose_ports) {
2982
2983 if (!arg_settings_trusted)
2984 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
2985 else {
2986 expose_port_free_all(arg_expose_ports);
2987 arg_expose_ports = settings->expose_ports;
2988 settings->expose_ports = NULL;
2989 }
2990 }
2991
2992 return 0;
2993 }
2994
2995 int main(int argc, char *argv[]) {
2996
2997 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
2998 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2999 _cleanup_close_ int master = -1, image_fd = -1;
3000 _cleanup_fdset_free_ FDSet *fds = NULL;
3001 int r, n_fd_passed, loop_nr = -1;
3002 char veth_name[IFNAMSIZ];
3003 bool secondary = false, remove_subvol = false;
3004 sigset_t mask_chld;
3005 pid_t pid = 0;
3006 int ret = EXIT_SUCCESS;
3007 union in_addr_union exposed = {};
3008 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3009 bool interactive;
3010
3011 log_parse_environment();
3012 log_open();
3013
3014 r = parse_argv(argc, argv);
3015 if (r <= 0)
3016 goto finish;
3017
3018 if (geteuid() != 0) {
3019 log_error("Need to be root.");
3020 r = -EPERM;
3021 goto finish;
3022 }
3023 r = determine_names();
3024 if (r < 0)
3025 goto finish;
3026
3027 r = load_settings();
3028 if (r < 0)
3029 goto finish;
3030
3031 r = verify_arguments();
3032 if (r < 0)
3033 goto finish;
3034
3035 n_fd_passed = sd_listen_fds(false);
3036 if (n_fd_passed > 0) {
3037 r = fdset_new_listen_fds(&fds, false);
3038 if (r < 0) {
3039 log_error_errno(r, "Failed to collect file descriptors: %m");
3040 goto finish;
3041 }
3042 }
3043
3044 if (arg_directory) {
3045 assert(!arg_image);
3046
3047 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3048 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3049 r = -EINVAL;
3050 goto finish;
3051 }
3052
3053 if (arg_ephemeral) {
3054 _cleanup_free_ char *np = NULL;
3055
3056 /* If the specified path is a mount point we
3057 * generate the new snapshot immediately
3058 * inside it under a random name. However if
3059 * the specified is not a mount point we
3060 * create the new snapshot in the parent
3061 * directory, just next to it. */
3062 r = path_is_mount_point(arg_directory, 0);
3063 if (r < 0) {
3064 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3065 goto finish;
3066 }
3067 if (r > 0)
3068 r = tempfn_random_child(arg_directory, "machine.", &np);
3069 else
3070 r = tempfn_random(arg_directory, "machine.", &np);
3071 if (r < 0) {
3072 log_error_errno(r, "Failed to generate name for snapshot: %m");
3073 goto finish;
3074 }
3075
3076 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3077 if (r < 0) {
3078 log_error_errno(r, "Failed to lock %s: %m", np);
3079 goto finish;
3080 }
3081
3082 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
3083 if (r < 0) {
3084 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3085 goto finish;
3086 }
3087
3088 free(arg_directory);
3089 arg_directory = np;
3090 np = NULL;
3091
3092 remove_subvol = true;
3093
3094 } else {
3095 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3096 if (r == -EBUSY) {
3097 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3098 goto finish;
3099 }
3100 if (r < 0) {
3101 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3102 return r;
3103 }
3104
3105 if (arg_template) {
3106 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
3107 if (r == -EEXIST) {
3108 if (!arg_quiet)
3109 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3110 } else if (r < 0) {
3111 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3112 goto finish;
3113 } else {
3114 if (!arg_quiet)
3115 log_info("Populated %s from template %s.", arg_directory, arg_template);
3116 }
3117 }
3118 }
3119
3120 if (arg_boot) {
3121 if (path_is_os_tree(arg_directory) <= 0) {
3122 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3123 r = -EINVAL;
3124 goto finish;
3125 }
3126 } else {
3127 const char *p;
3128
3129 p = strjoina(arg_directory, "/usr/");
3130 if (laccess(p, F_OK) < 0) {
3131 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
3132 r = -EINVAL;
3133 goto finish;
3134 }
3135 }
3136
3137 } else {
3138 char template[] = "/tmp/nspawn-root-XXXXXX";
3139
3140 assert(arg_image);
3141 assert(!arg_template);
3142
3143 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3144 if (r == -EBUSY) {
3145 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3146 goto finish;
3147 }
3148 if (r < 0) {
3149 r = log_error_errno(r, "Failed to create image lock: %m");
3150 goto finish;
3151 }
3152
3153 if (!mkdtemp(template)) {
3154 log_error_errno(errno, "Failed to create temporary directory: %m");
3155 r = -errno;
3156 goto finish;
3157 }
3158
3159 arg_directory = strdup(template);
3160 if (!arg_directory) {
3161 r = log_oom();
3162 goto finish;
3163 }
3164
3165 image_fd = setup_image(&device_path, &loop_nr);
3166 if (image_fd < 0) {
3167 r = image_fd;
3168 goto finish;
3169 }
3170
3171 r = dissect_image(image_fd,
3172 &root_device, &root_device_rw,
3173 &home_device, &home_device_rw,
3174 &srv_device, &srv_device_rw,
3175 &secondary);
3176 if (r < 0)
3177 goto finish;
3178 }
3179
3180 r = custom_mounts_prepare();
3181 if (r < 0)
3182 goto finish;
3183
3184 interactive =
3185 isatty(STDIN_FILENO) > 0 &&
3186 isatty(STDOUT_FILENO) > 0;
3187
3188 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3189 if (master < 0) {
3190 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3191 goto finish;
3192 }
3193
3194 r = ptsname_malloc(master, &console);
3195 if (r < 0) {
3196 r = log_error_errno(r, "Failed to determine tty name: %m");
3197 goto finish;
3198 }
3199
3200 if (unlockpt(master) < 0) {
3201 r = log_error_errno(errno, "Failed to unlock tty: %m");
3202 goto finish;
3203 }
3204
3205 if (!arg_quiet)
3206 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3207 arg_machine, arg_image ?: arg_directory);
3208
3209 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
3210
3211 assert_se(sigemptyset(&mask_chld) == 0);
3212 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3213
3214 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3215 r = log_error_errno(errno, "Failed to become subreaper: %m");
3216 goto finish;
3217 }
3218
3219 for (;;) {
3220 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
3221 uid_shift_socket_pair[2] = { -1, -1 };
3222 ContainerStatus container_status;
3223 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3224 static const struct sigaction sa = {
3225 .sa_handler = nop_signal_handler,
3226 .sa_flags = SA_NOCLDSTOP,
3227 };
3228 int ifi = 0;
3229 ssize_t l;
3230 _cleanup_event_unref_ sd_event *event = NULL;
3231 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3232 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
3233 char last_char = 0;
3234
3235 r = barrier_create(&barrier);
3236 if (r < 0) {
3237 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3238 goto finish;
3239 }
3240
3241 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3242 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3243 goto finish;
3244 }
3245
3246 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3247 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3248 goto finish;
3249 }
3250
3251 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
3252 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3253 goto finish;
3254 }
3255
3256 if (arg_userns)
3257 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
3258 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3259 goto finish;
3260 }
3261
3262 /* Child can be killed before execv(), so handle SIGCHLD
3263 * in order to interrupt parent's blocking calls and
3264 * give it a chance to call wait() and terminate. */
3265 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3266 if (r < 0) {
3267 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3268 goto finish;
3269 }
3270
3271 r = sigaction(SIGCHLD, &sa, NULL);
3272 if (r < 0) {
3273 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3274 goto finish;
3275 }
3276
3277 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
3278 if (pid < 0) {
3279 if (errno == EINVAL)
3280 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3281 else
3282 r = log_error_errno(errno, "clone() failed: %m");
3283
3284 goto finish;
3285 }
3286
3287 if (pid == 0) {
3288 /* The outer child only has a file system namespace. */
3289 barrier_set_role(&barrier, BARRIER_CHILD);
3290
3291 master = safe_close(master);
3292
3293 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3294 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3295 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3296 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3297
3298 (void) reset_all_signal_handlers();
3299 (void) reset_signal_mask();
3300
3301 r = outer_child(&barrier,
3302 arg_directory,
3303 console,
3304 root_device, root_device_rw,
3305 home_device, home_device_rw,
3306 srv_device, srv_device_rw,
3307 interactive,
3308 secondary,
3309 pid_socket_pair[1],
3310 kmsg_socket_pair[1],
3311 rtnl_socket_pair[1],
3312 uid_shift_socket_pair[1],
3313 fds);
3314 if (r < 0)
3315 _exit(EXIT_FAILURE);
3316
3317 _exit(EXIT_SUCCESS);
3318 }
3319
3320 barrier_set_role(&barrier, BARRIER_PARENT);
3321
3322 fds = fdset_free(fds);
3323
3324 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3325 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3326 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3327 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3328
3329 /* Wait for the outer child. */
3330 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3331 if (r < 0)
3332 goto finish;
3333 if (r != 0) {
3334 r = -EIO;
3335 goto finish;
3336 }
3337 pid = 0;
3338
3339 /* And now retrieve the PID of the inner child. */
3340 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3341 if (l < 0) {
3342 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3343 goto finish;
3344 }
3345 if (l != sizeof(pid)) {
3346 log_error("Short read while reading inner child PID.");
3347 r = EIO;
3348 goto finish;
3349 }
3350
3351 log_debug("Init process invoked as PID " PID_FMT, pid);
3352
3353 if (arg_userns) {
3354 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3355 log_error("Child died too early.");
3356 r = -ESRCH;
3357 goto finish;
3358 }
3359
3360 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3361 if (l < 0) {
3362 r = log_error_errno(errno, "Failed to read UID shift: %m");
3363 goto finish;
3364 }
3365 if (l != sizeof(arg_uid_shift)) {
3366 log_error("Short read while reading UID shift.");
3367 r = EIO;
3368 goto finish;
3369 }
3370
3371 r = setup_uid_map(pid);
3372 if (r < 0)
3373 goto finish;
3374
3375 (void) barrier_place(&barrier); /* #2 */
3376 }
3377
3378 if (arg_private_network) {
3379
3380 r = move_network_interfaces(pid, arg_network_interfaces);
3381 if (r < 0)
3382 goto finish;
3383
3384 if (arg_network_veth) {
3385 r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge);
3386 if (r < 0)
3387 goto finish;
3388 else if (r > 0)
3389 ifi = r;
3390
3391 if (arg_network_bridge) {
3392 r = setup_bridge(veth_name, arg_network_bridge);
3393 if (r < 0)
3394 goto finish;
3395 if (r > 0)
3396 ifi = r;
3397 }
3398 }
3399
3400 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3401 if (r < 0)
3402 goto finish;
3403
3404 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3405 if (r < 0)
3406 goto finish;
3407 }
3408
3409 if (arg_register) {
3410 r = register_machine(
3411 arg_machine,
3412 pid,
3413 arg_directory,
3414 arg_uuid,
3415 ifi,
3416 arg_slice,
3417 arg_custom_mounts, arg_n_custom_mounts,
3418 arg_kill_signal,
3419 arg_property,
3420 arg_keep_unit);
3421 if (r < 0)
3422 goto finish;
3423 }
3424
3425 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
3426 if (r < 0)
3427 goto finish;
3428
3429 if (arg_keep_unit) {
3430 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
3431 if (r < 0)
3432 goto finish;
3433 }
3434
3435 r = chown_cgroup(pid, arg_uid_shift);
3436 if (r < 0)
3437 goto finish;
3438
3439 /* Notify the child that the parent is ready with all
3440 * its setup (including cgroup-ification), and that
3441 * the child can now hand over control to the code to
3442 * run inside the container. */
3443 (void) barrier_place(&barrier); /* #3 */
3444
3445 /* Block SIGCHLD here, before notifying child.
3446 * process_pty() will handle it with the other signals. */
3447 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3448
3449 /* Reset signal to default */
3450 r = default_signals(SIGCHLD, -1);
3451 if (r < 0) {
3452 log_error_errno(r, "Failed to reset SIGCHLD: %m");
3453 goto finish;
3454 }
3455
3456 /* Let the child know that we are ready and wait that the child is completely ready now. */
3457 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3458 log_error("Child died too early.");
3459 r = -ESRCH;
3460 goto finish;
3461 }
3462
3463 sd_notifyf(false,
3464 "READY=1\n"
3465 "STATUS=Container running.\n"
3466 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
3467
3468 r = sd_event_new(&event);
3469 if (r < 0) {
3470 log_error_errno(r, "Failed to get default event source: %m");
3471 goto finish;
3472 }
3473
3474 if (arg_kill_signal > 0) {
3475 /* Try to kill the init system on SIGINT or SIGTERM */
3476 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3477 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3478 } else {
3479 /* Immediately exit */
3480 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3481 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3482 }
3483
3484 /* simply exit on sigchld */
3485 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3486
3487 if (arg_expose_ports) {
3488 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
3489 if (r < 0)
3490 goto finish;
3491
3492 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
3493 }
3494
3495 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3496
3497 r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward);
3498 if (r < 0) {
3499 log_error_errno(r, "Failed to create PTY forwarder: %m");
3500 goto finish;
3501 }
3502
3503 r = sd_event_loop(event);
3504 if (r < 0) {
3505 log_error_errno(r, "Failed to run event loop: %m");
3506 goto finish;
3507 }
3508
3509 pty_forward_get_last_char(forward, &last_char);
3510
3511 forward = pty_forward_free(forward);
3512
3513 if (!arg_quiet && last_char != '\n')
3514 putc('\n', stdout);
3515
3516 /* Kill if it is not dead yet anyway */
3517 if (arg_register && !arg_keep_unit)
3518 terminate_machine(pid);
3519
3520 /* Normally redundant, but better safe than sorry */
3521 kill(pid, SIGKILL);
3522
3523 r = wait_for_container(pid, &container_status);
3524 pid = 0;
3525
3526 if (r < 0)
3527 /* We failed to wait for the container, or the
3528 * container exited abnormally */
3529 goto finish;
3530 else if (r > 0 || container_status == CONTAINER_TERMINATED){
3531 /* The container exited with a non-zero
3532 * status, or with zero status and no reboot
3533 * was requested. */
3534 ret = r;
3535 break;
3536 }
3537
3538 /* CONTAINER_REBOOTED, loop again */
3539
3540 if (arg_keep_unit) {
3541 /* Special handling if we are running as a
3542 * service: instead of simply restarting the
3543 * machine we want to restart the entire
3544 * service, so let's inform systemd about this
3545 * with the special exit code 133. The service
3546 * file uses RestartForceExitStatus=133 so
3547 * that this results in a full nspawn
3548 * restart. This is necessary since we might
3549 * have cgroup parameters set we want to have
3550 * flushed out. */
3551 ret = 133;
3552 r = 0;
3553 break;
3554 }
3555
3556 expose_port_flush(arg_expose_ports, &exposed);
3557 }
3558
3559 finish:
3560 sd_notify(false,
3561 "STOPPING=1\n"
3562 "STATUS=Terminating...");
3563
3564 if (pid > 0)
3565 kill(pid, SIGKILL);
3566
3567 /* Try to flush whatever is still queued in the pty */
3568 if (master >= 0)
3569 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
3570
3571 loop_remove(loop_nr, &image_fd);
3572
3573 if (remove_subvol && arg_directory) {
3574 int k;
3575
3576 k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
3577 if (k < 0)
3578 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3579 }
3580
3581 if (arg_machine) {
3582 const char *p;
3583
3584 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
3585 (void) rm_rf(p, REMOVE_ROOT);
3586 }
3587
3588 expose_port_flush(arg_expose_ports, &exposed);
3589
3590 free(arg_directory);
3591 free(arg_template);
3592 free(arg_image);
3593 free(arg_machine);
3594 free(arg_user);
3595 strv_free(arg_setenv);
3596 free(arg_network_bridge);
3597 strv_free(arg_network_interfaces);
3598 strv_free(arg_network_macvlan);
3599 strv_free(arg_network_ipvlan);
3600 strv_free(arg_parameters);
3601 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3602 expose_port_free_all(arg_expose_ports);
3603
3604 return r < 0 ? EXIT_FAILURE : ret;
3605 }