]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
72f7d66782ecd81e5ffea76d0dd52596d8829ee8
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <net/if.h>
44 #include <linux/veth.h>
45 #include <sys/personality.h>
46 #include <linux/loop.h>
47
48 #ifdef HAVE_SELINUX
49 #include <selinux/selinux.h>
50 #endif
51
52 #ifdef HAVE_SECCOMP
53 #include <seccomp.h>
54 #endif
55
56 #ifdef HAVE_BLKID
57 #include <blkid/blkid.h>
58 #endif
59
60 #include "sd-daemon.h"
61 #include "sd-bus.h"
62 #include "sd-id128.h"
63 #include "sd-rtnl.h"
64 #include "log.h"
65 #include "util.h"
66 #include "mkdir.h"
67 #include "macro.h"
68 #include "audit.h"
69 #include "missing.h"
70 #include "cgroup-util.h"
71 #include "strv.h"
72 #include "path-util.h"
73 #include "loopback-setup.h"
74 #include "dev-setup.h"
75 #include "fdset.h"
76 #include "build.h"
77 #include "fileio.h"
78 #include "bus-util.h"
79 #include "bus-error.h"
80 #include "ptyfwd.h"
81 #include "bus-kernel.h"
82 #include "env-util.h"
83 #include "def.h"
84 #include "rtnl-util.h"
85 #include "udev-util.h"
86 #include "blkid-util.h"
87 #include "gpt.h"
88 #include "siphash24.h"
89 #include "copy.h"
90 #include "base-filesystem.h"
91 #include "barrier.h"
92 #include "event-util.h"
93 #include "cap-list.h"
94 #include "btrfs-util.h"
95
96 #ifdef HAVE_SECCOMP
97 #include "seccomp-util.h"
98 #endif
99
100 typedef enum ContainerStatus {
101 CONTAINER_TERMINATED,
102 CONTAINER_REBOOTED
103 } ContainerStatus;
104
105 typedef enum LinkJournal {
106 LINK_NO,
107 LINK_AUTO,
108 LINK_HOST,
109 LINK_GUEST
110 } LinkJournal;
111
112 typedef enum Volatile {
113 VOLATILE_NO,
114 VOLATILE_YES,
115 VOLATILE_STATE,
116 } Volatile;
117
118 static char *arg_directory = NULL;
119 static char *arg_template = NULL;
120 static char *arg_user = NULL;
121 static sd_id128_t arg_uuid = {};
122 static char *arg_machine = NULL;
123 static const char *arg_selinux_context = NULL;
124 static const char *arg_selinux_apifs_context = NULL;
125 static const char *arg_slice = NULL;
126 static bool arg_private_network = false;
127 static bool arg_read_only = false;
128 static bool arg_boot = false;
129 static bool arg_ephemeral = false;
130 static LinkJournal arg_link_journal = LINK_AUTO;
131 static bool arg_link_journal_try = false;
132 static uint64_t arg_retain =
133 (1ULL << CAP_CHOWN) |
134 (1ULL << CAP_DAC_OVERRIDE) |
135 (1ULL << CAP_DAC_READ_SEARCH) |
136 (1ULL << CAP_FOWNER) |
137 (1ULL << CAP_FSETID) |
138 (1ULL << CAP_IPC_OWNER) |
139 (1ULL << CAP_KILL) |
140 (1ULL << CAP_LEASE) |
141 (1ULL << CAP_LINUX_IMMUTABLE) |
142 (1ULL << CAP_NET_BIND_SERVICE) |
143 (1ULL << CAP_NET_BROADCAST) |
144 (1ULL << CAP_NET_RAW) |
145 (1ULL << CAP_SETGID) |
146 (1ULL << CAP_SETFCAP) |
147 (1ULL << CAP_SETPCAP) |
148 (1ULL << CAP_SETUID) |
149 (1ULL << CAP_SYS_ADMIN) |
150 (1ULL << CAP_SYS_CHROOT) |
151 (1ULL << CAP_SYS_NICE) |
152 (1ULL << CAP_SYS_PTRACE) |
153 (1ULL << CAP_SYS_TTY_CONFIG) |
154 (1ULL << CAP_SYS_RESOURCE) |
155 (1ULL << CAP_SYS_BOOT) |
156 (1ULL << CAP_AUDIT_WRITE) |
157 (1ULL << CAP_AUDIT_CONTROL) |
158 (1ULL << CAP_MKNOD);
159 static char **arg_bind = NULL;
160 static char **arg_bind_ro = NULL;
161 static char **arg_tmpfs = NULL;
162 static char **arg_setenv = NULL;
163 static bool arg_quiet = false;
164 static bool arg_share_system = false;
165 static bool arg_register = true;
166 static bool arg_keep_unit = false;
167 static char **arg_network_interfaces = NULL;
168 static char **arg_network_macvlan = NULL;
169 static bool arg_network_veth = false;
170 static const char *arg_network_bridge = NULL;
171 static unsigned long arg_personality = 0xffffffffLU;
172 static char *arg_image = NULL;
173 static Volatile arg_volatile = VOLATILE_NO;
174
175 static void help(void) {
176 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
177 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
178 " -h --help Show this help\n"
179 " --version Print version string\n"
180 " -q --quiet Do not show status information\n"
181 " -D --directory=PATH Root directory for the container\n"
182 " --template=PATH Initialize root directory from template directory,\n"
183 " if missing\n"
184 " -x --ephemeral Run container with snapshot of root directory, and\n"
185 " remove it after exit\n"
186 " -i --image=PATH File system device or disk image for the container\n"
187 " -b --boot Boot up full system (i.e. invoke init)\n"
188 " -u --user=USER Run the command under specified user or uid\n"
189 " -M --machine=NAME Set the machine name for the container\n"
190 " --uuid=UUID Set a specific machine UUID for the container\n"
191 " -S --slice=SLICE Place the container in the specified slice\n"
192 " --private-network Disable network in container\n"
193 " --network-interface=INTERFACE\n"
194 " Assign an existing network interface to the\n"
195 " container\n"
196 " --network-macvlan=INTERFACE\n"
197 " Create a macvlan network interface based on an\n"
198 " existing network interface to the container\n"
199 " --network-veth Add a virtual ethernet connection between host\n"
200 " and container\n"
201 " --network-bridge=INTERFACE\n"
202 " Add a virtual ethernet connection between host\n"
203 " and container and add it to an existing bridge on\n"
204 " the host\n"
205 " -Z --selinux-context=SECLABEL\n"
206 " Set the SELinux security context to be used by\n"
207 " processes in the container\n"
208 " -L --selinux-apifs-context=SECLABEL\n"
209 " Set the SELinux security context to be used by\n"
210 " API/tmpfs file systems in the container\n"
211 " --capability=CAP In addition to the default, retain specified\n"
212 " capability\n"
213 " --drop-capability=CAP Drop the specified capability from the default set\n"
214 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
215 " try-guest, try-host\n"
216 " -j Equivalent to --link-journal=try-guest\n"
217 " --read-only Mount the root directory read-only\n"
218 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
219 " the container\n"
220 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
221 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
222 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
223 " --share-system Share system namespaces with host\n"
224 " --register=BOOLEAN Register container as machine\n"
225 " --keep-unit Do not register a scope for the machine, reuse\n"
226 " the service unit nspawn is running in\n"
227 " --volatile[=MODE] Run the system in volatile mode\n",
228 program_invocation_short_name);
229 }
230
231 static int set_sanitized_path(char **b, const char *path) {
232 char *p;
233
234 assert(b);
235 assert(path);
236
237 p = canonicalize_file_name(path);
238 if (!p) {
239 if (errno != ENOENT)
240 return -errno;
241
242 p = path_make_absolute_cwd(path);
243 if (!p)
244 return -ENOMEM;
245 }
246
247 free(*b);
248 *b = path_kill_slashes(p);
249 return 0;
250 }
251
252 static int parse_argv(int argc, char *argv[]) {
253
254 enum {
255 ARG_VERSION = 0x100,
256 ARG_PRIVATE_NETWORK,
257 ARG_UUID,
258 ARG_READ_ONLY,
259 ARG_CAPABILITY,
260 ARG_DROP_CAPABILITY,
261 ARG_LINK_JOURNAL,
262 ARG_BIND,
263 ARG_BIND_RO,
264 ARG_TMPFS,
265 ARG_SETENV,
266 ARG_SHARE_SYSTEM,
267 ARG_REGISTER,
268 ARG_KEEP_UNIT,
269 ARG_NETWORK_INTERFACE,
270 ARG_NETWORK_MACVLAN,
271 ARG_NETWORK_VETH,
272 ARG_NETWORK_BRIDGE,
273 ARG_PERSONALITY,
274 ARG_VOLATILE,
275 ARG_TEMPLATE,
276 };
277
278 static const struct option options[] = {
279 { "help", no_argument, NULL, 'h' },
280 { "version", no_argument, NULL, ARG_VERSION },
281 { "directory", required_argument, NULL, 'D' },
282 { "template", required_argument, NULL, ARG_TEMPLATE },
283 { "ephemeral", no_argument, NULL, 'x' },
284 { "user", required_argument, NULL, 'u' },
285 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
286 { "boot", no_argument, NULL, 'b' },
287 { "uuid", required_argument, NULL, ARG_UUID },
288 { "read-only", no_argument, NULL, ARG_READ_ONLY },
289 { "capability", required_argument, NULL, ARG_CAPABILITY },
290 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
291 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
292 { "bind", required_argument, NULL, ARG_BIND },
293 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
294 { "tmpfs", required_argument, NULL, ARG_TMPFS },
295 { "machine", required_argument, NULL, 'M' },
296 { "slice", required_argument, NULL, 'S' },
297 { "setenv", required_argument, NULL, ARG_SETENV },
298 { "selinux-context", required_argument, NULL, 'Z' },
299 { "selinux-apifs-context", required_argument, NULL, 'L' },
300 { "quiet", no_argument, NULL, 'q' },
301 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
302 { "register", required_argument, NULL, ARG_REGISTER },
303 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
304 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
305 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
306 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
307 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
308 { "personality", required_argument, NULL, ARG_PERSONALITY },
309 { "image", required_argument, NULL, 'i' },
310 { "volatile", optional_argument, NULL, ARG_VOLATILE },
311 {}
312 };
313
314 int c, r;
315 uint64_t plus = 0, minus = 0;
316
317 assert(argc >= 0);
318 assert(argv);
319
320 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:x", options, NULL)) >= 0)
321
322 switch (c) {
323
324 case 'h':
325 help();
326 return 0;
327
328 case ARG_VERSION:
329 puts(PACKAGE_STRING);
330 puts(SYSTEMD_FEATURES);
331 return 0;
332
333 case 'D':
334 r = set_sanitized_path(&arg_directory, optarg);
335 if (r < 0)
336 return log_error_errno(r, "Invalid root directory: %m");
337
338 break;
339
340 case ARG_TEMPLATE:
341 r = set_sanitized_path(&arg_template, optarg);
342 if (r < 0)
343 return log_error_errno(r, "Invalid template directory: %m");
344
345 break;
346
347 case 'i':
348 r = set_sanitized_path(&arg_image, optarg);
349 if (r < 0)
350 return log_error_errno(r, "Invalid image path: %m");
351
352 break;
353
354 case 'x':
355 arg_ephemeral = true;
356 break;
357
358 case 'u':
359 free(arg_user);
360 arg_user = strdup(optarg);
361 if (!arg_user)
362 return log_oom();
363
364 break;
365
366 case ARG_NETWORK_BRIDGE:
367 arg_network_bridge = optarg;
368
369 /* fall through */
370
371 case ARG_NETWORK_VETH:
372 arg_network_veth = true;
373 arg_private_network = true;
374 break;
375
376 case ARG_NETWORK_INTERFACE:
377 if (strv_extend(&arg_network_interfaces, optarg) < 0)
378 return log_oom();
379
380 arg_private_network = true;
381 break;
382
383 case ARG_NETWORK_MACVLAN:
384 if (strv_extend(&arg_network_macvlan, optarg) < 0)
385 return log_oom();
386
387 /* fall through */
388
389 case ARG_PRIVATE_NETWORK:
390 arg_private_network = true;
391 break;
392
393 case 'b':
394 arg_boot = true;
395 break;
396
397 case ARG_UUID:
398 r = sd_id128_from_string(optarg, &arg_uuid);
399 if (r < 0) {
400 log_error("Invalid UUID: %s", optarg);
401 return r;
402 }
403 break;
404
405 case 'S':
406 arg_slice = optarg;
407 break;
408
409 case 'M':
410 if (isempty(optarg)) {
411 free(arg_machine);
412 arg_machine = NULL;
413 } else {
414 if (!machine_name_is_valid(optarg)) {
415 log_error("Invalid machine name: %s", optarg);
416 return -EINVAL;
417 }
418
419 r = free_and_strdup(&arg_machine, optarg);
420 if (r < 0)
421 return log_oom();
422
423 break;
424 }
425
426 case 'Z':
427 arg_selinux_context = optarg;
428 break;
429
430 case 'L':
431 arg_selinux_apifs_context = optarg;
432 break;
433
434 case ARG_READ_ONLY:
435 arg_read_only = true;
436 break;
437
438 case ARG_CAPABILITY:
439 case ARG_DROP_CAPABILITY: {
440 const char *state, *word;
441 size_t length;
442
443 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
444 _cleanup_free_ char *t;
445
446 t = strndup(word, length);
447 if (!t)
448 return log_oom();
449
450 if (streq(t, "all")) {
451 if (c == ARG_CAPABILITY)
452 plus = (uint64_t) -1;
453 else
454 minus = (uint64_t) -1;
455 } else {
456 int cap;
457
458 cap = capability_from_name(t);
459 if (cap < 0) {
460 log_error("Failed to parse capability %s.", t);
461 return -EINVAL;
462 }
463
464 if (c == ARG_CAPABILITY)
465 plus |= 1ULL << (uint64_t) cap;
466 else
467 minus |= 1ULL << (uint64_t) cap;
468 }
469 }
470
471 break;
472 }
473
474 case 'j':
475 arg_link_journal = LINK_GUEST;
476 arg_link_journal_try = true;
477 break;
478
479 case ARG_LINK_JOURNAL:
480 if (streq(optarg, "auto")) {
481 arg_link_journal = LINK_AUTO;
482 arg_link_journal_try = false;
483 } else if (streq(optarg, "no")) {
484 arg_link_journal = LINK_NO;
485 arg_link_journal_try = false;
486 } else if (streq(optarg, "guest")) {
487 arg_link_journal = LINK_GUEST;
488 arg_link_journal_try = false;
489 } else if (streq(optarg, "host")) {
490 arg_link_journal = LINK_HOST;
491 arg_link_journal_try = false;
492 } else if (streq(optarg, "try-guest")) {
493 arg_link_journal = LINK_GUEST;
494 arg_link_journal_try = true;
495 } else if (streq(optarg, "try-host")) {
496 arg_link_journal = LINK_HOST;
497 arg_link_journal_try = true;
498 } else {
499 log_error("Failed to parse link journal mode %s", optarg);
500 return -EINVAL;
501 }
502
503 break;
504
505 case ARG_BIND:
506 case ARG_BIND_RO: {
507 _cleanup_free_ char *a = NULL, *b = NULL;
508 char *e;
509 char ***x;
510
511 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
512
513 e = strchr(optarg, ':');
514 if (e) {
515 a = strndup(optarg, e - optarg);
516 b = strdup(e + 1);
517 } else {
518 a = strdup(optarg);
519 b = strdup(optarg);
520 }
521
522 if (!a || !b)
523 return log_oom();
524
525 if (!path_is_absolute(a) || !path_is_absolute(b)) {
526 log_error("Invalid bind mount specification: %s", optarg);
527 return -EINVAL;
528 }
529
530 r = strv_extend(x, a);
531 if (r < 0)
532 return log_oom();
533
534 r = strv_extend(x, b);
535 if (r < 0)
536 return log_oom();
537
538 break;
539 }
540
541 case ARG_TMPFS: {
542 _cleanup_free_ char *a = NULL, *b = NULL;
543 char *e;
544
545 e = strchr(optarg, ':');
546 if (e) {
547 a = strndup(optarg, e - optarg);
548 b = strdup(e + 1);
549 } else {
550 a = strdup(optarg);
551 b = strdup("mode=0755");
552 }
553
554 if (!a || !b)
555 return log_oom();
556
557 if (!path_is_absolute(a)) {
558 log_error("Invalid tmpfs specification: %s", optarg);
559 return -EINVAL;
560 }
561
562 r = strv_push(&arg_tmpfs, a);
563 if (r < 0)
564 return log_oom();
565
566 a = NULL;
567
568 r = strv_push(&arg_tmpfs, b);
569 if (r < 0)
570 return log_oom();
571
572 b = NULL;
573
574 break;
575 }
576
577 case ARG_SETENV: {
578 char **n;
579
580 if (!env_assignment_is_valid(optarg)) {
581 log_error("Environment variable assignment '%s' is not valid.", optarg);
582 return -EINVAL;
583 }
584
585 n = strv_env_set(arg_setenv, optarg);
586 if (!n)
587 return log_oom();
588
589 strv_free(arg_setenv);
590 arg_setenv = n;
591 break;
592 }
593
594 case 'q':
595 arg_quiet = true;
596 break;
597
598 case ARG_SHARE_SYSTEM:
599 arg_share_system = true;
600 break;
601
602 case ARG_REGISTER:
603 r = parse_boolean(optarg);
604 if (r < 0) {
605 log_error("Failed to parse --register= argument: %s", optarg);
606 return r;
607 }
608
609 arg_register = r;
610 break;
611
612 case ARG_KEEP_UNIT:
613 arg_keep_unit = true;
614 break;
615
616 case ARG_PERSONALITY:
617
618 arg_personality = personality_from_string(optarg);
619 if (arg_personality == 0xffffffffLU) {
620 log_error("Unknown or unsupported personality '%s'.", optarg);
621 return -EINVAL;
622 }
623
624 break;
625
626 case ARG_VOLATILE:
627
628 if (!optarg)
629 arg_volatile = VOLATILE_YES;
630 else {
631 r = parse_boolean(optarg);
632 if (r < 0) {
633 if (streq(optarg, "state"))
634 arg_volatile = VOLATILE_STATE;
635 else {
636 log_error("Failed to parse --volatile= argument: %s", optarg);
637 return r;
638 }
639 } else
640 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
641 }
642
643 break;
644
645 case '?':
646 return -EINVAL;
647
648 default:
649 assert_not_reached("Unhandled option");
650 }
651
652 if (arg_share_system)
653 arg_register = false;
654
655 if (arg_boot && arg_share_system) {
656 log_error("--boot and --share-system may not be combined.");
657 return -EINVAL;
658 }
659
660 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
661 log_error("--keep-unit may not be used when invoked from a user session.");
662 return -EINVAL;
663 }
664
665 if (arg_directory && arg_image) {
666 log_error("--directory= and --image= may not be combined.");
667 return -EINVAL;
668 }
669
670 if (arg_template && arg_image) {
671 log_error("--template= and --image= may not be combined.");
672 return -EINVAL;
673 }
674
675 if (arg_template && !(arg_directory || arg_machine)) {
676 log_error("--template= needs --directory= or --machine=.");
677 return -EINVAL;
678 }
679
680 if (arg_ephemeral && arg_template) {
681 log_error("--ephemeral and --template= may not be combined.");
682 return -EINVAL;
683 }
684
685 if (arg_ephemeral && arg_image) {
686 log_error("--ephemeral and --image= may not be combined.");
687 return -EINVAL;
688 }
689
690 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
691 log_error("--ephemeral and --link-journal= may not be combined.");
692 return -EINVAL;
693 }
694
695 if (arg_volatile != VOLATILE_NO && arg_read_only) {
696 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
697 return -EINVAL;
698 }
699
700 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
701
702 return 1;
703 }
704
705 static int mount_all(const char *dest) {
706
707 typedef struct MountPoint {
708 const char *what;
709 const char *where;
710 const char *type;
711 const char *options;
712 unsigned long flags;
713 bool fatal;
714 } MountPoint;
715
716 static const MountPoint mount_table[] = {
717 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
718 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
719 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
720 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
721 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
722 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
723 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
724 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
725 #ifdef HAVE_SELINUX
726 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
727 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
728 #endif
729 };
730
731 unsigned k;
732 int r = 0;
733
734 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
735 _cleanup_free_ char *where = NULL;
736 #ifdef HAVE_SELINUX
737 _cleanup_free_ char *options = NULL;
738 #endif
739 const char *o;
740 int t;
741
742 where = strjoin(dest, "/", mount_table[k].where, NULL);
743 if (!where)
744 return log_oom();
745
746 t = path_is_mount_point(where, true);
747 if (t < 0) {
748 log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
749
750 if (r == 0)
751 r = t;
752
753 continue;
754 }
755
756 /* Skip this entry if it is not a remount. */
757 if (mount_table[k].what && t > 0)
758 continue;
759
760 t = mkdir_p(where, 0755);
761 if (t < 0) {
762 if (mount_table[k].fatal) {
763 log_error_errno(t, "Failed to create directory %s: %m", where);
764
765 if (r == 0)
766 r = t;
767 } else
768 log_warning_errno(t, "Failed to create directory %s: %m", where);
769
770 continue;
771 }
772
773 #ifdef HAVE_SELINUX
774 if (arg_selinux_apifs_context &&
775 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
776 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
777 if (!options)
778 return log_oom();
779
780 o = options;
781 } else
782 #endif
783 o = mount_table[k].options;
784
785
786 if (mount(mount_table[k].what,
787 where,
788 mount_table[k].type,
789 mount_table[k].flags,
790 o) < 0) {
791
792 if (mount_table[k].fatal) {
793 log_error_errno(errno, "mount(%s) failed: %m", where);
794
795 if (r == 0)
796 r = -errno;
797 } else
798 log_warning_errno(errno, "mount(%s) failed: %m", where);
799 }
800 }
801
802 return r;
803 }
804
805 static int mount_binds(const char *dest, char **l, bool ro) {
806 char **x, **y;
807
808 STRV_FOREACH_PAIR(x, y, l) {
809 _cleanup_free_ char *where = NULL;
810 struct stat source_st, dest_st;
811 int r;
812
813 if (stat(*x, &source_st) < 0)
814 return log_error_errno(errno, "Failed to stat %s: %m", *x);
815
816 where = strappend(dest, *y);
817 if (!where)
818 return log_oom();
819
820 r = stat(where, &dest_st);
821 if (r == 0) {
822 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
823 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
824 return -EINVAL;
825 }
826 } else if (errno == ENOENT) {
827 r = mkdir_parents_label(where, 0755);
828 if (r < 0)
829 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
830 } else {
831 log_error_errno(errno, "Failed to bind mount %s: %m", *x);
832 return -errno;
833 }
834
835 /* Create the mount point, but be conservative -- refuse to create block
836 * and char devices. */
837 if (S_ISDIR(source_st.st_mode)) {
838 r = mkdir_label(where, 0755);
839 if (r < 0 && errno != EEXIST)
840 return log_error_errno(r, "Failed to create mount point %s: %m", where);
841 } else if (S_ISFIFO(source_st.st_mode)) {
842 r = mkfifo(where, 0644);
843 if (r < 0 && errno != EEXIST)
844 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
845 } else if (S_ISSOCK(source_st.st_mode)) {
846 r = mknod(where, 0644 | S_IFSOCK, 0);
847 if (r < 0 && errno != EEXIST)
848 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
849 } else if (S_ISREG(source_st.st_mode)) {
850 r = touch(where);
851 if (r < 0)
852 return log_error_errno(r, "Failed to create mount point %s: %m", where);
853 } else {
854 log_error("Refusing to create mountpoint for file: %s", *x);
855 return -ENOTSUP;
856 }
857
858 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
859 return log_error_errno(errno, "mount(%s) failed: %m", where);
860
861 if (ro) {
862 r = bind_remount_recursive(where, true);
863 if (r < 0)
864 return log_error_errno(r, "Read-Only bind mount failed: %m");
865 }
866 }
867
868 return 0;
869 }
870
871 static int mount_tmpfs(const char *dest) {
872 char **i, **o;
873
874 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
875 _cleanup_free_ char *where = NULL;
876 int r;
877
878 where = strappend(dest, *i);
879 if (!where)
880 return log_oom();
881
882 r = mkdir_label(where, 0755);
883 if (r < 0 && r != -EEXIST)
884 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
885
886 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
887 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
888 }
889
890 return 0;
891 }
892
893 static int setup_timezone(const char *dest) {
894 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
895 char *z, *y;
896 int r;
897
898 assert(dest);
899
900 /* Fix the timezone, if possible */
901 r = readlink_malloc("/etc/localtime", &p);
902 if (r < 0) {
903 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
904 return 0;
905 }
906
907 z = path_startswith(p, "../usr/share/zoneinfo/");
908 if (!z)
909 z = path_startswith(p, "/usr/share/zoneinfo/");
910 if (!z) {
911 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
912 return 0;
913 }
914
915 where = strappend(dest, "/etc/localtime");
916 if (!where)
917 return log_oom();
918
919 r = readlink_malloc(where, &q);
920 if (r >= 0) {
921 y = path_startswith(q, "../usr/share/zoneinfo/");
922 if (!y)
923 y = path_startswith(q, "/usr/share/zoneinfo/");
924
925 /* Already pointing to the right place? Then do nothing .. */
926 if (y && streq(y, z))
927 return 0;
928 }
929
930 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
931 if (!check)
932 return log_oom();
933
934 if (access(check, F_OK) < 0) {
935 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
936 return 0;
937 }
938
939 what = strappend("../usr/share/zoneinfo/", z);
940 if (!what)
941 return log_oom();
942
943 r = mkdir_parents(where, 0755);
944 if (r < 0) {
945 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
946
947 return 0;
948 }
949
950 r = unlink(where);
951 if (r < 0 && errno != ENOENT) {
952 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
953
954 return 0;
955 }
956
957 if (symlink(what, where) < 0) {
958 log_error_errno(errno, "Failed to correct timezone of container: %m");
959 return 0;
960 }
961
962 return 0;
963 }
964
965 static int setup_resolv_conf(const char *dest) {
966 _cleanup_free_ char *where = NULL;
967 int r;
968
969 assert(dest);
970
971 if (arg_private_network)
972 return 0;
973
974 /* Fix resolv.conf, if possible */
975 where = strappend(dest, "/etc/resolv.conf");
976 if (!where)
977 return log_oom();
978
979 /* We don't really care for the results of this really. If it
980 * fails, it fails, but meh... */
981 r = mkdir_parents(where, 0755);
982 if (r < 0) {
983 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
984
985 return 0;
986 }
987
988 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
989 if (r < 0) {
990 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
991
992 return 0;
993 }
994
995 return 0;
996 }
997
998 static int setup_volatile_state(const char *directory) {
999 const char *p;
1000 int r;
1001
1002 assert(directory);
1003
1004 if (arg_volatile != VOLATILE_STATE)
1005 return 0;
1006
1007 /* --volatile=state means we simply overmount /var
1008 with a tmpfs, and the rest read-only. */
1009
1010 r = bind_remount_recursive(directory, true);
1011 if (r < 0)
1012 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1013
1014 p = strappenda(directory, "/var");
1015 r = mkdir(p, 0755);
1016 if (r < 0 && errno != EEXIST)
1017 return log_error_errno(errno, "Failed to create %s: %m", directory);
1018
1019 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1020 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1021
1022 return 0;
1023 }
1024
1025 static int setup_volatile(const char *directory) {
1026 bool tmpfs_mounted = false, bind_mounted = false;
1027 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1028 const char *f, *t;
1029 int r;
1030
1031 assert(directory);
1032
1033 if (arg_volatile != VOLATILE_YES)
1034 return 0;
1035
1036 /* --volatile=yes means we mount a tmpfs to the root dir, and
1037 the original /usr to use inside it, and that read-only. */
1038
1039 if (!mkdtemp(template))
1040 return log_error_errno(errno, "Failed to create temporary directory: %m");
1041
1042 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1043 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1044 r = -errno;
1045 goto fail;
1046 }
1047
1048 tmpfs_mounted = true;
1049
1050 f = strappenda(directory, "/usr");
1051 t = strappenda(template, "/usr");
1052
1053 r = mkdir(t, 0755);
1054 if (r < 0 && errno != EEXIST) {
1055 log_error_errno(errno, "Failed to create %s: %m", t);
1056 r = -errno;
1057 goto fail;
1058 }
1059
1060 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1061 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1062 r = -errno;
1063 goto fail;
1064 }
1065
1066 bind_mounted = true;
1067
1068 r = bind_remount_recursive(t, true);
1069 if (r < 0) {
1070 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1071 goto fail;
1072 }
1073
1074 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1075 log_error_errno(errno, "Failed to move root mount: %m");
1076 r = -errno;
1077 goto fail;
1078 }
1079
1080 rmdir(template);
1081
1082 return 0;
1083
1084 fail:
1085 if (bind_mounted)
1086 umount(t);
1087 if (tmpfs_mounted)
1088 umount(template);
1089 rmdir(template);
1090 return r;
1091 }
1092
1093 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1094
1095 snprintf(s, 37,
1096 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1097 SD_ID128_FORMAT_VAL(id));
1098
1099 return s;
1100 }
1101
1102 static int setup_boot_id(const char *dest) {
1103 _cleanup_free_ char *from = NULL, *to = NULL;
1104 sd_id128_t rnd = {};
1105 char as_uuid[37];
1106 int r;
1107
1108 assert(dest);
1109
1110 if (arg_share_system)
1111 return 0;
1112
1113 /* Generate a new randomized boot ID, so that each boot-up of
1114 * the container gets a new one */
1115
1116 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1117 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1118 if (!from || !to)
1119 return log_oom();
1120
1121 r = sd_id128_randomize(&rnd);
1122 if (r < 0)
1123 return log_error_errno(r, "Failed to generate random boot id: %m");
1124
1125 id128_format_as_uuid(rnd, as_uuid);
1126
1127 r = write_string_file(from, as_uuid);
1128 if (r < 0)
1129 return log_error_errno(r, "Failed to write boot id: %m");
1130
1131 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1132 log_error_errno(errno, "Failed to bind mount boot id: %m");
1133 r = -errno;
1134 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1135 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1136
1137 unlink(from);
1138 return r;
1139 }
1140
1141 static int copy_devnodes(const char *dest) {
1142
1143 static const char devnodes[] =
1144 "null\0"
1145 "zero\0"
1146 "full\0"
1147 "random\0"
1148 "urandom\0"
1149 "tty\0"
1150 "net/tun\0";
1151
1152 const char *d;
1153 int r = 0;
1154 _cleanup_umask_ mode_t u;
1155
1156 assert(dest);
1157
1158 u = umask(0000);
1159
1160 NULSTR_FOREACH(d, devnodes) {
1161 _cleanup_free_ char *from = NULL, *to = NULL;
1162 struct stat st;
1163
1164 from = strappend("/dev/", d);
1165 to = strjoin(dest, "/dev/", d, NULL);
1166 if (!from || !to)
1167 return log_oom();
1168
1169 if (stat(from, &st) < 0) {
1170
1171 if (errno != ENOENT)
1172 return log_error_errno(errno, "Failed to stat %s: %m", from);
1173
1174 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1175
1176 log_error("%s is not a char or block device, cannot copy", from);
1177 return -EIO;
1178
1179 } else {
1180 r = mkdir_parents(to, 0775);
1181 if (r < 0) {
1182 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1183 return -r;
1184 }
1185
1186 if (mknod(to, st.st_mode, st.st_rdev) < 0)
1187 return log_error_errno(errno, "mknod(%s) failed: %m", dest);
1188 }
1189 }
1190
1191 return r;
1192 }
1193
1194 static int setup_ptmx(const char *dest) {
1195 _cleanup_free_ char *p = NULL;
1196
1197 p = strappend(dest, "/dev/ptmx");
1198 if (!p)
1199 return log_oom();
1200
1201 if (symlink("pts/ptmx", p) < 0)
1202 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1203
1204 return 0;
1205 }
1206
1207 static int setup_dev_console(const char *dest, const char *console) {
1208 _cleanup_umask_ mode_t u;
1209 const char *to;
1210 struct stat st;
1211 int r;
1212
1213 assert(dest);
1214 assert(console);
1215
1216 u = umask(0000);
1217
1218 if (stat("/dev/null", &st) < 0)
1219 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1220
1221 r = chmod_and_chown(console, 0600, 0, 0);
1222 if (r < 0)
1223 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1224
1225 /* We need to bind mount the right tty to /dev/console since
1226 * ptys can only exist on pts file systems. To have something
1227 * to bind mount things on we create a device node first, and
1228 * use /dev/null for that since we the cgroups device policy
1229 * allows us to create that freely, while we cannot create
1230 * /dev/console. (Note that the major minor doesn't actually
1231 * matter here, since we mount it over anyway). */
1232
1233 to = strappenda(dest, "/dev/console");
1234 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1235 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1236
1237 if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1238 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1239
1240 return 0;
1241 }
1242
1243 static int setup_kmsg(const char *dest, int kmsg_socket) {
1244 _cleanup_free_ char *from = NULL, *to = NULL;
1245 int r, fd, k;
1246 _cleanup_umask_ mode_t u;
1247 union {
1248 struct cmsghdr cmsghdr;
1249 uint8_t buf[CMSG_SPACE(sizeof(int))];
1250 } control = {};
1251 struct msghdr mh = {
1252 .msg_control = &control,
1253 .msg_controllen = sizeof(control),
1254 };
1255 struct cmsghdr *cmsg;
1256
1257 assert(dest);
1258 assert(kmsg_socket >= 0);
1259
1260 u = umask(0000);
1261
1262 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1263 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1264 * on the reading side behave very similar to /proc/kmsg,
1265 * their writing side behaves differently from /dev/kmsg in
1266 * that writing blocks when nothing is reading. In order to
1267 * avoid any problems with containers deadlocking due to this
1268 * we simply make /dev/kmsg unavailable to the container. */
1269 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1270 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1271 return log_oom();
1272
1273 if (mkfifo(from, 0600) < 0)
1274 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1275
1276 r = chmod_and_chown(from, 0600, 0, 0);
1277 if (r < 0)
1278 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1279
1280 if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1281 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1282
1283 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1284 if (fd < 0)
1285 return log_error_errno(errno, "Failed to open fifo: %m");
1286
1287 cmsg = CMSG_FIRSTHDR(&mh);
1288 cmsg->cmsg_level = SOL_SOCKET;
1289 cmsg->cmsg_type = SCM_RIGHTS;
1290 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1291 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1292
1293 mh.msg_controllen = cmsg->cmsg_len;
1294
1295 /* Store away the fd in the socket, so that it stays open as
1296 * long as we run the child */
1297 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1298 safe_close(fd);
1299
1300 if (k < 0)
1301 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1302
1303 /* And now make the FIFO unavailable as /dev/kmsg... */
1304 unlink(from);
1305 return 0;
1306 }
1307
1308 static int setup_hostname(void) {
1309
1310 if (arg_share_system)
1311 return 0;
1312
1313 if (sethostname_idempotent(arg_machine) < 0)
1314 return -errno;
1315
1316 return 0;
1317 }
1318
1319 static int setup_journal(const char *directory) {
1320 sd_id128_t machine_id, this_id;
1321 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1322 char *id;
1323 int r;
1324
1325 /* Don't link journals in ephemeral mode */
1326 if (arg_ephemeral)
1327 return 0;
1328
1329 p = strappend(directory, "/etc/machine-id");
1330 if (!p)
1331 return log_oom();
1332
1333 r = read_one_line_file(p, &b);
1334 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1335 return 0;
1336 else if (r < 0)
1337 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1338
1339 id = strstrip(b);
1340 if (isempty(id) && arg_link_journal == LINK_AUTO)
1341 return 0;
1342
1343 /* Verify validity */
1344 r = sd_id128_from_string(id, &machine_id);
1345 if (r < 0)
1346 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1347
1348 r = sd_id128_get_machine(&this_id);
1349 if (r < 0)
1350 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1351
1352 if (sd_id128_equal(machine_id, this_id)) {
1353 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1354 "Host and machine ids are equal (%s): refusing to link journals", id);
1355 if (arg_link_journal == LINK_AUTO)
1356 return 0;
1357 return -EEXIST;
1358 }
1359
1360 if (arg_link_journal == LINK_NO)
1361 return 0;
1362
1363 free(p);
1364 p = strappend("/var/log/journal/", id);
1365 q = strjoin(directory, "/var/log/journal/", id, NULL);
1366 if (!p || !q)
1367 return log_oom();
1368
1369 if (path_is_mount_point(p, false) > 0) {
1370 if (arg_link_journal != LINK_AUTO) {
1371 log_error("%s: already a mount point, refusing to use for journal", p);
1372 return -EEXIST;
1373 }
1374
1375 return 0;
1376 }
1377
1378 if (path_is_mount_point(q, false) > 0) {
1379 if (arg_link_journal != LINK_AUTO) {
1380 log_error("%s: already a mount point, refusing to use for journal", q);
1381 return -EEXIST;
1382 }
1383
1384 return 0;
1385 }
1386
1387 r = readlink_and_make_absolute(p, &d);
1388 if (r >= 0) {
1389 if ((arg_link_journal == LINK_GUEST ||
1390 arg_link_journal == LINK_AUTO) &&
1391 path_equal(d, q)) {
1392
1393 r = mkdir_p(q, 0755);
1394 if (r < 0)
1395 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1396 return 0;
1397 }
1398
1399 if (unlink(p) < 0)
1400 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1401 } else if (r == -EINVAL) {
1402
1403 if (arg_link_journal == LINK_GUEST &&
1404 rmdir(p) < 0) {
1405
1406 if (errno == ENOTDIR) {
1407 log_error("%s already exists and is neither a symlink nor a directory", p);
1408 return r;
1409 } else {
1410 log_error_errno(errno, "Failed to remove %s: %m", p);
1411 return -errno;
1412 }
1413 }
1414 } else if (r != -ENOENT) {
1415 log_error_errno(errno, "readlink(%s) failed: %m", p);
1416 return r;
1417 }
1418
1419 if (arg_link_journal == LINK_GUEST) {
1420
1421 if (symlink(q, p) < 0) {
1422 if (arg_link_journal_try) {
1423 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1424 return 0;
1425 } else {
1426 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1427 return -errno;
1428 }
1429 }
1430
1431 r = mkdir_p(q, 0755);
1432 if (r < 0)
1433 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1434 return 0;
1435 }
1436
1437 if (arg_link_journal == LINK_HOST) {
1438 /* don't create parents here -- if the host doesn't have
1439 * permanent journal set up, don't force it here */
1440 r = mkdir(p, 0755);
1441 if (r < 0) {
1442 if (arg_link_journal_try) {
1443 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1444 return 0;
1445 } else {
1446 log_error_errno(errno, "Failed to create %s: %m", p);
1447 return r;
1448 }
1449 }
1450
1451 } else if (access(p, F_OK) < 0)
1452 return 0;
1453
1454 if (dir_is_empty(q) == 0)
1455 log_warning("%s is not empty, proceeding anyway.", q);
1456
1457 r = mkdir_p(q, 0755);
1458 if (r < 0) {
1459 log_error_errno(errno, "Failed to create %s: %m", q);
1460 return r;
1461 }
1462
1463 if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1464 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1465
1466 return 0;
1467 }
1468
1469 static int drop_capabilities(void) {
1470 return capability_bounding_set_drop(~arg_retain, false);
1471 }
1472
1473 static int register_machine(pid_t pid, int local_ifindex) {
1474 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1475 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1476 int r;
1477
1478 if (!arg_register)
1479 return 0;
1480
1481 r = sd_bus_default_system(&bus);
1482 if (r < 0)
1483 return log_error_errno(r, "Failed to open system bus: %m");
1484
1485 if (arg_keep_unit) {
1486 r = sd_bus_call_method(
1487 bus,
1488 "org.freedesktop.machine1",
1489 "/org/freedesktop/machine1",
1490 "org.freedesktop.machine1.Manager",
1491 "RegisterMachineWithNetwork",
1492 &error,
1493 NULL,
1494 "sayssusai",
1495 arg_machine,
1496 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1497 "nspawn",
1498 "container",
1499 (uint32_t) pid,
1500 strempty(arg_directory),
1501 local_ifindex > 0 ? 1 : 0, local_ifindex);
1502 } else {
1503 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1504
1505 r = sd_bus_message_new_method_call(
1506 bus,
1507 &m,
1508 "org.freedesktop.machine1",
1509 "/org/freedesktop/machine1",
1510 "org.freedesktop.machine1.Manager",
1511 "CreateMachineWithNetwork");
1512 if (r < 0)
1513 return log_error_errno(r, "Failed to create message: %m");
1514
1515 r = sd_bus_message_append(
1516 m,
1517 "sayssusai",
1518 arg_machine,
1519 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1520 "nspawn",
1521 "container",
1522 (uint32_t) pid,
1523 strempty(arg_directory),
1524 local_ifindex > 0 ? 1 : 0, local_ifindex);
1525 if (r < 0)
1526 return log_error_errno(r, "Failed to append message arguments: %m");
1527
1528 r = sd_bus_message_open_container(m, 'a', "(sv)");
1529 if (r < 0)
1530 return log_error_errno(r, "Failed to open container: %m");
1531
1532 if (!isempty(arg_slice)) {
1533 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1534 if (r < 0)
1535 return log_error_errno(r, "Failed to append slice: %m");
1536 }
1537
1538 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1539 if (r < 0)
1540 return log_error_errno(r, "Failed to add device policy: %m");
1541
1542 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1543 /* Allow the container to
1544 * access and create the API
1545 * device nodes, so that
1546 * PrivateDevices= in the
1547 * container can work
1548 * fine */
1549 "/dev/null", "rwm",
1550 "/dev/zero", "rwm",
1551 "/dev/full", "rwm",
1552 "/dev/random", "rwm",
1553 "/dev/urandom", "rwm",
1554 "/dev/tty", "rwm",
1555 "/dev/net/tun", "rwm",
1556 /* Allow the container
1557 * access to ptys. However,
1558 * do not permit the
1559 * container to ever create
1560 * these device nodes. */
1561 "/dev/pts/ptmx", "rw",
1562 "char-pts", "rw");
1563 if (r < 0)
1564 return log_error_errno(r, "Failed to add device whitelist: %m");
1565
1566 r = sd_bus_message_close_container(m);
1567 if (r < 0)
1568 return log_error_errno(r, "Failed to close container: %m");
1569
1570 r = sd_bus_call(bus, m, 0, &error, NULL);
1571 }
1572
1573 if (r < 0) {
1574 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1575 return r;
1576 }
1577
1578 return 0;
1579 }
1580
1581 static int terminate_machine(pid_t pid) {
1582 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1583 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1584 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1585 const char *path;
1586 int r;
1587
1588 if (!arg_register)
1589 return 0;
1590
1591 r = sd_bus_default_system(&bus);
1592 if (r < 0)
1593 return log_error_errno(r, "Failed to open system bus: %m");
1594
1595 r = sd_bus_call_method(
1596 bus,
1597 "org.freedesktop.machine1",
1598 "/org/freedesktop/machine1",
1599 "org.freedesktop.machine1.Manager",
1600 "GetMachineByPID",
1601 &error,
1602 &reply,
1603 "u",
1604 (uint32_t) pid);
1605 if (r < 0) {
1606 /* Note that the machine might already have been
1607 * cleaned up automatically, hence don't consider it a
1608 * failure if we cannot get the machine object. */
1609 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1610 return 0;
1611 }
1612
1613 r = sd_bus_message_read(reply, "o", &path);
1614 if (r < 0)
1615 return bus_log_parse_error(r);
1616
1617 r = sd_bus_call_method(
1618 bus,
1619 "org.freedesktop.machine1",
1620 path,
1621 "org.freedesktop.machine1.Machine",
1622 "Terminate",
1623 &error,
1624 NULL,
1625 NULL);
1626 if (r < 0) {
1627 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1628 return 0;
1629 }
1630
1631 return 0;
1632 }
1633
1634 static int reset_audit_loginuid(void) {
1635 _cleanup_free_ char *p = NULL;
1636 int r;
1637
1638 if (arg_share_system)
1639 return 0;
1640
1641 r = read_one_line_file("/proc/self/loginuid", &p);
1642 if (r == -ENOENT)
1643 return 0;
1644 if (r < 0)
1645 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1646
1647 /* Already reset? */
1648 if (streq(p, "4294967295"))
1649 return 0;
1650
1651 r = write_string_file("/proc/self/loginuid", "4294967295");
1652 if (r < 0) {
1653 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1654 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1655 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1656 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1657 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1658
1659 sleep(5);
1660 }
1661
1662 return 0;
1663 }
1664
1665 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1666 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1667 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
1668
1669 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
1670 uint8_t result[8];
1671 size_t l, sz;
1672 uint8_t *v, *i;
1673 int r;
1674
1675 l = strlen(arg_machine);
1676 sz = sizeof(sd_id128_t) + l;
1677 if (idx > 0)
1678 sz += sizeof(idx);
1679
1680 v = alloca(sz);
1681
1682 /* fetch some persistent data unique to the host */
1683 r = sd_id128_get_machine((sd_id128_t*) v);
1684 if (r < 0)
1685 return r;
1686
1687 /* combine with some data unique (on this host) to this
1688 * container instance */
1689 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
1690 if (idx > 0) {
1691 idx = htole64(idx);
1692 memcpy(i, &idx, sizeof(idx));
1693 }
1694
1695 /* Let's hash the host machine ID plus the container name. We
1696 * use a fixed, but originally randomly created hash key here. */
1697 siphash24(result, v, sz, hash_key.bytes);
1698
1699 assert_cc(ETH_ALEN <= sizeof(result));
1700 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1701
1702 /* see eth_random_addr in the kernel */
1703 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
1704 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
1705
1706 return 0;
1707 }
1708
1709 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1710 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1711 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1712 struct ether_addr mac_host, mac_container;
1713 int r, i;
1714
1715 if (!arg_private_network)
1716 return 0;
1717
1718 if (!arg_network_veth)
1719 return 0;
1720
1721 /* Use two different interface name prefixes depending whether
1722 * we are in bridge mode or not. */
1723 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
1724 arg_network_bridge ? "vb" : "ve", arg_machine);
1725
1726 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
1727 if (r < 0)
1728 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
1729
1730 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
1731 if (r < 0)
1732 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
1733
1734 r = sd_rtnl_open(&rtnl, 0);
1735 if (r < 0)
1736 return log_error_errno(r, "Failed to connect to netlink: %m");
1737
1738 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1739 if (r < 0)
1740 return log_error_errno(r, "Failed to allocate netlink message: %m");
1741
1742 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1743 if (r < 0)
1744 return log_error_errno(r, "Failed to add netlink interface name: %m");
1745
1746 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
1747 if (r < 0)
1748 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1749
1750 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1751 if (r < 0)
1752 return log_error_errno(r, "Failed to open netlink container: %m");
1753
1754 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1755 if (r < 0)
1756 return log_error_errno(r, "Failed to open netlink container: %m");
1757
1758 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1759 if (r < 0)
1760 return log_error_errno(r, "Failed to open netlink container: %m");
1761
1762 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1763 if (r < 0)
1764 return log_error_errno(r, "Failed to add netlink interface name: %m");
1765
1766 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
1767 if (r < 0)
1768 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1769
1770 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1771 if (r < 0)
1772 return log_error_errno(r, "Failed to add netlink namespace field: %m");
1773
1774 r = sd_rtnl_message_close_container(m);
1775 if (r < 0)
1776 return log_error_errno(r, "Failed to close netlink container: %m");
1777
1778 r = sd_rtnl_message_close_container(m);
1779 if (r < 0)
1780 return log_error_errno(r, "Failed to close netlink container: %m");
1781
1782 r = sd_rtnl_message_close_container(m);
1783 if (r < 0)
1784 return log_error_errno(r, "Failed to close netlink container: %m");
1785
1786 r = sd_rtnl_call(rtnl, m, 0, NULL);
1787 if (r < 0)
1788 return log_error_errno(r, "Failed to add new veth interfaces: %m");
1789
1790 i = (int) if_nametoindex(iface_name);
1791 if (i <= 0)
1792 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
1793
1794 *ifi = i;
1795
1796 return 0;
1797 }
1798
1799 static int setup_bridge(const char veth_name[], int *ifi) {
1800 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1801 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1802 int r, bridge;
1803
1804 if (!arg_private_network)
1805 return 0;
1806
1807 if (!arg_network_veth)
1808 return 0;
1809
1810 if (!arg_network_bridge)
1811 return 0;
1812
1813 bridge = (int) if_nametoindex(arg_network_bridge);
1814 if (bridge <= 0)
1815 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
1816
1817 *ifi = bridge;
1818
1819 r = sd_rtnl_open(&rtnl, 0);
1820 if (r < 0)
1821 return log_error_errno(r, "Failed to connect to netlink: %m");
1822
1823 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1824 if (r < 0)
1825 return log_error_errno(r, "Failed to allocate netlink message: %m");
1826
1827 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1828 if (r < 0)
1829 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
1830
1831 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1832 if (r < 0)
1833 return log_error_errno(r, "Failed to add netlink interface name field: %m");
1834
1835 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1836 if (r < 0)
1837 return log_error_errno(r, "Failed to add netlink master field: %m");
1838
1839 r = sd_rtnl_call(rtnl, m, 0, NULL);
1840 if (r < 0)
1841 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
1842
1843 return 0;
1844 }
1845
1846 static int parse_interface(struct udev *udev, const char *name) {
1847 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1848 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1849 int ifi;
1850
1851 ifi = (int) if_nametoindex(name);
1852 if (ifi <= 0)
1853 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
1854
1855 sprintf(ifi_str, "n%i", ifi);
1856 d = udev_device_new_from_device_id(udev, ifi_str);
1857 if (!d)
1858 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
1859
1860 if (udev_device_get_is_initialized(d) <= 0) {
1861 log_error("Network interface %s is not initialized yet.", name);
1862 return -EBUSY;
1863 }
1864
1865 return ifi;
1866 }
1867
1868 static int move_network_interfaces(pid_t pid) {
1869 _cleanup_udev_unref_ struct udev *udev = NULL;
1870 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1871 char **i;
1872 int r;
1873
1874 if (!arg_private_network)
1875 return 0;
1876
1877 if (strv_isempty(arg_network_interfaces))
1878 return 0;
1879
1880 r = sd_rtnl_open(&rtnl, 0);
1881 if (r < 0)
1882 return log_error_errno(r, "Failed to connect to netlink: %m");
1883
1884 udev = udev_new();
1885 if (!udev) {
1886 log_error("Failed to connect to udev.");
1887 return -ENOMEM;
1888 }
1889
1890 STRV_FOREACH(i, arg_network_interfaces) {
1891 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1892 int ifi;
1893
1894 ifi = parse_interface(udev, *i);
1895 if (ifi < 0)
1896 return ifi;
1897
1898 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
1899 if (r < 0)
1900 return log_error_errno(r, "Failed to allocate netlink message: %m");
1901
1902 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1903 if (r < 0)
1904 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
1905
1906 r = sd_rtnl_call(rtnl, m, 0, NULL);
1907 if (r < 0)
1908 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
1909 }
1910
1911 return 0;
1912 }
1913
1914 static int setup_macvlan(pid_t pid) {
1915 _cleanup_udev_unref_ struct udev *udev = NULL;
1916 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1917 unsigned idx = 0;
1918 char **i;
1919 int r;
1920
1921 if (!arg_private_network)
1922 return 0;
1923
1924 if (strv_isempty(arg_network_macvlan))
1925 return 0;
1926
1927 r = sd_rtnl_open(&rtnl, 0);
1928 if (r < 0)
1929 return log_error_errno(r, "Failed to connect to netlink: %m");
1930
1931 udev = udev_new();
1932 if (!udev) {
1933 log_error("Failed to connect to udev.");
1934 return -ENOMEM;
1935 }
1936
1937 STRV_FOREACH(i, arg_network_macvlan) {
1938 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1939 _cleanup_free_ char *n = NULL;
1940 struct ether_addr mac;
1941 int ifi;
1942
1943 ifi = parse_interface(udev, *i);
1944 if (ifi < 0)
1945 return ifi;
1946
1947 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
1948 if (r < 0)
1949 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
1950
1951 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1952 if (r < 0)
1953 return log_error_errno(r, "Failed to allocate netlink message: %m");
1954
1955 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1956 if (r < 0)
1957 return log_error_errno(r, "Failed to add netlink interface index: %m");
1958
1959 n = strappend("mv-", *i);
1960 if (!n)
1961 return log_oom();
1962
1963 strshorten(n, IFNAMSIZ-1);
1964
1965 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1966 if (r < 0)
1967 return log_error_errno(r, "Failed to add netlink interface name: %m");
1968
1969 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1970 if (r < 0)
1971 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1972
1973 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1974 if (r < 0)
1975 return log_error_errno(r, "Failed to add netlink namespace field: %m");
1976
1977 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1978 if (r < 0)
1979 return log_error_errno(r, "Failed to open netlink container: %m");
1980
1981 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1982 if (r < 0)
1983 return log_error_errno(r, "Failed to open netlink container: %m");
1984
1985 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1986 if (r < 0)
1987 return log_error_errno(r, "Failed to append macvlan mode: %m");
1988
1989 r = sd_rtnl_message_close_container(m);
1990 if (r < 0)
1991 return log_error_errno(r, "Failed to close netlink container: %m");
1992
1993 r = sd_rtnl_message_close_container(m);
1994 if (r < 0)
1995 return log_error_errno(r, "Failed to close netlink container: %m");
1996
1997 r = sd_rtnl_call(rtnl, m, 0, NULL);
1998 if (r < 0)
1999 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2000 }
2001
2002 return 0;
2003 }
2004
2005 static int setup_seccomp(void) {
2006
2007 #ifdef HAVE_SECCOMP
2008 static const int blacklist[] = {
2009 SCMP_SYS(kexec_load),
2010 SCMP_SYS(open_by_handle_at),
2011 SCMP_SYS(init_module),
2012 SCMP_SYS(finit_module),
2013 SCMP_SYS(delete_module),
2014 SCMP_SYS(iopl),
2015 SCMP_SYS(ioperm),
2016 SCMP_SYS(swapon),
2017 SCMP_SYS(swapoff),
2018 };
2019
2020 scmp_filter_ctx seccomp;
2021 unsigned i;
2022 int r;
2023
2024 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2025 if (!seccomp)
2026 return log_oom();
2027
2028 r = seccomp_add_secondary_archs(seccomp);
2029 if (r < 0) {
2030 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2031 goto finish;
2032 }
2033
2034 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2035 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2036 if (r == -EFAULT)
2037 continue; /* unknown syscall */
2038 if (r < 0) {
2039 log_error_errno(r, "Failed to block syscall: %m");
2040 goto finish;
2041 }
2042 }
2043
2044 /*
2045 Audit is broken in containers, much of the userspace audit
2046 hookup will fail if running inside a container. We don't
2047 care and just turn off creation of audit sockets.
2048
2049 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2050 with EAFNOSUPPORT which audit userspace uses as indication
2051 that audit is disabled in the kernel.
2052 */
2053
2054 r = seccomp_rule_add(
2055 seccomp,
2056 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2057 SCMP_SYS(socket),
2058 2,
2059 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2060 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2061 if (r < 0) {
2062 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2063 goto finish;
2064 }
2065
2066 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2067 if (r < 0) {
2068 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2069 goto finish;
2070 }
2071
2072 r = seccomp_load(seccomp);
2073 if (r < 0)
2074 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2075
2076 finish:
2077 seccomp_release(seccomp);
2078 return r;
2079 #else
2080 return 0;
2081 #endif
2082
2083 }
2084
2085 static int setup_propagate(const char *root) {
2086 const char *p, *q;
2087
2088 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2089 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2090 p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
2091 (void) mkdir_p(p, 0600);
2092
2093 q = strappenda(root, "/run/systemd/nspawn/incoming");
2094 mkdir_parents(q, 0755);
2095 mkdir_p(q, 0600);
2096
2097 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2098 return log_error_errno(errno, "Failed to install propagation bind mount.");
2099
2100 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2101 return log_error_errno(errno, "Failed to make propagation mount read-only");
2102
2103 return 0;
2104 }
2105
2106 static int setup_image(char **device_path, int *loop_nr) {
2107 struct loop_info64 info = {
2108 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2109 };
2110 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2111 _cleanup_free_ char* loopdev = NULL;
2112 struct stat st;
2113 int r, nr;
2114
2115 assert(device_path);
2116 assert(loop_nr);
2117 assert(arg_image);
2118
2119 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2120 if (fd < 0)
2121 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2122
2123 if (fstat(fd, &st) < 0)
2124 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2125
2126 if (S_ISBLK(st.st_mode)) {
2127 char *p;
2128
2129 p = strdup(arg_image);
2130 if (!p)
2131 return log_oom();
2132
2133 *device_path = p;
2134
2135 *loop_nr = -1;
2136
2137 r = fd;
2138 fd = -1;
2139
2140 return r;
2141 }
2142
2143 if (!S_ISREG(st.st_mode)) {
2144 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2145 return -EINVAL;
2146 }
2147
2148 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2149 if (control < 0)
2150 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2151
2152 nr = ioctl(control, LOOP_CTL_GET_FREE);
2153 if (nr < 0)
2154 return log_error_errno(errno, "Failed to allocate loop device: %m");
2155
2156 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2157 return log_oom();
2158
2159 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2160 if (loop < 0)
2161 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2162
2163 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2164 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2165
2166 if (arg_read_only)
2167 info.lo_flags |= LO_FLAGS_READ_ONLY;
2168
2169 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2170 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2171
2172 *device_path = loopdev;
2173 loopdev = NULL;
2174
2175 *loop_nr = nr;
2176
2177 r = loop;
2178 loop = -1;
2179
2180 return r;
2181 }
2182
2183 static int dissect_image(
2184 int fd,
2185 char **root_device, bool *root_device_rw,
2186 char **home_device, bool *home_device_rw,
2187 char **srv_device, bool *srv_device_rw,
2188 bool *secondary) {
2189
2190 #ifdef HAVE_BLKID
2191 int home_nr = -1, srv_nr = -1;
2192 #ifdef GPT_ROOT_NATIVE
2193 int root_nr = -1;
2194 #endif
2195 #ifdef GPT_ROOT_SECONDARY
2196 int secondary_root_nr = -1;
2197 #endif
2198
2199 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2200 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2201 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2202 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2203 _cleanup_udev_unref_ struct udev *udev = NULL;
2204 struct udev_list_entry *first, *item;
2205 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2206 const char *pttype = NULL;
2207 blkid_partlist pl;
2208 struct stat st;
2209 int r;
2210
2211 assert(fd >= 0);
2212 assert(root_device);
2213 assert(home_device);
2214 assert(srv_device);
2215 assert(secondary);
2216 assert(arg_image);
2217
2218 b = blkid_new_probe();
2219 if (!b)
2220 return log_oom();
2221
2222 errno = 0;
2223 r = blkid_probe_set_device(b, fd, 0, 0);
2224 if (r != 0) {
2225 if (errno == 0)
2226 return log_oom();
2227
2228 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2229 return -errno;
2230 }
2231
2232 blkid_probe_enable_partitions(b, 1);
2233 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2234
2235 errno = 0;
2236 r = blkid_do_safeprobe(b);
2237 if (r == -2 || r == 1) {
2238 log_error("Failed to identify any partition table on %s.\n"
2239 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2240 return -EINVAL;
2241 } else if (r != 0) {
2242 if (errno == 0)
2243 errno = EIO;
2244 log_error_errno(errno, "Failed to probe: %m");
2245 return -errno;
2246 }
2247
2248 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2249 if (!streq_ptr(pttype, "gpt")) {
2250 log_error("Image %s does not carry a GUID Partition Table.\n"
2251 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2252 return -EINVAL;
2253 }
2254
2255 errno = 0;
2256 pl = blkid_probe_get_partitions(b);
2257 if (!pl) {
2258 if (errno == 0)
2259 return log_oom();
2260
2261 log_error("Failed to list partitions of %s", arg_image);
2262 return -errno;
2263 }
2264
2265 udev = udev_new();
2266 if (!udev)
2267 return log_oom();
2268
2269 if (fstat(fd, &st) < 0)
2270 return log_error_errno(errno, "Failed to stat block device: %m");
2271
2272 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2273 if (!d)
2274 return log_oom();
2275
2276 e = udev_enumerate_new(udev);
2277 if (!e)
2278 return log_oom();
2279
2280 r = udev_enumerate_add_match_parent(e, d);
2281 if (r < 0)
2282 return log_oom();
2283
2284 r = udev_enumerate_scan_devices(e);
2285 if (r < 0)
2286 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2287
2288 first = udev_enumerate_get_list_entry(e);
2289 udev_list_entry_foreach(item, first) {
2290 _cleanup_udev_device_unref_ struct udev_device *q;
2291 const char *stype, *node;
2292 unsigned long long flags;
2293 sd_id128_t type_id;
2294 blkid_partition pp;
2295 dev_t qn;
2296 int nr;
2297
2298 errno = 0;
2299 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2300 if (!q) {
2301 if (!errno)
2302 errno = ENOMEM;
2303
2304 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2305 return -errno;
2306 }
2307
2308 qn = udev_device_get_devnum(q);
2309 if (major(qn) == 0)
2310 continue;
2311
2312 if (st.st_rdev == qn)
2313 continue;
2314
2315 node = udev_device_get_devnode(q);
2316 if (!node)
2317 continue;
2318
2319 pp = blkid_partlist_devno_to_partition(pl, qn);
2320 if (!pp)
2321 continue;
2322
2323 flags = blkid_partition_get_flags(pp);
2324 if (flags & GPT_FLAG_NO_AUTO)
2325 continue;
2326
2327 nr = blkid_partition_get_partno(pp);
2328 if (nr < 0)
2329 continue;
2330
2331 stype = blkid_partition_get_type_string(pp);
2332 if (!stype)
2333 continue;
2334
2335 if (sd_id128_from_string(stype, &type_id) < 0)
2336 continue;
2337
2338 if (sd_id128_equal(type_id, GPT_HOME)) {
2339
2340 if (home && nr >= home_nr)
2341 continue;
2342
2343 home_nr = nr;
2344 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2345
2346 free(home);
2347 home = strdup(node);
2348 if (!home)
2349 return log_oom();
2350 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2351
2352 if (srv && nr >= srv_nr)
2353 continue;
2354
2355 srv_nr = nr;
2356 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2357
2358 free(srv);
2359 srv = strdup(node);
2360 if (!srv)
2361 return log_oom();
2362 }
2363 #ifdef GPT_ROOT_NATIVE
2364 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2365
2366 if (root && nr >= root_nr)
2367 continue;
2368
2369 root_nr = nr;
2370 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2371
2372 free(root);
2373 root = strdup(node);
2374 if (!root)
2375 return log_oom();
2376 }
2377 #endif
2378 #ifdef GPT_ROOT_SECONDARY
2379 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2380
2381 if (secondary_root && nr >= secondary_root_nr)
2382 continue;
2383
2384 secondary_root_nr = nr;
2385 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2386
2387
2388 free(secondary_root);
2389 secondary_root = strdup(node);
2390 if (!secondary_root)
2391 return log_oom();
2392 }
2393 #endif
2394 }
2395
2396 if (!root && !secondary_root) {
2397 log_error("Failed to identify root partition in disk image %s.\n"
2398 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2399 return -EINVAL;
2400 }
2401
2402 if (root) {
2403 *root_device = root;
2404 root = NULL;
2405
2406 *root_device_rw = root_rw;
2407 *secondary = false;
2408 } else if (secondary_root) {
2409 *root_device = secondary_root;
2410 secondary_root = NULL;
2411
2412 *root_device_rw = secondary_root_rw;
2413 *secondary = true;
2414 }
2415
2416 if (home) {
2417 *home_device = home;
2418 home = NULL;
2419
2420 *home_device_rw = home_rw;
2421 }
2422
2423 if (srv) {
2424 *srv_device = srv;
2425 srv = NULL;
2426
2427 *srv_device_rw = srv_rw;
2428 }
2429
2430 return 0;
2431 #else
2432 log_error("--image= is not supported, compiled without blkid support.");
2433 return -ENOTSUP;
2434 #endif
2435 }
2436
2437 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2438 #ifdef HAVE_BLKID
2439 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2440 const char *fstype, *p;
2441 int r;
2442
2443 assert(what);
2444 assert(where);
2445
2446 if (arg_read_only)
2447 rw = false;
2448
2449 if (directory)
2450 p = strappenda(where, directory);
2451 else
2452 p = where;
2453
2454 errno = 0;
2455 b = blkid_new_probe_from_filename(what);
2456 if (!b) {
2457 if (errno == 0)
2458 return log_oom();
2459 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2460 return -errno;
2461 }
2462
2463 blkid_probe_enable_superblocks(b, 1);
2464 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2465
2466 errno = 0;
2467 r = blkid_do_safeprobe(b);
2468 if (r == -1 || r == 1) {
2469 log_error("Cannot determine file system type of %s", what);
2470 return -EINVAL;
2471 } else if (r != 0) {
2472 if (errno == 0)
2473 errno = EIO;
2474 log_error_errno(errno, "Failed to probe %s: %m", what);
2475 return -errno;
2476 }
2477
2478 errno = 0;
2479 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2480 if (errno == 0)
2481 errno = EINVAL;
2482 log_error("Failed to determine file system type of %s", what);
2483 return -errno;
2484 }
2485
2486 if (streq(fstype, "crypto_LUKS")) {
2487 log_error("nspawn currently does not support LUKS disk images.");
2488 return -ENOTSUP;
2489 }
2490
2491 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2492 return log_error_errno(errno, "Failed to mount %s: %m", what);
2493
2494 return 0;
2495 #else
2496 log_error("--image= is not supported, compiled without blkid support.");
2497 return -ENOTSUP;
2498 #endif
2499 }
2500
2501 static int mount_devices(
2502 const char *where,
2503 const char *root_device, bool root_device_rw,
2504 const char *home_device, bool home_device_rw,
2505 const char *srv_device, bool srv_device_rw) {
2506 int r;
2507
2508 assert(where);
2509
2510 if (root_device) {
2511 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2512 if (r < 0)
2513 return log_error_errno(r, "Failed to mount root directory: %m");
2514 }
2515
2516 if (home_device) {
2517 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2518 if (r < 0)
2519 return log_error_errno(r, "Failed to mount home directory: %m");
2520 }
2521
2522 if (srv_device) {
2523 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2524 if (r < 0)
2525 return log_error_errno(r, "Failed to mount server data directory: %m");
2526 }
2527
2528 return 0;
2529 }
2530
2531 static void loop_remove(int nr, int *image_fd) {
2532 _cleanup_close_ int control = -1;
2533 int r;
2534
2535 if (nr < 0)
2536 return;
2537
2538 if (image_fd && *image_fd >= 0) {
2539 r = ioctl(*image_fd, LOOP_CLR_FD);
2540 if (r < 0)
2541 log_warning_errno(errno, "Failed to close loop image: %m");
2542 *image_fd = safe_close(*image_fd);
2543 }
2544
2545 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2546 if (control < 0) {
2547 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2548 return;
2549 }
2550
2551 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2552 if (r < 0)
2553 log_warning_errno(errno, "Failed to remove loop %d: %m", nr);
2554 }
2555
2556 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2557 int pipe_fds[2];
2558 pid_t pid;
2559
2560 assert(database);
2561 assert(key);
2562 assert(rpid);
2563
2564 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
2565 return log_error_errno(errno, "Failed to allocate pipe: %m");
2566
2567 pid = fork();
2568 if (pid < 0)
2569 return log_error_errno(errno, "Failed to fork getent child: %m");
2570 else if (pid == 0) {
2571 int nullfd;
2572 char *empty_env = NULL;
2573
2574 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2575 _exit(EXIT_FAILURE);
2576
2577 if (pipe_fds[0] > 2)
2578 safe_close(pipe_fds[0]);
2579 if (pipe_fds[1] > 2)
2580 safe_close(pipe_fds[1]);
2581
2582 nullfd = open("/dev/null", O_RDWR);
2583 if (nullfd < 0)
2584 _exit(EXIT_FAILURE);
2585
2586 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2587 _exit(EXIT_FAILURE);
2588
2589 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2590 _exit(EXIT_FAILURE);
2591
2592 if (nullfd > 2)
2593 safe_close(nullfd);
2594
2595 reset_all_signal_handlers();
2596 close_all_fds(NULL, 0);
2597
2598 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2599 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2600 _exit(EXIT_FAILURE);
2601 }
2602
2603 pipe_fds[1] = safe_close(pipe_fds[1]);
2604
2605 *rpid = pid;
2606
2607 return pipe_fds[0];
2608 }
2609
2610 static int change_uid_gid(char **_home) {
2611 char line[LINE_MAX], *x, *u, *g, *h;
2612 const char *word, *state;
2613 _cleanup_free_ uid_t *uids = NULL;
2614 _cleanup_free_ char *home = NULL;
2615 _cleanup_fclose_ FILE *f = NULL;
2616 _cleanup_close_ int fd = -1;
2617 unsigned n_uids = 0;
2618 size_t sz = 0, l;
2619 uid_t uid;
2620 gid_t gid;
2621 pid_t pid;
2622 int r;
2623
2624 assert(_home);
2625
2626 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2627 /* Reset everything fully to 0, just in case */
2628
2629 if (setgroups(0, NULL) < 0)
2630 return log_error_errno(errno, "setgroups() failed: %m");
2631
2632 if (setresgid(0, 0, 0) < 0)
2633 return log_error_errno(errno, "setregid() failed: %m");
2634
2635 if (setresuid(0, 0, 0) < 0)
2636 return log_error_errno(errno, "setreuid() failed: %m");
2637
2638 *_home = NULL;
2639 return 0;
2640 }
2641
2642 /* First, get user credentials */
2643 fd = spawn_getent("passwd", arg_user, &pid);
2644 if (fd < 0)
2645 return fd;
2646
2647 f = fdopen(fd, "r");
2648 if (!f)
2649 return log_oom();
2650 fd = -1;
2651
2652 if (!fgets(line, sizeof(line), f)) {
2653
2654 if (!ferror(f)) {
2655 log_error("Failed to resolve user %s.", arg_user);
2656 return -ESRCH;
2657 }
2658
2659 log_error_errno(errno, "Failed to read from getent: %m");
2660 return -errno;
2661 }
2662
2663 truncate_nl(line);
2664
2665 wait_for_terminate_and_warn("getent passwd", pid, true);
2666
2667 x = strchr(line, ':');
2668 if (!x) {
2669 log_error("/etc/passwd entry has invalid user field.");
2670 return -EIO;
2671 }
2672
2673 u = strchr(x+1, ':');
2674 if (!u) {
2675 log_error("/etc/passwd entry has invalid password field.");
2676 return -EIO;
2677 }
2678
2679 u++;
2680 g = strchr(u, ':');
2681 if (!g) {
2682 log_error("/etc/passwd entry has invalid UID field.");
2683 return -EIO;
2684 }
2685
2686 *g = 0;
2687 g++;
2688 x = strchr(g, ':');
2689 if (!x) {
2690 log_error("/etc/passwd entry has invalid GID field.");
2691 return -EIO;
2692 }
2693
2694 *x = 0;
2695 h = strchr(x+1, ':');
2696 if (!h) {
2697 log_error("/etc/passwd entry has invalid GECOS field.");
2698 return -EIO;
2699 }
2700
2701 h++;
2702 x = strchr(h, ':');
2703 if (!x) {
2704 log_error("/etc/passwd entry has invalid home directory field.");
2705 return -EIO;
2706 }
2707
2708 *x = 0;
2709
2710 r = parse_uid(u, &uid);
2711 if (r < 0) {
2712 log_error("Failed to parse UID of user.");
2713 return -EIO;
2714 }
2715
2716 r = parse_gid(g, &gid);
2717 if (r < 0) {
2718 log_error("Failed to parse GID of user.");
2719 return -EIO;
2720 }
2721
2722 home = strdup(h);
2723 if (!home)
2724 return log_oom();
2725
2726 /* Second, get group memberships */
2727 fd = spawn_getent("initgroups", arg_user, &pid);
2728 if (fd < 0)
2729 return fd;
2730
2731 fclose(f);
2732 f = fdopen(fd, "r");
2733 if (!f)
2734 return log_oom();
2735 fd = -1;
2736
2737 if (!fgets(line, sizeof(line), f)) {
2738 if (!ferror(f)) {
2739 log_error("Failed to resolve user %s.", arg_user);
2740 return -ESRCH;
2741 }
2742
2743 log_error_errno(errno, "Failed to read from getent: %m");
2744 return -errno;
2745 }
2746
2747 truncate_nl(line);
2748
2749 wait_for_terminate_and_warn("getent initgroups", pid, true);
2750
2751 /* Skip over the username and subsequent separator whitespace */
2752 x = line;
2753 x += strcspn(x, WHITESPACE);
2754 x += strspn(x, WHITESPACE);
2755
2756 FOREACH_WORD(word, l, x, state) {
2757 char c[l+1];
2758
2759 memcpy(c, word, l);
2760 c[l] = 0;
2761
2762 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2763 return log_oom();
2764
2765 r = parse_uid(c, &uids[n_uids++]);
2766 if (r < 0) {
2767 log_error("Failed to parse group data from getent.");
2768 return -EIO;
2769 }
2770 }
2771
2772 r = mkdir_parents(home, 0775);
2773 if (r < 0)
2774 return log_error_errno(r, "Failed to make home root directory: %m");
2775
2776 r = mkdir_safe(home, 0755, uid, gid);
2777 if (r < 0 && r != -EEXIST)
2778 return log_error_errno(r, "Failed to make home directory: %m");
2779
2780 fchown(STDIN_FILENO, uid, gid);
2781 fchown(STDOUT_FILENO, uid, gid);
2782 fchown(STDERR_FILENO, uid, gid);
2783
2784 if (setgroups(n_uids, uids) < 0)
2785 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
2786
2787 if (setresgid(gid, gid, gid) < 0)
2788 return log_error_errno(errno, "setregid() failed: %m");
2789
2790 if (setresuid(uid, uid, uid) < 0)
2791 return log_error_errno(errno, "setreuid() failed: %m");
2792
2793 if (_home) {
2794 *_home = home;
2795 home = NULL;
2796 }
2797
2798 return 0;
2799 }
2800
2801 /*
2802 * Return values:
2803 * < 0 : wait_for_terminate() failed to get the state of the
2804 * container, the container was terminated by a signal, or
2805 * failed for an unknown reason. No change is made to the
2806 * container argument.
2807 * > 0 : The program executed in the container terminated with an
2808 * error. The exit code of the program executed in the
2809 * container is returned. The container argument has been set
2810 * to CONTAINER_TERMINATED.
2811 * 0 : The container is being rebooted, has been shut down or exited
2812 * successfully. The container argument has been set to either
2813 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2814 *
2815 * That is, success is indicated by a return value of zero, and an
2816 * error is indicated by a non-zero value.
2817 */
2818 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2819 siginfo_t status;
2820 int r;
2821
2822 r = wait_for_terminate(pid, &status);
2823 if (r < 0)
2824 return log_warning_errno(r, "Failed to wait for container: %m");
2825
2826 switch (status.si_code) {
2827
2828 case CLD_EXITED:
2829 if (status.si_status == 0) {
2830 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2831
2832 } else
2833 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2834
2835 *container = CONTAINER_TERMINATED;
2836 return status.si_status;
2837
2838 case CLD_KILLED:
2839 if (status.si_status == SIGINT) {
2840
2841 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2842 *container = CONTAINER_TERMINATED;
2843 return 0;
2844
2845 } else if (status.si_status == SIGHUP) {
2846
2847 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2848 *container = CONTAINER_REBOOTED;
2849 return 0;
2850 }
2851
2852 /* CLD_KILLED fallthrough */
2853
2854 case CLD_DUMPED:
2855 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2856 return -EIO;
2857
2858 default:
2859 log_error("Container %s failed due to unknown reason.", arg_machine);
2860 return -EIO;
2861 }
2862
2863 return r;
2864 }
2865
2866 static void nop_handler(int sig) {}
2867
2868 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2869 pid_t pid;
2870
2871 pid = PTR_TO_UINT32(userdata);
2872 if (pid > 0) {
2873 if (kill(pid, SIGRTMIN+3) >= 0) {
2874 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2875 sd_event_source_set_userdata(s, NULL);
2876 return 0;
2877 }
2878 }
2879
2880 sd_event_exit(sd_event_source_get_event(s), 0);
2881 return 0;
2882 }
2883
2884 static int determine_names(void) {
2885
2886 if (!arg_image && !arg_directory) {
2887 if (arg_machine)
2888 arg_directory = strappend("/var/lib/container/", arg_machine);
2889 else
2890 arg_directory = get_current_dir_name();
2891
2892 if (!arg_directory) {
2893 log_error("Failed to determine path, please use -D.");
2894 return -EINVAL;
2895 }
2896 }
2897
2898 if (!arg_machine) {
2899 if (arg_directory && path_equal(arg_directory, "/"))
2900 arg_machine = gethostname_malloc();
2901 else
2902 arg_machine = strdup(basename(arg_image ?: arg_directory));
2903
2904 if (!arg_machine)
2905 return log_oom();
2906
2907 hostname_cleanup(arg_machine, false);
2908 if (!machine_name_is_valid(arg_machine)) {
2909 log_error("Failed to determine machine name automatically, please use -M.");
2910 return -EINVAL;
2911 }
2912
2913 if (arg_ephemeral) {
2914 char *b;
2915
2916 /* Add a random suffix when this is an
2917 * ephemeral machine, so that we can run many
2918 * instances at once without manually having
2919 * to specify -M each time. */
2920
2921 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2922 return log_oom();
2923
2924 free(arg_machine);
2925 arg_machine = b;
2926 }
2927 }
2928
2929 return 0;
2930 }
2931
2932 int main(int argc, char *argv[]) {
2933
2934 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2935 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2936 _cleanup_close_ int master = -1, image_fd = -1;
2937 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2938 _cleanup_fdset_free_ FDSet *fds = NULL;
2939 int r, n_fd_passed, loop_nr = -1;
2940 const char *console = NULL;
2941 char veth_name[IFNAMSIZ];
2942 bool secondary = false, remove_subvol = false;
2943 sigset_t mask, mask_chld;
2944 pid_t pid = 0;
2945 int ret = EXIT_SUCCESS;
2946
2947 log_parse_environment();
2948 log_open();
2949
2950 r = parse_argv(argc, argv);
2951 if (r <= 0)
2952 goto finish;
2953
2954 r = determine_names();
2955 if (r < 0)
2956 goto finish;
2957
2958 if (geteuid() != 0) {
2959 log_error("Need to be root.");
2960 r = -EPERM;
2961 goto finish;
2962 }
2963
2964 if (sd_booted() <= 0) {
2965 log_error("Not running on a systemd system.");
2966 r = -EINVAL;
2967 goto finish;
2968 }
2969
2970 log_close();
2971 n_fd_passed = sd_listen_fds(false);
2972 if (n_fd_passed > 0) {
2973 r = fdset_new_listen_fds(&fds, false);
2974 if (r < 0) {
2975 log_error_errno(r, "Failed to collect file descriptors: %m");
2976 goto finish;
2977 }
2978 }
2979 fdset_close_others(fds);
2980 log_open();
2981
2982 if (arg_directory) {
2983 assert(!arg_image);
2984
2985 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
2986 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
2987 r = -EINVAL;
2988 goto finish;
2989 }
2990
2991 if (arg_template) {
2992 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
2993 if (r == -EEXIST) {
2994 if (!arg_quiet)
2995 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
2996 } else if (r < 0) {
2997 log_error_errno(r, "Couldn't create snapshort %s from %s: %m", arg_directory, arg_template);
2998 goto finish;
2999 } else {
3000 if (!arg_quiet)
3001 log_info("Populated %s from template %s.", arg_directory, arg_template);
3002 }
3003
3004 } else if (arg_ephemeral) {
3005 char *np;
3006
3007 /* If the specified path is a mount point we
3008 * generate the new snapshot immediately
3009 * inside it under a random name. However if
3010 * the specified is not a mount point we
3011 * create the new snapshot in the parent
3012 * directory, just next to it. */
3013 r = path_is_mount_point(arg_directory, false);
3014 if (r < 0) {
3015 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3016 goto finish;
3017 }
3018 if (r > 0)
3019 r = tempfn_random_child(arg_directory, &np);
3020 else
3021 r = tempfn_random(arg_directory, &np);
3022 if (r < 0) {
3023 log_error_errno(r, "Failed to generate name for snapshot: %m");
3024 goto finish;
3025 }
3026
3027 r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3028 if (r < 0) {
3029 free(np);
3030 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3031 goto finish;
3032 }
3033
3034 free(arg_directory);
3035 arg_directory = np;
3036
3037 remove_subvol = true;
3038 }
3039
3040 if (arg_boot) {
3041 if (path_is_os_tree(arg_directory) <= 0) {
3042 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3043 r = -EINVAL;
3044 goto finish;
3045 }
3046 } else {
3047 const char *p;
3048
3049 p = strappenda(arg_directory,
3050 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3051 if (access(p, F_OK) < 0) {
3052 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3053 r = -EINVAL;
3054 goto finish;
3055 }
3056 }
3057
3058 } else {
3059 char template[] = "/tmp/nspawn-root-XXXXXX";
3060
3061 assert(arg_image);
3062 assert(!arg_template);
3063
3064 if (!mkdtemp(template)) {
3065 log_error_errno(errno, "Failed to create temporary directory: %m");
3066 r = -errno;
3067 goto finish;
3068 }
3069
3070 arg_directory = strdup(template);
3071 if (!arg_directory) {
3072 r = log_oom();
3073 goto finish;
3074 }
3075
3076 image_fd = setup_image(&device_path, &loop_nr);
3077 if (image_fd < 0) {
3078 r = image_fd;
3079 goto finish;
3080 }
3081
3082 r = dissect_image(image_fd,
3083 &root_device, &root_device_rw,
3084 &home_device, &home_device_rw,
3085 &srv_device, &srv_device_rw,
3086 &secondary);
3087 if (r < 0)
3088 goto finish;
3089 }
3090
3091 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3092 if (master < 0) {
3093 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3094 goto finish;
3095 }
3096
3097 console = ptsname(master);
3098 if (!console) {
3099 r = log_error_errno(errno, "Failed to determine tty name: %m");
3100 goto finish;
3101 }
3102
3103 if (!arg_quiet)
3104 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3105 arg_machine, arg_image ?: arg_directory);
3106
3107 if (unlockpt(master) < 0) {
3108 r = log_error_errno(errno, "Failed to unlock tty: %m");
3109 goto finish;
3110 }
3111
3112 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3113 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3114 goto finish;
3115 }
3116
3117 sd_notify(false,
3118 "READY=1\n"
3119 "STATUS=Container running.");
3120
3121 assert_se(sigemptyset(&mask) == 0);
3122 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3123 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3124
3125 assert_se(sigemptyset(&mask_chld) == 0);
3126 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3127
3128 for (;;) {
3129 ContainerStatus container_status;
3130 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3131 struct sigaction sa = {
3132 .sa_handler = nop_handler,
3133 .sa_flags = SA_NOCLDSTOP,
3134 };
3135
3136 r = barrier_create(&barrier);
3137 if (r < 0) {
3138 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3139 goto finish;
3140 }
3141
3142 /* Child can be killed before execv(), so handle SIGCHLD
3143 * in order to interrupt parent's blocking calls and
3144 * give it a chance to call wait() and terminate. */
3145 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3146 if (r < 0) {
3147 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3148 goto finish;
3149 }
3150
3151 r = sigaction(SIGCHLD, &sa, NULL);
3152 if (r < 0) {
3153 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3154 goto finish;
3155 }
3156
3157 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3158 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3159 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3160 if (pid < 0) {
3161 if (errno == EINVAL)
3162 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3163 else
3164 r = log_error_errno(errno, "clone() failed: %m");
3165
3166 goto finish;
3167 }
3168
3169 if (pid == 0) {
3170 /* child */
3171 _cleanup_free_ char *home = NULL;
3172 unsigned n_env = 2;
3173 const char *envp[] = {
3174 "PATH=" DEFAULT_PATH_SPLIT_USR,
3175 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3176 NULL, /* TERM */
3177 NULL, /* HOME */
3178 NULL, /* USER */
3179 NULL, /* LOGNAME */
3180 NULL, /* container_uuid */
3181 NULL, /* LISTEN_FDS */
3182 NULL, /* LISTEN_PID */
3183 NULL
3184 };
3185 char **env_use;
3186
3187 barrier_set_role(&barrier, BARRIER_CHILD);
3188
3189 envp[n_env] = strv_find_prefix(environ, "TERM=");
3190 if (envp[n_env])
3191 n_env ++;
3192
3193 master = safe_close(master);
3194
3195 close_nointr(STDIN_FILENO);
3196 close_nointr(STDOUT_FILENO);
3197 close_nointr(STDERR_FILENO);
3198
3199 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3200
3201 reset_all_signal_handlers();
3202 reset_signal_mask();
3203
3204 r = open_terminal(console, O_RDWR);
3205 if (r != STDIN_FILENO) {
3206 if (r >= 0) {
3207 safe_close(r);
3208 r = -EINVAL;
3209 }
3210
3211 log_error_errno(r, "Failed to open console: %m");
3212 _exit(EXIT_FAILURE);
3213 }
3214
3215 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3216 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3217 log_error_errno(errno, "Failed to duplicate console: %m");
3218 _exit(EXIT_FAILURE);
3219 }
3220
3221 if (setsid() < 0) {
3222 log_error_errno(errno, "setsid() failed: %m");
3223 _exit(EXIT_FAILURE);
3224 }
3225
3226 if (reset_audit_loginuid() < 0)
3227 _exit(EXIT_FAILURE);
3228
3229 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3230 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3231 _exit(EXIT_FAILURE);
3232 }
3233
3234 /* Mark everything as slave, so that we still
3235 * receive mounts from the real root, but don't
3236 * propagate mounts to the real root. */
3237 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3238 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3239 _exit(EXIT_FAILURE);
3240 }
3241
3242 if (mount_devices(arg_directory,
3243 root_device, root_device_rw,
3244 home_device, home_device_rw,
3245 srv_device, srv_device_rw) < 0)
3246 _exit(EXIT_FAILURE);
3247
3248 /* Turn directory into bind mount */
3249 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3250 log_error_errno(errno, "Failed to make bind mount: %m");
3251 _exit(EXIT_FAILURE);
3252 }
3253
3254 r = setup_volatile(arg_directory);
3255 if (r < 0)
3256 _exit(EXIT_FAILURE);
3257
3258 if (setup_volatile_state(arg_directory) < 0)
3259 _exit(EXIT_FAILURE);
3260
3261 r = base_filesystem_create(arg_directory);
3262 if (r < 0)
3263 _exit(EXIT_FAILURE);
3264
3265 if (arg_read_only) {
3266 r = bind_remount_recursive(arg_directory, true);
3267 if (r < 0) {
3268 log_error_errno(r, "Failed to make tree read-only: %m");
3269 _exit(EXIT_FAILURE);
3270 }
3271 }
3272
3273 if (mount_all(arg_directory) < 0)
3274 _exit(EXIT_FAILURE);
3275
3276 if (copy_devnodes(arg_directory) < 0)
3277 _exit(EXIT_FAILURE);
3278
3279 if (setup_ptmx(arg_directory) < 0)
3280 _exit(EXIT_FAILURE);
3281
3282 dev_setup(arg_directory);
3283
3284 if (setup_propagate(arg_directory) < 0)
3285 _exit(EXIT_FAILURE);
3286
3287 if (setup_seccomp() < 0)
3288 _exit(EXIT_FAILURE);
3289
3290 if (setup_dev_console(arg_directory, console) < 0)
3291 _exit(EXIT_FAILURE);
3292
3293 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3294 _exit(EXIT_FAILURE);
3295
3296 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3297
3298 if (setup_boot_id(arg_directory) < 0)
3299 _exit(EXIT_FAILURE);
3300
3301 if (setup_timezone(arg_directory) < 0)
3302 _exit(EXIT_FAILURE);
3303
3304 if (setup_resolv_conf(arg_directory) < 0)
3305 _exit(EXIT_FAILURE);
3306
3307 if (setup_journal(arg_directory) < 0)
3308 _exit(EXIT_FAILURE);
3309
3310 if (mount_binds(arg_directory, arg_bind, false) < 0)
3311 _exit(EXIT_FAILURE);
3312
3313 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3314 _exit(EXIT_FAILURE);
3315
3316 if (mount_tmpfs(arg_directory) < 0)
3317 _exit(EXIT_FAILURE);
3318
3319 /* Tell the parent that we are ready, and that
3320 * it can cgroupify us to that we lack access
3321 * to certain devices and resources. */
3322 (void)barrier_place(&barrier);
3323
3324 if (chdir(arg_directory) < 0) {
3325 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
3326 _exit(EXIT_FAILURE);
3327 }
3328
3329 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3330 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
3331 _exit(EXIT_FAILURE);
3332 }
3333
3334 if (chroot(".") < 0) {
3335 log_error_errno(errno, "chroot() failed: %m");
3336 _exit(EXIT_FAILURE);
3337 }
3338
3339 if (chdir("/") < 0) {
3340 log_error_errno(errno, "chdir() failed: %m");
3341 _exit(EXIT_FAILURE);
3342 }
3343
3344 umask(0022);
3345
3346 if (arg_private_network)
3347 loopback_setup();
3348
3349 if (drop_capabilities() < 0) {
3350 log_error_errno(errno, "drop_capabilities() failed: %m");
3351 _exit(EXIT_FAILURE);
3352 }
3353
3354 r = change_uid_gid(&home);
3355 if (r < 0)
3356 _exit(EXIT_FAILURE);
3357
3358 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3359 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3360 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3361 log_oom();
3362 _exit(EXIT_FAILURE);
3363 }
3364
3365 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3366 char as_uuid[37];
3367
3368 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3369 log_oom();
3370 _exit(EXIT_FAILURE);
3371 }
3372 }
3373
3374 if (fdset_size(fds) > 0) {
3375 r = fdset_cloexec(fds, false);
3376 if (r < 0) {
3377 log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3378 _exit(EXIT_FAILURE);
3379 }
3380
3381 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3382 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3383 log_oom();
3384 _exit(EXIT_FAILURE);
3385 }
3386 }
3387
3388 setup_hostname();
3389
3390 if (arg_personality != 0xffffffffLU) {
3391 if (personality(arg_personality) < 0) {
3392 log_error_errno(errno, "personality() failed: %m");
3393 _exit(EXIT_FAILURE);
3394 }
3395 } else if (secondary) {
3396 if (personality(PER_LINUX32) < 0) {
3397 log_error_errno(errno, "personality() failed: %m");
3398 _exit(EXIT_FAILURE);
3399 }
3400 }
3401
3402 #ifdef HAVE_SELINUX
3403 if (arg_selinux_context)
3404 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3405 log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3406 _exit(EXIT_FAILURE);
3407 }
3408 #endif
3409
3410 if (!strv_isempty(arg_setenv)) {
3411 char **n;
3412
3413 n = strv_env_merge(2, envp, arg_setenv);
3414 if (!n) {
3415 log_oom();
3416 _exit(EXIT_FAILURE);
3417 }
3418
3419 env_use = n;
3420 } else
3421 env_use = (char**) envp;
3422
3423 /* Wait until the parent is ready with the setup, too... */
3424 if (!barrier_place_and_sync(&barrier))
3425 _exit(EXIT_FAILURE);
3426
3427 if (arg_boot) {
3428 char **a;
3429 size_t l;
3430
3431 /* Automatically search for the init system */
3432
3433 l = 1 + argc - optind;
3434 a = newa(char*, l + 1);
3435 memcpy(a + 1, argv + optind, l * sizeof(char*));
3436
3437 a[0] = (char*) "/usr/lib/systemd/systemd";
3438 execve(a[0], a, env_use);
3439
3440 a[0] = (char*) "/lib/systemd/systemd";
3441 execve(a[0], a, env_use);
3442
3443 a[0] = (char*) "/sbin/init";
3444 execve(a[0], a, env_use);
3445 } else if (argc > optind)
3446 execvpe(argv[optind], argv + optind, env_use);
3447 else {
3448 chdir(home ? home : "/root");
3449 execle("/bin/bash", "-bash", NULL, env_use);
3450 execle("/bin/sh", "-sh", NULL, env_use);
3451 }
3452
3453 log_error_errno(errno, "execv() failed: %m");
3454 _exit(EXIT_FAILURE);
3455 }
3456
3457 barrier_set_role(&barrier, BARRIER_PARENT);
3458 fdset_free(fds);
3459 fds = NULL;
3460
3461 /* wait for child-setup to be done */
3462 if (barrier_place_and_sync(&barrier)) {
3463 _cleanup_event_unref_ sd_event *event = NULL;
3464 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3465 int ifi = 0;
3466
3467 r = move_network_interfaces(pid);
3468 if (r < 0)
3469 goto finish;
3470
3471 r = setup_veth(pid, veth_name, &ifi);
3472 if (r < 0)
3473 goto finish;
3474
3475 r = setup_bridge(veth_name, &ifi);
3476 if (r < 0)
3477 goto finish;
3478
3479 r = setup_macvlan(pid);
3480 if (r < 0)
3481 goto finish;
3482
3483 r = register_machine(pid, ifi);
3484 if (r < 0)
3485 goto finish;
3486
3487 /* Block SIGCHLD here, before notifying child.
3488 * process_pty() will handle it with the other signals. */
3489 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3490 if (r < 0)
3491 goto finish;
3492
3493 /* Reset signal to default */
3494 r = default_signals(SIGCHLD, -1);
3495 if (r < 0)
3496 goto finish;
3497
3498 /* Notify the child that the parent is ready with all
3499 * its setup, and that the child can now hand over
3500 * control to the code to run inside the container. */
3501 (void)barrier_place(&barrier);
3502
3503 r = sd_event_new(&event);
3504 if (r < 0) {
3505 log_error_errno(r, "Failed to get default event source: %m");
3506 goto finish;
3507 }
3508
3509 if (arg_boot) {
3510 /* Try to kill the init system on SIGINT or SIGTERM */
3511 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3512 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3513 } else {
3514 /* Immediately exit */
3515 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3516 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3517 }
3518
3519 /* simply exit on sigchld */
3520 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3521
3522 r = pty_forward_new(event, master, &forward);
3523 if (r < 0) {
3524 log_error_errno(r, "Failed to create PTY forwarder: %m");
3525 goto finish;
3526 }
3527
3528 r = sd_event_loop(event);
3529 if (r < 0) {
3530 log_error_errno(r, "Failed to run event loop: %m");
3531 goto finish;
3532 }
3533
3534 forward = pty_forward_free(forward);
3535
3536 if (!arg_quiet)
3537 putc('\n', stdout);
3538
3539 /* Kill if it is not dead yet anyway */
3540 terminate_machine(pid);
3541 }
3542
3543 /* Normally redundant, but better safe than sorry */
3544 kill(pid, SIGKILL);
3545
3546 r = wait_for_container(pid, &container_status);
3547 pid = 0;
3548
3549 if (r < 0)
3550 /* We failed to wait for the container, or the
3551 * container exited abnormally */
3552 goto finish;
3553 else if (r > 0 || container_status == CONTAINER_TERMINATED){
3554 /* The container exited with a non-zero
3555 * status, or with zero status and no reboot
3556 * was requested. */
3557 ret = r;
3558 break;
3559 }
3560
3561 /* CONTAINER_REBOOTED, loop again */
3562
3563 if (arg_keep_unit) {
3564 /* Special handling if we are running as a
3565 * service: instead of simply restarting the
3566 * machine we want to restart the entire
3567 * service, so let's inform systemd about this
3568 * with the special exit code 133. The service
3569 * file uses RestartForceExitStatus=133 so
3570 * that this results in a full nspawn
3571 * restart. This is necessary since we might
3572 * have cgroup parameters set we want to have
3573 * flushed out. */
3574 ret = 133;
3575 r = 0;
3576 break;
3577 }
3578 }
3579
3580 finish:
3581 sd_notify(false,
3582 "STOPPING=1\n"
3583 "STATUS=Terminating...");
3584
3585 loop_remove(loop_nr, &image_fd);
3586
3587 if (pid > 0)
3588 kill(pid, SIGKILL);
3589
3590 if (remove_subvol && arg_directory) {
3591 int k;
3592
3593 k = btrfs_subvol_remove(arg_directory);
3594 if (k < 0)
3595 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3596 }
3597
3598 if (arg_machine) {
3599 const char *p;
3600
3601 p = strappenda("/run/systemd/nspawn/propagate", arg_machine);
3602 (void) rm_rf(p, false, true, false);
3603 }
3604
3605 free(arg_directory);
3606 free(arg_template);
3607 free(arg_image);
3608 free(arg_machine);
3609 free(arg_user);
3610 strv_free(arg_setenv);
3611 strv_free(arg_network_interfaces);
3612 strv_free(arg_network_macvlan);
3613 strv_free(arg_bind);
3614 strv_free(arg_bind_ro);
3615 strv_free(arg_tmpfs);
3616
3617 return r < 0 ? EXIT_FAILURE : ret;
3618 }